From 6798b507248ad22a289d40668848fc49b2ca5c29 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Wed, 17 Feb 2021 14:50:26 +0000
Subject: v21.02 release

* String routine changes
  * Added AArch64 ILP32 ABI support.
  * Fixed SVE strnlen return value.
  * Added MTE related __mtag_tag_region.
  * Added MTE related __mtag_tag_zero_region.
  * Minor code cleanups.
---
 README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README b/README
index ae465e9..9e1a34f 100644
--- a/README
+++ b/README
@@ -9,7 +9,7 @@ contributor-agreement.pdf. This is needed so upstreaming code
 to projects that require copyright assignment is possible.
 
 Regular quarterly releases are tagged as vYY.MM, the latest
-release is v20.11.
+release is v21.02.
 
 Source code layout:
 
-- 
cgit v1.2.3


From 0ef8199a23954dc50a9d44dd8cd12d8e0b960db4 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 4 Oct 2021 16:20:55 +0100
Subject: string: Add memset benchmark

Add a randomized memset benchmark using string length and alignment distribution
based on SPEC2017.
---
 string/bench/memset.c | 243 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 243 insertions(+)
 create mode 100644 string/bench/memset.c

diff --git a/string/bench/memset.c b/string/bench/memset.c
new file mode 100644
index 0000000..2d61969
--- /dev/null
+++ b/string/bench/memset.c
@@ -0,0 +1,243 @@
+/*
+ * memset benchmark.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS  5000
+#define ITERS2 20000000
+#define ITERS3 1000000
+#define NUM_TESTS 16384
+#define MIN_SIZE 32768
+#define MAX_SIZE (1024 * 1024)
+
+static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun)(void *, int, size_t);
+} funtab[] =
+{
+#if __aarch64__
+  F(__memset_aarch64)
+#elif __arm__
+  F(__memset_arm)
+#endif
+  F(memset)
+#undef F
+  {0, 0}
+};
+
+typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
+static memset_test_t test_arr[NUM_TESTS];
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM-1)
+static uint8_t len_arr[SIZE_NUM];
+
+/* Frequency data for memset sizes up to 4096 based on SPEC2017.  */
+static freq_data_t memset_len_freq[] =
+{
+{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, {  8,1412},
+{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414},
+{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, {  2, 200}, {  4, 192},
+{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140},
+{4095,133}, { 10, 130}, {  9, 124}, {  3, 124}, { 28, 120}, {  0, 118},
+{288, 110}, {1152, 96}, {104,  90}, {  1,  86}, {832,  76}, {248,  74},
+{1024, 69}, {120,  64}, {512,  63}, {384,  60}, {  6,  59}, { 80,  54},
+{ 17,  50}, {  7,  49}, {520,  47}, {2048, 39}, {256,  37}, {864,  33},
+{1440, 28}, { 22,  27}, {2056, 24}, {260,  23}, { 68,  23}, {  5,  22},
+{ 18,  21}, {200,  18}, {2120, 18}, { 60,  17}, { 52,  16}, {336,  15},
+{ 44,  13}, {192,  13}, {160,  12}, {2064, 12}, {128,  12}, { 76,  11},
+{164,  11}, {152,  10}, {136,   9}, {488,   7}, { 96,   6}, {560,   6},
+{1016,  6}, {112,   5}, {232,   5}, {168,   5}, {952,   5}, {184,   5},
+{144,   4}, {252,   4}, { 84,   3}, {960,   3}, {3808,  3}, {244,   3},
+{280,   3}, {224,   3}, {156,   3}, {1088,  3}, {440,   3}, {216,   2},
+{304,   2}, { 23,   2}, { 25,   2}, { 26,   2}, {264,   2}, {328,   2},
+{1096,  2}, {240,   2}, {1104,  2}, {704,   2}, {1664,  2}, {360,   2},
+{808,   1}, {544,   1}, {236,   1}, {720,   1}, {368,   1}, {424,   1},
+{640,   1}, {1112,  1}, {552,   1}, {272,   1}, {776,   1}, {376,   1},
+{ 92,   1}, {536,   1}, {824,   1}, {496,   1}, {760,   1}, {792,   1},
+{504,   1}, {344,   1}, {1816,  1}, {880,   1}, {176,   1}, {320,   1},
+{352,   1}, {2008,  1}, {208,   1}, {408,   1}, {228,   1}, {2072,  1},
+{568,   1}, {220,   1}, {616,   1}, {600,   1}, {392,   1}, {696,   1},
+{2144,  1}, {1280,  1}, {2136,  1}, {632,   1}, {584,   1}, {456,   1},
+{472,   1}, {3440,  1}, {2088,  1}, {680,   1}, {2928,  1}, {212,   1},
+{648,   1}, {1752,  1}, {664,   1}, {3512,  1}, {1032,  1}, {528,   1},
+{4072,  1}, {204,   1}, {2880,  1}, {3392,  1}, {712,   1}, { 59,   1},
+{736,   1}, {592,   1}, {2520,  1}, {744,   1}, {196,   1}, {172,   1},
+{728,   1}, {2040,  1}, {1192,  1}, {3600,  1}, {0, 0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM-1)
+static uint8_t align_arr[ALIGN_NUM];
+
+/* Alignment data for memset based on SPEC2017.  */
+static align_data_t memset_align_freq[] =
+{
+ {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0}
+};
+
+static void
+init_memset_distribution (void)
+{
+  int i, j, freq, size, n;
+
+  for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++)
+    for (j = 0, size = memset_len_freq[i].size; j < freq; j++)
+      len_arr[n++] = size;
+  assert (n == SIZE_NUM);
+
+  for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++)
+    for (j = 0, size = memset_align_freq[i].align; j < freq; j++)
+      align_arr[n++] = size - 1;
+  assert (n == ALIGN_NUM);
+}
+
+static size_t
+init_memset (size_t max_size)
+{
+  size_t total = 0;
+  /* Create a random set of memsets with the given size and alignment
+     distributions.  */
+  for (int i = 0; i < NUM_TESTS; i++)
+    {
+      test_arr[i].offset = (rand32 (0) & (max_size - 1));
+      test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK];
+      test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK];
+      total += test_arr[i].len;
+    }
+
+  return total;
+}
+
+
+int main (void)
+{
+  init_memset_distribution ();
+
+  memset (a, 1, sizeof (a));
+
+  printf("Random memset (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      size_t total_size = 0;
+      uint64_t tsum = 0;
+      printf ("%22s ", funtab[f].name);
+      rand32 (0x12345678);
+
+      for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+	{
+	  size_t memset_size = init_memset (size) * ITERS;
+
+	  for (int c = 0; c < NUM_TESTS; c++)
+	    funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS; i++)
+	    for (int c = 0; c < NUM_TESTS; c++)
+	      funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
+	  t = clock_get_ns () - t;
+	  total_size += memset_size;
+	  tsum += t;
+	  printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
+	}
+      printf( "avg %.2f\n", (double)total_size / tsum);
+    }
+
+  size_t total_size = 0;
+  uint64_t tsum = 0;
+  printf ("%22s ", "memset_call");
+  rand32 (0x12345678);
+
+  for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+    {
+      size_t memset_size = init_memset (size) * ITERS;
+
+      for (int c = 0; c < NUM_TESTS; c++)
+	memset (a + test_arr[c].offset, 0, test_arr[c].len);
+
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS; i++)
+	for (int c = 0; c < NUM_TESTS; c++)
+	  memset (a + test_arr[c].offset, 0, test_arr[c].len);
+      t = clock_get_ns () - t;
+      total_size += memset_size;
+      tsum += t;
+      printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
+    }
+  printf( "avg %.2f\n", (double)total_size / tsum);
+
+
+  printf ("\nMedium memset (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 8; size <= 512; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (a, 0, size);
+	  t = clock_get_ns () - t;
+	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("%22s ", "memset_call");
+  for (int size = 8; size <= 512; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS2; i++)
+	memset (a, 0, size);
+      t = clock_get_ns () - t;
+      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+    }
+
+
+  printf ("\nLarge memset (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 1024; size <= 65536; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a, 0, size);
+	  t = clock_get_ns () - t;
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("%22s ", "memset_call");
+  for (int size = 1024; size <= 65536; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS3; i++)
+	memset (a, 0, size);
+      t = clock_get_ns () - t;
+      printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+    }
+  printf ("\n\n");
+
+  return 0;
+}
-- 
cgit v1.2.3


From bc1f4b02cf12e50bca74474f76e7c29ef7fa0047 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 4 Oct 2021 16:21:25 +0100
Subject: string: Improve strlen benchmark

Increase the number of iterations of the random test. Minor code cleanup.
---
 string/bench/strlen.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/string/bench/strlen.c b/string/bench/strlen.c
index cc0f04b..b7eee6e 100644
--- a/string/bench/strlen.c
+++ b/string/bench/strlen.c
@@ -1,7 +1,7 @@
 /*
  * strlen benchmark.
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -13,10 +13,10 @@
 #include "stringlib.h"
 #include "benchlib.h"
 
-#define ITERS 2000
+#define ITERS 5000
 #define ITERS2 20000000
 #define ITERS3 2000000
-#define NUM_STRLEN 16384
+#define NUM_TESTS 16384
 
 #define MAX_ALIGN 32
 #define MAX_STRLEN 256
@@ -49,7 +49,7 @@ static const struct fun
 };
 #undef F
 
-static uint16_t strlen_tests[NUM_STRLEN];
+static uint16_t strlen_tests[NUM_TESTS];
 
 typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
 typedef struct { uint8_t align; uint16_t freq; } align_data_t;
@@ -117,7 +117,7 @@ init_strlen_tests (void)
 
   /* Create a random set of strlen input strings using the string length
      and alignment distributions.  */
-  for (int n = 0; n < NUM_STRLEN; n++)
+  for (int n = 0; n < NUM_TESTS; n++)
     {
       int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
       int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
@@ -141,14 +141,14 @@ int main (void)
       size_t res = 0, strlen_size = 0, mask = maskv;
       printf ("%22s ", funtab[f].name);
 
-      for (int c = 0; c < NUM_STRLEN; c++)
+      for (int c = 0; c < NUM_TESTS; c++)
 	strlen_size += funtab[f].fun (a + strlen_tests[c]);
       strlen_size *= ITERS;
 
       /* Measure latency of strlen result with (res & mask).  */
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS; i++)
-	for (int c = 0; c < NUM_STRLEN; c++)
+	for (int c = 0; c < NUM_TESTS; c++)
 	  res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
       t = clock_get_ns () - t;
       printf ("%.2f\n", (double)strlen_size / t);
-- 
cgit v1.2.3


From 2760eebc2f456b2db6fd9a087d8cbfd534a2c27c Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 4 Oct 2021 16:21:36 +0100
Subject: string: Improve memcpy benchmark

Improve memcpy benchmark. Double the number of random tests and the memory size.
Add separate tests using a direct call to memcpy to compare with indirect call to
GLIBC memcpy. Add a test for small aligned and unaligned memcpy.
---
 string/bench/memcpy.c | 162 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 119 insertions(+), 43 deletions(-)

diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index d5d4ea7..80611a4 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -1,7 +1,7 @@
 /*
  * memcpy benchmark.
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -13,14 +13,15 @@
 #include "stringlib.h"
 #include "benchlib.h"
 
-#define ITERS 5000
+#define ITERS  5000
 #define ITERS2 20000000
-#define ITERS3 500000
-#define MAX_COPIES 8192
-#define SIZE (256*1024)
+#define ITERS3 200000
+#define NUM_TESTS 16384
+#define MIN_SIZE 32768
+#define MAX_SIZE (1024 * 1024)
 
-static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
-static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
+static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
 
 #define F(x) {#x, x},
 
@@ -30,7 +31,6 @@ static const struct fun
   void *(*fun)(void *, const void *, size_t);
 } funtab[] =
 {
-  F(memcpy)
 #if __aarch64__
   F(__memcpy_aarch64)
 # if __ARM_NEON
@@ -39,6 +39,7 @@ static const struct fun
 #elif __arm__
   F(__memcpy_arm)
 #endif
+  F(memcpy)
 #undef F
   {0, 0}
 };
@@ -109,7 +110,7 @@ typedef struct
   uint64_t len : 16;
 } copy_t;
 
-static copy_t copy[MAX_COPIES];
+static copy_t test_arr[NUM_TESTS];
 
 typedef char *(*proto_t) (char *, const char *, size_t);
 
@@ -140,14 +141,14 @@ init_copies (size_t max_size)
   size_t total = 0;
   /* Create a random set of copies with the given size and alignment
      distributions.  */
-  for (int i = 0; i < MAX_COPIES; i++)
+  for (int i = 0; i < NUM_TESTS; i++)
     {
-      copy[i].dst = (rand32 (0) & (max_size - 1));
-      copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
-      copy[i].src = (rand32 (0) & (max_size - 1));
-      copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
-      copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
-      total += copy[i].len;
+      test_arr[i].dst = (rand32 (0) & (max_size - 1));
+      test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
+      test_arr[i].src = (rand32 (0) & (max_size - 1));
+      test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
+      test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];
+      total += test_arr[i].len;
     }
 
   return total;
@@ -160,25 +161,27 @@ int main (void)
   memset (a, 1, sizeof (a));
   memset (b, 2, sizeof (b));
 
-  printf("Random memcpy:\n");
+  printf("Random memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       size_t total = 0;
       uint64_t tsum = 0;
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
       rand32 (0x12345678);
 
-      for (int size = 16384; size <= SIZE; size *= 2)
+      for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
 	{
 	  size_t copy_size = init_copies (size) * ITERS;
 
-	  for (int c = 0; c < MAX_COPIES; c++)
-	    funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+	  for (int c = 0; c < NUM_TESTS; c++)
+	    funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
+			   test_arr[c].len);
 
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS; i++)
-	    for (int c = 0; c < MAX_COPIES; c++)
-	      funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+	    for (int c = 0; c < NUM_TESTS; c++)
+	      funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
+			     test_arr[c].len);
 	  t = clock_get_ns () - t;
 	  total += copy_size;
 	  tsum += t;
@@ -187,74 +190,147 @@ int main (void)
       printf( "avg %.2f\n", (double)total / tsum);
     }
 
-  printf ("\nMedium memcpy:\n");
+  size_t total = 0;
+  uint64_t tsum = 0;
+  printf ("%22s ", "memcpy_call");
+  rand32 (0x12345678);
+
+  for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+    {
+      size_t copy_size = init_copies (size) * ITERS;
+
+      for (int c = 0; c < NUM_TESTS; c++)
+	memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS; i++)
+	for (int c = 0; c < NUM_TESTS; c++)
+	  memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+      t = clock_get_ns () - t;
+      total += copy_size;
+      tsum += t;
+      printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
+    }
+  printf( "avg %.2f\n", (double)total / tsum);
+
+
+  printf ("\nAligned medium memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 16; size <= 512; size *= 2)
+      for (int size = 8; size <= 512; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS2; i++)
 	    funtab[f].fun (b, a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
 	}
       printf ("\n");
     }
 
-  printf ("\nLarge memcpy:\n");
+  printf ("%22s ", "memcpy_call");
+  for (int size = 8; size <= 512; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS2; i++)
+	memcpy (b, a, size);
+      t = clock_get_ns () - t;
+      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+    }
+  printf ("\n");
+
+
+  printf ("\nUnaligned medium memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 1024; size <= 32768; size *= 2)
+      for (int size = 8; size <= 512; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (b + 3, a + 1, size);
+	  t = clock_get_ns () - t;
+	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("%22s ", "memcpy_call");
+  for (int size = 8; size <= 512; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS2; i++)
+	memcpy (b + 3, a + 1, size);
+      t = clock_get_ns () - t;
+      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+    }
+  printf ("\n");
+
+
+  printf ("\nLarge memcpy (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (b, a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
-  printf ("\nUnaligned forwards memmove:\n");
+  printf ("%22s ", "memcpy_call");
+  for (int size = 1024; size <= 65536; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS3; i++)
+	memcpy (b, a, size);
+      t = clock_get_ns () - t;
+      printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+    }
+  printf ("\n");
+
+
+  printf ("\nUnaligned forwards memmove (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 1024; size <= 32768; size *= 2)
+      for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a, a + 256 + (i & 31), size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
 
-  printf ("\nUnaligned backwards memmove:\n");
+  printf ("\nUnaligned backwards memmove (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 1024; size <= 32768; size *= 2)
+      for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a + 256 + (i & 31), a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
+  printf ("\n");
 
   return 0;
 }
-- 
cgit v1.2.3


From 7a9fd1603e1179b044406fb9b6cc5770d736cde7 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Fri, 29 Oct 2021 15:39:00 +0100
Subject: string: Optimize memcmp

Rewrite memcmp to improve performance. On small and medium inputs
performance is typically 25% better. Large inputs use a SIMD loop
processing 64 bytes per iteration, which is 50% faster than the
previous version.
---
 string/aarch64/memcmp.S | 237 +++++++++++++++++++++++++++++-------------------
 1 file changed, 145 insertions(+), 92 deletions(-)

diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 3b10266..7ca1135 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,103 +1,84 @@
 /* memcmp - compare memory
  *
- * Copyright (c) 2013-2020, Arm Limited.
+ * Copyright (c) 2013-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  */
 
 #include "../asmdefs.h"
 
-/* Parameters and result.  */
-#define src1		x0
-#define src2		x1
-#define limit		x2
-#define result		w0
-
-/* Internal variables.  */
-#define data1		x3
-#define data1w		w3
-#define data1h		x4
-#define data2		x5
-#define data2w		w5
-#define data2h		x6
-#define tmp1		x7
-#define tmp2		x8
+#define src1	x0
+#define src2	x1
+#define limit	x2
+#define result	w0
+
+#define data1	x3
+#define data1w	w3
+#define data2	x4
+#define data2w	w4
+#define data3	x5
+#define data3w	w5
+#define data4	x6
+#define data4w	w6
+#define tmp	x6
+#define src1end	x7
+#define src2end	x8
+
 
 ENTRY (__memcmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
-	subs	limit, limit, 8
-	b.lo	L(less8)
-
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	cmp	data1, data2
-	b.ne	L(return)
-
-	subs	limit, limit, 8
-	b.gt	L(more16)
-
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
-	b	L(return)
-
-L(more16):
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	cmp	data1, data2
-	bne	L(return)
 
-	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
-	   strings.  */
-	subs	limit, limit, 16
+	cmp	limit, 16
+	b.lo	L(less16)
+	ldp	data1, data3, [src1]
+	ldp	data2, data4, [src2]
+	ccmp	data1, data2, 0, ne
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	cmp	limit, 32
 	b.ls	L(last_bytes)
+	cmp	limit, 160
+	b.hs	L(loop_align)
+	sub	limit, limit, 32
 
-	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
-	   try to align, so limit it only to strings larger than 128 bytes.  */
-	cmp	limit, 96
-	b.ls	L(loop16)
-
-	/* Align src1 and adjust src2 with bytes not yet done.  */
-	and	tmp1, src1, 15
-	add	limit, limit, tmp1
-	sub	src1, src1, tmp1
-	sub	src2, src2, tmp1
-
-	/* Loop performing 16 bytes per iteration using aligned src1.
-	   Limit is pre-decremented by 16 and must be larger than zero.
-	   Exit if <= 16 bytes left to do or if the data is not equal.  */
 	.p2align 4
-L(loop16):
-	ldp	data1, data1h, [src1], 16
-	ldp	data2, data2h, [src2], 16
-	subs	limit, limit, 16
-	ccmp	data1, data2, 0, hi
-	ccmp	data1h, data2h, 0, eq
-	b.eq	L(loop16)
-
+L(loop32):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
 	cmp	data1, data2
-	bne	L(return)
-	mov	data1, data1h
-	mov	data2, data2h
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	cmp	limit, 16
+	b.ls	L(last_bytes)
+
+	ldp	data1, data3, [src1, 32]
+	ldp	data2, data4, [src2, 32]
 	cmp	data1, data2
-	bne	L(return)
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	add	src1, src1, 32
+	add	src2, src2, 32
+L(last64):
+	subs	limit, limit, 32
+	b.hi	L(loop32)
 
 	/* Compare last 1-16 bytes using unaligned access.  */
 L(last_bytes):
-	add	src1, src1, limit
-	add	src2, src2, limit
-	ldp	data1, data1h, [src1]
-	ldp	data2, data2h, [src2]
-	cmp     data1, data2
-	bne	L(return)
-	mov	data1, data1h
-	mov	data2, data2h
+	ldp	data1, data3, [src1end, -16]
+	ldp	data2, data4, [src2end, -16]
+L(return2):
 	cmp	data1, data2
+	csel	data1, data1, data3, ne
+	csel	data2, data2, data4, ne
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
 L(return):
@@ -105,33 +86,105 @@ L(return):
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	cmp     data1, data2
-L(ret_eq):
+	cmp	data1, data2
 	cset	result, ne
 	cneg	result, result, lo
 	ret
 
 	.p2align 4
-	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less16):
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	tbz	limit, 3, L(less8)
+	ldr	data1, [src1]
+	ldr	data2, [src2]
+	ldr	data3, [src1end, -8]
+	ldr	data4, [src2end, -8]
+	b	L(return2)
+
+	.p2align 4
 L(less8):
-	adds	limit, limit, 4
-	b.lo	L(less4)
-	ldr	data1w, [src1], 4
-	ldr	data2w, [src2], 4
+	tbz	limit, 2, L(less4)
+	ldr	data1w, [src1]
+	ldr	data2w, [src2]
+	ldr	data3w, [src1end, -4]
+	ldr	data4w, [src2end, -4]
+	b	L(return2)
+
+L(less4):
+	tbz	limit, 1, L(less2)
+	ldrh	data1w, [src1]
+	ldrh	data2w, [src2]
 	cmp	data1w, data2w
 	b.ne	L(return)
-	sub	limit, limit, 4
-L(less4):
-	adds	limit, limit, 4
-	beq	L(ret_eq)
-L(byte_loop):
-	ldrb	data1w, [src1], 1
-	ldrb	data2w, [src2], 1
-	subs	limit, limit, 1
-	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.eq	L(byte_loop)
+L(less2):
+	mov	result, 0
+	tbz	limit, 0, L(return_zero)
+	ldrb	data1w, [src1end, -1]
+	ldrb	data2w, [src2end, -1]
 	sub	result, data1w, data2w
+L(return_zero):
 	ret
 
-END (__memcmp_aarch64)
+L(loop_align):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
+	cmp	data1, data2
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
 
+	/* Align src2 and adjust src1, src2 and limit.  */
+	and	tmp, src2, 15
+	sub	tmp, tmp, 16
+	sub	src2, src2, tmp
+	add	limit, limit, tmp
+	sub	src1, src1, tmp
+	sub	limit, limit, 64 + 16
+
+	.p2align 4
+L(loop64):
+	ldr	q0, [src1, 16]
+	ldr	q1, [src2, 16]
+	subs	limit, limit, 64
+	ldr	q2, [src1, 32]
+	ldr	q3, [src2, 32]
+	eor	v0.16b, v0.16b, v1.16b
+	eor	v1.16b, v2.16b, v3.16b
+	ldr	q2, [src1, 48]
+	ldr	q3, [src2, 48]
+	umaxp	v0.16b, v0.16b, v1.16b
+	ldr	q4, [src1, 64]!
+	ldr	q5, [src2, 64]!
+	eor	v1.16b, v2.16b, v3.16b
+	eor	v2.16b, v4.16b, v5.16b
+	umaxp	v1.16b, v1.16b, v2.16b
+	umaxp	v0.16b, v0.16b, v1.16b
+	umaxp	v0.16b, v0.16b, v0.16b
+	fmov	tmp, d0
+	ccmp	tmp, 0, 0, hi
+	b.eq	L(loop64)
+
+	/* If equal, process last 1-64 bytes using scalar loop.  */
+	add	limit, limit, 64 + 16
+	cbz	tmp, L(last64)
+
+	/* Determine the 8-byte aligned offset of the first difference.  */
+#ifdef __AARCH64EB__
+	rev16	tmp, tmp
+#endif
+	rev	tmp, tmp
+	clz	tmp, tmp
+	bic	tmp, tmp, 7
+	sub	tmp, tmp, 48
+	ldr	data1, [src1, tmp]
+	ldr	data2, [src2, tmp]
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	mov	result, 1
+	cmp	data1, data2
+	cneg	result, result, lo
+	ret
+
+END (__memcmp_aarch64)
-- 
cgit v1.2.3


From 074e835776116824ea1a23cbb4260dd60b1aaba0 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Tue, 21 Dec 2021 10:22:22 +0000
Subject: math: fix constant in sinf and cosf

gcc-12 -frounding-math started using runtime rounding mode for
converting double constants to float, so abstop12(pio4) is no longer
a compile time constant (this is required by iso c). Use float pio4f
instead to make the generated code the same as before and avoid
regressions on gcc-12.
---
 math/cosf.c    | 2 +-
 math/sincosf.c | 2 +-
 math/sincosf.h | 2 +-
 math/sinf.c    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/math/cosf.c b/math/cosf.c
index f29f194..67a3798 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -22,7 +22,7 @@ cosf (float y)
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4))
+  if (abstop12 (y) < abstop12 (pio4f))
     {
       double x2 = x * x;
 
diff --git a/math/sincosf.c b/math/sincosf.c
index 9746f1c..6fb299d 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -22,7 +22,7 @@ sincosf (float y, float *sinp, float *cosp)
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4))
+  if (abstop12 (y) < abstop12 (pio4f))
     {
       double x2 = x * x;
 
diff --git a/math/sincosf.h b/math/sincosf.h
index 1e80fc9..5912469 100644
--- a/math/sincosf.h
+++ b/math/sincosf.h
@@ -12,7 +12,7 @@
 /* 2PI * 2^-64.  */
 static const double pi63 = 0x1.921FB54442D18p-62;
 /* PI / 4.  */
-static const double pio4 = 0x1.921FB54442D18p-1;
+static const float pio4f = 0x1.921FB6p-1f;
 
 /* The constants and polynomials for sine and cosine.  */
 typedef struct
diff --git a/math/sinf.c b/math/sinf.c
index ddbc1da..4d2cbd6 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -21,7 +21,7 @@ sinf (float y)
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4))
+  if (abstop12 (y) < abstop12 (pio4f))
     {
       s = x * x;
 
-- 
cgit v1.2.3


From 8c107a34bb96a4c4a7de4ee6de210b30dbb6e45f Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Thu, 10 Feb 2022 10:23:09 +0000
Subject: string: Add SVE memcpy

Add an initial SVE memcpy implementation. Copies up to 32 bytes use SVE
vectors which improves the random memcpy benchmark significantly.
---
 string/aarch64/memcpy-sve.S | 180 ++++++++++++++++++++++++++++++++++++++++++++
 string/bench/memcpy.c       |   3 +
 string/include/stringlib.h  |   2 +
 string/test/memcpy.c        |   3 +
 string/test/memmove.c       |   3 +
 5 files changed, 191 insertions(+)
 create mode 100644 string/aarch64/memcpy-sve.S

diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
new file mode 100644
index 0000000..f85e800
--- /dev/null
+++ b/string/aarch64/memcpy-sve.S
@@ -0,0 +1,180 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ *
+ */
+
+#if __ARM_FEATURE_SVE
+
+#include "../asmdefs.h"
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define tmp1	x6
+#define vlen	x6
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+   SVE vectors are used to speedup small copies.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The source pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_sve)
+ENTRY (__memcpy_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	whilelo p0.b, xzr, count
+	cntb	vlen
+	tbnz	vlen, 4, L(vlen128)
+	ld1b	z0.b, p0/z, [src]
+	st1b	z0.b, p0, [dstin]
+	ret
+
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	add	srcend, src, count
+	add	dstend, dstin, count
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+L(copy96):
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+L(loop64):
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
+	ret
+
+L(vlen128):
+	whilelo p1.b, vlen, count
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p1, [dstin, 1, mul vl]
+	ret
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	cbz	tmp1, L(return)
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+L(return):
+	ret
+
+END (__memcpy_aarch64_sve)
+#endif
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index 80611a4..6bd2763 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -36,6 +36,9 @@ static const struct fun
 # if __ARM_NEON
   F(__memcpy_aarch64_simd)
 # endif
+# if __ARM_FEATURE_SVE
+  F(__memcpy_aarch64_sve)
+# endif
 #elif __arm__
   F(__memcpy_arm)
 #endif
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 378c3cd..ae1b289 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -42,6 +42,8 @@ void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64_simd (void *, const void *, size_t);
 #endif
 # if __ARM_FEATURE_SVE
+void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
 void *__memchr_aarch64_sve (const void *, int, size_t);
 int __memcmp_aarch64_sve (const void *, const void *, size_t);
 char *__strchr_aarch64_sve (const char *, int);
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index ce0ceee..21b35b9 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -28,6 +28,9 @@ static const struct fun
 # if __ARM_NEON
   F(__memcpy_aarch64_simd, 1)
 # endif
+# if __ARM_FEATURE_SVE
+  F(__memcpy_aarch64_sve, 1)
+# endif
 #elif __arm__
   F(__memcpy_arm, 0)
 #endif
diff --git a/string/test/memmove.c b/string/test/memmove.c
index 689b68c..12a7057 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -28,6 +28,9 @@ static const struct fun
 # if __ARM_NEON
   F(__memmove_aarch64_simd, 1)
 # endif
+# if __ARM_FEATURE_SVE
+  F(__memmove_aarch64_sve, 1)
+# endif
 #endif
   {0, 0, 0}
   // clang-format on
-- 
cgit v1.2.3


From 7b91c3cdb12b023004cb4dda30a1aa3424329ce6 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Thu, 10 Feb 2022 10:27:27 +0000
Subject: string: Merge MTE versions of strcmp and strncmp

Merge the MTE and non-MTE versions of strcmp and strncmp since the MTE
versions are faster.
---
 string/aarch64/strcmp-mte.S  | 189 --------------------------
 string/aarch64/strcmp.S      | 234 ++++++++++++++++++---------------
 string/aarch64/strncmp-mte.S | 307 -------------------------------------------
 string/aarch64/strncmp.S     | 234 ++++++++++++++++++++-------------
 string/include/stringlib.h   |   2 -
 string/test/strcmp.c         |   3 +-
 string/test/strncmp.c        |   3 +-
 7 files changed, 268 insertions(+), 704 deletions(-)
 delete mode 100644 string/aarch64/strcmp-mte.S
 delete mode 100644 string/aarch64/strncmp-mte.S

diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
deleted file mode 100644
index 12d1a6b..0000000
--- a/string/aarch64/strcmp-mte.S
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * strcmp - compare two strings
- *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-#define src1		x0
-#define src2		x1
-#define result		x0
-
-#define data1		x2
-#define data1w		w2
-#define data2		x3
-#define data2w		w3
-#define has_nul		x4
-#define diff		x5
-#define off1		x5
-#define syndrome	x6
-#define tmp		x6
-#define data3		x7
-#define zeroones	x8
-#define shift		x9
-#define off2		x10
-
-/* On big-endian early bytes are at MSB and on little-endian LSB.
-   LS_FW means shifting towards early bytes.  */
-#ifdef __AARCH64EB__
-# define LS_FW lsl
-#else
-# define LS_FW lsr
-#endif
-
-/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-   can be done in parallel across the entire word.
-   Since carry propagation makes 0x1 bytes before a NUL byte appear
-   NUL too in big-endian, byte-reverse the data before the NUL check.  */
-
-
-ENTRY (__strcmp_aarch64_mte)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	sub	off2, src2, src1
-	mov	zeroones, REP8_01
-	and	tmp, src1, 7
-	tst	off2, 7
-	b.ne	L(misaligned8)
-	cbnz	tmp, L(mutual_align)
-
-	.p2align 4
-
-L(loop_aligned):
-	ldr	data2, [src1, off2]
-	ldr	data1, [src1], 8
-L(start_realigned):
-#ifdef __AARCH64EB__
-	rev	tmp, data1
-	sub	has_nul, tmp, zeroones
-	orr	tmp, tmp, REP8_7f
-#else
-	sub	has_nul, data1, zeroones
-	orr	tmp, data1, REP8_7f
-#endif
-	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
-	ccmp	data1, data2, 0, eq
-	b.eq	L(loop_aligned)
-#ifdef __AARCH64EB__
-	rev	has_nul, has_nul
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, has_nul
-L(end):
-#ifndef __AARCH64EB__
-	rev	syndrome, syndrome
-	rev	data1, data1
-	rev	data2, data2
-#endif
-	clz	shift, syndrome
-	/* The most-significant-non-zero bit of the syndrome marks either the
-	   first bit that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	lsl	data1, data1, shift
-	lsl	data2, data2, shift
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, 56
-	sub	result, data1, data2, lsr 56
-	ret
-
-	.p2align 4
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.  */
-	bic	src1, src1, 7
-	ldr	data2, [src1, off2]
-	ldr	data1, [src1], 8
-	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
-	mov	tmp, -1
-	LS_FW	tmp, tmp, shift
-	orr	data1, data1, tmp
-	orr	data2, data2, tmp
-	b	L(start_realigned)
-
-L(misaligned8):
-	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
-	   checking to make sure that we don't access beyond the end of SRC2.  */
-	cbz	tmp, L(src1_aligned)
-L(do_misaligned):
-	ldrb	data1w, [src1], 1
-	ldrb	data2w, [src2], 1
-	cmp	data1w, 0
-	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.ne	L(done)
-	tst	src1, 7
-	b.ne	L(do_misaligned)
-
-L(src1_aligned):
-	neg	shift, src2, lsl 3
-	bic	src2, src2, 7
-	ldr	data3, [src2], 8
-#ifdef __AARCH64EB__
-	rev	data3, data3
-#endif
-	lsr	tmp, zeroones, shift
-	orr	data3, data3, tmp
-	sub	has_nul, data3, zeroones
-	orr	tmp, data3, REP8_7f
-	bics	has_nul, has_nul, tmp
-	b.ne	L(tail)
-
-	sub	off1, src2, src1
-
-	.p2align 4
-
-L(loop_unaligned):
-	ldr	data3, [src1, off1]
-	ldr	data2, [src1, off2]
-#ifdef __AARCH64EB__
-	rev	data3, data3
-#endif
-	sub	has_nul, data3, zeroones
-	orr	tmp, data3, REP8_7f
-	ldr	data1, [src1], 8
-	bics	has_nul, has_nul, tmp
-	ccmp	data1, data2, 0, eq
-	b.eq	L(loop_unaligned)
-
-	lsl	tmp, has_nul, shift
-#ifdef __AARCH64EB__
-	rev	tmp, tmp
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, tmp
-	cbnz	syndrome, L(end)
-L(tail):
-	ldr	data1, [src1]
-	neg	shift, shift
-	lsr	data2, data3, shift
-	lsr	has_nul, has_nul, shift
-#ifdef __AARCH64EB__
-	rev     data2, data2
-	rev	has_nul, has_nul
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, has_nul
-	b	L(end)
-
-L(done):
-	sub	result, data1, data2
-	ret
-
-END (__strcmp_aarch64_mte)
-
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 7714ebf..6e77845 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -1,168 +1,184 @@
 /*
  * strcmp - compare two strings
  *
- * Copyright (c) 2012-2020, Arm Limited.
+ * Copyright (c) 2012-2022, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
  */
 
 #include "../asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
 
-/* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define result		x0
 
-/* Internal variables.  */
 #define data1		x2
 #define data1w		w2
 #define data2		x3
 #define data2w		w3
 #define has_nul		x4
 #define diff		x5
+#define off1		x5
 #define syndrome	x6
-#define tmp1		x7
-#define tmp2		x8
-#define tmp3		x9
-#define zeroones	x10
-#define pos		x11
+#define tmp		x6
+#define data3		x7
+#define zeroones	x8
+#define shift		x9
+#define off2		x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.  */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+   can be done in parallel across the entire word.
+   Since carry propagation makes 0x1 bytes before a NUL byte appear
+   NUL too in big-endian, byte-reverse the data before the NUL check.  */
+
 
-	/* Start of performance-critical section  -- one 64B cache line.  */
 ENTRY (__strcmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
-	eor	tmp1, src1, src2
-	mov	zeroones, #REP8_01
-	tst	tmp1, #7
+	sub	off2, src2, src1
+	mov	zeroones, REP8_01
+	and	tmp, src1, 7
+	tst	off2, 7
 	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
+	cbnz	tmp, L(mutual_align)
+
+	.p2align 4
+
 L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
 L(start_realigned):
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	tmp, data1
+	sub	has_nul, tmp, zeroones
+	orr	tmp, tmp, REP8_7f
+#else
+	sub	has_nul, data1, zeroones
+	orr	tmp, data1, REP8_7f
+#endif
+	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_aligned)
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
-
 L(end):
-#ifndef	__AARCH64EB__
+#ifndef __AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, syndrome
 	rev	data2, data2
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-#else
-	/* For big-endian we cannot use the trick with the syndrome value
-	   as carry-propagation can corrupt the upper bits if the trailing
-	   bytes in the string contain 0x01.  */
-	/* However, if there is no NUL byte in the dword, we can generate
-	   the result directly.  We can't just subtract the bytes as the
-	   MSB might be significant.  */
-	cbnz	has_nul, 1f
-	cmp	data1, data2
-	cset	result, ne
-	cneg	result, result, lo
-	ret
-1:
-	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
-	rev	tmp3, data1
-	sub	tmp1, tmp3, zeroones
-	orr	tmp2, tmp3, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	rev	has_nul, has_nul
-	orr	syndrome, diff, has_nul
-	clz	pos, syndrome
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
+#endif
+	clz	shift, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
+	lsl	data1, data1, shift
+	lsl	data2, data2, shift
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
+	lsr	data1, data1, 56
+	sub	result, data1, data2, lsr 56
 	ret
-#endif
+
+	.p2align 4
 
 L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that preceed the start point.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	ldr	data1, [src1], #8
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
+	   the bytes that precede the start point.  */
+	bic	src1, src1, 7
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
+	mov	tmp, -1
+	LS_FW	tmp, tmp, shift
+	orr	data1, data1, tmp
+	orr	data2, data2, tmp
 	b	L(start_realigned)
 
 L(misaligned8):
 	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
-	   checking to make sure that we don't access beyond page boundary in
-	   SRC2.  */
-	tst	src1, #7
-	b.eq	L(loop_misaligned)
+	   checking to make sure that we don't access beyond the end of SRC2.  */
+	cbz	tmp, L(src1_aligned)
 L(do_misaligned):
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	cmp	data1w, 0
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
 	b.ne	L(done)
-	tst	src1, #7
+	tst	src1, 7
 	b.ne	L(do_misaligned)
 
-L(loop_misaligned):
-	/* Test if we are within the last dword of the end of a 4K page.  If
-	   yes then jump back to the misaligned loop to copy a byte at a time.  */
-	and	tmp1, src2, #0xff8
-	eor	tmp1, tmp1, #0xff8
-	cbz	tmp1, L(do_misaligned)
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+L(src1_aligned):
+	neg	shift, src2, lsl 3
+	bic	src2, src2, 7
+	ldr	data3, [src2], 8
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	lsr	tmp, zeroones, shift
+	orr	data3, data3, tmp
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	bics	has_nul, has_nul, tmp
+	b.ne	L(tail)
+
+	sub	off1, src2, src1
+
+	.p2align 4
+
+L(loop_unaligned):
+	ldr	data3, [src1, off1]
+	ldr	data2, [src1, off2]
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	ldr	data1, [src1], 8
+	bics	has_nul, has_nul, tmp
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_unaligned)
+
+	lsl	tmp, has_nul, shift
+#ifdef __AARCH64EB__
+	rev	tmp, tmp
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, tmp
+	cbnz	syndrome, L(end)
+L(tail):
+	ldr	data1, [src1]
+	neg	shift, shift
+	lsr	data2, data3, shift
+	lsr	has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+	rev     data2, data2
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, L(loop_misaligned)
 	b	L(end)
 
 L(done):
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
deleted file mode 100644
index c9d6fc8..0000000
--- a/string/aarch64/strncmp-mte.S
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * strncmp - compare two strings
- *
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-/* Parameters and result.  */
-#define src1		x0
-#define src2		x1
-#define limit		x2
-#define result		x0
-
-/* Internal variables.  */
-#define data1		x3
-#define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define has_nul		x5
-#define diff		x6
-#define syndrome	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define zeroones	x11
-#define pos		x12
-#define mask		x13
-#define endloop		x14
-#define count		mask
-#define offset		pos
-#define neg_offset	x15
-
-/* Define endian dependent shift operations.
-   On big-endian early bytes are at MSB and on little-endian LSB.
-   LS_FW means shifting towards early bytes.
-   LS_BK means shifting towards later bytes.
-   */
-#ifdef __AARCH64EB__
-#define LS_FW lsl
-#define LS_BK lsr
-#else
-#define LS_FW lsr
-#define LS_BK lsl
-#endif
-
-ENTRY (__strncmp_aarch64_mte)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-	cbz	limit, L(ret0)
-	eor	tmp1, src1, src2
-	mov	zeroones, #REP8_01
-	tst	tmp1, #7
-	and	count, src1, #7
-	b.ne	L(misaligned8)
-	cbnz	count, L(mutual_align)
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	.p2align 4
-L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-L(start_realigned):
-	subs	limit, limit, #8
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	endloop, #0, #0, eq
-	b.eq	L(loop_aligned)
-	/* End of main loop */
-
-L(full_check):
-#ifndef __AARCH64EB__
-	orr	syndrome, diff, has_nul
-	add	limit, limit, 8	/* Rewind limit to before last subs. */
-L(syndrome_check):
-	/* Limit was reached. Check if the NUL byte or the difference
-	   is before the limit. */
-	rev	syndrome, syndrome
-	rev	data1, data1
-	clz	pos, syndrome
-	rev	data2, data2
-	lsl	data1, data1, pos
-	cmp	limit, pos, lsr #3
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	csel result, result, xzr, hi
-	ret
-#else
-	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit, #63, L(not_limit)
-	add	tmp1, limit, 8
-	cbz	limit, L(not_limit)
-
-	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-	lsr	mask, mask, limit
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	/* Make sure that the NUL byte is marked in the syndrome.  */
-	orr	has_nul, has_nul, mask
-
-L(not_limit):
-	/* For big-endian we cannot use the trick with the syndrome value
-	   as carry-propagation can corrupt the upper bits if the trailing
-	   bytes in the string contain 0x01.  */
-	/* However, if there is no NUL byte in the dword, we can generate
-	   the result directly.  We can't just subtract the bytes as the
-	   MSB might be significant.  */
-	cbnz	has_nul, 1f
-	cmp	data1, data2
-	cset	result, ne
-	cneg	result, result, lo
-	ret
-1:
-	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
-	rev	tmp3, data1
-	sub	tmp1, tmp3, zeroones
-	orr	tmp2, tmp3, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	rev	has_nul, has_nul
-	orr	syndrome, diff, has_nul
-	clz	pos, syndrome
-	/* The most-significant-non-zero bit of the syndrome marks either the
-	   first bit that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-L(end_quick):
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-#endif
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.
-	   We also need to adjust the limit calculations, but without
-	   overflowing if the limit is near ULONG_MAX.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	ldr	data1, [src1], #8
-	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-	/* Adjust the limit and ensure it doesn't overflow.  */
-	adds	limit, limit, count
-	csinv	limit, limit, xzr, lo
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	b	L(start_realigned)
-
-	.p2align 4
-	/* Don't bother with dwords for up to 16 bytes.  */
-L(misaligned8):
-	cmp	limit, #16
-	b.hs	L(try_misaligned_words)
-
-L(byte_loop):
-	/* Perhaps we can do better than this.  */
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	L(byte_loop)
-L(done):
-	sub	result, data1, data2
-	ret
-	/* Align the SRC1 to a dword by doing a bytewise compare and then do
-	   the dword loop.  */
-L(try_misaligned_words):
-	cbz	count, L(src1_aligned)
-
-	neg	count, count
-	and	count, count, #7
-	sub	limit, limit, count
-
-L(page_end_loop):
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.ne	L(done)
-	subs	count, count, #1
-	b.hi	L(page_end_loop)
-
-	/* The following diagram explains the comparison of misaligned strings.
-	   The bytes are shown in natural order. For little-endian, it is
-	   reversed in the registers. The "x" bytes are before the string.
-	   The "|" separates data that is loaded at one time.
-	   src1     | a a a a a a a a | b b b c c c c c | . . .
-	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
-
-	   After shifting in each step, the data looks like this:
-	                STEP_A              STEP_B              STEP_C
-	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
-	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
-
-	   The bytes with "0" are eliminated from the syndrome via mask.
-
-	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
-	   time from SRC2. The comparison happens in 3 steps. After each step
-	   the loop can exit, or read from SRC1 or SRC2. */
-L(src1_aligned):
-	/* Calculate offset from 8 byte alignment to string start in bits. No
-	   need to mask offset since shifts are ignoring upper bits. */
-	lsl	offset, src2, #3
-	bic	src2, src2, #0xf
-	mov	mask, -1
-	neg	neg_offset, offset
-	ldr	data1, [src1], #8
-	ldp	tmp1, tmp2, [src2], #16
-	LS_BK	mask, mask, neg_offset
-	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
-	/* Skip the first compare if data in tmp1 is irrelevant. */
-	tbnz	offset, 6, L(misaligned_mid_loop)
-
-L(loop_misaligned):
-	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
-	LS_FW	data2, tmp1, offset
-	LS_BK	tmp1, tmp2, neg_offset
-	subs	limit, limit, #8
-	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
-	sub	has_nul, data1, zeroones
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	orr	tmp3, data1, #REP8_7f
-	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
-	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
-	orr	tmp3, endloop, has_nul
-	cbnz	tmp3, L(full_check)
-
-	ldr	data1, [src1], #8
-L(misaligned_mid_loop):
-	/* STEP_B: Compare first part of data1 to second part of tmp2. */
-	LS_FW	data2, tmp2, offset
-#ifdef __AARCH64EB__
-	/* For big-endian we do a byte reverse to avoid carry-propagation
-	problem described above. This way we can reuse the has_nul in the
-	next step and also use syndrome value trick at the end. */
-	rev	tmp3, data1
-	#define data1_fixed tmp3
-#else
-	#define data1_fixed data1
-#endif
-	sub	has_nul, data1_fixed, zeroones
-	orr	tmp3, data1_fixed, #REP8_7f
-	eor	diff, data2, data1	/* Non-zero if differences found.  */
-	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
-#ifdef __AARCH64EB__
-	rev	has_nul, has_nul
-#endif
-	cmp	limit, neg_offset, lsr #3
-	orr	syndrome, diff, has_nul
-	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
-	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
-	cbnz	tmp3, L(syndrome_check)
-
-	/* STEP_C: Compare second part of data1 to first part of tmp1. */
-	ldp	tmp1, tmp2, [src2], #16
-	cmp	limit, #8
-	LS_BK	data2, tmp1, neg_offset
-	eor	diff, data2, data1	/* Non-zero if differences found.  */
-	orr	syndrome, diff, has_nul
-	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
-	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
-	cbnz	tmp3, L(syndrome_check)
-
-	ldr	data1, [src1], #8
-	sub	limit, limit, #8
-	b	L(loop_misaligned)
-
-#ifdef	__AARCH64EB__
-L(syndrome_check):
-	clz	pos, syndrome
-	cmp	pos, limit, lsl #3
-	b.lo	L(end_quick)
-#endif
-
-L(ret0):
-	mov	result, #0
-	ret
-END(__strncmp_aarch64_mte)
-
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 738b653..7e636b4 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -1,20 +1,20 @@
 /*
  * strncmp - compare two strings
  *
- * Copyright (c) 2013-2021, Arm Limited.
+ * Copyright (c) 2013-2022, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
  */
 
 #include "../asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
 
 /* Parameters and result.  */
 #define src1		x0
@@ -35,10 +35,24 @@
 #define tmp3		x10
 #define zeroones	x11
 #define pos		x12
-#define limit_wd	x13
-#define mask		x14
-#define endloop		x15
+#define mask		x13
+#define endloop		x14
 #define count		mask
+#define offset		pos
+#define neg_offset	x15
+
+/* Define endian dependent shift operations.
+   On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.
+   LS_BK means shifting towards later bytes.
+   */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
 
 ENTRY (__strncmp_aarch64)
 	PTR_ARG (0)
@@ -51,9 +65,6 @@ ENTRY (__strncmp_aarch64)
 	and	count, src1, #7
 	b.ne	L(misaligned8)
 	cbnz	count, L(mutual_align)
-	/* Calculate the number of full and partial words -1.  */
-	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
 
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
@@ -63,56 +74,52 @@ L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
 L(start_realigned):
-	subs	limit_wd, limit_wd, #1
+	subs	limit, limit, #8
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
+	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
 	/* End of main loop */
 
-	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit_wd, #63, L(not_limit)
-
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	L(not_limit)
-
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-#ifdef __AARCH64EB__
-	lsr	mask, mask, limit
-#else
-	lsl	mask, mask, limit
-#endif
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	/* Make sure that the NUL byte is marked in the syndrome.  */
-	orr	has_nul, has_nul, mask
-
-L(not_limit):
+L(full_check):
+#ifndef __AARCH64EB__
 	orr	syndrome, diff, has_nul
-
-#ifndef	__AARCH64EB__
+	add	limit, limit, 8	/* Rewind limit to before last subs. */
+L(syndrome_check):
+	/* Limit was reached. Check if the NUL byte or the difference
+	   is before the limit. */
 	rev	syndrome, syndrome
 	rev	data1, data1
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
 	clz	pos, syndrome
 	rev	data2, data2
 	lsl	data1, data1, pos
+	cmp	limit, pos, lsr #3
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
 	lsr	data1, data1, #56
 	sub	result, data1, data2, lsr #56
+	csel result, result, xzr, hi
 	ret
 #else
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit, #63, L(not_limit)
+	add	tmp1, limit, 8
+	cbz	limit, L(not_limit)
+
+	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+	lsr	mask, mask, limit
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+L(not_limit):
 	/* For big-endian we cannot use the trick with the syndrome value
 	   as carry-propagation can corrupt the upper bits if the trailing
 	   bytes in the string contain 0x01.  */
@@ -133,10 +140,11 @@ L(not_limit):
 	rev	has_nul, has_nul
 	orr	syndrome, diff, has_nul
 	clz	pos, syndrome
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
+L(end_quick):
 	lsl	data1, data1, pos
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
@@ -158,22 +166,12 @@ L(mutual_align):
 	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
-	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-#endif
-	and	tmp3, limit_wd, #7
-	lsr	limit_wd, limit_wd, #3
-	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, count
-	add	tmp3, tmp3, count
+	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+	/* Adjust the limit and ensure it doesn't overflow.  */
+	adds	limit, limit, count
+	csinv	limit, limit, xzr, lo
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
-	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
 	.p2align 4
@@ -196,13 +194,11 @@ L(done):
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
 L(try_misaligned_words):
-	lsr	limit_wd, limit, #3
-	cbz	count, L(do_misaligned)
+	cbz	count, L(src1_aligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
-	lsr	limit_wd, limit, #3
 
 L(page_end_loop):
 	ldrb	data1w, [src1], #1
@@ -213,48 +209,100 @@ L(page_end_loop):
 	subs	count, count, #1
 	b.hi	L(page_end_loop)
 
-L(do_misaligned):
-	/* Prepare ourselves for the next page crossing.  Unlike the aligned
-	   loop, we fetch 1 less dword because we risk crossing bounds on
-	   SRC2.  */
-	mov	count, #8
-	subs	limit_wd, limit_wd, #1
-	b.lo	L(done_loop)
-L(loop_misaligned):
-	and	tmp2, src2, #0xff8
-	eor	tmp2, tmp2, #0xff8
-	cbz	tmp2, L(page_end_loop)
+	/* The following diagram explains the comparison of misaligned strings.
+	   The bytes are shown in natural order. For little-endian, it is
+	   reversed in the registers. The "x" bytes are before the string.
+	   The "|" separates data that is loaded at one time.
+	   src1     | a a a a a a a a | b b b c c c c c | . . .
+	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
+
+	   After shifting in each step, the data looks like this:
+	                STEP_A              STEP_B              STEP_C
+	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
+	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
 
+	   The bytes with "0" are eliminated from the syndrome via mask.
+
+	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+	   time from SRC2. The comparison happens in 3 steps. After each step
+	   the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+	/* Calculate offset from 8 byte alignment to string start in bits. No
+	   need to mask offset since shifts are ignoring upper bits. */
+	lsl	offset, src2, #3
+	bic	src2, src2, #0xf
+	mov	mask, -1
+	neg	neg_offset, offset
 	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	diff, #0, #0, eq
-	b.ne	L(not_limit)
-	subs	limit_wd, limit_wd, #1
-	b.pl	L(loop_misaligned)
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	mask, mask, neg_offset
+	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
+	/* Skip the first compare if data in tmp1 is irrelevant. */
+	tbnz	offset, 6, L(misaligned_mid_loop)
 
-L(done_loop):
-	/* We found a difference or a NULL before the limit was reached.  */
-	and	limit, limit, #7
-	cbz	limit, L(not_limit)
-	/* Read the last word.  */
-	sub	src1, src1, 8
-	sub	src2, src2, 8
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
+L(loop_misaligned):
+	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+	LS_FW	data2, tmp1, offset
+	LS_BK	tmp1, tmp2, neg_offset
+	subs	limit, limit, #8
+	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
+	sub	has_nul, data1, zeroones
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	diff, #0, #0, eq
-	b.ne	L(not_limit)
+	orr	tmp3, data1, #REP8_7f
+	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
+	orr	tmp3, endloop, has_nul
+	cbnz	tmp3, L(full_check)
+
+	ldr	data1, [src1], #8
+L(misaligned_mid_loop):
+	/* STEP_B: Compare first part of data1 to second part of tmp2. */
+	LS_FW	data2, tmp2, offset
+#ifdef __AARCH64EB__
+	/* For big-endian we do a byte reverse to avoid carry-propagation
+	problem described above. This way we can reuse the has_nul in the
+	next step and also use syndrome value trick at the end. */
+	rev	tmp3, data1
+	#define data1_fixed tmp3
+#else
+	#define data1_fixed data1
+#endif
+	sub	has_nul, data1_fixed, zeroones
+	orr	tmp3, data1_fixed, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	cmp	limit, neg_offset, lsr #3
+	orr	syndrome, diff, has_nul
+	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	/* STEP_C: Compare second part of data1 to first part of tmp1. */
+	ldp	tmp1, tmp2, [src2], #16
+	cmp	limit, #8
+	LS_BK	data2, tmp1, neg_offset
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	ldr	data1, [src1], #8
+	sub	limit, limit, #8
+	b	L(loop_misaligned)
+
+#ifdef	__AARCH64EB__
+L(syndrome_check):
+	clz	pos, syndrome
+	cmp	pos, limit, lsl #3
+	b.lo	L(end_quick)
+#endif
 
 L(ret0):
 	mov	result, #0
 	ret
-
-END ( __strncmp_aarch64)
+END(__strncmp_aarch64)
 
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index ae1b289..05142eb 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -35,8 +35,6 @@ char *__strchr_aarch64_mte (const char *, int);
 char * __strchrnul_aarch64_mte (const char *, int );
 size_t __strlen_aarch64_mte (const char *);
 char *__strrchr_aarch64_mte (const char *, int);
-int __strcmp_aarch64_mte (const char *, const char *);
-int __strncmp_aarch64_mte (const char *, const char *, size_t);
 #if __ARM_NEON
 void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64_simd (void *, const void *, size_t);
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index d57b54e..0262397 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -24,8 +24,7 @@ static const struct fun
   // clang-format off
   F(strcmp, 0)
 #if __aarch64__
-  F(__strcmp_aarch64, 0)
-  F(__strcmp_aarch64_mte, 1)
+  F(__strcmp_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strcmp_aarch64_sve, 1)
 # endif
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 018a8a4..f8c2167 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -24,8 +24,7 @@ static const struct fun
   // clang-format off
   F(strncmp, 0)
 #if __aarch64__
-  F(__strncmp_aarch64, 0)
-  F(__strncmp_aarch64_mte, 1)
+  F(__strncmp_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strncmp_aarch64_sve, 1)
 # endif
-- 
cgit v1.2.3


From 89ca9c3629eb6a62c28918db929a6fe80b141825 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Thu, 10 Feb 2022 10:28:59 +0000
Subject: string: Merge MTE versions of strcpy and stpcpy

Merge the MTE and non-MTE versions of strcpy and stpcpy since the MTE
versions are faster.
---
 string/aarch64/stpcpy-mte.S |  10 --
 string/aarch64/strcpy-mte.S | 161 ------------------
 string/aarch64/strcpy.S     | 394 ++++++++++++++------------------------------
 string/include/stringlib.h  |   2 -
 string/test/stpcpy.c        |   3 +-
 string/test/strcpy.c        |   3 +-
 6 files changed, 124 insertions(+), 449 deletions(-)
 delete mode 100644 string/aarch64/stpcpy-mte.S
 delete mode 100644 string/aarch64/strcpy-mte.S

diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
deleted file mode 100644
index f1c7119..0000000
--- a/string/aarch64/stpcpy-mte.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * stpcpy - copy a string returning pointer to end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define BUILD_STPCPY 1
-
-#include "strcpy-mte.S"
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
deleted file mode 100644
index 88c222d..0000000
--- a/string/aarch64/strcpy-mte.S
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * strcpy/stpcpy - copy a string returning pointer to start/end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define dstin		x0
-#define srcin		x1
-#define result		x0
-
-#define src		x2
-#define dst		x3
-#define len		x4
-#define synd		x4
-#define	tmp		x5
-#define wtmp		w5
-#define shift		x5
-#define data1		x6
-#define dataw1		w6
-#define data2		x7
-#define dataw2		w7
-
-#define dataq		q0
-#define vdata		v0
-#define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
-#define dataq2		q1
-
-#ifdef BUILD_STPCPY
-# define STRCPY __stpcpy_aarch64_mte
-# define IFSTPCPY(X,...) X,__VA_ARGS__
-#else
-# define STRCPY __strcpy_aarch64_mte
-# define IFSTPCPY(X,...)
-#endif
-
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
-
-ENTRY (STRCPY)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
-	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	lsr	synd, synd, shift
-	cbnz	synd, L(tail)
-
-	ldr	dataq, [src, 16]!
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbz	synd, L(start_loop)
-
-#ifndef __AARCH64EB__
-	rbit	synd, synd
-#endif
-	sub	tmp, src, srcin
-	clz	len, synd
-	add	len, tmp, len, lsr 2
-	tbz	len, 4, L(less16)
-	sub	tmp, len, 15
-	ldr	dataq, [srcin]
-	ldr	dataq2, [srcin, tmp]
-	str	dataq, [dstin]
-	str	dataq2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4,,8
-L(tail):
-	rbit	synd, synd
-	clz	len, synd
-	lsr	len, len, 2
-
-	.p2align 4
-L(less16):
-	tbz	len, 3, L(less8)
-	sub	tmp, len, 7
-	ldr	data1, [srcin]
-	ldr	data2, [srcin, tmp]
-	str	data1, [dstin]
-	str	data2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4
-L(less8):
-	subs	tmp, len, 3
-	b.lo	L(less4)
-	ldr	dataw1, [srcin]
-	ldr	dataw2, [srcin, tmp]
-	str	dataw1, [dstin]
-	str	dataw2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-L(less4):
-	cbz	len, L(zerobyte)
-	ldrh	dataw1, [srcin]
-	strh	dataw1, [dstin]
-L(zerobyte):
-	strb	wzr, [dstin, len]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4
-L(start_loop):
-	sub	len, src, srcin
-	ldr	dataq2, [srcin]
-	add	dst, dstin, len
-	str	dataq2, [dstin]
-
-	.p2align 5
-L(loop):
-	str	dataq, [dst], 16
-	ldr	dataq, [src, 16]!
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbz	synd, L(loop)
-
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
-	fmov	synd, dend
-#ifndef __AARCH64EB__
-	rbit	synd, synd
-#endif
-	clz	len, synd
-	lsr	len, len, 2
-	sub	tmp, len, 15
-	ldr	dataq, [src, tmp]
-	str	dataq, [dst, tmp]
-	IFSTPCPY (add result, dst, len)
-	ret
-
-END (STRCPY)
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 6e9ed42..b99e494 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -1,311 +1,161 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2013-2020, Arm Limited.
+ * Copyright (c) 2020-2022, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
 #include "../asmdefs.h"
 
-/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
-
-   To test the page crossing code path more thoroughly, compile with
-   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
-   entry path.  This option is not intended for production use.  */
-
-/* Arguments and results.  */
 #define dstin		x0
 #define srcin		x1
+#define result		x0
 
-/* Locals and temporaries.  */
 #define src		x2
 #define dst		x3
-#define data1		x4
-#define data1w		w4
-#define data2		x5
-#define data2w		w5
-#define has_nul1	x6
-#define has_nul2	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define tmp4		x11
-#define zeroones	x12
-#define data1a		x13
-#define data2a		x14
-#define pos		x15
-#define len		x16
-#define to_align	x17
+#define len		x4
+#define synd		x4
+#define	tmp		x5
+#define wtmp		w5
+#define shift		x5
+#define data1		x6
+#define dataw1		w6
+#define data2		x7
+#define dataw2		w7
+
+#define dataq		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
+#define dataq2		q1
 
 #ifdef BUILD_STPCPY
-#define STRCPY __stpcpy_aarch64
+# define STRCPY __stpcpy_aarch64
+# define IFSTPCPY(X,...) X,__VA_ARGS__
 #else
-#define STRCPY __strcpy_aarch64
+# define STRCPY __strcpy_aarch64
+# define IFSTPCPY(X,...)
 #endif
 
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
+/* Core algorithm:
 
-	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
-	   page size check for crossing this boundary on entry and if we
-	   do not, then we can short-circuit much of the entry code.  We
-	   expect early page-crossing strings to be rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
-	   predictable, even with random strings.
-
-	   We don't bother checking for larger page sizes, the cost of setting
-	   up the correct page size is just not worth the extra gain from
-	   a small reduction in the cases taking the slow path.  Note that
-	   we only care about whether the first fetch, which may be
-	   misaligned, crosses a page boundary - after that we move to aligned
-	   fetches for the remainder of the string.  */
-
-#ifdef STRCPY_TEST_PAGE_CROSS
-	/* Make everything that isn't Qword aligned look like a page cross.  */
-#define MIN_PAGE_P2 4
-#else
-#define MIN_PAGE_P2 12
-#endif
-
-#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
-	/* For moderately short strings, the fastest way to do the copy is to
-	   calculate the length of the string in the same way as strlen, then
-	   essentially do a memcpy of the result.  This avoids the need for
-	   multiple byte copies and further means that by the time we
-	   reach the bulk copy loop we know we can always use DWord
-	   accesses.  We expect __strcpy_aarch64 to rarely be called repeatedly
-	   with the same source string, so branch prediction is likely to
-	   always be difficult - we mitigate against this by preferring
-	   conditional select operations over branches whenever this is
-	   feasible.  */
-	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
-	mov	zeroones, #REP8_01
-	and	to_align, srcin, #15
-	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
-	neg	tmp1, to_align
-	/* The first fetch will straddle a (possible) page boundary iff
-	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
-	   aligned string will never fail the page align check, so will
-	   always take the fast path.  */
-	b.gt	L(page_cross)
-
-L(page_cross_ok):
-	ldp	data1, data2, [srcin]
-#ifdef __AARCH64EB__
-	/* Because we expect the end to be found within 16 characters
-	   (profiling shows this is the most common case), it's worth
-	   swapping the bytes now to save having to recalculate the
-	   termination syndrome later.  We preserve data1 and data2
-	   so that we can re-use the values later on.  */
-	rev	tmp2, data1
-	sub	tmp1, tmp2, zeroones
-	orr	tmp2, tmp2, #REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	b.ne	L(fp_le8)
-	rev	tmp4, data2
-	sub	tmp3, tmp4, zeroones
-	orr	tmp4, tmp4, #REP8_7f
-#else
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	b.ne	L(fp_le8)
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
+	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
+	ld1	{vdata.16b}, [src]
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbnz	synd, L(tail)
+
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	bics	has_nul2, tmp3, tmp4
-	b.eq	L(bulk_entry)
+	sub	tmp, src, srcin
+	clz	len, synd
+	add	len, tmp, len, lsr 2
+	tbz	len, 4, L(less16)
+	sub	tmp, len, 15
+	ldr	dataq, [srcin]
+	ldr	dataq2, [srcin, tmp]
+	str	dataq, [dstin]
+	str	dataq2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
 
-	/* The string is short (<=16 bytes).  We don't know exactly how
-	   short though, yet.  Work out the exact length so that we can
-	   quickly select the optimal copy strategy.  */
-L(fp_gt8):
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	mov	tmp2, #56
-	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
-	sub	pos, tmp2, pos
-#ifdef __AARCH64EB__
-	lsr	data2, data2, pos
-#else
-	lsl	data2, data2, pos
-#endif
-	str	data2, [dst, #1]
+	.p2align 4,,8
+L(tail):
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 2
+
+	.p2align 4
+L(less16):
+	tbz	len, 3, L(less8)
+	sub	tmp, len, 7
+	ldr	data1, [srcin]
+	ldr	data2, [srcin, tmp]
 	str	data1, [dstin]
-#ifdef BUILD_STPCPY
-	add	dstin, dst, #8
-#endif
+	str	data2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-L(fp_le8):
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
-	subs	tmp2, pos, #24			/* Pos in bits. */
-	b.lt	L(fp_lt4)
-#ifdef __AARCH64EB__
-	mov	tmp2, #56
-	sub	pos, tmp2, pos
-	lsr	data2, data1, pos
-	lsr	data1, data1, #32
-#else
-	lsr	data2, data1, tmp2
-#endif
-	/* 4->7 bytes to copy.  */
-	str	data2w, [dst, #-3]
-	str	data1w, [dstin]
-#ifdef BUILD_STPCPY
-	mov	dstin, dst
-#endif
+	.p2align 4
+L(less8):
+	subs	tmp, len, 3
+	b.lo	L(less4)
+	ldr	dataw1, [srcin]
+	ldr	dataw2, [srcin, tmp]
+	str	dataw1, [dstin]
+	str	dataw2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
 	ret
-L(fp_lt4):
-	cbz	pos, L(fp_lt2)
-	/* 2->3 bytes to copy.  */
-#ifdef __AARCH64EB__
-	lsr	data1, data1, #48
-#endif
-	strh	data1w, [dstin]
-	/* Fall-through, one byte (max) to go.  */
-L(fp_lt2):
-	/* Null-terminated string.  Last character must be zero!  */
-	strb	wzr, [dst]
-#ifdef BUILD_STPCPY
-	mov	dstin, dst
-#endif
-	ret
-
-	.p2align 6
-	/* Aligning here ensures that the entry code and main loop all lies
-	   within one 64-byte cache line.  */
-L(bulk_entry):
-	sub	to_align, to_align, #16
-	stp	data1, data2, [dstin]
-	sub	src, srcin, to_align
-	sub	dst, dstin, to_align
-	b	L(entry_no_page_cross)
 
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-L(main_loop):
-	stp	data1, data2, [dst], #16
-L(entry_no_page_cross):
-	ldp	data1, data2, [src], #16
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	L(main_loop)
-
-	/* Since we know we are copying at least 16 bytes, the fastest way
-	   to deal with the tail is to determine the location of the
-	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
-	cmp	has_nul1, #0
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	csel	data1, data1, data2, ne
-	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
-	csel	has_nul1, has_nul1, has_nul2, ne
-#endif
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	tmp1, pos, #72
-	add	pos, pos, #8
-	csel	pos, pos, tmp1, ne
-	add	src, src, pos, lsr #3
-	add	dst, dst, pos, lsr #3
-	ldp	data1, data2, [src, #-32]
-	stp	data1, data2, [dst, #-16]
-#ifdef BUILD_STPCPY
-	sub	dstin, dst, #1
-#endif
+L(less4):
+	cbz	len, L(zerobyte)
+	ldrh	dataw1, [srcin]
+	strh	dataw1, [dstin]
+L(zerobyte):
+	strb	wzr, [dstin, len]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-L(page_cross):
-	bic	src, srcin, #15
-	/* Start by loading two words at [srcin & ~15], then forcing the
-	   bytes that precede srcin to 0xff.  This means they never look
-	   like termination bytes.  */
-	ldp	data1, data2, [src]
-	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
-	tst	to_align, #7
-	csetm	tmp2, ne
-#ifdef __AARCH64EB__
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	.p2align 4
+L(start_loop):
+	sub	len, src, srcin
+	ldr	dataq2, [srcin]
+	add	dst, dstin, len
+	str	dataq2, [dstin]
+
+	.p2align 5
+L(loop):
+	str	dataq, [dst], 16
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	synd, dend
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-	cmp	to_align, #8
-	csinv	data1, data1, xzr, lt
-	csel	data2, data2, data2a, lt
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	L(page_cross_ok)
-	/* We now need to make data1 and data2 look like they've been
-	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
-	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
-	neg	tmp2, to_align, lsl #3
-#ifdef __AARCH64EB__
-	lsl	data1a, data1, tmp1
-	lsr	tmp4, data2, tmp2
-	lsl	data2, data2, tmp1
-	orr	tmp4, tmp4, data1a
-	cmp	to_align, #8
-	csel	data1, tmp4, data2, lt
-	rev	tmp2, data1
-	rev	tmp4, data2
-	sub	tmp1, tmp2, zeroones
-	orr	tmp2, tmp2, #REP8_7f
-	sub	tmp3, tmp4, zeroones
-	orr	tmp4, tmp4, #REP8_7f
-#else
-	lsr	data1a, data1, tmp1
-	lsl	tmp4, data2, tmp2
-	lsr	data2, data2, tmp1
-	orr	tmp4, tmp4, data1a
-	cmp	to_align, #8
-	csel	data1, tmp4, data2, lt
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-#endif
-	bic	has_nul1, tmp1, tmp2
-	cbnz	has_nul1, L(fp_le8)
-	bic	has_nul2, tmp3, tmp4
-	b	L(fp_gt8)
+	clz	len, synd
+	lsr	len, len, 2
+	sub	tmp, len, 15
+	ldr	dataq, [src, tmp]
+	str	dataq, [dst, tmp]
+	IFSTPCPY (add result, dst, len)
+	ret
 
 END (STRCPY)
-
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 05142eb..85e6302 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -29,8 +29,6 @@ size_t __strlen_aarch64 (const char *);
 size_t __strnlen_aarch64 (const char *, size_t);
 int __strncmp_aarch64 (const char *, const char *, size_t);
 void * __memchr_aarch64_mte (const void *, int, size_t);
-char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
-char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
 char *__strchr_aarch64_mte (const char *, int);
 char * __strchrnul_aarch64_mte (const char *, int );
 size_t __strlen_aarch64_mte (const char *);
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
index 1827e68..1b61245 100644
--- a/string/test/stpcpy.c
+++ b/string/test/stpcpy.c
@@ -28,8 +28,7 @@ static const struct fun
   // clang-format off
   F(stpcpy, 0)
 #if __aarch64__
-  F(__stpcpy_aarch64, 0)
-  F(__stpcpy_aarch64_mte, 1)
+  F(__stpcpy_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__stpcpy_aarch64_sve, 1)
 # endif
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index e84cace..6de3bed 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c
@@ -24,8 +24,7 @@ static const struct fun
   // clang-format off
   F(strcpy, 0)
 #if __aarch64__
-  F(__strcpy_aarch64, 0)
-  F(__strcpy_aarch64_mte, 1)
+  F(__strcpy_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strcpy_aarch64_sve, 1)
 # endif
-- 
cgit v1.2.3


From 189dfefe37d54c5b9d2a8bb2039091a638c691e1 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Thu, 10 Feb 2022 10:32:35 +0000
Subject: Update lincense to MIT OR Apache-2.0 WITH LLVM-exception

The outgoing license was MIT only. The new dual license allows
using the code under Apache-2.0 WITH LLVM-exception license too.
---
 LICENSE                                  | 230 ++++++++++++++++++++++++++++++-
 Makefile                                 |   2 +-
 README                                   |   7 +-
 config.mk.dist                           |   2 +-
 math/Dir.mk                              |   2 +-
 math/cosf.c                              |   4 +-
 math/erf.c                               |   2 +-
 math/erf_data.c                          |   2 +-
 math/erff.c                              |   2 +-
 math/erff_data.c                         |   2 +-
 math/exp.c                               |   2 +-
 math/exp2.c                              |   2 +-
 math/exp2f.c                             |   2 +-
 math/exp2f_data.c                        |   2 +-
 math/exp_data.c                          |   2 +-
 math/expf.c                              |   2 +-
 math/include/mathlib.h                   |   2 +-
 math/log.c                               |   2 +-
 math/log2.c                              |   2 +-
 math/log2_data.c                         |   2 +-
 math/log2f.c                             |   2 +-
 math/log2f_data.c                        |   2 +-
 math/log_data.c                          |   2 +-
 math/logf.c                              |   2 +-
 math/logf_data.c                         |   2 +-
 math/math_config.h                       |   2 +-
 math/math_err.c                          |   2 +-
 math/math_errf.c                         |   2 +-
 math/pow.c                               |   2 +-
 math/pow_log_data.c                      |   2 +-
 math/powf.c                              |   2 +-
 math/powf_log2_data.c                    |   2 +-
 math/s_cos.c                             |   2 +-
 math/s_cosf.c                            |   2 +-
 math/s_exp.c                             |   2 +-
 math/s_exp2f.c                           |   2 +-
 math/s_exp2f_1u.c                        |   2 +-
 math/s_expf.c                            |   2 +-
 math/s_expf_1u.c                         |   2 +-
 math/s_log.c                             |   2 +-
 math/s_logf.c                            |   2 +-
 math/s_pow.c                             |   2 +-
 math/s_powf.c                            |   2 +-
 math/s_sin.c                             |   2 +-
 math/s_sinf.c                            |   2 +-
 math/sincosf.c                           |   4 +-
 math/sincosf.h                           |   4 +-
 math/sincosf_data.c                      |   2 +-
 math/sinf.c                              |   4 +-
 math/test/mathbench.c                    |   2 +-
 math/test/mathtest.c                     |   2 +-
 math/test/rtest/dotest.c                 |   2 +-
 math/test/rtest/intern.h                 |   2 +-
 math/test/rtest/main.c                   |   2 +-
 math/test/rtest/random.c                 |   2 +-
 math/test/rtest/random.h                 |   2 +-
 math/test/rtest/semi.c                   |   2 +-
 math/test/rtest/semi.h                   |   2 +-
 math/test/rtest/types.h                  |   2 +-
 math/test/rtest/wrappers.c               |   2 +-
 math/test/rtest/wrappers.h               |   2 +-
 math/test/runulp.sh                      |   2 +-
 math/test/testcases/directed/cosf.tst    |   2 +-
 math/test/testcases/directed/erf.tst     |   2 +-
 math/test/testcases/directed/erff.tst    |   2 +-
 math/test/testcases/directed/exp.tst     |   2 +-
 math/test/testcases/directed/exp2.tst    |   2 +-
 math/test/testcases/directed/exp2f.tst   |   2 +-
 math/test/testcases/directed/expf.tst    |   2 +-
 math/test/testcases/directed/log.tst     |   2 +-
 math/test/testcases/directed/log2.tst    |   2 +-
 math/test/testcases/directed/log2f.tst   |   2 +-
 math/test/testcases/directed/logf.tst    |   2 +-
 math/test/testcases/directed/pow.tst     |   2 +-
 math/test/testcases/directed/powf.tst    |   2 +-
 math/test/testcases/directed/sincosf.tst |   2 +-
 math/test/testcases/directed/sinf.tst    |   2 +-
 math/test/testcases/random/double.tst    |   2 +-
 math/test/testcases/random/float.tst     |   2 +-
 math/test/ulp.c                          |   2 +-
 math/test/ulp.h                          |   2 +-
 math/tools/cos.sollya                    |   2 +-
 math/tools/exp.sollya                    |   2 +-
 math/tools/exp2.sollya                   |   2 +-
 math/tools/log.sollya                    |   2 +-
 math/tools/log2.sollya                   |   2 +-
 math/tools/log2_abs.sollya               |   2 +-
 math/tools/log_abs.sollya                |   2 +-
 math/tools/plot.py                       |   2 +-
 math/tools/remez.jl                      |   2 +-
 math/tools/sin.sollya                    |   2 +-
 math/tools/v_exp.sollya                  |   2 +-
 math/tools/v_log.sollya                  |   2 +-
 math/tools/v_sin.sollya                  |   2 +-
 math/v_cos.c                             |   2 +-
 math/v_cosf.c                            |   2 +-
 math/v_exp.c                             |   2 +-
 math/v_exp.h                             |   2 +-
 math/v_exp2f.c                           |   2 +-
 math/v_exp2f_1u.c                        |   2 +-
 math/v_exp_data.c                        |   2 +-
 math/v_expf.c                            |   2 +-
 math/v_expf_1u.c                         |   2 +-
 math/v_log.c                             |   2 +-
 math/v_log.h                             |   2 +-
 math/v_log_data.c                        |   2 +-
 math/v_logf.c                            |   2 +-
 math/v_math.h                            |   2 +-
 math/v_pow.c                             |   2 +-
 math/v_powf.c                            |   2 +-
 math/v_sin.c                             |   2 +-
 math/v_sinf.c                            |   2 +-
 math/vn_cos.c                            |   2 +-
 math/vn_cosf.c                           |   2 +-
 math/vn_exp.c                            |   2 +-
 math/vn_exp2f.c                          |   2 +-
 math/vn_exp2f_1u.c                       |   2 +-
 math/vn_expf.c                           |   2 +-
 math/vn_expf_1u.c                        |   2 +-
 math/vn_log.c                            |   2 +-
 math/vn_logf.c                           |   2 +-
 math/vn_pow.c                            |   2 +-
 math/vn_powf.c                           |   2 +-
 math/vn_sin.c                            |   2 +-
 math/vn_sinf.c                           |   2 +-
 networking/Dir.mk                        |   2 +-
 networking/aarch64/chksum_simd.c         |   2 +-
 networking/arm/chksum_simd.c             |   2 +-
 networking/chksum.c                      |   2 +-
 networking/chksum_common.h               |   2 +-
 networking/include/networking.h          |   2 +-
 networking/test/chksum.c                 |   2 +-
 string/Dir.mk                            |   2 +-
 string/aarch64/__mtag_tag_region.S       |   2 +-
 string/aarch64/__mtag_tag_zero_region.S  |   2 +-
 string/aarch64/check-arch.S              |   2 +-
 string/aarch64/memchr-mte.S              |   2 +-
 string/aarch64/memchr-sve.S              |   2 +-
 string/aarch64/memchr.S                  |   2 +-
 string/aarch64/memcmp-sve.S              |   2 +-
 string/aarch64/memcmp.S                  |   2 +-
 string/aarch64/memcpy-advsimd.S          |   2 +-
 string/aarch64/memcpy-sve.S              |   2 +-
 string/aarch64/memcpy.S                  |   2 +-
 string/aarch64/memrchr.S                 |   2 +-
 string/aarch64/memset.S                  |   2 +-
 string/aarch64/stpcpy-sve.S              |   2 +-
 string/aarch64/stpcpy.S                  |   2 +-
 string/aarch64/strchr-mte.S              |   2 +-
 string/aarch64/strchr-sve.S              |   2 +-
 string/aarch64/strchr.S                  |   2 +-
 string/aarch64/strchrnul-mte.S           |   2 +-
 string/aarch64/strchrnul-sve.S           |   2 +-
 string/aarch64/strchrnul.S               |   2 +-
 string/aarch64/strcmp-sve.S              |   2 +-
 string/aarch64/strcmp.S                  |   2 +-
 string/aarch64/strcpy-sve.S              |   2 +-
 string/aarch64/strcpy.S                  |   2 +-
 string/aarch64/strlen-mte.S              |   2 +-
 string/aarch64/strlen-sve.S              |   2 +-
 string/aarch64/strlen.S                  |   2 +-
 string/aarch64/strncmp-sve.S             |   2 +-
 string/aarch64/strncmp.S                 |   2 +-
 string/aarch64/strnlen-sve.S             |   2 +-
 string/aarch64/strnlen.S                 |   2 +-
 string/aarch64/strrchr-mte.S             |   2 +-
 string/aarch64/strrchr-sve.S             |   2 +-
 string/aarch64/strrchr.S                 |   2 +-
 string/arm/check-arch.S                  |   2 +-
 string/arm/memchr.S                      |   2 +-
 string/arm/memcpy.S                      |   2 +-
 string/arm/memset.S                      |   2 +-
 string/arm/strcmp-armv6m.S               |   2 +-
 string/arm/strcmp.S                      |   2 +-
 string/arm/strcpy.c                      |   2 +-
 string/arm/strlen-armv6t2.S              |   2 +-
 string/asmdefs.h                         |   2 +-
 string/bench/memcpy.c                    |   4 +-
 string/bench/memset.c                    |   2 +-
 string/bench/strlen.c                    |   2 +-
 string/include/benchlib.h                |   2 +-
 string/include/stringlib.h               |   4 +-
 string/test/__mtag_tag_region.c          |   2 +-
 string/test/__mtag_tag_zero_region.c     |   2 +-
 string/test/memchr.c                     |   2 +-
 string/test/memcmp.c                     |   2 +-
 string/test/memcpy.c                     |   4 +-
 string/test/memmove.c                    |   4 +-
 string/test/memrchr.c                    |   2 +-
 string/test/memset.c                     |   2 +-
 string/test/mte.h                        |   2 +-
 string/test/stpcpy.c                     |   4 +-
 string/test/strchr.c                     |   2 +-
 string/test/strchrnul.c                  |   2 +-
 string/test/strcmp.c                     |   4 +-
 string/test/strcpy.c                     |   4 +-
 string/test/stringtest.h                 |   2 +-
 string/test/strlen.c                     |   2 +-
 string/test/strncmp.c                    |   4 +-
 string/test/strnlen.c                    |   2 +-
 string/test/strrchr.c                    |   2 +-
 string/x86_64/check-arch.S               |   2 +-
 202 files changed, 445 insertions(+), 216 deletions(-)

diff --git a/LICENSE b/LICENSE
index 2543b82..20a4b77 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,11 @@
+MIT OR Apache-2.0 WITH LLVM-exception
+=====================================
+
+
 MIT License
+-----------
 
-Copyright (c) 1999-2019, Arm Limited.
+Copyright (c) 1999-2022, Arm Limited.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -19,3 +24,226 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+
+Apache-2.0 WITH LLVM-exception
+------------------------------
+
+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+--- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
diff --git a/Makefile b/Makefile
index 169f89e..187a729 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Makefile - requires GNU make
 #
 # Copyright (c) 2018-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 srcdir = .
 prefix = /usr
diff --git a/README b/README
index 9e1a34f..4654282 100644
--- a/README
+++ b/README
@@ -2,9 +2,10 @@ Arm Optimized Routines
 ----------------------
 
 This repository contains implementations of library functions
-provided by Arm under MIT License (See LICENSE). Contributions
-to this project are accepted, but Contributors have to sign an
-Assignment Agreement, please follow the instructions in
+provided by Arm. The outbound license is available under a dual
+license, at the user’s election, as reflected in the LICENSE file.
+Contributions to this project are accepted, but Contributors have
+to sign an Assignment Agreement, please follow the instructions in
 contributor-agreement.pdf. This is needed so upstreaming code
 to projects that require copyright assignment is possible.
 
diff --git a/config.mk.dist b/config.mk.dist
index 177e1ac..b33c107 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,7 +1,7 @@
 # Example config.mk
 #
 # Copyright (c) 2018-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 # Subprojects to build
 SUBS = math string networking
diff --git a/math/Dir.mk b/math/Dir.mk
index 3b841ab..dac636c 100644
--- a/math/Dir.mk
+++ b/math/Dir.mk
@@ -1,7 +1,7 @@
 # Makefile fragment - requires GNU make
 #
 # Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/math
 B := build/math
diff --git a/math/cosf.c b/math/cosf.c
index 67a3798..6293ce8 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision cos function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/math/erf.c b/math/erf.c
index 12d7e51..5f9f40d 100644
--- a/math/erf.c
+++ b/math/erf.c
@@ -2,7 +2,7 @@
  * Double-precision erf(x) function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/erf_data.c b/math/erf_data.c
index 807875b..10cf1fa 100644
--- a/math/erf_data.c
+++ b/math/erf_data.c
@@ -2,7 +2,7 @@
  * Shared data between erf and erfc.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/erff.c b/math/erff.c
index a58e825..9fa476d 100644
--- a/math/erff.c
+++ b/math/erff.c
@@ -2,7 +2,7 @@
  * Single-precision erf(x) function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/math/erff_data.c b/math/erff_data.c
index fa6b1ef..f822788 100644
--- a/math/erff_data.c
+++ b/math/erff_data.c
@@ -2,7 +2,7 @@
  * Data for approximation of erff.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/exp.c b/math/exp.c
index 7f5024c..1de500c 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -2,7 +2,7 @@
  * Double-precision e^x function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/exp2.c b/math/exp2.c
index 35ab39f..a1eee44 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -2,7 +2,7 @@
  * Double-precision 2^x function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/exp2f.c b/math/exp2f.c
index 94b3253..776c3dd 100644
--- a/math/exp2f.c
+++ b/math/exp2f.c
@@ -2,7 +2,7 @@
  * Single-precision 2^x function.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/exp2f_data.c b/math/exp2f_data.c
index 3fb0ad1..f0cb7fc 100644
--- a/math/exp2f_data.c
+++ b/math/exp2f_data.c
@@ -2,7 +2,7 @@
  * Shared data between expf, exp2f and powf.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/exp_data.c b/math/exp_data.c
index cba7683..714c845 100644
--- a/math/exp_data.c
+++ b/math/exp_data.c
@@ -2,7 +2,7 @@
  * Shared data between exp, exp2 and pow.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/expf.c b/math/expf.c
index 9b2f0c3..08a20d5 100644
--- a/math/expf.c
+++ b/math/expf.c
@@ -2,7 +2,7 @@
  * Single-precision e^x function.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 279d829..c520c37 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -2,7 +2,7 @@
  * Public API.
  *
  * Copyright (c) 2015-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _MATHLIB_H
diff --git a/math/log.c b/math/log.c
index d3b7bc6..43dfc2a 100644
--- a/math/log.c
+++ b/math/log.c
@@ -2,7 +2,7 @@
  * Double-precision log(x) function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/log2.c b/math/log2.c
index 55102b7..3f9c21b 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -2,7 +2,7 @@
  * Double-precision log2(x) function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/log2_data.c b/math/log2_data.c
index 3fc9b47..293bd7d 100644
--- a/math/log2_data.c
+++ b/math/log2_data.c
@@ -2,7 +2,7 @@
  * Data for log2.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/log2f.c b/math/log2f.c
index acb629e..0a44fa2 100644
--- a/math/log2f.c
+++ b/math/log2f.c
@@ -2,7 +2,7 @@
  * Single-precision log2 function.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/log2f_data.c b/math/log2f_data.c
index f3546d7..4866ef7 100644
--- a/math/log2f_data.c
+++ b/math/log2f_data.c
@@ -2,7 +2,7 @@
  * Data definition for log2f.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/log_data.c b/math/log_data.c
index 96a098d..3ecc1f4 100644
--- a/math/log_data.c
+++ b/math/log_data.c
@@ -2,7 +2,7 @@
  * Data for log.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/logf.c b/math/logf.c
index cfbaee1..ea378d6 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -2,7 +2,7 @@
  * Single-precision log function.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/logf_data.c b/math/logf_data.c
index e8973ce..0424768 100644
--- a/math/logf_data.c
+++ b/math/logf_data.c
@@ -2,7 +2,7 @@
  * Data definition for logf.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/math_config.h b/math/math_config.h
index e851043..7ffc0cd 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -2,7 +2,7 @@
  * Configuration for math routines.
  *
  * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _MATH_CONFIG_H
diff --git a/math/math_err.c b/math/math_err.c
index 1bf9538..cfe0728 100644
--- a/math/math_err.c
+++ b/math/math_err.c
@@ -2,7 +2,7 @@
  * Double-precision math error handling.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/math_errf.c b/math/math_errf.c
index d5350b8..4233918 100644
--- a/math/math_errf.c
+++ b/math/math_errf.c
@@ -2,7 +2,7 @@
  * Single-precision math error handling.
  *
  * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/pow.c b/math/pow.c
index 86842c6..af719fe 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -2,7 +2,7 @@
  * Double-precision x^y function.
  *
  * Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/pow_log_data.c b/math/pow_log_data.c
index 45569c5..2a4c250 100644
--- a/math/pow_log_data.c
+++ b/math/pow_log_data.c
@@ -2,7 +2,7 @@
  * Data for the log part of pow.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/powf.c b/math/powf.c
index 6ba45d3..05c80bb 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -2,7 +2,7 @@
  * Single-precision pow function.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c
index 97e0d98..243836a 100644
--- a/math/powf_log2_data.c
+++ b/math/powf_log2_data.c
@@ -2,7 +2,7 @@
  * Data definition for powf.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/s_cos.c b/math/s_cos.c
index 53a95b0..e66d563 100644
--- a/math/s_cos.c
+++ b/math/s_cos.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_cos.c"
diff --git a/math/s_cosf.c b/math/s_cosf.c
index 914c02e..f615d26 100644
--- a/math/s_cosf.c
+++ b/math/s_cosf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_cosf.c"
diff --git a/math/s_exp.c b/math/s_exp.c
index ac7246b..5da0099 100644
--- a/math/s_exp.c
+++ b/math/s_exp.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_exp.c"
diff --git a/math/s_exp2f.c b/math/s_exp2f.c
index df7dfd6..dcbfea9 100644
--- a/math/s_exp2f.c
+++ b/math/s_exp2f.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_exp2f.c"
diff --git a/math/s_exp2f_1u.c b/math/s_exp2f_1u.c
index 5e3852b..bf387e4 100644
--- a/math/s_exp2f_1u.c
+++ b/math/s_exp2f_1u.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_exp2f_1u.c"
diff --git a/math/s_expf.c b/math/s_expf.c
index 3492c46..dacda7f 100644
--- a/math/s_expf.c
+++ b/math/s_expf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_expf.c"
diff --git a/math/s_expf_1u.c b/math/s_expf_1u.c
index eb7bbcb..0009644 100644
--- a/math/s_expf_1u.c
+++ b/math/s_expf_1u.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_expf_1u.c"
diff --git a/math/s_log.c b/math/s_log.c
index 23289cf..27d2eb2 100644
--- a/math/s_log.c
+++ b/math/s_log.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_log.c"
diff --git a/math/s_logf.c b/math/s_logf.c
index 9399350..7d98b2b 100644
--- a/math/s_logf.c
+++ b/math/s_logf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_logf.c"
diff --git a/math/s_pow.c b/math/s_pow.c
index 2e34c9f..6eca2b2 100644
--- a/math/s_pow.c
+++ b/math/s_pow.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_pow.c"
diff --git a/math/s_powf.c b/math/s_powf.c
index 6d91a4a..1d55d90 100644
--- a/math/s_powf.c
+++ b/math/s_powf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_powf.c"
diff --git a/math/s_sin.c b/math/s_sin.c
index 06982c2..0c61712 100644
--- a/math/s_sin.c
+++ b/math/s_sin.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_sin.c"
diff --git a/math/s_sinf.c b/math/s_sinf.c
index 68ca908..3aae611 100644
--- a/math/s_sinf.c
+++ b/math/s_sinf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_sinf.c"
diff --git a/math/sincosf.c b/math/sincosf.c
index 6fb299d..446f21d 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision sin/cos function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/math/sincosf.h b/math/sincosf.h
index 5912469..ec23ed7 100644
--- a/math/sincosf.h
+++ b/math/sincosf.h
@@ -1,8 +1,8 @@
 /*
  * Header for sinf, cosf and sincosf.
  *
- * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/math/sincosf_data.c b/math/sincosf_data.c
index ab4ac47..2252529 100644
--- a/math/sincosf_data.c
+++ b/math/sincosf_data.c
@@ -2,7 +2,7 @@
  * Data definition for sinf, cosf and sincosf.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/math/sinf.c b/math/sinf.c
index 4d2cbd6..8dd8ae4 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision sin function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 0c17826..8f305a2 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -2,7 +2,7 @@
  * Microbenchmark for math functions.
  *
  * Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #undef _GNU_SOURCE
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index 3108967..85a42a7 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -2,7 +2,7 @@
  * mathtest.c - test rig for mathlib
  *
  * Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c
index 6be79e1..5b3e9b4 100644
--- a/math/test/rtest/dotest.c
+++ b/math/test/rtest/dotest.c
@@ -2,7 +2,7 @@
  * dotest.c - actually generate mathlib test cases
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdio.h>
diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h
index 12a9c74..3ebd7dd 100644
--- a/math/test/rtest/intern.h
+++ b/math/test/rtest/intern.h
@@ -2,7 +2,7 @@
  * intern.h
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef mathtest_intern_h
diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c
index 0d8ead8..3d533c9 100644
--- a/math/test/rtest/main.c
+++ b/math/test/rtest/main.c
@@ -2,7 +2,7 @@
  * main.c
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c
index 5612396..1de3258 100644
--- a/math/test/rtest/random.c
+++ b/math/test/rtest/random.c
@@ -2,7 +2,7 @@
  * random.c - random number generator for producing mathlib test cases
  *
  * Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "types.h"
diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h
index b4b22df..0b477d7 100644
--- a/math/test/rtest/random.h
+++ b/math/test/rtest/random.h
@@ -2,7 +2,7 @@
  * random.h - header for random.c
  *
  * Copyright (c) 2009-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "types.h"
diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c
index c9f0daf..70a7844 100644
--- a/math/test/rtest/semi.c
+++ b/math/test/rtest/semi.c
@@ -2,7 +2,7 @@
  * semi.c: test implementations of mathlib seminumerical functions
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdio.h>
diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h
index 17dc415..7a1444e 100644
--- a/math/test/rtest/semi.h
+++ b/math/test/rtest/semi.h
@@ -2,7 +2,7 @@
  * semi.h: header for semi.c
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef test_semi_h
diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h
index 53cd557..e15b4e0 100644
--- a/math/test/rtest/types.h
+++ b/math/test/rtest/types.h
@@ -2,7 +2,7 @@
  * types.h
  *
  * Copyright (c) 2005-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef mathtest_types_h
diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c
index de45ac5..4410171 100644
--- a/math/test/rtest/wrappers.c
+++ b/math/test/rtest/wrappers.c
@@ -2,7 +2,7 @@
  * wrappers.c - wrappers to modify output of MPFR/MPC test functions
  *
  * Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h
index 7b09c85..0a8a587 100644
--- a/math/test/rtest/wrappers.h
+++ b/math/test/rtest/wrappers.h
@@ -2,7 +2,7 @@
  * wrappers.h - wrappers to modify output of MPFR/MPC test functions
  *
  * Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 typedef struct {
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 0190d9a..2b42ae1 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -3,7 +3,7 @@
 # ULP error check script.
 #
 # Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 #set -x
 set -eu
diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst
index 7916044..7ea0d45 100644
--- a/math/test/testcases/directed/cosf.tst
+++ b/math/test/testcases/directed/cosf.tst
@@ -1,7 +1,7 @@
 ; cosf.tst - Directed test cases for SP cosine
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=cosf op1=7fc00001 result=7fc00001 errno=0
 func=cosf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst
index 7fa4d18..12384ce 100644
--- a/math/test/testcases/directed/erf.tst
+++ b/math/test/testcases/directed/erf.tst
@@ -1,7 +1,7 @@
 ; erf.tst - Directed test cases for erf
 ;
 ; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst
index d05b7b1..28f8fa3 100644
--- a/math/test/testcases/directed/erff.tst
+++ b/math/test/testcases/directed/erff.tst
@@ -1,7 +1,7 @@
 ; erff.tst
 ;
 ; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erff op1=7fc00001 result=7fc00001 errno=0
 func=erff op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst
index 85d556c..0bb2ef4 100644
--- a/math/test/testcases/directed/exp.tst
+++ b/math/test/testcases/directed/exp.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for exp
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst
index fa56c9f..7069f90 100644
--- a/math/test/testcases/directed/exp2.tst
+++ b/math/test/testcases/directed/exp2.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for exp2
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst
index 38cfc3f..6ca2eea 100644
--- a/math/test/testcases/directed/exp2f.tst
+++ b/math/test/testcases/directed/exp2f.tst
@@ -1,7 +1,7 @@
 ; exp2f.tst - Directed test cases for exp2f
 ;
 ; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp2f op1=7fc00001 result=7fc00001 errno=0
 func=exp2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst
index ff0f671..89ae8fe 100644
--- a/math/test/testcases/directed/expf.tst
+++ b/math/test/testcases/directed/expf.tst
@@ -1,7 +1,7 @@
 ; expf.tst - Directed test cases for expf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=expf op1=7fc00001 result=7fc00001 errno=0
 func=expf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst
index a0aa398..686ea83 100644
--- a/math/test/testcases/directed/log.tst
+++ b/math/test/testcases/directed/log.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for log
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=log op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst
index ff1286c..361bdde 100644
--- a/math/test/testcases/directed/log2.tst
+++ b/math/test/testcases/directed/log2.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for log2
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst
index 5832c4f..5fce051 100644
--- a/math/test/testcases/directed/log2f.tst
+++ b/math/test/testcases/directed/log2f.tst
@@ -1,7 +1,7 @@
 ; log2f.tst - Directed test cases for log2f
 ;
 ; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2f op1=7fc00001 result=7fc00001 errno=0
 func=log2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst
index 6e68a36..a6d1b9d 100644
--- a/math/test/testcases/directed/logf.tst
+++ b/math/test/testcases/directed/logf.tst
@@ -1,7 +1,7 @@
 ; logf.tst - Directed test cases for logf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=logf op1=7fc00001 result=7fc00001 errno=0
 func=logf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst
index 1966581..879d128 100644
--- a/math/test/testcases/directed/pow.tst
+++ b/math/test/testcases/directed/pow.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for pow
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=00000000.00000001 result=00000000.00000000 errno=0
diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst
index 3fa8b11..46d5224 100644
--- a/math/test/testcases/directed/powf.tst
+++ b/math/test/testcases/directed/powf.tst
@@ -1,7 +1,7 @@
 ; powf.tst - Directed test cases for powf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i
diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst
index 4b33d22..cddb346 100644
--- a/math/test/testcases/directed/sincosf.tst
+++ b/math/test/testcases/directed/sincosf.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for SP sincos
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 
 func=sincosf_sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst
index ded80b1..041b13d 100644
--- a/math/test/testcases/directed/sinf.tst
+++ b/math/test/testcases/directed/sinf.tst
@@ -1,7 +1,7 @@
 ; sinf.tst - Directed test cases for SP sine
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 
 func=sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst
index c24ff80..8e885d6 100644
--- a/math/test/testcases/random/double.tst
+++ b/math/test/testcases/random/double.tst
@@ -1,7 +1,7 @@
 !! double.tst - Random test case specification for DP functions
 !!
 !! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 test exp 10000
 test exp2 10000
diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst
index d02a227..ea4a5a0 100644
--- a/math/test/testcases/random/float.tst
+++ b/math/test/testcases/random/float.tst
@@ -1,7 +1,7 @@
 !! single.tst - Random test case specification for SP functions
 !!
 !! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 test sinf 10000
 test cosf 10000
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 51479b8..336a9be 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -2,7 +2,7 @@
  * ULP error checking tool for math functions.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <ctype.h>
diff --git a/math/test/ulp.h b/math/test/ulp.h
index a0c3016..327b4bd 100644
--- a/math/test/ulp.h
+++ b/math/test/ulp.h
@@ -2,7 +2,7 @@
  * Generic functions for ULP error estimation.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* For each different math function type,
diff --git a/math/tools/cos.sollya b/math/tools/cos.sollya
index bd72d6b..6690adf 100644
--- a/math/tools/cos.sollya
+++ b/math/tools/cos.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating cos(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 8;   // polynomial degree
 a = -pi/4; // interval
diff --git a/math/tools/exp.sollya b/math/tools/exp.sollya
index b7a462c..0668bdb 100644
--- a/math/tools/exp.sollya
+++ b/math/tools/exp.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating e^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 5; // poly degree
 N = 128; // table entries
diff --git a/math/tools/exp2.sollya b/math/tools/exp2.sollya
index e760769..bd0a42d 100644
--- a/math/tools/exp2.sollya
+++ b/math/tools/exp2.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating 2^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 // exp2f parameters
 deg = 3; // poly degree
diff --git a/math/tools/log.sollya b/math/tools/log.sollya
index 6df4db4..5288f55 100644
--- a/math/tools/log.sollya
+++ b/math/tools/log.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 12; // poly degree
 // |log(1+x)| > 0x1p-4 outside the interval
diff --git a/math/tools/log2.sollya b/math/tools/log2.sollya
index 4a364c0..85811be 100644
--- a/math/tools/log2.sollya
+++ b/math/tools/log2.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log2(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 11; // poly degree
 // |log2(1+x)| > 0x1p-4 outside the interval
diff --git a/math/tools/log2_abs.sollya b/math/tools/log2_abs.sollya
index 82c4dac..d018ba0 100644
--- a/math/tools/log2_abs.sollya
+++ b/math/tools/log2_abs.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log2(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 7; // poly degree
 // interval ~= 1/(2*N), where N is the table entries
diff --git a/math/tools/log_abs.sollya b/math/tools/log_abs.sollya
index a2ac190..5f9bfe4 100644
--- a/math/tools/log_abs.sollya
+++ b/math/tools/log_abs.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
 // interval ~= 1/(2*N), where N is the table entries
diff --git a/math/tools/plot.py b/math/tools/plot.py
index 6c8b89f..a0fa023 100755
--- a/math/tools/plot.py
+++ b/math/tools/plot.py
@@ -3,7 +3,7 @@
 # ULP error plot tool.
 #
 # Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/math/tools/remez.jl b/math/tools/remez.jl
index 2ff436f..1deab67 100755
--- a/math/tools/remez.jl
+++ b/math/tools/remez.jl
@@ -4,7 +4,7 @@
 # remez.jl - implementation of the Remez algorithm for polynomial approximation
 #
 # Copyright (c) 2015-2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 import Base.\
 
diff --git a/math/tools/sin.sollya b/math/tools/sin.sollya
index a6e8511..a193000 100644
--- a/math/tools/sin.sollya
+++ b/math/tools/sin.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating sin(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 7;   // polynomial degree
 a = -pi/4; // interval
diff --git a/math/tools/v_exp.sollya b/math/tools/v_exp.sollya
index c0abb63..5fa7de7 100644
--- a/math/tools/v_exp.sollya
+++ b/math/tools/v_exp.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating e^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 4; // poly degree
 N = 128; // table entries
diff --git a/math/tools/v_log.sollya b/math/tools/v_log.sollya
index cc3d2c4..d982524 100644
--- a/math/tools/v_log.sollya
+++ b/math/tools/v_log.sollya
@@ -1,7 +1,7 @@
 // polynomial used for __v_log(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
 a = -0x1.fc1p-9;
diff --git a/math/tools/v_sin.sollya b/math/tools/v_sin.sollya
index 65cc995..63b9d65 100644
--- a/math/tools/v_sin.sollya
+++ b/math/tools/v_sin.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating sin(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 15;  // polynomial degree
 a = -pi/2; // interval
diff --git a/math/v_cos.c b/math/v_cos.c
index 20ba6bd..eb7e337 100644
--- a/math/v_cos.c
+++ b/math/v_cos.c
@@ -2,7 +2,7 @@
  * Double-precision vector cos function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_cosf.c b/math/v_cosf.c
index 150294b..e1d656c 100644
--- a/math/v_cosf.c
+++ b/math/v_cosf.c
@@ -2,7 +2,7 @@
  * Single-precision vector cos function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_exp.c b/math/v_exp.c
index e459d53..039504d 100644
--- a/math/v_exp.c
+++ b/math/v_exp.c
@@ -2,7 +2,7 @@
  * Double-precision vector e^x function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_exp.h b/math/v_exp.h
index 305da19..1e7f7f3 100644
--- a/math/v_exp.h
+++ b/math/v_exp.h
@@ -2,7 +2,7 @@
  * Declarations for double-precision e^x vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_math.h"
diff --git a/math/v_exp2f.c b/math/v_exp2f.c
index e3ea5af..b817560 100644
--- a/math/v_exp2f.c
+++ b/math/v_exp2f.c
@@ -2,7 +2,7 @@
  * Single-precision vector 2^x function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_exp2f_1u.c b/math/v_exp2f_1u.c
index 1caa14d..de1a32d 100644
--- a/math/v_exp2f_1u.c
+++ b/math/v_exp2f_1u.c
@@ -2,7 +2,7 @@
  * Single-precision vector 2^x function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_exp_data.c b/math/v_exp_data.c
index 3653554..30421da 100644
--- a/math/v_exp_data.c
+++ b/math/v_exp_data.c
@@ -2,7 +2,7 @@
  * Lookup table for double-precision e^x vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_exp.h"
diff --git a/math/v_expf.c b/math/v_expf.c
index d403e00..2707ebc 100644
--- a/math/v_expf.c
+++ b/math/v_expf.c
@@ -2,7 +2,7 @@
  * Single-precision vector e^x function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_expf_1u.c b/math/v_expf_1u.c
index 023bd24..8f0ae91 100644
--- a/math/v_expf_1u.c
+++ b/math/v_expf_1u.c
@@ -2,7 +2,7 @@
  * Single-precision vector e^x function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_log.c b/math/v_log.c
index d84c740..47a8291 100644
--- a/math/v_log.c
+++ b/math/v_log.c
@@ -2,7 +2,7 @@
  * Double-precision vector log(x) function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_log.h b/math/v_log.h
index bcc2fa6..a37bbc2 100644
--- a/math/v_log.h
+++ b/math/v_log.h
@@ -2,7 +2,7 @@
  * Declarations for double-precision log(x) vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_math.h"
diff --git a/math/v_log_data.c b/math/v_log_data.c
index 97ee5b0..ec1c8e5 100644
--- a/math/v_log_data.c
+++ b/math/v_log_data.c
@@ -2,7 +2,7 @@
  * Lookup table for double-precision log(x) vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_log.h"
diff --git a/math/v_logf.c b/math/v_logf.c
index 7373192..93a5375 100644
--- a/math/v_logf.c
+++ b/math/v_logf.c
@@ -2,7 +2,7 @@
  * Single-precision vector log function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_math.h b/math/v_math.h
index f2cc467..31df7ee 100644
--- a/math/v_math.h
+++ b/math/v_math.h
@@ -2,7 +2,7 @@
  * Vector math abstractions.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _V_MATH_H
diff --git a/math/v_pow.c b/math/v_pow.c
index a209d57..05a83aa 100644
--- a/math/v_pow.c
+++ b/math/v_pow.c
@@ -2,7 +2,7 @@
  * Double-precision vector pow function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_powf.c b/math/v_powf.c
index fb80fa6..ad8ab8d 100644
--- a/math/v_powf.c
+++ b/math/v_powf.c
@@ -2,7 +2,7 @@
  * Single-precision vector powf function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_sin.c b/math/v_sin.c
index 2b9ed05..4e03576 100644
--- a/math/v_sin.c
+++ b/math/v_sin.c
@@ -2,7 +2,7 @@
  * Double-precision vector sin function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_sinf.c b/math/v_sinf.c
index e66bfce..d2e18b5 100644
--- a/math/v_sinf.c
+++ b/math/v_sinf.c
@@ -2,7 +2,7 @@
  * Single-precision vector sin function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/vn_cos.c b/math/vn_cos.c
index b57a549..4b5b237 100644
--- a/math/vn_cos.c
+++ b/math/vn_cos.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_cos.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_cosf.c b/math/vn_cosf.c
index 6321d46..86dd26e 100644
--- a/math/vn_cosf.c
+++ b/math/vn_cosf.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_cosf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_exp.c b/math/vn_exp.c
index 06e269d..0d85b17 100644
--- a/math/vn_exp.c
+++ b/math/vn_exp.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_exp.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_exp2f.c b/math/vn_exp2f.c
index db9707e..da3bb40 100644
--- a/math/vn_exp2f.c
+++ b/math/vn_exp2f.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_exp2f.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_exp2f_1u.c b/math/vn_exp2f_1u.c
index 17bd0ab..3e3a247 100644
--- a/math/vn_exp2f_1u.c
+++ b/math/vn_exp2f_1u.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_exp2f_1u.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_expf.c b/math/vn_expf.c
index 0652907..6e91a94 100644
--- a/math/vn_expf.c
+++ b/math/vn_expf.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_expf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_expf_1u.c b/math/vn_expf_1u.c
index 3be7768..57ae6a3 100644
--- a/math/vn_expf_1u.c
+++ b/math/vn_expf_1u.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_expf_1u.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_log.c b/math/vn_log.c
index b58fe8f..902bff1 100644
--- a/math/vn_log.c
+++ b/math/vn_log.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_log.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_logf.c b/math/vn_logf.c
index cc5b8ae..07e4936 100644
--- a/math/vn_logf.c
+++ b/math/vn_logf.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_logf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_pow.c b/math/vn_pow.c
index 2609501..1a980ff 100644
--- a/math/vn_pow.c
+++ b/math/vn_pow.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_pow.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_powf.c b/math/vn_powf.c
index 095d07e..a42ade3 100644
--- a/math/vn_powf.c
+++ b/math/vn_powf.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_powf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_sin.c b/math/vn_sin.c
index 905c796..64b05c8 100644
--- a/math/vn_sin.c
+++ b/math/vn_sin.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_sin.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_sinf.c b/math/vn_sinf.c
index 1214e1a..6e880c6 100644
--- a/math/vn_sinf.c
+++ b/math/vn_sinf.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_sinf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/networking/Dir.mk b/networking/Dir.mk
index b496103..2589e0a 100644
--- a/networking/Dir.mk
+++ b/networking/Dir.mk
@@ -1,7 +1,7 @@
 # Makefile fragment - requires GNU make
 #
 # Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/networking
 B := build/networking
diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c
index 6d5be58..90c00eb 100644
--- a/networking/aarch64/chksum_simd.c
+++ b/networking/aarch64/chksum_simd.c
@@ -2,7 +2,7 @@
  * AArch64-specific checksum implementation using NEON
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c
index 7f69adf..ae08fe5 100644
--- a/networking/arm/chksum_simd.c
+++ b/networking/arm/chksum_simd.c
@@ -2,7 +2,7 @@
  * Armv7-A specific checksum implementation using NEON
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
diff --git a/networking/chksum.c b/networking/chksum.c
index 95ce5ba..329482f 100644
--- a/networking/chksum.c
+++ b/networking/chksum.c
@@ -3,7 +3,7 @@
  * This sum is often used as a simple checksum in networking.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
diff --git a/networking/chksum_common.h b/networking/chksum_common.h
index 958c8cc..16f0f6c 100644
--- a/networking/chksum_common.h
+++ b/networking/chksum_common.h
@@ -2,7 +2,7 @@
  * Common code for checksum implementations
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef CHKSUM_COMMON_H
diff --git a/networking/include/networking.h b/networking/include/networking.h
index a88feff..297dd4b 100644
--- a/networking/include/networking.h
+++ b/networking/include/networking.h
@@ -2,7 +2,7 @@
  * Public API.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 unsigned short __chksum (const void *, unsigned int);
diff --git a/networking/test/chksum.c b/networking/test/chksum.c
index 41b9812..239b5b8 100644
--- a/networking/test/chksum.c
+++ b/networking/test/chksum.c
@@ -2,7 +2,7 @@
  * Ones' complement checksum test & benchmark
  *
  * Copyright (c) 2016-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
diff --git a/string/Dir.mk b/string/Dir.mk
index cf3453f..40ff5ac 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -1,7 +1,7 @@
 # Makefile fragment - requires GNU make
 #
 # Copyright (c) 2019-2021, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/string
 B := build/string
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
index 84339f7..84b8c94 100644
--- a/string/aarch64/__mtag_tag_region.S
+++ b/string/aarch64/__mtag_tag_region.S
@@ -2,7 +2,7 @@
  * __mtag_tag_region - tag memory
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
index f58364c..97ae68e 100644
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -2,7 +2,7 @@
  * __mtag_tag_zero_region - tag memory and fill it with zero bytes
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
index 5a54242..1565465 100644
--- a/string/aarch64/check-arch.S
+++ b/string/aarch64/check-arch.S
@@ -2,7 +2,7 @@
  * check ARCH setting.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__aarch64__
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index c2e967d..8441585 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -2,7 +2,7 @@
  * memchr - find a character in a memory zone
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index c22e659..820228e 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -2,7 +2,7 @@
  * memchr - find a character in a memory zone
  *
  * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "../asmdefs.h"
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 353f0d1..5879c1c 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -2,7 +2,7 @@
  * memchr - find a character in a memory zone
  *
  * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index 78c5eca..d29588c 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -2,7 +2,7 @@
  * memcmp - compare memory
  *
  * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "../asmdefs.h"
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 7ca1135..e19521f 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,7 +1,7 @@
 /* memcmp - compare memory
  *
  * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index f97f2c3..d1368d0 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -2,7 +2,7 @@
  * memcpy - copy memory area
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
index f85e800..66ae896 100644
--- a/string/aarch64/memcpy-sve.S
+++ b/string/aarch64/memcpy-sve.S
@@ -2,7 +2,7 @@
  * memcpy - copy memory area
  *
  * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index dd254f6..36aaf60 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -2,7 +2,7 @@
  * memcpy - copy memory area
  *
  * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index 7b4be84..ff4f47a 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -2,7 +2,7 @@
  * memrchr - find last character in a memory zone.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 9fcd975..ad0b0d6 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -2,7 +2,7 @@
  * memset - fill memory with a constant byte
  *
  * Copyright (c) 2012-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S
index 82dd971..5d3f14b 100644
--- a/string/aarch64/stpcpy-sve.S
+++ b/string/aarch64/stpcpy-sve.S
@@ -2,7 +2,7 @@
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STPCPY 1
diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S
index 4f62aa4..155c68d 100644
--- a/string/aarch64/stpcpy.S
+++ b/string/aarch64/stpcpy.S
@@ -2,7 +2,7 @@
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STPCPY 1
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index dcb0e46..4ed6cce 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -2,7 +2,7 @@
  * strchr - find a character in a string
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index 13ba9f4..1b984b9 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -2,7 +2,7 @@
  * strchr/strchrnul - find a character in a string
  *
  * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "../asmdefs.h"
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 1063cbf..3aab56c 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -2,7 +2,7 @@
  * strchr - find a character in a string
  *
  * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 1b0d0a6..4a08b52 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -2,7 +2,7 @@
  * strchrnul - find a character or nul in a string
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
index 428ff1a..0005f91 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/strchrnul-sve.S
@@ -2,7 +2,7 @@
  * strchrnul - find a character or nul in a string
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STRCHRNUL
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index a4230d9..dc57f5f 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -2,7 +2,7 @@
  * strchrnul - find a character or nul in a string
  *
  * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index e6d2da5..6ce80e3 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -2,7 +2,7 @@
  * __strcmp_aarch64_sve - compare two strings
  *
  * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "../asmdefs.h"
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 6e77845..bc1f74e 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -2,7 +2,7 @@
  * strcmp - compare two strings
  *
  * Copyright (c) 2012-2022, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index f515462..3ce951c 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -2,7 +2,7 @@
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
  * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "../asmdefs.h"
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index b99e494..92b2850 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -2,7 +2,7 @@
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
  * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 7cf41d5..45103ff 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -2,7 +2,7 @@
  * strlen - calculate the length of a string.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 2392493..0fd663f 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -2,7 +2,7 @@
  * __strlen_aarch64_sve - compute the length of a string
  *
  * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "../asmdefs.h"
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index a1b164a..98145f9 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -2,7 +2,7 @@
  * strlen - calculate the length of a string.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index 234190e..08b9a7e 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -2,7 +2,7 @@
  * strncmp - compare two strings with limit
  *
  * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "../asmdefs.h"
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 7e636b4..6957d07 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -2,7 +2,7 @@
  * strncmp - compare two strings
  *
  * Copyright (c) 2013-2022, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index 5b9ebf7..ec6f881 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -2,7 +2,7 @@
  * strnlen - calculate the length of a string with limit.
  *
  * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "../asmdefs.h"
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index 48d2495..e09dd1b 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -2,7 +2,7 @@
  * strnlen - calculate the length of a string with limit.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index 1e4fb1a..dcee1bf 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -2,7 +2,7 @@
  * strrchr - find last position of a character in a string.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index d36d69a..f907166 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -2,7 +2,7 @@
  * strrchr - find the last of a character in a string
  *
  * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "../asmdefs.h"
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index 56185ff..a1b43ca 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -2,7 +2,7 @@
  * strrchr - find last position of a character in a string.
  *
  * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
diff --git a/string/arm/check-arch.S b/string/arm/check-arch.S
index 1cff934..f69e112 100644
--- a/string/arm/check-arch.S
+++ b/string/arm/check-arch.S
@@ -2,7 +2,7 @@
  * check ARCH setting.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__arm__
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 3f1ac4d..1271ca1 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -2,7 +2,7 @@
  * memchr - scan memory for a character
  *
  * Copyright (c) 2010-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /*
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index 86e6493..77f4553 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -2,7 +2,7 @@
  * memcpy - copy memory area
  *
  * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /*
diff --git a/string/arm/memset.S b/string/arm/memset.S
index 11e9273..487b9d6 100644
--- a/string/arm/memset.S
+++ b/string/arm/memset.S
@@ -2,7 +2,7 @@
  * memset - fill memory with a constant
  *
  * Copyright (c) 2010-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /*
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
index b75d414..8b11175 100644
--- a/string/arm/strcmp-armv6m.S
+++ b/string/arm/strcmp-armv6m.S
@@ -2,7 +2,7 @@
  * strcmp for ARMv6-M (optimized for performance, not size)
  *
  * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index 51443e3..622efb9 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -2,7 +2,7 @@
  * strcmp for ARMv7
  *
  * Copyright (c) 2012-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c
index 02cf94f..b5728a2 100644
--- a/string/arm/strcpy.c
+++ b/string/arm/strcpy.c
@@ -2,7 +2,7 @@
  * strcpy
  *
  * Copyright (c) 2008-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if defined (__thumb2__) && !defined (__thumb__)
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 5ad30c9..01ebf1d 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -2,7 +2,7 @@
  * strlen - calculate the length of a string
  *
  * Copyright (c) 2010-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
diff --git a/string/asmdefs.h b/string/asmdefs.h
index 340b427..0d6ebd7 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -2,7 +2,7 @@
  * Macros for asm code.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _ASMDEFS_H
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index 6bd2763..1468663 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -1,8 +1,8 @@
 /*
  * memcpy benchmark.
  *
- * Copyright (c) 2020-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
diff --git a/string/bench/memset.c b/string/bench/memset.c
index 2d61969..990e23b 100644
--- a/string/bench/memset.c
+++ b/string/bench/memset.c
@@ -2,7 +2,7 @@
  * memset benchmark.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
index b7eee6e..f05d0d5 100644
--- a/string/bench/strlen.c
+++ b/string/bench/strlen.c
@@ -2,7 +2,7 @@
  * strlen benchmark.
  *
  * Copyright (c) 2020-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
index 0f2ce2e..f1bbea3 100644
--- a/string/include/benchlib.h
+++ b/string/include/benchlib.h
@@ -2,7 +2,7 @@
  * Benchmark support functions.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 85e6302..f41a464 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -1,8 +1,8 @@
 /*
  * Public API.
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stddef.h>
diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c
index d8c02d9..c45fa66 100644
--- a/string/test/__mtag_tag_region.c
+++ b/string/test/__mtag_tag_region.c
@@ -2,7 +2,7 @@
  * __mtag_tag_region test.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c
index 221c223..a4a7861 100644
--- a/string/test/__mtag_tag_zero_region.c
+++ b/string/test/__mtag_tag_zero_region.c
@@ -2,7 +2,7 @@
  * __mtag_tag_zero_region test.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/string/test/memchr.c b/string/test/memchr.c
index 0ff77f5..c6a9448 100644
--- a/string/test/memchr.c
+++ b/string/test/memchr.c
@@ -2,7 +2,7 @@
  * memchr test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
index 7a7cf9c..f9236b8 100644
--- a/string/test/memcmp.c
+++ b/string/test/memcmp.c
@@ -2,7 +2,7 @@
  * memcmp test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index 21b35b9..fa15a95 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -1,8 +1,8 @@
 /*
  * memcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/memmove.c b/string/test/memmove.c
index 12a7057..5d509c0 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -1,8 +1,8 @@
 /*
  * memmove test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/memrchr.c b/string/test/memrchr.c
index adf96f0..4171a56 100644
--- a/string/test/memrchr.c
+++ b/string/test/memrchr.c
@@ -2,7 +2,7 @@
  * memchr test.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/memset.c b/string/test/memset.c
index f172144..5543f44 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -2,7 +2,7 @@
  * memset test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/mte.h b/string/test/mte.h
index e67cbd9..40b0ecf 100644
--- a/string/test/mte.h
+++ b/string/test/mte.h
@@ -2,7 +2,7 @@
  * Memory tagging testing code.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef __TEST_MTE_H
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
index 1b61245..0300892 100644
--- a/string/test/stpcpy.c
+++ b/string/test/stpcpy.c
@@ -1,8 +1,8 @@
 /*
  * stpcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/strchr.c b/string/test/strchr.c
index f3ae982..66180ac 100644
--- a/string/test/strchr.c
+++ b/string/test/strchr.c
@@ -2,7 +2,7 @@
  * strchr test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
index 6c30ab2..aad0bf5 100644
--- a/string/test/strchrnul.c
+++ b/string/test/strchrnul.c
@@ -2,7 +2,7 @@
  * strchrnul test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index 0262397..4aa95f4 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -1,8 +1,8 @@
 /*
  * strcmp test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index 6de3bed..af297f9 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c
@@ -1,8 +1,8 @@
 /*
  * strcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/stringtest.h b/string/test/stringtest.h
index fe855fc..6bb7e1f 100644
--- a/string/test/stringtest.h
+++ b/string/test/stringtest.h
@@ -2,7 +2,7 @@
  * Common string test code.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <ctype.h>
diff --git a/string/test/strlen.c b/string/test/strlen.c
index 6278380..68c51b1 100644
--- a/string/test/strlen.c
+++ b/string/test/strlen.c
@@ -2,7 +2,7 @@
  * strlen test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index f8c2167..4bbab6f 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -1,8 +1,8 @@
 /*
  * strncmp test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
index 0dea00e..a800fd1 100644
--- a/string/test/strnlen.c
+++ b/string/test/strnlen.c
@@ -2,7 +2,7 @@
  * strnlen test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index fedbdc5..580ca49 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -2,7 +2,7 @@
  * strrchr test.
  *
  * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S
index 26ade0a..5afcf7b 100644
--- a/string/x86_64/check-arch.S
+++ b/string/x86_64/check-arch.S
@@ -2,7 +2,7 @@
  * check ARCH setting.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__x86_64__
-- 
cgit v1.2.3


From 998fec12c602b64261eca197f87d744183626907 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Fri, 29 Oct 2021 17:13:36 +0100
Subject: Add README.contributors

Document contributor requirements.
---
 README                     |  4 ++-
 README.contributors        | 44 ++++++++++++++++++++++++++
 math/README.contributors   | 78 ++++++++++++++++++++++++++++++++++++++++++++++
 string/README.contributors | 30 ++++++++++++++++++
 4 files changed, 155 insertions(+), 1 deletion(-)
 create mode 100644 README.contributors
 create mode 100644 math/README.contributors
 create mode 100644 string/README.contributors

diff --git a/README b/README
index 4654282..0afca55 100644
--- a/README
+++ b/README
@@ -7,7 +7,9 @@ license, at the user’s election, as reflected in the LICENSE file.
 Contributions to this project are accepted, but Contributors have
 to sign an Assignment Agreement, please follow the instructions in
 contributor-agreement.pdf. This is needed so upstreaming code
-to projects that require copyright assignment is possible.
+to projects that require copyright assignment is possible. Further
+contribution requirements are documented in README.contributors of
+the appropriate subdirectory.
 
 Regular quarterly releases are tagged as vYY.MM, the latest
 release is v21.02.
diff --git a/README.contributors b/README.contributors
new file mode 100644
index 0000000..f8fcdde
--- /dev/null
+++ b/README.contributors
@@ -0,0 +1,44 @@
+GENERIC CONTRIBUTION GUIDELINES
+===============================
+
+1. Sub-projects are maintained independently and thus have independent
+   contribution rules. If there exists a README.contributors in the
+   sub-directory to which the contribution is made, it must be followed.
+
+2. Legal:
+   - Contributors who are not employed by Arm must sign an Assignment Agreement.
+     See contributor-agreement.pdf.
+   - All code must be copyright owned by Arm Limited and the appropriate
+     copyright notice and license identifier must be present in every source
+     file.
+
+3. Build:
+   - Build should only depend on GNU make and posix utilities (shell, awk, sed,
+     etc) and on a C toolchain.
+   - Build should pass with the default configuration (see config.mk.dist)
+     and other supported configurations, with both gcc and clang based
+     toolchains. (The build should not depend on a recent toolchain, the use
+     of a new feature should be possible to disable.)
+   - Currently there is no automated configuration, target specific configuration
+     should be done via make variables in config.mk. This is the user interface
+     to the build system, so it should be documented in sufficient detail and
+     kept reasonably stable.
+
+4. Testing:
+   - On aarch64 the tests must pass. If the code may behave differently under
+     some supported configurations (e.g. CFLAGS) those should be tested.
+   - New symbols are expected to have new associated test code and ideally
+     benchmark code too.
+
+4. Commits:
+   - Commit message should be descriptive and should not refer to Arm internal
+     information (such as Jira tickets, or internal discussions). Non-obvious
+     decisions should be recorded or explained in the commit message if they are
+     not explained in source comments.
+   - Ideally tools and scripts used to write the code should be added to the
+     repository or at least mentioned in the commit.
+   - Logically independent changes should not be mixed into the same commit.
+
+5. Style:
+   - Unless otherwise required differently by the sub-project, follow the
+     clang-format tool using the style from the gcc contrib/ directory.
diff --git a/math/README.contributors b/math/README.contributors
new file mode 100644
index 0000000..33e7ba3
--- /dev/null
+++ b/math/README.contributors
@@ -0,0 +1,78 @@
+STYLE REQUIREMENTS
+==================
+
+1. Most code in this sub-directory is expected to be upstreamed into glibc so
+   the GNU Coding Standard and glibc specific conventions should be followed
+   to ease upstreaming.
+
+2. ABI and symbols: the code should be written so it is suitable for inclusion
+   into a libc with minimal changes. This e.g. means that internal symbols
+   should be hidden and in the implementation reserved namespace according to
+   ISO C and POSIX rules. If possible the built shared libraries and static
+   library archives should be usable to override libc symbols at link time (or
+   at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
+   (other than symbol versioning), this cannot be done reliably for static
+   linking so this is a best effort requirement.
+
+3. API: include headers should be suitable for benchmarking and testing code
+   and should not conflict with libc headers.
+
+
+CONTRIBUTION GUIDELINES FOR math SUB-DIRECTORY
+==============================================
+
+1. Math functions have quality and performance requirements.
+
+2. Quality:
+   - Worst-case ULP error should be small in the entire input domain (for most
+     common double precision scalar functions the target is < 0.66 ULP error,
+     and < 1 ULP for single precision, even performance optimized function
+     variant should not have > 5 ULP error if the goal is to be a drop in
+     replacement for a standard math function), this should be tested
+     statistically (or on all inputs if possible in reasonable amount of time).
+     The ulp tool is for this and runulp.sh should be updated for new functions.
+
+   - All standard rounding modes need to be supported but in non-default rounding
+     modes the quality requirement can be relaxed. (Non-nearest rounded
+     computation can be slow and inaccurate but has to be correct for conformance
+     reasons.)
+
+   - Special cases and error handling need to follow ISO C Annex F requirements,
+     POSIX requirements, IEEE 754-2008 requirements and Glibc requiremnts:
+     https://www.gnu.org/software/libc/manual/html_mono/libc.html#Errors-in-Math-Functions
+     this should be tested by direct tests (glibc test system may be used for it).
+
+   - Error handling code should be decoupled from the approximation code as much
+     as possible. (There are helper functions, these take care of errno as well
+     as exception raising.)
+
+   - Vector math code does not need to work in non-nearest rounding mode and error
+     handling side effects need not happen (fenv exceptions and errno), but the
+     result should be correct (within quality requirements, which are lower for
+     vector code than for scalar code).
+
+   - Error bounds of the approximation should be clearly documented.
+
+   - The code should build and pass tests on arm, aarch64 and x86_64 GNU linux
+     systems. (Routines and features can be disabled on specific targets, but
+     the build must complete). On aarch64, both little- and big-endian targets
+     are supported as well as valid combinations of architecture extensions.
+     The configurations that should be tested depend on the contribution.
+
+3. Performance:
+   - Common math code should be benchmarked on modern aarch64 microarchitectures
+     over typical inputs.
+
+   - Performance improvements should be documented (relative numbers can be
+     published; it is enough to use the mathbench microbenchmark tool which should
+     be updated for new functions).
+
+   - Attention should be paid to the compilation flags: for aarch64 fma
+     contraction should be on and math errno turned off so some builtins can be
+     inlined.
+
+   - The code should be reasonably performant on x86_64 too, e.g. some rounding
+     instructions and fma may not be available on x86_64, such builtins turn into
+     libc calls with slow code. Such slowdown is not acceptable, a faster fallback
+     should be present: glibc and bionic use the same code on all targets. (This
+     does not apply to vector math code).
diff --git a/string/README.contributors b/string/README.contributors
new file mode 100644
index 0000000..0b4a51b
--- /dev/null
+++ b/string/README.contributors
@@ -0,0 +1,30 @@
+STYLE REQUIREMENTS
+==================
+
+1. Most code in this sub-directory is expected to be upstreamed into glibc so
+   the GNU Coding Standard and glibc specific conventions should be followed
+   to ease upstreaming.
+
+2. ABI and symbols: the code should be written so it is suitable for inclusion
+   into a libc with minimal changes. This e.g. means that internal symbols
+   should be hidden and in the implementation reserved namespace according to
+   ISO C and POSIX rules. If possible the built shared libraries and static
+   library archives should be usable to override libc symbols at link time (or
+   at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
+   (other than symbol versioning), this cannot be done reliably for static
+   linking so this is a best effort requirement.
+
+3. API: include headers should be suitable for benchmarking and testing code
+   and should not conflict with libc headers.
+
+
+CONTRIBUTION GUIDELINES FOR string SUB-DIRECTORY
+================================================
+1. Code:
+   - The assumptions of the code must be clearly documented.
+
+   - Assembly style should be consistent across different implementations.
+
+
+2. Performance:
+   - Benchmarking is needed on several microarchitectures.
-- 
cgit v1.2.3


From dfa594c3976183897080de6deac2e40fb1506b57 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Fri, 29 Oct 2021 17:17:31 +0100
Subject: Add pl subdirectory

This directory will contain optimized code that is maintained
separately from existing routines with different requirements.
---
 README                 |  1 +
 pl/README.contributors | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 pl/README.contributors

diff --git a/README b/README
index 0afca55..e24746d 100644
--- a/README
+++ b/README
@@ -27,6 +27,7 @@ networking/test/ - networking test and benchmark related sources.
 string/         - string routines subproject sources.
 string/include/ - string library public headers.
 string/test/    - string test and benchmark related sources.
+pl/...          - separately maintained performance library code.
 
 The steps to build the target libraries and run the tests:
 
diff --git a/pl/README.contributors b/pl/README.contributors
new file mode 100644
index 0000000..3af9b1f
--- /dev/null
+++ b/pl/README.contributors
@@ -0,0 +1,23 @@
+Code in this sub-directory should follow the GNU Coding Standard, but it is
+not expected to be upstreamed into glibc without modification, so
+glibc-specific conventions need not be followed.
+
+The requirements for portable code apply to non-portable code with the
+following differences:
+
+
+1. Worst-case ULP error should be encoded in filenames (e.g. sin_u35.c). There
+   are no specific restrictions on acceptable ULP error, but if functions
+   provide significantly less accuracy than portable equivalents then a clear
+   justification for inclusion should be stated in comments at the top of the
+   source file. Error bounds of the approximation should be clearly documented
+   in comments.
+
+2. Functions are assumed to support round-to-nearest mode by default, unless
+   stated; other rounding modes are not required to be provided.
+
+3. Handling of special cases may be relaxed for vector functions. Checking
+   whether each vector lane contains special values such as NaN, Inf or
+   denormal numbers can prove too costly for vector functions. This is often
+   not required since vector functions are typically used along with aggressive
+   compiler optimization flags.
-- 
cgit v1.2.3


From 4c32619682de9d8632fe039153a3e26a6f095482 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Thu, 10 Feb 2022 11:13:40 +0000
Subject: Add MAINTAINERS file

This file describes who maintains each subdirectory in the project.
---
 MAINTAINERS | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 MAINTAINERS

diff --git a/MAINTAINERS b/MAINTAINERS
new file mode 100644
index 0000000..ed77c6a
--- /dev/null
+++ b/MAINTAINERS
@@ -0,0 +1,11 @@
+/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+math/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+networking/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+pl/
+	Pierre Blanchard <pierre.blanchard@arm.com>
+string/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+	Wilco Dijkstra <wilco.dijkstra@arm.com>
-- 
cgit v1.2.3


From 82c8c8aeb2875223f63fc83187b0e8dd36fc8afe Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Fri, 25 Mar 2022 15:04:38 +0000
Subject: Add build system and infra for the pl directory

- pl/ is built from top-level Makefile by adding pl to SUBS
- PLSUBS lists all pl/ subdirectories that can be built,
  it only contains math for now. Please modify this list in
  the top-level config.mk.
- pl libraries and infrastructure is built in build/pl/
- As a result math/ and pl/math generate separate test and bench
  binaries.
- Use infrastructure provided in math/test to test and profile
  pl/math routines. The build system ensures the appropriate
  header files are first copied to build/pl/include/test to define
  wrappers and entries in ulp and mathbench.
- Copyied scalar erff from math/ to pl/math/ to show build
  system is functional.
- pl mathlib libraries are built separately to the main/portable
  mathlib libraries and installed alongside.
---
 Makefile                                 |   3 +
 config.mk.dist                           |   3 +
 math/Dir.mk                              |   9 +-
 math/test/mathbench.c                    | 182 +----------------
 math/test/mathbench_funcs.h              | 100 ++++++++++
 math/test/mathbench_wrappers.h           | 104 ++++++++++
 math/test/mathtest.c                     |   5 +-
 math/test/ulp.c                          | 138 +------------
 math/test/ulp_funcs.h                    |  78 ++++++++
 math/test/ulp_wrappers.h                 |  71 +++++++
 pl/Dir.mk                                |  21 ++
 pl/math/Dir.mk                           | 151 ++++++++++++++
 pl/math/erff_1u5.c                       | 103 ++++++++++
 pl/math/erff_data.c                      |  16 ++
 pl/math/include/mathlib.h                |  26 +++
 pl/math/math_config.h                    | 326 +++++++++++++++++++++++++++++++
 pl/math/math_errf.c                      |  80 ++++++++
 pl/math/test/mathbench_funcs.h           |   7 +
 pl/math/test/mathbench_wrappers.h        |   7 +
 pl/math/test/runulp.sh                   |  45 +++++
 pl/math/test/testcases/directed/erff.tst |  17 ++
 pl/math/test/testcases/random/float.tst  |   6 +
 pl/math/test/ulp_funcs.h                 |   7 +
 pl/math/test/ulp_wrappers.h              |  27 +++
 24 files changed, 1213 insertions(+), 319 deletions(-)
 create mode 100644 math/test/mathbench_funcs.h
 create mode 100644 math/test/mathbench_wrappers.h
 create mode 100644 math/test/ulp_funcs.h
 create mode 100644 math/test/ulp_wrappers.h
 create mode 100644 pl/Dir.mk
 create mode 100644 pl/math/Dir.mk
 create mode 100644 pl/math/erff_1u5.c
 create mode 100644 pl/math/erff_data.c
 create mode 100644 pl/math/include/mathlib.h
 create mode 100644 pl/math/math_config.h
 create mode 100644 pl/math/math_errf.c
 create mode 100644 pl/math/test/mathbench_funcs.h
 create mode 100644 pl/math/test/mathbench_wrappers.h
 create mode 100755 pl/math/test/runulp.sh
 create mode 100644 pl/math/test/testcases/directed/erff.tst
 create mode 100644 pl/math/test/testcases/random/float.tst
 create mode 100644 pl/math/test/ulp_funcs.h
 create mode 100644 pl/math/test/ulp_wrappers.h

diff --git a/Makefile b/Makefile
index 187a729..22323af 100644
--- a/Makefile
+++ b/Makefile
@@ -11,6 +11,7 @@ includedir = $(prefix)/include
 
 # Configure these in config.mk, do not make changes in this file.
 SUBS = math string networking
+PLSUBS = math
 HOST_CC = cc
 HOST_CFLAGS = -std=c99 -O2
 HOST_LDFLAGS =
@@ -20,6 +21,7 @@ CPPFLAGS =
 CFLAGS = -std=c99 -O2
 CFLAGS_SHARED = -fPIC
 CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS)
+CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL
 LDFLAGS =
 LDLIBS =
 AR = $(CROSS_COMPILE)ar
@@ -51,6 +53,7 @@ $(DIRS):
 	mkdir -p $@
 
 $(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED)
+$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED)
 
 build/%.o: $(srcdir)/%.S
 	$(CC) $(CFLAGS_ALL) -c -o $@ $<
diff --git a/config.mk.dist b/config.mk.dist
index b33c107..78588de 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -6,6 +6,9 @@
 # Subprojects to build
 SUBS = math string networking
 
+# Subsubprojects to build if subproject pl is built
+PLSUBS = math
+
 # Target architecture: aarch64, arm or x86_64
 ARCH = aarch64
 
diff --git a/math/Dir.mk b/math/Dir.mk
index dac636c..534f997 100644
--- a/math/Dir.mk
+++ b/math/Dir.mk
@@ -15,6 +15,7 @@ math-test-srcs := \
 math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS])
 
 math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
+math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h))
 
 math-libs := \
 	build/lib/libmathlib.so \
@@ -42,10 +43,11 @@ math-files := \
 	$(math-tools) \
 	$(math-host-tools) \
 	$(math-includes) \
+	$(math-test-includes) \
 
-all-math: $(math-libs) $(math-tools) $(math-includes)
+all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
 
-$(math-objs): $(math-includes)
+$(math-objs): $(math-includes) $(math-test-includes)
 $(math-objs): CFLAGS_ALL += $(math-cflags)
 $(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
 $(math-host-objs): CC = $(HOST_CC)
@@ -83,6 +85,9 @@ build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a
 build/include/%.h: $(S)/include/%.h
 	cp $< $@
 
+build/include/test/%.h: $(S)/test/%.h
+	cp $< $@
+
 build/bin/%.sh: $(S)/test/%.sh
 	cp $< $@
 
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 8f305a2..5c8881a 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -89,7 +89,6 @@ dummyf (float x)
 {
   return x;
 }
-
 #if WANT_VMATH
 #if __aarch64__
 static v_double
@@ -116,101 +115,11 @@ __vn_dummyf (v_float x)
 {
   return x;
 }
-
-__vpcs static v_float
-xy__vn_powf (v_float x)
-{
-  return __vn_powf (x, x);
-}
-
-__vpcs static v_float
-xy_Z_powf (v_float x)
-{
-  return _ZGVnN4vv_powf (x, x);
-}
-
-__vpcs static v_double
-xy__vn_pow (v_double x)
-{
-  return __vn_pow (x, x);
-}
-
-__vpcs static v_double
-xy_Z_pow (v_double x)
-{
-  return _ZGVnN2vv_pow (x, x);
-}
 #endif
-
-static v_float
-xy__v_powf (v_float x)
-{
-  return __v_powf (x, x);
-}
-
-static v_double
-xy__v_pow (v_double x)
-{
-  return __v_pow (x, x);
-}
 #endif
-
-static float
-xy__s_powf (float x)
-{
-  return __s_powf (x, x);
-}
-
-static double
-xy__s_pow (double x)
-{
-  return __s_pow (x, x);
-}
 #endif
 
-static double
-xypow (double x)
-{
-  return pow (x, x);
-}
-
-static float
-xypowf (float x)
-{
-  return powf (x, x);
-}
-
-static double
-xpow (double x)
-{
-  return pow (x, 23.4);
-}
-
-static float
-xpowf (float x)
-{
-  return powf (x, 23.4f);
-}
-
-static double
-ypow (double x)
-{
-  return pow (2.34, x);
-}
-
-static float
-ypowf (float x)
-{
-  return powf (2.34f, x);
-}
-
-static float
-sincosf_wrap (float x)
-{
-  float s, c;
-  sincosf (x, &s, &c);
-  return s + c;
-}
+#include "test/mathbench_wrappers.h"
 
 static const struct fun
 {
@@ -238,105 +147,18 @@ static const struct fun
 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
 D (dummy, 1.0, 2.0)
-D (exp, -9.9, 9.9)
-D (exp, 0.5, 1.0)
-D (exp2, -9.9, 9.9)
-D (log, 0.01, 11.1)
-D (log, 0.999, 1.001)
-D (log2, 0.01, 11.1)
-D (log2, 0.999, 1.001)
-{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
-D (xpow, 0.01, 11.1)
-D (ypow, -9.9, 9.9)
-D (erf, -6.0, 6.0)
-
 F (dummyf, 1.0, 2.0)
-F (expf, -9.9, 9.9)
-F (exp2f, -9.9, 9.9)
-F (logf, 0.01, 11.1)
-F (log2f, 0.01, 11.1)
-{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
-F (xpowf, 0.01, 11.1)
-F (ypowf, -9.9, 9.9)
-{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
-F (sinf, 0.1, 0.7)
-F (sinf, 0.8, 3.1)
-F (sinf, -3.1, 3.1)
-F (sinf, 3.3, 33.3)
-F (sinf, 100, 1000)
-F (sinf, 1e6, 1e32)
-F (cosf, 0.1, 0.7)
-F (cosf, 0.8, 3.1)
-F (cosf, -3.1, 3.1)
-F (cosf, 3.3, 33.3)
-F (cosf, 100, 1000)
-F (cosf, 1e6, 1e32)
-F (erff, -4.0, 4.0)
 #if WANT_VMATH
-D (__s_sin, -3.1, 3.1)
-D (__s_cos, -3.1, 3.1)
-D (__s_exp, -9.9, 9.9)
-D (__s_log, 0.01, 11.1)
-{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
-F (__s_expf, -9.9, 9.9)
-F (__s_expf_1u, -9.9, 9.9)
-F (__s_exp2f, -9.9, 9.9)
-F (__s_exp2f_1u, -9.9, 9.9)
-F (__s_logf, 0.01, 11.1)
-{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
-F (__s_sinf, -3.1, 3.1)
-F (__s_cosf, -3.1, 3.1)
 #if __aarch64__
 VD (__v_dummy, 1.0, 2.0)
-VD (__v_sin, -3.1, 3.1)
-VD (__v_cos, -3.1, 3.1)
-VD (__v_exp, -9.9, 9.9)
-VD (__v_log, 0.01, 11.1)
-{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
 VF (__v_dummyf, 1.0, 2.0)
-VF (__v_expf, -9.9, 9.9)
-VF (__v_expf_1u, -9.9, 9.9)
-VF (__v_exp2f, -9.9, 9.9)
-VF (__v_exp2f_1u, -9.9, 9.9)
-VF (__v_logf, 0.01, 11.1)
-{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
-VF (__v_sinf, -3.1, 3.1)
-VF (__v_cosf, -3.1, 3.1)
 #ifdef __vpcs
 VND (__vn_dummy, 1.0, 2.0)
-VND (__vn_exp, -9.9, 9.9)
-VND (_ZGVnN2v_exp, -9.9, 9.9)
-VND (__vn_log, 0.01, 11.1)
-VND (_ZGVnN2v_log, 0.01, 11.1)
-{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
-{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
-VND (__vn_sin, -3.1, 3.1)
-VND (_ZGVnN2v_sin, -3.1, 3.1)
-VND (__vn_cos, -3.1, 3.1)
-VND (_ZGVnN2v_cos, -3.1, 3.1)
 VNF (__vn_dummyf, 1.0, 2.0)
-VNF (__vn_expf, -9.9, 9.9)
-VNF (_ZGVnN4v_expf, -9.9, 9.9)
-VNF (__vn_expf_1u, -9.9, 9.9)
-VNF (__vn_exp2f, -9.9, 9.9)
-VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
-VNF (__vn_exp2f_1u, -9.9, 9.9)
-VNF (__vn_logf, 0.01, 11.1)
-VNF (_ZGVnN4v_logf, 0.01, 11.1)
-{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
-{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
-VNF (__vn_sinf, -3.1, 3.1)
-VNF (_ZGVnN4v_sinf, -3.1, 3.1)
-VNF (__vn_cosf, -3.1, 3.1)
-VNF (_ZGVnN4v_cosf, -3.1, 3.1)
 #endif
 #endif
 #endif
+#include "test/mathbench_funcs.h"
 {0},
 #undef F
 #undef D
diff --git a/math/test/mathbench_funcs.h b/math/test/mathbench_funcs.h
new file mode 100644
index 0000000..ad6dd2a
--- /dev/null
+++ b/math/test/mathbench_funcs.h
@@ -0,0 +1,100 @@
+/*
+ * Function entries for mathbench.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+D (exp, -9.9, 9.9)
+D (exp, 0.5, 1.0)
+D (exp2, -9.9, 9.9)
+D (log, 0.01, 11.1)
+D (log, 0.999, 1.001)
+D (log2, 0.01, 11.1)
+D (log2, 0.999, 1.001)
+{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
+D (xpow, 0.01, 11.1)
+D (ypow, -9.9, 9.9)
+D (erf, -6.0, 6.0)
+
+F (expf, -9.9, 9.9)
+F (exp2f, -9.9, 9.9)
+F (logf, 0.01, 11.1)
+F (log2f, 0.01, 11.1)
+{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
+F (xpowf, 0.01, 11.1)
+F (ypowf, -9.9, 9.9)
+{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
+F (sinf, 0.1, 0.7)
+F (sinf, 0.8, 3.1)
+F (sinf, -3.1, 3.1)
+F (sinf, 3.3, 33.3)
+F (sinf, 100, 1000)
+F (sinf, 1e6, 1e32)
+F (cosf, 0.1, 0.7)
+F (cosf, 0.8, 3.1)
+F (cosf, -3.1, 3.1)
+F (cosf, 3.3, 33.3)
+F (cosf, 100, 1000)
+F (cosf, 1e6, 1e32)
+F (erff, -4.0, 4.0)
+#if WANT_VMATH
+D (__s_sin, -3.1, 3.1)
+D (__s_cos, -3.1, 3.1)
+D (__s_exp, -9.9, 9.9)
+D (__s_log, 0.01, 11.1)
+{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
+F (__s_expf, -9.9, 9.9)
+F (__s_expf_1u, -9.9, 9.9)
+F (__s_exp2f, -9.9, 9.9)
+F (__s_exp2f_1u, -9.9, 9.9)
+F (__s_logf, 0.01, 11.1)
+{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
+F (__s_sinf, -3.1, 3.1)
+F (__s_cosf, -3.1, 3.1)
+#if __aarch64__
+VD (__v_sin, -3.1, 3.1)
+VD (__v_cos, -3.1, 3.1)
+VD (__v_exp, -9.9, 9.9)
+VD (__v_log, 0.01, 11.1)
+{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
+VF (__v_expf, -9.9, 9.9)
+VF (__v_expf_1u, -9.9, 9.9)
+VF (__v_exp2f, -9.9, 9.9)
+VF (__v_exp2f_1u, -9.9, 9.9)
+VF (__v_logf, 0.01, 11.1)
+{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
+VF (__v_sinf, -3.1, 3.1)
+VF (__v_cosf, -3.1, 3.1)
+#ifdef __vpcs
+VND (__vn_exp, -9.9, 9.9)
+VND (_ZGVnN2v_exp, -9.9, 9.9)
+VND (__vn_log, 0.01, 11.1)
+VND (_ZGVnN2v_log, 0.01, 11.1)
+{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
+{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
+VND (__vn_sin, -3.1, 3.1)
+VND (_ZGVnN2v_sin, -3.1, 3.1)
+VND (__vn_cos, -3.1, 3.1)
+VND (_ZGVnN2v_cos, -3.1, 3.1)
+VNF (__vn_expf, -9.9, 9.9)
+VNF (_ZGVnN4v_expf, -9.9, 9.9)
+VNF (__vn_expf_1u, -9.9, 9.9)
+VNF (__vn_exp2f, -9.9, 9.9)
+VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
+VNF (__vn_exp2f_1u, -9.9, 9.9)
+VNF (__vn_logf, 0.01, 11.1)
+VNF (_ZGVnN4v_logf, 0.01, 11.1)
+{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
+{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
+VNF (__vn_sinf, -3.1, 3.1)
+VNF (_ZGVnN4v_sinf, -3.1, 3.1)
+VNF (__vn_cosf, -3.1, 3.1)
+VNF (_ZGVnN4v_cosf, -3.1, 3.1)
+#endif
+#endif
+#endif
diff --git a/math/test/mathbench_wrappers.h b/math/test/mathbench_wrappers.h
new file mode 100644
index 0000000..8311f0f
--- /dev/null
+++ b/math/test/mathbench_wrappers.h
@@ -0,0 +1,104 @@
+/*
+ * Function wrappers for mathbench.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#if WANT_VMATH
+#if __aarch64__
+
+#ifdef __vpcs
+__vpcs static v_float
+xy__vn_powf (v_float x)
+{
+  return __vn_powf (x, x);
+}
+
+__vpcs static v_float
+xy_Z_powf (v_float x)
+{
+  return _ZGVnN4vv_powf (x, x);
+}
+
+__vpcs static v_double
+xy__vn_pow (v_double x)
+{
+  return __vn_pow (x, x);
+}
+
+__vpcs static v_double
+xy_Z_pow (v_double x)
+{
+  return _ZGVnN2vv_pow (x, x);
+}
+#endif // __vpcs
+
+static v_float
+xy__v_powf (v_float x)
+{
+  return __v_powf (x, x);
+}
+
+static v_double
+xy__v_pow (v_double x)
+{
+  return __v_pow (x, x);
+}
+#endif // __aarch64__
+
+static float
+xy__s_powf (float x)
+{
+  return __s_powf (x, x);
+}
+
+static double
+xy__s_pow (double x)
+{
+  return __s_pow (x, x);
+}
+#endif // WANT_VMATH
+
+static double
+xypow (double x)
+{
+  return pow (x, x);
+}
+
+static float
+xypowf (float x)
+{
+  return powf (x, x);
+}
+
+static double
+xpow (double x)
+{
+  return pow (x, 23.4);
+}
+
+static float
+xpowf (float x)
+{
+  return powf (x, 23.4f);
+}
+
+static double
+ypow (double x)
+{
+  return pow (2.34, x);
+}
+
+static float
+ypowf (float x)
+{
+  return powf (2.34f, x);
+}
+
+static float
+sincosf_wrap (float x)
+{
+  float s, c;
+  sincosf (x, &s, &c);
+  return s + c;
+}
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index 85a42a7..21509b2 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -196,9 +196,11 @@ int is_complex_rettype(int rettype) {
 #define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name }
 #define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name }
 
+#ifndef PL
 /* sincosf wrappers for easier testing.  */
 static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; }
 static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; }
+#endif
 
 test_func tfuncs[] = {
     /* trigonometric */
@@ -218,9 +220,10 @@ test_func tfuncs[] = {
     TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT),
     TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4),
+#ifndef PL
     TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4),
-
+#endif
     /* hyperbolic */
     TFUNC(at_d, rt_d, atanh, 4*ULPUNIT),
     TFUNC(at_d, rt_d, asinh, 4*ULPUNIT),
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 336a9be..a38238e 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -214,70 +214,7 @@ struct conf
   double errlim;
 };
 
-/* Wrappers for sincos.  */
-static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
-static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
-static double sincos_sin(double x) {(void)cos(x); return sin(x);}
-static double sincos_cos(double x) {(void)sin(x); return cos(x);}
-#if USE_MPFR
-static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
-static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
-#endif
-
-/* A bit of a hack: call vector functions twice with the same
-   input in lane 0 but a different value in other lanes: once
-   with an in-range value and then with a special case value.  */
-static int secondcall;
-
-/* Wrappers for vector functions.  */
-#if __aarch64__ && WANT_VMATH
-typedef __f32x4_t v_float;
-typedef __f64x2_t v_double;
-static const float fv[2] = {1.0f, -INFINITY};
-static const double dv[2] = {1.0, -INFINITY};
-static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
-static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
-
-static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
-static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
-static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
-static float v_expf(float x) { return __v_expf(argf(x))[0]; }
-static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
-static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
-static float v_logf(float x) { return __v_logf(argf(x))[0]; }
-static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
-static double v_sin(double x) { return __v_sin(argd(x))[0]; }
-static double v_cos(double x) { return __v_cos(argd(x))[0]; }
-static double v_exp(double x) { return __v_exp(argd(x))[0]; }
-static double v_log(double x) { return __v_log(argd(x))[0]; }
-static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
-#ifdef __vpcs
-static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
-static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
-static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
-static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
-static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
-static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
-static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
-static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
-static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
-static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
-static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
-static double vn_log(double x) { return __vn_log(argd(x))[0]; }
-static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
-static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
-static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
-static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
-static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
-static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
-static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
-static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
-static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
-static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
-static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
-static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
-#endif
-#endif
+#include "test/ulp_wrappers.h"
 
 struct fun
 {
@@ -322,78 +259,7 @@ static const struct fun fun[] = {
 #define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
 #define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
 #define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
- F1 (sin)
- F1 (cos)
- F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
- F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
- F1 (exp)
- F1 (exp2)
- F1 (log)
- F1 (log2)
- F2 (pow)
- F1 (erf)
- D1 (exp)
- D1 (exp2)
- D1 (log)
- D1 (log2)
- D2 (pow)
- D1 (erf)
-#if WANT_VMATH
- F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
- F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
- F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
- F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
- F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
- F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
- F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
- F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
- F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
-#if __aarch64__
- F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
-#ifdef __vpcs
- F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
- F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
- F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
- F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
-#endif
-#endif
-#endif
+#include "test/ulp_funcs.h"
 #undef F
 #undef F1
 #undef F2
diff --git a/math/test/ulp_funcs.h b/math/test/ulp_funcs.h
new file mode 100644
index 0000000..f5cea4d
--- /dev/null
+++ b/math/test/ulp_funcs.h
@@ -0,0 +1,78 @@
+/*
+ * Function entries for ulp.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+ F1 (sin)
+ F1 (cos)
+ F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
+ F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
+ F1 (exp)
+ F1 (exp2)
+ F1 (log)
+ F1 (log2)
+ F2 (pow)
+ F1 (erf)
+ D1 (exp)
+ D1 (exp2)
+ D1 (log)
+ D1 (log2)
+ D2 (pow)
+ D1 (erf)
+#if WANT_VMATH
+ F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
+ F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
+ F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
+ F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
+ F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
+ F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
+ F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+ F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
+ F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
+ F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
+ F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
+#if __aarch64__
+ F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
+#ifdef __vpcs
+ F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
+ F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
+#endif
+#endif
+#endif
diff --git a/math/test/ulp_wrappers.h b/math/test/ulp_wrappers.h
new file mode 100644
index 0000000..10abe0a
--- /dev/null
+++ b/math/test/ulp_wrappers.h
@@ -0,0 +1,71 @@
+/*
+ * Function wrappers for ulp.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Wrappers for sincos.  */
+static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
+static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
+static double sincos_sin(double x) {(void)cos(x); return sin(x);}
+static double sincos_cos(double x) {(void)sin(x); return cos(x);}
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+#endif
+
+/* A bit of a hack: call vector functions twice with the same
+   input in lane 0 but a different value in other lanes: once
+   with an in-range value and then with a special case value.  */
+static int secondcall;
+
+/* Wrappers for vector functions.  */
+#if __aarch64__ && WANT_VMATH
+typedef __f32x4_t v_float;
+typedef __f64x2_t v_double;
+static const float fv[2] = {1.0f, -INFINITY};
+static const double dv[2] = {1.0, -INFINITY};
+static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
+static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
+
+static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
+static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
+static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
+static float v_expf(float x) { return __v_expf(argf(x))[0]; }
+static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
+static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
+static float v_logf(float x) { return __v_logf(argf(x))[0]; }
+static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
+static double v_sin(double x) { return __v_sin(argd(x))[0]; }
+static double v_cos(double x) { return __v_cos(argd(x))[0]; }
+static double v_exp(double x) { return __v_exp(argd(x))[0]; }
+static double v_log(double x) { return __v_log(argd(x))[0]; }
+static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
+#ifdef __vpcs
+static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
+static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
+static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
+static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
+static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
+static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
+static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
+static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
+static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
+static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
+static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
+static double vn_log(double x) { return __vn_log(argd(x))[0]; }
+static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
+static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
+static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
+static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
+static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
+static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
+static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
+static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
+static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
+static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
+static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
+static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
+#endif
+#endif
diff --git a/pl/Dir.mk b/pl/Dir.mk
new file mode 100644
index 0000000..2d00779
--- /dev/null
+++ b/pl/Dir.mk
@@ -0,0 +1,21 @@
+# Makefile fragment - requires GNU make
+#
+# Copyright (c) 2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+# These targets are defined if we prescribe pl in SUBS.
+# It requires PLSUBS to be set.
+
+$(foreach sub,$(PLSUBS),$(eval include $(srcdir)/pl/$(sub)/Dir.mk))
+
+pl-files := $($(PLSUBS:%=pl/%-files))
+
+all-pl: $(PLSUBS:%=all-pl/%)
+
+check-pl: $(PLSUBS:%=check-pl/%)
+
+install-pl: $(PLSUBS:%=install-pl/%)
+
+clean-pl: $(PLSUBS:%=clean-pl/%)
+
+.PHONY: all-pl check-pl install-pl clean-pl
diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
new file mode 100644
index 0000000..13ecf87
--- /dev/null
+++ b/pl/math/Dir.mk
@@ -0,0 +1,151 @@
+# Makefile fragment - requires GNU make
+#
+# Copyright (c) 2019-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+PLM := $(srcdir)/pl/math
+AOR := $(srcdir)/math
+B := $(srcdir)/build/pl/math
+
+math-lib-srcs := $(wildcard $(PLM)/*.[cS])
+math-test-srcs := \
+	$(AOR)/test/mathtest.c \
+	$(AOR)/test/mathbench.c \
+	$(AOR)/test/ulp.c \
+
+math-test-host-srcs := $(wildcard $(AOR)/test/rtest/*.[cS])
+
+math-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h))
+math-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h))
+
+math-libs := \
+	build/pl/lib/libmathlib.so \
+	build/pl/lib/libmathlib.a \
+
+math-tools := \
+	build/pl/bin/mathtest \
+	build/pl/bin/mathbench \
+	build/pl/bin/mathbench_libc \
+	build/pl/bin/runulp.sh \
+	build/pl/bin/ulp \
+
+math-host-tools := \
+	build/pl/bin/rtest \
+
+math-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(math-lib-srcs)))
+math-test-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-srcs)))
+math-host-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
+math-target-objs := $(math-lib-objs) $(math-test-objs)
+math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs)
+
+pl/math-files := \
+	$(math-objs) \
+	$(math-libs) \
+	$(math-tools) \
+	$(math-host-tools) \
+	$(math-includes) \
+	$(math-test-includes) \
+
+all-pl/math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
+
+$(math-objs): $(math-includes) $(math-test-includes)
+$(math-objs): CFLAGS_PL += $(math-cflags)
+$(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno
+$(math-host-objs): CC = $(HOST_CC)
+$(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS)
+
+$(B)/test/ulp.o: $(AOR)/test/ulp.h
+
+build/pl/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^
+
+build/pl/lib/libmathlib.a: $(math-lib-objs)
+	rm -f $@
+	$(AR) rc $@ $^
+	$(RANLIB) $@
+
+$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
+$(math-tools): LDLIBS += $(math-ldlibs) -lm
+
+# Some targets to build pl/math/test from math/test sources
+build/pl/math/test/%.o: $(srcdir)/math/test/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/math/test/%.o: $(srcdir)/math/test/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/math/test/%.os: $(srcdir)/math/test/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/math/test/%.os: $(srcdir)/math/test/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+# Some targets to build pl/ sources using appropriate flags
+build/pl/%.o: $(srcdir)/pl/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/%.o: $(srcdir)/pl/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/%.os: $(srcdir)/pl/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/%.os: $(srcdir)/pl/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/bin/rtest: $(math-host-objs)
+	$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
+
+build/pl/bin/mathtest: $(B)/test/mathtest.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/pl/bin/mathbench: $(B)/test/mathbench.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+# This is not ideal, but allows custom symbols in mathbench to get resolved.
+build/pl/bin/mathbench_libc: $(B)/test/mathbench.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/pl/lib/libmathlib.a -lm
+
+build/pl/bin/ulp: $(B)/test/ulp.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/pl/include/%.h: $(PLM)/include/%.h
+	cp $< $@
+
+build/pl/include/test/%.h: $(PLM)/test/%.h
+	cp $< $@
+
+build/pl/bin/%.sh: $(PLM)/test/%.sh
+	cp $< $@
+
+math-tests := $(wildcard $(PLM)/test/testcases/directed/*.tst)
+math-rtests := $(wildcard $(PLM)/test/testcases/random/*.tst)
+
+check-pl/math-test: $(math-tools)
+	cat $(math-tests) | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
+
+check-pl/math-rtest: $(math-host-tools) $(math-tools)
+	cat $(math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
+
+check-pl/math-ulp: $(math-tools)
+	ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR)
+
+check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
+
+$(DESTDIR)$(libdir)/pl/%.so: build/pl/lib/%.so
+	$(INSTALL) -D $< $@
+
+$(DESTDIR)$(libdir)/pl/%: build/pl/lib/%
+	$(INSTALL) -m 644 -D $< $@
+
+$(DESTDIR)$(includedir)/pl/%: build/pl/include/%
+	$(INSTALL) -m 644 -D $< $@
+
+install-pl/math: \
+ $(math-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \
+ $(math-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%)
+
+clean-pl/math:
+	rm -f $(pl/math-files)
+
+.PHONY: all-pl/math check-pl/math-test check-pl/math-rtest check-pl/math-ulp check-pl/math install-pl/math clean-pl/math
diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c
new file mode 100644
index 0000000..1073603
--- /dev/null
+++ b/pl/math/erff_1u5.c
@@ -0,0 +1,103 @@
+/*
+ * Single-precision erf(x) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stdint.h>
+#include <math.h>
+#include "math_config.h"
+
+#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
+#define A __erff_data.erff_poly_A
+#define B __erff_data.erff_poly_B
+
+/* Top 12 bits of a float.  */
+static inline uint32_t
+top12 (float x)
+{
+  return asuint (x) >> 20;
+}
+
+/* Efficient implementation of erff using either a pure polynomial approximation
+   or the exponential of a polynomial. Worst-case error is 1.09ulps at
+   0x1.c111acp-1.  */
+float
+erff (float x)
+{
+  float r, x2, u;
+
+  /* Get top word.  */
+  uint32_t ix = asuint (x);
+  uint32_t sign = ix >> 31;
+  uint32_t ia12 = top12 (x) & 0x7ff;
+
+  /* Limit of both intervals is 0.875 for performance reasons but coefficients
+     computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
+     from 0.94 to 1.1ulps.  */
+  if (ia12 < 0x3f6)
+    { /* a = |x| < 0.875.  */
+
+      /* Tiny and subnormal cases.  */
+      if (unlikely (ia12 < 0x318))
+	{ /* |x| < 2^(-28).  */
+	  if (unlikely (ia12 < 0x040))
+	    { /* |x| < 2^(-119).  */
+	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
+	      return check_uflowf (y);
+	    }
+	  return x + TwoOverSqrtPiMinusOne * x;
+	}
+
+      x2 = x * x;
+
+      /* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2).
+       */
+      r = A[5];
+      r = fmaf (r, x2, A[4]);
+      r = fmaf (r, x2, A[3]);
+      r = fmaf (r, x2, A[2]);
+      r = fmaf (r, x2, A[1]);
+      r = fmaf (r, x2, A[0]);
+      r = fmaf (r, x, x);
+    }
+  else if (ia12 < 0x408)
+    { /* |x| < 4.0 - Use a custom Estrin scheme.  */
+
+      float a = fabsf (x);
+      /* Use Estrin scheme on high order (small magnitude) coefficients.  */
+      r = fmaf (B[6], a, B[5]);
+      u = fmaf (B[4], a, B[3]);
+      x2 = x * x;
+      r = fmaf (r, x2, u);
+      /* Then switch to pure Horner scheme.  */
+      r = fmaf (r, a, B[2]);
+      r = fmaf (r, a, B[1]);
+      r = fmaf (r, a, B[0]);
+      r = fmaf (r, a, a);
+      /* Single precision exponential with ~0.5ulps ensures erff has maximum
+	 relative error below 1ulp on [0.921875, 4.0] and below 1.1ulps on
+	 [0.875, 4.0].  */
+      r = expf (-r);
+      /* Explicit copysign (calling copysignf increases latency).  */
+      if (sign)
+	r = -1.0f + r;
+      else
+	r = 1.0f - r;
+    }
+  else
+    { /* |x| >= 4.0.  */
+
+      /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
+      if (unlikely (ia12 >= 0x7f8))
+	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
+
+      /* Explicit copysign (calling copysignf increases latency).  */
+      if (sign)
+	r = -1.0f;
+      else
+	r = 1.0f;
+    }
+  return r;
+}
diff --git a/pl/math/erff_data.c b/pl/math/erff_data.c
new file mode 100644
index 0000000..eeb0b20
--- /dev/null
+++ b/pl/math/erff_data.c
@@ -0,0 +1,16 @@
+/*
+ * Data for approximation of erff.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Minimax approximation of erff.  */
+const struct erff_data __erff_data
+  = {.erff_poly_A = {0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f,
+		     -0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f},
+     .erff_poly_B
+     = {0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f, -0x1.8d6300p-6f,
+	0x1.fd1336p-9f, -0x1.91d2ccp-12f, 0x1.222900p-16f}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
new file mode 100644
index 0000000..fc8eb0b
--- /dev/null
+++ b/pl/math/include/mathlib.h
@@ -0,0 +1,26 @@
+/*
+ * Public API.
+ *
+ * Copyright (c) 2015-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _MATHLIB_H
+#define _MATHLIB_H
+
+float erff (float);
+
+#if __aarch64__
+#if __GNUC__ >= 5
+typedef __Float32x4_t __f32x4_t;
+typedef __Float64x2_t __f64x2_t;
+#elif __clang_major__*100+__clang_minor__ >= 305
+typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
+typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
+#else
+#error Unsupported compiler
+#endif
+
+#endif
+
+#endif
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
new file mode 100644
index 0000000..0790416
--- /dev/null
+++ b/pl/math/math_config.h
@@ -0,0 +1,326 @@
+/*
+ * Configuration for math routines.
+ *
+ * Copyright (c) 2017-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _MATH_CONFIG_H
+#define _MATH_CONFIG_H
+
+#include <math.h>
+#include <stdint.h>
+
+#ifndef WANT_ROUNDING
+/* If defined to 1, return correct results for special cases in non-nearest
+   rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f).
+   This may be set to 0 if there is no fenv support or if math functions only
+   get called in round to nearest mode.  */
+# define WANT_ROUNDING 1
+#endif
+#ifndef WANT_ERRNO
+/* If defined to 1, set errno in math functions according to ISO C.  Many math
+   libraries do not set errno, so this is 0 by default.  It may need to be
+   set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.  */
+# define WANT_ERRNO 0
+#endif
+#ifndef WANT_ERRNO_UFLOW
+/* Set errno to ERANGE if result underflows to 0 (in all rounding modes).  */
+# define WANT_ERRNO_UFLOW (WANT_ROUNDING && WANT_ERRNO)
+#endif
+
+/* Compiler can inline round as a single instruction.  */
+#ifndef HAVE_FAST_ROUND
+# if __aarch64__
+#   define HAVE_FAST_ROUND 1
+# else
+#   define HAVE_FAST_ROUND 0
+# endif
+#endif
+
+/* Compiler can inline lround, but not (long)round(x).  */
+#ifndef HAVE_FAST_LROUND
+# if __aarch64__ && (100*__GNUC__ + __GNUC_MINOR__) >= 408 && __NO_MATH_ERRNO__
+#   define HAVE_FAST_LROUND 1
+# else
+#   define HAVE_FAST_LROUND 0
+# endif
+#endif
+
+/* Compiler can inline fma as a single instruction.  */
+#ifndef HAVE_FAST_FMA
+# if defined FP_FAST_FMA || __aarch64__
+#   define HAVE_FAST_FMA 1
+# else
+#   define HAVE_FAST_FMA 0
+# endif
+#endif
+
+/* Provide *_finite symbols and some of the glibc hidden symbols
+   so libmathlib can be used with binaries compiled against glibc
+   to interpose math functions with both static and dynamic linking.  */
+#ifndef USE_GLIBC_ABI
+# if __GNUC__
+#   define USE_GLIBC_ABI 1
+# else
+#   define USE_GLIBC_ABI 0
+# endif
+#endif
+
+/* Optionally used extensions.  */
+#ifdef __GNUC__
+# define HIDDEN __attribute__ ((__visibility__ ("hidden")))
+# define NOINLINE __attribute__ ((noinline))
+# define UNUSED __attribute__ ((unused))
+# define likely(x) __builtin_expect (!!(x), 1)
+# define unlikely(x) __builtin_expect (x, 0)
+# if __GNUC__ >= 9
+#   define attribute_copy(f) __attribute__ ((copy (f)))
+# else
+#   define attribute_copy(f)
+# endif
+# define strong_alias(f, a) \
+  extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f);
+# define hidden_alias(f, a) \
+  extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \
+  attribute_copy (f);
+#else
+# define HIDDEN
+# define NOINLINE
+# define UNUSED
+# define likely(x) (x)
+# define unlikely(x) (x)
+#endif
+
+#if HAVE_FAST_ROUND
+/* When set, the roundtoint and converttoint functions are provided with
+   the semantics documented below.  */
+# define TOINT_INTRINSICS 1
+
+/* Round x to nearest int in all rounding modes, ties have to be rounded
+   consistently with converttoint so the results match.  If the result
+   would be outside of [-2^31, 2^31-1] then the semantics is unspecified.  */
+static inline double_t
+roundtoint (double_t x)
+{
+  return round (x);
+}
+
+/* Convert x to nearest int in all rounding modes, ties have to be rounded
+   consistently with roundtoint.  If the result is not representible in an
+   int32_t then the semantics is unspecified.  */
+static inline int32_t
+converttoint (double_t x)
+{
+# if HAVE_FAST_LROUND
+  return lround (x);
+# else
+  return (long) round (x);
+# endif
+}
+#endif
+
+static inline uint32_t
+asuint (float f)
+{
+  union
+  {
+    float f;
+    uint32_t i;
+  } u = {f};
+  return u.i;
+}
+
+static inline float
+asfloat (uint32_t i)
+{
+  union
+  {
+    uint32_t i;
+    float f;
+  } u = {i};
+  return u.f;
+}
+
+static inline uint64_t
+asuint64 (double f)
+{
+  union
+  {
+    double f;
+    uint64_t i;
+  } u = {f};
+  return u.i;
+}
+
+static inline double
+asdouble (uint64_t i)
+{
+  union
+  {
+    uint64_t i;
+    double f;
+  } u = {i};
+  return u.f;
+}
+
+#ifndef IEEE_754_2008_SNAN
+# define IEEE_754_2008_SNAN 1
+#endif
+static inline int
+issignalingf_inline (float x)
+{
+  uint32_t ix = asuint (x);
+  if (!IEEE_754_2008_SNAN)
+    return (ix & 0x7fc00000) == 0x7fc00000;
+  return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000;
+}
+
+static inline int
+issignaling_inline (double x)
+{
+  uint64_t ix = asuint64 (x);
+  if (!IEEE_754_2008_SNAN)
+    return (ix & 0x7ff8000000000000) == 0x7ff8000000000000;
+  return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL;
+}
+
+#if __aarch64__ && __GNUC__
+/* Prevent the optimization of a floating-point expression.  */
+static inline float
+opt_barrier_float (float x)
+{
+  __asm__ __volatile__ ("" : "+w" (x));
+  return x;
+}
+static inline double
+opt_barrier_double (double x)
+{
+  __asm__ __volatile__ ("" : "+w" (x));
+  return x;
+}
+/* Force the evaluation of a floating-point expression for its side-effect.  */
+static inline void
+force_eval_float (float x)
+{
+  __asm__ __volatile__ ("" : "+w" (x));
+}
+static inline void
+force_eval_double (double x)
+{
+  __asm__ __volatile__ ("" : "+w" (x));
+}
+#else
+static inline float
+opt_barrier_float (float x)
+{
+  volatile float y = x;
+  return y;
+}
+static inline double
+opt_barrier_double (double x)
+{
+  volatile double y = x;
+  return y;
+}
+static inline void
+force_eval_float (float x)
+{
+  volatile float y UNUSED = x;
+}
+static inline void
+force_eval_double (double x)
+{
+  volatile double y UNUSED = x;
+}
+#endif
+
+/* Evaluate an expression as the specified type, normally a type
+   cast should be enough, but compilers implement non-standard
+   excess-precision handling, so when FLT_EVAL_METHOD != 0 then
+   these functions may need to be customized.  */
+static inline float
+eval_as_float (float x)
+{
+  return x;
+}
+static inline double
+eval_as_double (double x)
+{
+  return x;
+}
+
+/* Error handling tail calls for special cases, with a sign argument.
+   The sign of the return value is set if the argument is non-zero.  */
+
+/* The result overflows.  */
+HIDDEN float __math_oflowf (uint32_t);
+/* The result underflows to 0 in nearest rounding mode.  */
+HIDDEN float __math_uflowf (uint32_t);
+/* The result underflows to 0 in some directed rounding mode only.  */
+HIDDEN float __math_may_uflowf (uint32_t);
+/* Division by zero.  */
+HIDDEN float __math_divzerof (uint32_t);
+/* The result overflows.  */
+HIDDEN double __math_oflow (uint32_t);
+/* The result underflows to 0 in nearest rounding mode.  */
+HIDDEN double __math_uflow (uint32_t);
+/* The result underflows to 0 in some directed rounding mode only.  */
+HIDDEN double __math_may_uflow (uint32_t);
+/* Division by zero.  */
+HIDDEN double __math_divzero (uint32_t);
+
+/* Error handling using input checking.  */
+
+/* Invalid input unless it is a quiet NaN.  */
+HIDDEN float __math_invalidf (float);
+/* Invalid input unless it is a quiet NaN.  */
+HIDDEN double __math_invalid (double);
+
+/* Error handling using output checking, only for errno setting.  */
+
+/* Check if the result overflowed to infinity.  */
+HIDDEN double __math_check_oflow (double);
+/* Check if the result underflowed to 0.  */
+HIDDEN double __math_check_uflow (double);
+
+/* Check if the result overflowed to infinity.  */
+static inline double
+check_oflow (double x)
+{
+  return WANT_ERRNO ? __math_check_oflow (x) : x;
+}
+
+/* Check if the result underflowed to 0.  */
+static inline double
+check_uflow (double x)
+{
+  return WANT_ERRNO ? __math_check_uflow (x) : x;
+}
+
+/* Check if the result overflowed to infinity.  */
+HIDDEN float __math_check_oflowf (float);
+/* Check if the result underflowed to 0.  */
+HIDDEN float __math_check_uflowf (float);
+
+/* Check if the result overflowed to infinity.  */
+static inline float
+check_oflowf (float x)
+{
+  return WANT_ERRNO ? __math_check_oflowf (x) : x;
+}
+
+/* Check if the result underflowed to 0.  */
+static inline float
+check_uflowf (float x)
+{
+  return WANT_ERRNO ? __math_check_uflowf (x) : x;
+}
+
+extern const struct erff_data
+{
+  float erff_poly_A[6];
+  float erff_poly_B[7];
+} __erff_data HIDDEN;
+
+#endif
diff --git a/pl/math/math_errf.c b/pl/math/math_errf.c
new file mode 100644
index 0000000..f2aad46
--- /dev/null
+++ b/pl/math/math_errf.c
@@ -0,0 +1,80 @@
+/*
+ * Single-precision math error handling.
+ *
+ * Copyright (c) 2017-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#if WANT_ERRNO
+#include <errno.h>
+/* NOINLINE reduces code size and avoids making math functions non-leaf
+   when the error handling is inlined.  */
+NOINLINE static float
+with_errnof (float y, int e)
+{
+  errno = e;
+  return y;
+}
+#else
+#define with_errnof(x, e) (x)
+#endif
+
+/* NOINLINE reduces code size.  */
+NOINLINE static float
+xflowf (uint32_t sign, float y)
+{
+  y = eval_as_float (opt_barrier_float (sign ? -y : y) * y);
+  return with_errnof (y, ERANGE);
+}
+
+HIDDEN float
+__math_uflowf (uint32_t sign)
+{
+  return xflowf (sign, 0x1p-95f);
+}
+
+#if WANT_ERRNO_UFLOW
+/* Underflows to zero in some non-nearest rounding mode, setting errno
+   is valid even if the result is non-zero, but in the subnormal range.  */
+HIDDEN float
+__math_may_uflowf (uint32_t sign)
+{
+  return xflowf (sign, 0x1.4p-75f);
+}
+#endif
+
+HIDDEN float
+__math_oflowf (uint32_t sign)
+{
+  return xflowf (sign, 0x1p97f);
+}
+
+HIDDEN float
+__math_divzerof (uint32_t sign)
+{
+  float y = opt_barrier_float (sign ? -1.0f : 1.0f) / 0.0f;
+  return with_errnof (y, ERANGE);
+}
+
+HIDDEN float
+__math_invalidf (float x)
+{
+  float y = (x - x) / (x - x);
+  return isnan (x) ? y : with_errnof (y, EDOM);
+}
+
+/* Check result and set errno if necessary.  */
+
+HIDDEN float
+__math_check_uflowf (float y)
+{
+  return y == 0.0f ? with_errnof (y, ERANGE) : y;
+}
+
+HIDDEN float
+__math_check_oflowf (float y)
+{
+  return isinf (y) ? with_errnof (y, ERANGE) : y;
+}
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
new file mode 100644
index 0000000..64c1300
--- /dev/null
+++ b/pl/math/test/mathbench_funcs.h
@@ -0,0 +1,7 @@
+/*
+ * Function entries for mathbench.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+F (erff, -4.0, 4.0)
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
new file mode 100644
index 0000000..8f85079
--- /dev/null
+++ b/pl/math/test/mathbench_wrappers.h
@@ -0,0 +1,7 @@
+/*
+ * Function wrappers for mathbench.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
new file mode 100755
index 0000000..5d29f21
--- /dev/null
+++ b/pl/math/test/runulp.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# ULP error check script.
+#
+# Copyright (c) 2019-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+#set -x
+set -eu
+
+# cd to bin directory.
+cd "${0%/*}"
+
+rmodes='n'
+flags="${ULPFLAGS:--q}"
+emu="$@"
+
+FAIL=0
+PASS=0
+
+t() {
+	[ $r = "n" ] && Lt=$L || Lt=$Ldir
+	$emu ./ulp -r $r -e $Lt $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
+}
+
+Ldir=0.5
+for r in $rmodes
+do
+
+L=0.6
+Ldir=0.9
+t erff  0      0xffff0000 10000
+t erff  0x1p-127  0x1p-26 40000
+t erff -0x1p-127 -0x1p-26 40000
+t erff  0x1p-26   0x1p3   40000
+t erff -0x1p-26  -0x1p3   40000
+t erff  0         inf     40000
+Ldir=0.5
+
+done
+
+[ 0 -eq $FAIL ] || {
+	echo "FAILED $FAIL PASSED $PASS"
+	exit 1
+}
diff --git a/pl/math/test/testcases/directed/erff.tst b/pl/math/test/testcases/directed/erff.tst
new file mode 100644
index 0000000..48a3d6e
--- /dev/null
+++ b/pl/math/test/testcases/directed/erff.tst
@@ -0,0 +1,17 @@
+; erff.tst
+;
+; Copyright (c) 2007-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=erff op1=7fc00001 result=7fc00001 errno=0
+func=erff op1=ffc00001 result=7fc00001 errno=0
+func=erff op1=7f800001 result=7fc00001 errno=0 status=i
+func=erff op1=ff800001 result=7fc00001 errno=0 status=i
+func=erff op1=7f800000 result=3f800000 errno=0
+func=erff op1=ff800000 result=bf800000 errno=0
+func=erff op1=00000000 result=00000000 errno=ERANGE
+func=erff op1=80000000 result=80000000 errno=ERANGE
+func=erff op1=00000001 result=00000001 errno=0 status=ux
+func=erff op1=80000001 result=80000001 errno=0 status=ux
+func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0
+func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0
diff --git a/pl/math/test/testcases/random/float.tst b/pl/math/test/testcases/random/float.tst
new file mode 100644
index 0000000..caf0bd3
--- /dev/null
+++ b/pl/math/test/testcases/random/float.tst
@@ -0,0 +1,6 @@
+!! float.tst - Random test case specification for SP functions
+!!
+!! Copyright (c) 2022, Arm Limited.
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+test erff 10000
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
new file mode 100644
index 0000000..68d9ec7
--- /dev/null
+++ b/pl/math/test/ulp_funcs.h
@@ -0,0 +1,7 @@
+/*
+ * Function entries for ulp.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+F1 (erf)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
new file mode 100644
index 0000000..38e0f63
--- /dev/null
+++ b/pl/math/test/ulp_wrappers.h
@@ -0,0 +1,27 @@
+/*
+ * Function wrappers for ulp.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+#endif
+
+/* A bit of a hack: call vector functions twice with the same
+   input in lane 0 but a different value in other lanes: once
+   with an in-range value and then with a special case value.  */
+static int secondcall;
+
+/* Wrappers for vector functions.  */
+#if __aarch64__ && WANT_VMATH
+typedef __f32x4_t v_float;
+typedef __f64x2_t v_double;
+static const float fv[2] = {1.0f, -INFINITY};
+static const double dv[2] = {1.0, -INFINITY};
+static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
+static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
+
+#endif
-- 
cgit v1.2.3


From e9e23e58ca7404802d4f1e752cef38861771df2d Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Mon, 11 Apr 2022 14:42:01 +0100
Subject: pl/math: Add scalar log10f

- Scalar log10f simply implemented as log10(x):=log(x)*invlog10.
- The implementation of the log is similar to that of math/.
- The maximum measured ULP error is about 0.80ulp.
---
 pl/math/include/mathlib.h                  |  1 +
 pl/math/log10f.c                           | 90 ++++++++++++++++++++++++++++++
 pl/math/logf_data.c                        | 36 ++++++++++++
 pl/math/math_config.h                      | 18 ++++--
 pl/math/math_errf.c                        |  2 -
 pl/math/test/mathbench_funcs.h             |  1 +
 pl/math/test/runulp.sh                     |  9 ++-
 pl/math/test/testcases/directed/log10f.tst | 69 +++++++++++++++++++++++
 pl/math/test/testcases/random/float.tst    |  1 +
 pl/math/test/ulp_funcs.h                   |  1 +
 pl/math/tools/log10f.sollya                | 46 +++++++++++++++
 11 files changed, 267 insertions(+), 7 deletions(-)
 create mode 100644 pl/math/log10f.c
 create mode 100644 pl/math/logf_data.c
 create mode 100644 pl/math/test/testcases/directed/log10f.tst
 create mode 100644 pl/math/tools/log10f.sollya

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index fc8eb0b..2da8d7f 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -9,6 +9,7 @@
 #define _MATHLIB_H
 
 float erff (float);
+float log10f (float);
 
 #if __aarch64__
 #if __GNUC__ >= 5
diff --git a/pl/math/log10f.c b/pl/math/log10f.c
new file mode 100644
index 0000000..79f5d12
--- /dev/null
+++ b/pl/math/log10f.c
@@ -0,0 +1,90 @@
+/*
+ * Single-precision log10 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include <math.h>
+#include <stdint.h>
+
+/* Data associated to logf:
+
+   LOGF_TABLE_BITS = 4
+   LOGF_POLY_ORDER = 4
+
+   ULP error: 0.818 (nearest rounding.)
+   Relative error: 1.957 * 2^-26 (before rounding.).  */
+
+#define T __logf_data.tab
+#define A __logf_data.poly
+#define Ln2 __logf_data.ln2
+#define InvLn10 __logf_data.invln10
+#define N (1 << LOGF_TABLE_BITS)
+#define OFF 0x3f330000
+
+/* This naive implementation of log10f mimics that of log
+   then simply scales the result by 1/log(10) to switch from base e to
+   base 10. Hence, most computations are carried out in double precision.
+   Scaling before rounding to single precision is both faster and more accurate.
+
+   ULP error: 0.797 ulp (nearest rounding.).  */
+float
+log10f (float x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t z, r, r2, y, y0, invc, logc;
+  uint32_t ix, iz, tmp;
+  int k, i;
+
+  ix = asuint (x);
+#if WANT_ROUNDING
+  /* Fix sign of zero with downward rounding when x==1.  */
+  if (unlikely (ix == 0x3f800000))
+    return 0;
+#endif
+  if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000))
+    {
+      /* x < 0x1p-126 or inf or nan.  */
+      if (ix * 2 == 0)
+	return __math_divzerof (1);
+      if (ix == 0x7f800000) /* log(inf) == inf.  */
+	return x;
+      if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
+	return __math_invalidf (x);
+      /* x is subnormal, normalize it.  */
+      ix = asuint (x * 0x1p23f);
+      ix -= 23 << 23;
+    }
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
+  k = (int32_t) tmp >> 23; /* arithmetic shift.  */
+  iz = ix - (tmp & 0x1ff << 23);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = (double_t) asfloat (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  r = z * invc - 1;
+  y0 = logc + (double_t) k * Ln2;
+
+  /* Pipelined polynomial evaluation to approximate log1p(r).  */
+  r2 = r * r;
+  y = A[1] * r + A[2];
+  y = A[0] * r2 + y;
+  y = y * r2 + (y0 + r);
+
+  /* Multiply by 1/log(10).  */
+  y = y * InvLn10;
+
+  return eval_as_float (y);
+}
+#if USE_GLIBC_ABI
+strong_alias (log10f, __log10f_finite)
+hidden_alias (log10f, __ieee754_log10f)
+#endif
diff --git a/pl/math/logf_data.c b/pl/math/logf_data.c
new file mode 100644
index 0000000..279a265
--- /dev/null
+++ b/pl/math/logf_data.c
@@ -0,0 +1,36 @@
+/*
+ * Data definition for logf and log10f.
+ *
+ * Copyright (c) 2017-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct logf_data __logf_data = {
+    .tab =
+        {
+            {0x1.661ec79f8f3bep+0, -0x1.57bf7808caadep-2},
+            {0x1.571ed4aaf883dp+0, -0x1.2bef0a7c06ddbp-2},
+            {0x1.49539f0f010bp+0, -0x1.01eae7f513a67p-2},
+            {0x1.3c995b0b80385p+0, -0x1.b31d8a68224e9p-3},
+            {0x1.30d190c8864a5p+0, -0x1.6574f0ac07758p-3},
+            {0x1.25e227b0b8eap+0, -0x1.1aa2bc79c81p-3},
+            {0x1.1bb4a4a1a343fp+0, -0x1.a4e76ce8c0e5ep-4},
+            {0x1.12358f08ae5bap+0, -0x1.1973c5a611cccp-4},
+            {0x1.0953f419900a7p+0, -0x1.252f438e10c1ep-5},
+            {0x1p+0, 0x0p+0},
+            {0x1.e608cfd9a47acp-1, 0x1.aa5aa5df25984p-5},
+            {0x1.ca4b31f026aap-1, 0x1.c5e53aa362eb4p-4},
+            {0x1.b2036576afce6p-1, 0x1.526e57720db08p-3},
+            {0x1.9c2d163a1aa2dp-1, 0x1.bc2860d22477p-3},
+            {0x1.886e6037841edp-1, 0x1.1058bc8a07ee1p-2},
+            {0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2},
+        },
+    .ln2 = 0x1.62e42fefa39efp-1,
+    .invln10 = 0x1.bcb7b1526e50ep-2,
+    .poly = {
+        -0x1.00ea348b88334p-2,
+        0x1.5575b0be00b6ap-2,
+        -0x1.ffffef20a4123p-2,
+    }};
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 0790416..1e1d5ab 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -24,10 +24,6 @@
    set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.  */
 # define WANT_ERRNO 0
 #endif
-#ifndef WANT_ERRNO_UFLOW
-/* Set errno to ERANGE if result underflows to 0 (in all rounding modes).  */
-# define WANT_ERRNO_UFLOW (WANT_ROUNDING && WANT_ERRNO)
-#endif
 
 /* Compiler can inline round as a single instruction.  */
 #ifndef HAVE_FAST_ROUND
@@ -323,4 +319,18 @@ extern const struct erff_data
   float erff_poly_B[7];
 } __erff_data HIDDEN;
 
+/* Data for logf and log10f.  */
+#define LOGF_TABLE_BITS 4
+#define LOGF_POLY_ORDER 4
+extern const struct logf_data
+{
+  struct
+  {
+    double invc, logc;
+  } tab[1 << LOGF_TABLE_BITS];
+  double ln2;
+  double invln10;
+  double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1.  */
+} __logf_data HIDDEN;
+
 #endif
diff --git a/pl/math/math_errf.c b/pl/math/math_errf.c
index f2aad46..5b4945f 100644
--- a/pl/math/math_errf.c
+++ b/pl/math/math_errf.c
@@ -35,7 +35,6 @@ __math_uflowf (uint32_t sign)
   return xflowf (sign, 0x1p-95f);
 }
 
-#if WANT_ERRNO_UFLOW
 /* Underflows to zero in some non-nearest rounding mode, setting errno
    is valid even if the result is non-zero, but in the subnormal range.  */
 HIDDEN float
@@ -43,7 +42,6 @@ __math_may_uflowf (uint32_t sign)
 {
   return xflowf (sign, 0x1.4p-75f);
 }
-#endif
 
 HIDDEN float
 __math_oflowf (uint32_t sign)
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 64c1300..a48015b 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -5,3 +5,4 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 F (erff, -4.0, 4.0)
+F (log10f, 0.01, 11.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 5d29f21..8f5620e 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -35,7 +35,14 @@ t erff -0x1p-127 -0x1p-26 40000
 t erff  0x1p-26   0x1p3   40000
 t erff -0x1p-26  -0x1p3   40000
 t erff  0         inf     40000
-Ldir=0.5
+
+L=0.30
+Ldir=
+t log10f  0      0xffff0000 10000
+t log10f  0x1p-127  0x1p-26 50000
+t log10f  0x1p-26   0x1p3   50000
+t log10f  0x1p-4    0x1p4   50000
+t log10f  0         inf     50000
 
 done
 
diff --git a/pl/math/test/testcases/directed/log10f.tst b/pl/math/test/testcases/directed/log10f.tst
new file mode 100644
index 0000000..5fdd635
--- /dev/null
+++ b/pl/math/test/testcases/directed/log10f.tst
@@ -0,0 +1,69 @@
+; log10f.tst
+;
+; Copyright (c) 2007-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log10f op1=7fc00001 result=7fc00001 errno=0
+func=log10f op1=ffc00001 result=7fc00001 errno=0
+func=log10f op1=7f800001 result=7fc00001 errno=0 status=i
+func=log10f op1=ff800001 result=7fc00001 errno=0 status=i
+func=log10f op1=ff810000 result=7fc00001 errno=0 status=i
+func=log10f op1=7f800000 result=7f800000 errno=0
+func=log10f op1=3f800000 result=00000000 errno=0
+func=log10f op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=log10f op1=00000000 result=ff800000 errno=ERANGE status=z
+func=log10f op1=80000000 result=ff800000 errno=ERANGE status=z
+func=log10f op1=80000001 result=7fc00001 errno=EDOM status=i
+
+; Directed tests for the special-case handling of log10 of things
+; very near 1
+func=log10f op1=3f81a618 result=3bb62472.b92 error=0
+func=log10f op1=3f876783 result=3cc811f4.26c error=0
+func=log10f op1=3f816af8 result=3b9cc4c7.057 error=0
+func=log10f op1=3f7bed7d result=bbe432cb.e23 error=0
+func=log10f op1=3f803ece result=3a59ff3a.a84 error=0
+func=log10f op1=3f80089f result=38ef9728.aa6 error=0
+func=log10f op1=3f86ab72 result=3cb4b711.457 error=0
+func=log10f op1=3f780854 result=bc60f953.904 error=0
+func=log10f op1=3f7c6d76 result=bbc7fd01.01c error=0
+func=log10f op1=3f85dff6 result=3c9fa76f.81f error=0
+func=log10f op1=3f7b87f4 result=bbfa9edc.be4 error=0
+func=log10f op1=3f81c710 result=3bc4457b.745 error=0
+func=log10f op1=3f80946d result=3b00a140.c06 error=0
+func=log10f op1=3f7e87ea result=bb23cd70.828 error=0
+func=log10f op1=3f811437 result=3b6ee960.b40 error=0
+func=log10f op1=3f858dcf result=3c971d9b.2ea error=0
+func=log10f op1=3f7f61a3 result=ba89b814.4e0 error=0
+func=log10f op1=3f82d642 result=3c1bfb8d.517 error=0
+func=log10f op1=3f80f3bc result=3b52ebe8.c75 error=0
+func=log10f op1=3f85eff9 result=3ca150d9.7e8 error=0
+func=log10f op1=3f843eb8 result=3c68263f.771 error=0
+func=log10f op1=3f78e691 result=bc481cf4.50a error=0
+func=log10f op1=3f87c56f result=3cd1b268.5e6 error=0
+func=log10f op1=3f83b711 result=3c4b94c5.918 error=0
+func=log10f op1=3f823b2b result=3bf5eb02.e2a error=0
+func=log10f op1=3f7f2c4e result=bab82c80.519 error=0
+func=log10f op1=3f83fc92 result=3c5a3ba1.543 error=0
+func=log10f op1=3f793956 result=bc3ee04e.03c error=0
+func=log10f op1=3f839ba5 result=3c45caca.92a error=0
+func=log10f op1=3f862f30 result=3ca7de76.16f error=0
+func=log10f op1=3f832a20 result=3c2dc6e9.afd error=0
+func=log10f op1=3f810296 result=3b5fb92a.429 error=0
+func=log10f op1=3f7e58c9 result=bb38655a.0a4 error=0
+func=log10f op1=3f8362e7 result=3c39cc65.d15 error=0
+func=log10f op1=3f7fdb85 result=b97d9016.40b error=0
+func=log10f op1=3f84484e result=3c6a29f2.f74 error=0
+func=log10f op1=3f861862 result=3ca5819e.f2d error=0
+func=log10f op1=3f7c027b result=bbdf912d.440 error=0
+func=log10f op1=3f867803 result=3caf6744.34d error=0
+func=log10f op1=3f789a89 result=bc509bce.458 error=0
+func=log10f op1=3f8361d9 result=3c399347.379 error=0
+func=log10f op1=3f7d3ac3 result=bb9ad93a.93d error=0
+func=log10f op1=3f7ee241 result=baf8bd12.a62 error=0
+func=log10f op1=3f83a1fd result=3c4721bd.0a4 error=0
+func=log10f op1=3f840da3 result=3c5dd375.675 error=0
+func=log10f op1=3f79c2fe result=bc2f8a60.8c5 error=0
+func=log10f op1=3f854a93 result=3c901cc9.add error=0
+func=log10f op1=3f87a50a result=3cce6125.cd6 error=0
+func=log10f op1=3f818bf5 result=3baaee68.a55 error=0
+func=log10f op1=3f830a44 result=3c2705c4.d87 error=0
diff --git a/pl/math/test/testcases/random/float.tst b/pl/math/test/testcases/random/float.tst
index caf0bd3..468896b 100644
--- a/pl/math/test/testcases/random/float.tst
+++ b/pl/math/test/testcases/random/float.tst
@@ -4,3 +4,4 @@
 !! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 test erff 10000
+test log10f 10000
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 68d9ec7..7cf4576 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -5,3 +5,4 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 F1 (erf)
+F1 (log10)
diff --git a/pl/math/tools/log10f.sollya b/pl/math/tools/log10f.sollya
new file mode 100644
index 0000000..c8cfa51
--- /dev/null
+++ b/pl/math/tools/log10f.sollya
@@ -0,0 +1,46 @@
+// polynomial for approximating log10f(1+x)
+//
+// Copyright (c) 2019-2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 9; // poly degree
+// |log10(1+x)| > 0x1p-4 outside the interval
+a = -1/3;
+b =  1/3;
+//b =  2*(a+1)-1;
+
+display = hexadecimal;
+print("log10(2) = ", single(log10(2)));
+
+ln10 = evaluate(log(10),0);
+invln10 = single(1/ln10);
+
+// find log10(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log10(1+x) is the same * x)
+deg = deg-1; // because of /x
+
+// f = log(1+x)/x; using taylor series
+f = 0;
+for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+f = f/ln10;
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = invln10;
+for i from 1 to deg do {
+  p = roundcoefficients(approx(poly,i), [|SG ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+display = hexadecimal;
+print("invln10:", invln10);
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
+
+display = decimal;
+print("in [",a,b,"]");
-- 
cgit v1.2.3


From b3c0d1f33b1b10026a8b89610351621d5bd7f423 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Mon, 11 Apr 2022 16:54:16 +0100
Subject: pl/math: Add scalar log10

- Scalar log10 uses scaled polynomial coefficients and scales the rest
  of the variables at runtime.
- The implementation is similar to that of math/log.c except that
  we merged hi and lo terms in the log10 constant to provide a faster
  low accurate variant (>1ulp).
- The maximum measured ULP error is 1.7ulps.
---
 pl/math/include/mathlib.h                 |   2 +
 pl/math/log10_2u.c                        | 132 ++++++++++++
 pl/math/log10_data.c                      | 334 ++++++++++++++++++++++++++++++
 pl/math/math_config.h                     |  17 ++
 pl/math/math_err.c                        |  78 +++++++
 pl/math/test/mathbench_funcs.h            |   2 +
 pl/math/test/runulp.sh                    |   6 +
 pl/math/test/testcases/directed/log10.tst |  16 ++
 pl/math/test/testcases/random/double.tst  |   6 +
 pl/math/test/ulp_funcs.h                  |   1 +
 pl/math/tools/log10.sollya                |  44 ++++
 11 files changed, 638 insertions(+)
 create mode 100644 pl/math/log10_2u.c
 create mode 100644 pl/math/log10_data.c
 create mode 100644 pl/math/math_err.c
 create mode 100644 pl/math/test/testcases/directed/log10.tst
 create mode 100644 pl/math/test/testcases/random/double.tst
 create mode 100644 pl/math/tools/log10.sollya

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 2da8d7f..63c30e6 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -11,6 +11,8 @@
 float erff (float);
 float log10f (float);
 
+double log10 (double);
+
 #if __aarch64__
 #if __GNUC__ >= 5
 typedef __Float32x4_t __f32x4_t;
diff --git a/pl/math/log10_2u.c b/pl/math/log10_2u.c
new file mode 100644
index 0000000..d0c3123
--- /dev/null
+++ b/pl/math/log10_2u.c
@@ -0,0 +1,132 @@
+/*
+ * Double-precision log10(x) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include <float.h>
+#include <math.h>
+#include <stdint.h>
+
+/* Polynomial coefficients and lookup tables.  */
+#define T __log10_data.tab
+#define T2 __log10_data.tab2
+#define B __log10_data.poly1
+#define A __log10_data.poly
+#define Ln2hi __log10_data.ln2hi
+#define Ln2lo __log10_data.ln2lo
+#define InvLn10 __log10_data.invln10
+#define N (1 << LOG10_TABLE_BITS)
+#define OFF 0x3fe6000000000000
+#define LO asuint64 (1.0 - 0x1p-5)
+#define HI asuint64 (1.0 + 0x1.1p-5)
+
+/* Top 16 bits of a double.  */
+static inline uint32_t
+top16 (double x)
+{
+  return asuint64 (x) >> 48;
+}
+
+/* Fast and low accuracy implementation of log10.
+   The implementation is similar to that of math/log, except that:
+   - Polynomials are computed for log10(1+r) with r on same intervals as log.
+   - Lookup parameters are scaled (at runtime) to switch from base e to base 10.
+   Max ULP error: < 1.7 ulp (nearest rounding.)
+     with (LOG10_POLY1_ORDER = 10, LOG10_POLY_ORDER = 6, N = 128)
+   Maximum measured at 1.655 ulp for x in [0.0746, 0.0747]:
+     log10(0x1.ee008434a44a4p-1) got -0x1.fd415bb39db27p-7
+				want -0x1.fd415bb39db29p-7
+     +0.344511 ulp err 1.15549.  */
+double
+log10 (double x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t w, z, r, r2, r3, y, invc, logc, kd;
+  uint64_t ix, iz, tmp;
+  uint32_t top;
+  int k, i;
+
+  ix = asuint64 (x);
+  top = top16 (x);
+
+  if (unlikely (ix - LO < HI - LO))
+    {
+      /* Handle close to 1.0 inputs separately.  */
+      /* Fix sign of zero with downward rounding when x==1.  */
+      if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
+	return 0;
+      r = x - 1.0;
+      r2 = r * r;
+      r3 = r * r2;
+      /* Worst-case error is around 0.727 ULP.  */
+      y = r3
+	  * (B[1] + r * B[2] + r2 * B[3]
+	     + r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8])));
+      w = B[0] * r2; /* B[0] == -0.5.  */
+      /* Scale by 1/ln(10). Polynomial already contains scaling.  */
+      y = (y + w) + r * InvLn10;
+
+      return eval_as_double (y);
+    }
+  if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
+    {
+      /* x < 0x1p-1022 or inf or nan.  */
+      if (ix * 2 == 0)
+	return __math_divzero (1);
+      if (ix == asuint64 (INFINITY)) /* log10(inf) == inf.  */
+	return x;
+      if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
+	return __math_invalid (x);
+      /* x is subnormal, normalize it.  */
+      ix = asuint64 (x * 0x1p52);
+      ix -= 52ULL << 52;
+    }
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (52 - LOG10_TABLE_BITS)) % N;
+  k = (int64_t) tmp >> 52; /* arithmetic shift.  */
+  iz = ix - (tmp & 0xfffULL << 52);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = asdouble (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  /* r ~= z/c - 1, |r| < 1/(2*N).  */
+#if HAVE_FAST_FMA
+  /* rounding error: 0x1p-55/N.  */
+  r = fma (z, invc, -1.0);
+#else
+  /* rounding error: 0x1p-55/N + 0x1p-66.  */
+  r = (z - T2[i].chi - T2[i].clo) * invc;
+#endif
+  kd = (double_t) k;
+
+  /* w = log(c) + k*Ln2hi.  */
+  w = kd * Ln2hi + logc;
+
+  /* log10(x) = (w + r)/log(10) + (log10(1+r) - r/log(10)).  */
+  r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
+  y = r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4]));
+
+  /* Scale by 1/ln(10). Polynomial already contains scaling.  */
+  y = y + ((r + kd * Ln2lo) + w) * InvLn10;
+
+  return eval_as_double (y);
+}
+#if USE_GLIBC_ABI
+strong_alias (log10, __log10_finite)
+hidden_alias (log10, __ieee754_log10)
+#if LDBL_MANT_DIG == 53
+long double
+log10l (long double x)
+{
+  return log10 (x);
+}
+#endif
+#endif
diff --git a/pl/math/log10_data.c b/pl/math/log10_data.c
new file mode 100644
index 0000000..e844203
--- /dev/null
+++ b/pl/math/log10_data.c
@@ -0,0 +1,334 @@
+/*
+ * Data for log10.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << LOG10_TABLE_BITS)
+
+const struct log10_data __log10_data = {
+.ln2hi = 0x1.62e42fefa3800p-1,
+.ln2lo = 0x1.ef35793c76730p-45,
+.invln10 = 0x1.bcb7b1526e50ep-2,
+.poly1 = {
+#if LOG10_POLY1_ORDER == 10
+// relative error: 0x1.d34d5238p-63
+// in -0x1p-5 0x1.1p-5 (|log10(1+x)| > 0x1p-5 outside this interval)
+-0x1.bcb7b1526e50ep-3,
+0x1.287a7636f4314p-3,
+-0x1.bcb7b1526eeebp-4,
+0x1.63c62776b50e6p-4,
+-0x1.287a76329b69dp-4,
+0x1.fc3f7e81f44c2p-5,
+-0x1.bcb7b7893672ap-5,
+0x1.8c0fa601b4779p-5,
+-0x1.64403e39d7278p-5,
+#endif
+},
+.poly = {
+#if N == 128 && LOG10_POLY_ORDER == 6
+// relative error: 0x1.29fc52bp-56
+// in -0x1.fp-9 0x1.fp-9
+-0x1.bcb7b1526e50fp-3,
+0x1.287a7636c4076p-3,
+-0x1.bcb7b151bffaep-4,
+0x1.63c77372810dep-4,
+-0x1.287bdeec963c2p-4,
+#endif
+},
+/* Algorithm:
+
+	x = 2^k z
+	log(x) = k ln2 + log(c) + log(z/c)
+	log(z/c) = poly(z/c - 1)
+
+where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
+into the ith one, then table entries are computed as
+
+	tab[i].invc = 1/c
+	tab[i].logc = (double)log(c)
+	tab2[i].chi = (double)c
+	tab2[i].clo = (double)(c - (double)c)
+
+where c is near the center of the subinterval and is chosen by trying +-2^29
+floating point invc candidates around 1/center and selecting one for which
+
+	1) the rounding error in 0x1.8p9 + logc is 0,
+	2) the rounding error in z - chi - clo is < 0x1p-66 and
+	3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
+
+Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
+2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
+a single rounding error when there is no fast fma for z*invc - 1, 3) ensures
+that logc + poly(z/c - 1) has small error, however near x == 1 when
+|log(x)| < 0x1p-4, this is not enough so that is special cased.  */
+.tab = {
+#if N == 128
+{0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2},
+{0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2},
+{0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2},
+{0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2},
+{0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2},
+{0x1.69147332f0cbap+0, -0x1.602d076180000p-2},
+{0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2},
+{0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2},
+{0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2},
+{0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2},
+{0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2},
+{0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2},
+{0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2},
+{0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2},
+{0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2},
+{0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2},
+{0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2},
+{0x1.52aff42064583p+0, -0x1.1e9e129279000p-2},
+{0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2},
+{0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2},
+{0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2},
+{0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2},
+{0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2},
+{0x1.4880524d48434p+0, -0x1.feb224586f000p-3},
+{0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3},
+{0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3},
+{0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3},
+{0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3},
+{0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3},
+{0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3},
+{0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3},
+{0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3},
+{0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3},
+{0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3},
+{0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3},
+{0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3},
+{0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3},
+{0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3},
+{0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3},
+{0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3},
+{0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3},
+{0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3},
+{0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3},
+{0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3},
+{0x1.293726014b530p+0, -0x1.31b996b490000p-3},
+{0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3},
+{0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3},
+{0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3},
+{0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3},
+{0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3},
+{0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4},
+{0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4},
+{0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4},
+{0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4},
+{0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4},
+{0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4},
+{0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4},
+{0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4},
+{0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4},
+{0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4},
+{0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4},
+{0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4},
+{0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4},
+{0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4},
+{0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5},
+{0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5},
+{0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5},
+{0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5},
+{0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5},
+{0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5},
+{0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5},
+{0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5},
+{0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6},
+{0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6},
+{0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6},
+{0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6},
+{0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7},
+{0x1.02865137932a9p+0, -0x1.419355daa0000p-7},
+{0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8},
+{0x1.008040614b195p+0, -0x1.0040979240000p-9},
+{0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9},
+{0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7},
+{0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6},
+{0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6},
+{0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5},
+{0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5},
+{0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5},
+{0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5},
+{0x1.e01e009609a56p-1, 0x1.07598e598c000p-4},
+{0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4},
+{0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4},
+{0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4},
+{0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4},
+{0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4},
+{0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4},
+{0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4},
+{0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4},
+{0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3},
+{0x1.bf583eeece73fp-1, 0x1.147858292b000p-3},
+{0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3},
+{0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3},
+{0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3},
+{0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3},
+{0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3},
+{0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3},
+{0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3},
+{0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3},
+{0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3},
+{0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3},
+{0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3},
+{0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3},
+{0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3},
+{0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3},
+{0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3},
+{0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3},
+{0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3},
+{0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2},
+{0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2},
+{0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2},
+{0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2},
+{0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2},
+{0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2},
+{0x1.8060195f40260p-1, 0x1.2595fd7636800p-2},
+{0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2},
+{0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2},
+{0x1.79baa679725c2p-1, 0x1.377266dec1800p-2},
+{0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2},
+{0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2},
+#endif
+},
+#if !HAVE_FAST_FMA
+.tab2 = {
+#if N == 128
+{0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56},
+{0x1.63000034db495p-1, 0x1.dbfea48005d41p-55},
+{0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55},
+{0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57},
+{0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56},
+{0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55},
+{0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55},
+{0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56},
+{0x1.710000e86978p-1, 0x1.bff6671097952p-56},
+{0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55},
+{0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57},
+{0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57},
+{0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55},
+{0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56},
+{0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55},
+{0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55},
+{0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55},
+{0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55},
+{0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55},
+{0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55},
+{0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55},
+{0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56},
+{0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55},
+{0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55},
+{0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55},
+{0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56},
+{0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55},
+{0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56},
+{0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55},
+{0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55},
+{0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60},
+{0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55},
+{0x1.a10001145b006p-1, 0x1.4ff489958da56p-56},
+{0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55},
+{0x1.a500010971d79p-1, 0x1.8fecadd78793p-55},
+{0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55},
+{0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55},
+{0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57},
+{0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55},
+{0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57},
+{0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58},
+{0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56},
+{0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56},
+{0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55},
+{0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56},
+{0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57},
+{0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57},
+{0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55},
+{0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55},
+{0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57},
+{0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55},
+{0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55},
+{0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56},
+{0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57},
+{0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55},
+{0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55},
+{0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56},
+{0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55},
+{0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58},
+{0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56},
+{0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56},
+{0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55},
+{0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55},
+{0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57},
+{0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56},
+{0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56},
+{0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56},
+{0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58},
+{0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55},
+{0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56},
+{0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58},
+{0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55},
+{0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59},
+{0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55},
+{0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55},
+{0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57},
+{0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56},
+{0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57},
+{0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56},
+{0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57},
+{0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55},
+{0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54},
+{0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54},
+{0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55},
+{0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57},
+{0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54},
+{0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55},
+{0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56},
+{0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55},
+{0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54},
+{0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54},
+{0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55},
+{0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54},
+{0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54},
+{0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57},
+{0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54},
+{0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54},
+{0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54},
+{0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56},
+{0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56},
+{0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56},
+{0x1.2b00014556313p+0, -0x1.2808233f21f02p-54},
+{0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55},
+{0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55},
+{0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55},
+{0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54},
+{0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54},
+{0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55},
+{0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54},
+{0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55},
+{0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56},
+{0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54},
+{0x1.410001532aff4p+0, 0x1.7f8375f198524p-57},
+{0x1.4300017478b29p+0, 0x1.301e672dc5143p-55},
+{0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55},
+{0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54},
+{0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54},
+{0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54},
+{0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54},
+{0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54},
+{0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57},
+{0x1.530001605277ap+0, -0x1.6bfcece233209p-54},
+{0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55},
+{0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54},
+{0x1.5900017e61012p+0, 0x1.87ec581afef9p-55},
+{0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54},
+{0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54},
+{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54},
+#endif
+},
+#endif /* !HAVE_FAST_FMA */
+};
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 1e1d5ab..db16a3d 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -333,4 +333,21 @@ extern const struct logf_data
   double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1.  */
 } __logf_data HIDDEN;
 
+/* Data for low accuracy log10 (with 1/ln(10) included in coefficients).  */
+#define LOG10_TABLE_BITS 7
+#define LOG10_POLY_ORDER 6
+#define LOG10_POLY1_ORDER 10
+extern const struct log10_data
+{
+  double ln2hi;
+  double ln2lo;
+  double invln10;
+  double poly[LOG10_POLY_ORDER - 1]; /* First coefficient is 1/log(10).  */
+  double poly1[LOG10_POLY1_ORDER - 1];
+  struct {double invc, logc;} tab[1 << LOG10_TABLE_BITS];
+#if !HAVE_FAST_FMA
+  struct {double chi, clo;} tab2[1 << LOG10_TABLE_BITS];
+#endif
+} __log10_data HIDDEN;
+
 #endif
diff --git a/pl/math/math_err.c b/pl/math/math_err.c
new file mode 100644
index 0000000..fb98361
--- /dev/null
+++ b/pl/math/math_err.c
@@ -0,0 +1,78 @@
+/*
+ * Double-precision math error handling.
+ *
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#if WANT_ERRNO
+#include <errno.h>
+/* NOINLINE reduces code size and avoids making math functions non-leaf
+   when the error handling is inlined.  */
+NOINLINE static double
+with_errno (double y, int e)
+{
+  errno = e;
+  return y;
+}
+#else
+#define with_errno(x, e) (x)
+#endif
+
+/* NOINLINE reduces code size.  */
+NOINLINE static double
+xflow (uint32_t sign, double y)
+{
+  y = eval_as_double (opt_barrier_double (sign ? -y : y) * y);
+  return with_errno (y, ERANGE);
+}
+
+HIDDEN double
+__math_uflow (uint32_t sign)
+{
+  return xflow (sign, 0x1p-767);
+}
+
+/* Underflows to zero in some non-nearest rounding mode, setting errno
+   is valid even if the result is non-zero, but in the subnormal range.  */
+HIDDEN double
+__math_may_uflow (uint32_t sign)
+{
+  return xflow (sign, 0x1.8p-538);
+}
+
+HIDDEN double
+__math_oflow (uint32_t sign)
+{
+  return xflow (sign, 0x1p769);
+}
+
+HIDDEN double
+__math_divzero (uint32_t sign)
+{
+  double y = opt_barrier_double (sign ? -1.0 : 1.0) / 0.0;
+  return with_errno (y, ERANGE);
+}
+
+HIDDEN double
+__math_invalid (double x)
+{
+  double y = (x - x) / (x - x);
+  return isnan (x) ? y : with_errno (y, EDOM);
+}
+
+/* Check result and set errno if necessary.  */
+
+HIDDEN double
+__math_check_uflow (double y)
+{
+  return y == 0.0 ? with_errno (y, ERANGE) : y;
+}
+
+HIDDEN double
+__math_check_oflow (double y)
+{
+  return isinf (y) ? with_errno (y, ERANGE) : y;
+}
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index a48015b..63028c4 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -6,3 +6,5 @@
  */
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
+
+D (log10, 0.01, 11.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 8f5620e..06474c8 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -44,6 +44,12 @@ t log10f  0x1p-26   0x1p3   50000
 t log10f  0x1p-4    0x1p4   50000
 t log10f  0         inf     50000
 
+L=1.15
+Ldir=
+t log10  0 0xffff000000000000 10000
+t log10  0x1p-4    0x1p4      40000
+t log10  0         inf        40000
+
 done
 
 [ 0 -eq $FAIL ] || {
diff --git a/pl/math/test/testcases/directed/log10.tst b/pl/math/test/testcases/directed/log10.tst
new file mode 100644
index 0000000..a8da6a7
--- /dev/null
+++ b/pl/math/test/testcases/directed/log10.tst
@@ -0,0 +1,16 @@
+; log10.tst
+;
+; Copyright (c) 2007-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=log10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=log10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log10 op1=fff02000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=log10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=log10 op1=3ff00000.00000000 result=00000000.00000000 errno=0
+func=log10 op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=log10 op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=log10 op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=log10 op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i
diff --git a/pl/math/test/testcases/random/double.tst b/pl/math/test/testcases/random/double.tst
new file mode 100644
index 0000000..03d14d4
--- /dev/null
+++ b/pl/math/test/testcases/random/double.tst
@@ -0,0 +1,6 @@
+!! double.tst - Random test case specification for DP functions
+!!
+!! Copyright (c) 1999-2022, Arm Limited.
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+test log10 10000
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 7cf4576..5d14d5e 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -6,3 +6,4 @@
  */
 F1 (erf)
 F1 (log10)
+D1 (log10)
diff --git a/pl/math/tools/log10.sollya b/pl/math/tools/log10.sollya
new file mode 100644
index 0000000..a353a20
--- /dev/null
+++ b/pl/math/tools/log10.sollya
@@ -0,0 +1,44 @@
+// polynomial for approximating log10(1+x)
+//
+// Copyright (c) 2019-2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 6; // poly degree
+// |log10(1+x)| > 0x1p-5 outside the interval
+a = -0x1.p-5;
+b = 0x1.p-5;
+
+ln10 = evaluate(log(10),0);
+invln10hi = double(1/ln10 + 0x1p21) - 0x1p21; // round away last 21 bits
+invln10lo = double(1/ln10 - invln10hi);
+
+// find log10(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log10(1+x) is the same * x)
+deg = deg-1; // because of /x
+
+// f = log(1+x)/x; using taylor series
+f = 0;
+for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+f = f/ln10;
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = invln10hi + invln10lo;
+for i from 1 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+display = hexadecimal;
+print("invln10hi:", invln10hi);
+print("invln10lo:", invln10lo);
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
+
+display = decimal;
+print("in [",a,b,"]");
-- 
cgit v1.2.3


From 38fb9e7f26def75531d37982e2d4439886cffd79 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Tue, 19 Apr 2022 17:54:25 +0100
Subject: pl/math: Add Vector/Neon log10f

- Neon log10f uses the same approach as math/v_logf but uses
  coefficients associated with a polynomial approximation of log10(1+x),
  see pl/math/tools/v_log10f.sollya to reproduce coefficients.
- Extended precision can be used to get a 1ulp variant, incurring
  a performance penalty.
- The maximum measured ULP error is 3.31ulps.

- A sollya file for scalar log10f is also provided.
- Copy math/v_math.h into pl/math/ in case we need to specialize
  behavior.
---
 pl/math/include/mathlib.h      |  15 +
 pl/math/s_log10f_3u5.c         |   6 +
 pl/math/test/mathbench_funcs.h |  11 +
 pl/math/test/runulp.sh         |  44 +++
 pl/math/test/ulp_funcs.h       |  10 +
 pl/math/test/ulp_wrappers.h    |   5 +
 pl/math/tools/log10f.sollya    |  31 +-
 pl/math/tools/v_log10f.sollya  |  45 +++
 pl/math/v_log10f_3u5.c         |  86 ++++++
 pl/math/v_math.h               | 638 +++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_log10f_3u5.c        |  12 +
 11 files changed, 883 insertions(+), 20 deletions(-)
 create mode 100644 pl/math/s_log10f_3u5.c
 create mode 100644 pl/math/tools/v_log10f.sollya
 create mode 100644 pl/math/v_log10f_3u5.c
 create mode 100644 pl/math/v_math.h
 create mode 100644 pl/math/vn_log10f_3u5.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 63c30e6..faf407e 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -13,6 +13,8 @@ float log10f (float);
 
 double log10 (double);
 
+float __s_log10f (float);
+
 #if __aarch64__
 #if __GNUC__ >= 5
 typedef __Float32x4_t __f32x4_t;
@@ -24,6 +26,19 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #error Unsupported compiler
 #endif
 
+/* Vector functions following the base PCS.  */
+__f32x4_t __v_log10f (__f32x4_t);
+
+#if __GNUC__ >= 9 || __clang_major__ >= 8
+#define __vpcs __attribute__((__aarch64_vector_pcs__))
+
+/* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_log10f (__f32x4_t);
+
+/* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
+
+#endif
 #endif
 
 #endif
diff --git a/pl/math/s_log10f_3u5.c b/pl/math/s_log10f_3u5.c
new file mode 100644
index 0000000..dc804b6
--- /dev/null
+++ b/pl/math/s_log10f_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log10f_3u5.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 63028c4..1dd7209 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -8,3 +8,14 @@ F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
 
 D (log10, 0.01, 11.1)
+
+#if WANT_VMATH
+F (__s_log10f, 0.01, 11.1)
+#if __aarch64__
+VF (__v_log10f, 0.01, 11.1)
+#ifdef __vpcs
+VNF (__vn_log10f, 0.01, 11.1)
+VNF (_ZGVnN4v_log10f, 0.01, 11.1)
+#endif
+#endif
+#endif
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 06474c8..7705258 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -23,6 +23,10 @@ t() {
 	$emu ./ulp -r $r -e $Lt $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
 }
 
+check() {
+	$emu ./ulp -f -q "$@" #>/dev/null
+}
+
 Ldir=0.5
 for r in $rmodes
 do
@@ -52,6 +56,46 @@ t log10  0         inf        40000
 
 done
 
+# vector functions
+Ldir=0.5
+r='n'
+flags="${ULPFLAGS:--q} -f"
+runs=
+check __s_log10f 1 && runs=1
+runv=
+check __v_log10f 1 && runv=1
+runvn=
+check __vn_log10f 1 && runvn=1
+
+range_log10f='
+ 0    0xffff0000    10000
+ 0x1p-4    0x1p4    500000
+'
+# error limits
+L_log10f=2.81
+
+while read G F R
+do
+	[ "$R" = 1 ] || continue
+	case "$G" in \#*) continue ;; esac
+	eval range="\${range_$G}"
+	eval L="\${L_$G}"
+	while read X
+	do
+		[ -n "$X" ] || continue
+		case "$X" in \#*) continue ;; esac
+		t $F $X
+	done << EOF
+$range
+EOF
+done << EOF
+# group symbol run
+log10f __s_log10f      $runs
+log10f __v_log10f      $runv
+log10f __vn_log10f     $runvn
+log10f _ZGVnN4v_log10f $runvn
+EOF
+
 [ 0 -eq $FAIL ] || {
 	echo "FAILED $FAIL PASSED $PASS"
 	exit 1
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 5d14d5e..de9285e 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -7,3 +7,13 @@
 F1 (erf)
 F1 (log10)
 D1 (log10)
+#if WANT_VMATH
+F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
+#if __aarch64__
+F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
+#ifdef __vpcs
+F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
+F (_ZGVnN4v_log10f, Z_log10f, log10, mpfr_log10, 1, 1, f1, 1)
+#endif
+#endif
+#endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 38e0f63..7cdd3e8 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -24,4 +24,9 @@ static const double dv[2] = {1.0, -INFINITY};
 static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
 static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
 
+static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
+#ifdef __vpcs
+static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
+static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
+#endif
 #endif
diff --git a/pl/math/tools/log10f.sollya b/pl/math/tools/log10f.sollya
index c8cfa51..26a4a76 100644
--- a/pl/math/tools/log10f.sollya
+++ b/pl/math/tools/log10f.sollya
@@ -3,26 +3,20 @@
 // Copyright (c) 2019-2022, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
-deg = 9; // poly degree
-// |log10(1+x)| > 0x1p-4 outside the interval
-a = -1/3;
-b =  1/3;
-//b =  2*(a+1)-1;
+// Computation of log10f(1+x) will be carried out in double precision
 
-display = hexadecimal;
-print("log10(2) = ", single(log10(2)));
-
-ln10 = evaluate(log(10),0);
-invln10 = single(1/ln10);
+deg = 4; // poly degree
+// [OFF; 2*OFF] is divided in 2^4 intervals with OFF~0.7
+a = -0.04375;
+b = 0.04375;
 
-// find log10(1+x)/x polynomial with minimal relative error
-// (minimal relative error polynomial for log10(1+x) is the same * x)
+// find log(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log(1+x) is the same * x)
 deg = deg-1; // because of /x
 
 // f = log(1+x)/x; using taylor series
 f = 0;
 for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
-f = f/ln10;
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
 approx = proc(poly,d) {
@@ -30,17 +24,14 @@ approx = proc(poly,d) {
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
-poly = invln10;
+poly = 1;
 for i from 1 to deg do {
-  p = roundcoefficients(approx(poly,i), [|SG ...|]);
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
+
 display = hexadecimal;
-print("invln10:", invln10);
 print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
-for i from 0 to deg do coeff(poly,i);
-
-display = decimal;
-print("in [",a,b,"]");
+for i from 0 to deg do double(coeff(poly,i));
diff --git a/pl/math/tools/v_log10f.sollya b/pl/math/tools/v_log10f.sollya
new file mode 100644
index 0000000..c24c2c9
--- /dev/null
+++ b/pl/math/tools/v_log10f.sollya
@@ -0,0 +1,45 @@
+// polynomial for approximating v_log10f(1+x)
+//
+// Copyright (c) 2019-2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 9; // poly degree
+// |log10(1+x)| > 0x1p-4 outside the interval
+a = -1/3;
+b =  1/3;
+
+display = hexadecimal;
+print("log10(2) = ", single(log10(2)));
+
+ln10 = evaluate(log(10),0);
+invln10 = single(1/ln10);
+
+// find log10(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log10(1+x) is the same * x)
+deg = deg-1; // because of /x
+
+// f = log(1+x)/x; using taylor series
+f = 0;
+for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+f = f/ln10;
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = invln10;
+for i from 1 to deg do {
+  p = roundcoefficients(approx(poly,i), [|SG ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+display = hexadecimal;
+print("invln10:", invln10);
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do single(coeff(poly,i));
+
+display = decimal;
+print("in [",a,b,"]");
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
new file mode 100644
index 0000000..e105956
--- /dev/null
+++ b/pl/math/v_log10f_3u5.c
@@ -0,0 +1,86 @@
+/*
+ * Single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+     [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
+  -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f,
+  -0x1.246f8p-4f,  0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f};
+#define P8 v_f32 (Poly[7])
+#define P7 v_f32 (Poly[6])
+#define P6 v_f32 (Poly[5])
+#define P5 v_f32 (Poly[4])
+#define P4 v_f32 (Poly[3])
+#define P3 v_f32 (Poly[2])
+#define P2 v_f32 (Poly[1])
+#define P1 v_f32 (Poly[0])
+
+#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218.  */
+#define Log10_2 v_f32 (0x1.344136p-2f)
+#define InvLn10 v_f32 (0x1.bcb7b2p-2f)
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Mask v_u32 (0x007fffff)
+#define Off v_u32 (0x3f2aaaab) /* 0.666667.  */
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (log10f, x, y, cmp);
+}
+
+/* Our fast implementation of v_log10f uses a similar approach as v_logf.
+   With the same offset as v_logf (i.e., 2/3) it delivers about 3.3ulps with
+   order 9. This is more efficient than using a low order polynomial computed in
+   double precision.
+   Maximum error: 3.305ulps (nearest rounding.)
+   __v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+			    want 0x1.ffe2f4p-4 -0.304916 ulp err 2.80492.  */
+VPCS_ATTR
+v_f32_t V_NAME (log10f) (v_f32_t x)
+{
+  v_f32_t n, o, p, q, r, r2, y;
+  v_u32_t u, cmp;
+
+  u = v_as_u32_f32 (x);
+  cmp = v_cond_u32 (u - Min >= Max - Min);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u -= Off;
+  n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend.  */
+  u &= Mask;
+  u += Off;
+  r = v_as_f32_u32 (u) - v_f32 (1.0f);
+
+  /* y = log10(1+r) + n*log10(2).  */
+  r2 = r * r;
+  /* (n*ln2 + r)*InvLn10 + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 +
+     r2*(P7+r*P8))).  */
+  o = v_fma_f32 (P8, r, P7);
+  p = v_fma_f32 (P6, r, P5);
+  q = v_fma_f32 (P4, r, P3);
+  y = v_fma_f32 (P2, r, P1);
+  p = v_fma_f32 (o, r2, p);
+  q = v_fma_f32 (p, r2, q);
+  y = v_fma_f32 (q, r2, y);
+  /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster
+     but less accurate.  */
+  p = v_fma_f32 (Ln2, n, r);
+  y = v_fma_f32 (y, r2, p * InvLn10);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
new file mode 100644
index 0000000..97c3731
--- /dev/null
+++ b/pl/math/v_math.h
@@ -0,0 +1,638 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+#ifndef WANT_VMATH
+/* Enable the build of vector math code.  */
+# define WANT_VMATH 1
+#endif
+#if WANT_VMATH
+
+/* The goal of this header is to allow vector (only Neon for now)
+   and scalar build of the same algorithm. */
+
+#if SCALAR
+#define V_NAME(x) __s_##x
+#elif VPCS && __aarch64__
+#define V_NAME(x) __vn_##x
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+#else
+#define V_NAME(x) __v_##x
+#endif
+
+#ifndef VPCS_ATTR
+#define VPCS_ATTR
+#endif
+#ifndef VPCS_ALIAS
+#define VPCS_ALIAS
+#endif
+
+#include <stdint.h>
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+/* reinterpret as type1 from type2.  */
+static inline u32_t
+as_u32_f32 (f32_t x)
+{
+  union { f32_t f; u32_t u; } r = {x};
+  return r.u;
+}
+static inline f32_t
+as_f32_u32 (u32_t x)
+{
+  union { u32_t u; f32_t f; } r = {x};
+  return r.f;
+}
+static inline s32_t
+as_s32_u32 (u32_t x)
+{
+  union { u32_t u; s32_t i; } r = {x};
+  return r.i;
+}
+static inline u32_t
+as_u32_s32 (s32_t x)
+{
+  union { s32_t i; u32_t u; } r = {x};
+  return r.u;
+}
+static inline u64_t
+as_u64_f64 (f64_t x)
+{
+  union { f64_t f; u64_t u; } r = {x};
+  return r.u;
+}
+static inline f64_t
+as_f64_u64 (u64_t x)
+{
+  union { u64_t u; f64_t f; } r = {x};
+  return r.f;
+}
+static inline s64_t
+as_s64_u64 (u64_t x)
+{
+  union { u64_t u; s64_t i; } r = {x};
+  return r.i;
+}
+static inline u64_t
+as_u64_s64 (s64_t x)
+{
+  union { s64_t i; u64_t u; } r = {x};
+  return r.u;
+}
+
+#if SCALAR
+#define V_SUPPORTED 1
+typedef f32_t v_f32_t;
+typedef u32_t v_u32_t;
+typedef s32_t v_s32_t;
+typedef f64_t v_f64_t;
+typedef u64_t v_u64_t;
+typedef s64_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 1;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return x;
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return x;
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return x;
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x;
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x;
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x;
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  *x = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  return x != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return __builtin_fabsf (x);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return __builtin_fmaf (x, y, z);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return __builtin_roundf (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return f (x);
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return f (x1, x2);
+}
+
+static inline int
+v_lanes64 (void)
+{
+  return 1;
+}
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return x;
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return x;
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return x;
+}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x;
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  *x = v;
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  return x != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return __builtin_fabs (x);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return __builtin_fma (x, y, z);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return __builtin_round (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return __builtin_lround (x); /* relies on -fno-math-errno.  */
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return f (x);
+}
+
+#elif __aarch64__
+#define V_SUPPORTED 1
+#include <arm_neon.h>
+typedef float32x4_t v_f32_t;
+typedef uint32x4_t v_u32_t;
+typedef int32x4_t v_s32_t;
+typedef float64x2_t v_f64_t;
+typedef uint64x2_t v_u64_t;
+typedef int64x2_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 4;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return (v_f32_t){x, x, x, x};
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return (v_u32_t){x, x, x, x};
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return (v_s32_t){x, x, x, x};
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x[i];
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x[i];
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x[i];
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  (*x)[i] = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return vabsq_f32 (x);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return vfmaq_f32 (z, x, y);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return vrndaq_f32 (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return vcvtaq_s32_f32 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+		   p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return (
+    v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
+	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
+}
+
+static inline int
+v_lanes64 (void)
+{
+  return 2;
+}
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return (v_f64_t){x, x};
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return (v_u64_t){x, x};
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return (v_s64_t){x, x};
+}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x[i];
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  (*x)[i] = v;
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (x) != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return vabsq_f64 (x);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return vfmaq_f64 (z, x, y);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return vrndaq_f64 (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return vcvtaq_s64_f64 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union {  v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return (v_f64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return (v_u64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
+}
+#endif
+
+#endif
+#endif
diff --git a/pl/math/vn_log10f_3u5.c b/pl/math/vn_log10f_3u5.c
new file mode 100644
index 0000000..b419d0a
--- /dev/null
+++ b/pl/math/vn_log10f_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log10f.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log10f, _ZGVnN4v_log10f)
+#include "v_log10f_3u5.c"
+#endif
-- 
cgit v1.2.3


From 4b77ad76dd0ff3c70119b1a47b4b8ea659e04a83 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Tue, 19 Apr 2022 18:30:38 +0100
Subject: pl/math: Add Vector/Neon log10

- Neon log10 is a slight modification of math/v_log, where
  log10(c) values have been computed by scaling ln(c) values.
- Coefficients are first computed for log, then scaled by
  1/log(10) then rounded to double precision.
- The maximum measured ULP error is 2.5ulps.
---
 pl/math/include/mathlib.h      |   4 ++
 pl/math/s_log10_2u5.c          |   6 ++
 pl/math/test/mathbench_funcs.h |   5 ++
 pl/math/test/runulp.sh         |  12 ++++
 pl/math/test/ulp_funcs.h       |   4 ++
 pl/math/test/ulp_wrappers.h    |   4 ++
 pl/math/tools/v_log10.sollya   |  38 ++++++++++
 pl/math/v_log10.h              |  19 +++++
 pl/math/v_log10_2u5.c          | 116 ++++++++++++++++++++++++++++++
 pl/math/v_log10_data.c         | 157 +++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_log10_2u5.c         |  12 ++++
 11 files changed, 377 insertions(+)
 create mode 100644 pl/math/s_log10_2u5.c
 create mode 100644 pl/math/tools/v_log10.sollya
 create mode 100644 pl/math/v_log10.h
 create mode 100644 pl/math/v_log10_2u5.c
 create mode 100644 pl/math/v_log10_data.c
 create mode 100644 pl/math/vn_log10_2u5.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index faf407e..df1f884 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -14,6 +14,7 @@ float log10f (float);
 double log10 (double);
 
 float __s_log10f (float);
+double __s_log10 (double);
 
 #if __aarch64__
 #if __GNUC__ >= 5
@@ -28,15 +29,18 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 
 /* Vector functions following the base PCS.  */
 __f32x4_t __v_log10f (__f32x4_t);
+__f64x2_t __v_log10 (__f64x2_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
 __vpcs __f32x4_t __vn_log10f (__f32x4_t);
+__vpcs __f64x2_t __vn_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 
 #endif
 #endif
diff --git a/pl/math/s_log10_2u5.c b/pl/math/s_log10_2u5.c
new file mode 100644
index 0000000..ad7f50b
--- /dev/null
+++ b/pl/math/s_log10_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log10_2u5.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 1dd7209..8b70d47 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -11,11 +11,16 @@ D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
 F (__s_log10f, 0.01, 11.1)
+D (__s_log10, 0.01, 11.1)
 #if __aarch64__
+VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 #ifdef __vpcs
 VNF (__vn_log10f, 0.01, 11.1)
 VNF (_ZGVnN4v_log10f, 0.01, 11.1)
+
+VND (__vn_log10, 0.01, 11.1)
+VND (_ZGVnN2v_log10, 0.01, 11.1)
 #endif
 #endif
 #endif
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 7705258..c2e8455 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -67,11 +67,18 @@ check __v_log10f 1 && runv=1
 runvn=
 check __vn_log10f 1 && runvn=1
 
+range_log10='
+  0 0xffff000000000000 10000
+  0x1p-4     0x1p4     400000
+  0          inf       400000
+'
+
 range_log10f='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
 '
 # error limits
+L_log10=1.16
 L_log10f=2.81
 
 while read G F R
@@ -90,6 +97,11 @@ $range
 EOF
 done << EOF
 # group symbol run
+log10  __s_log10       $runs
+log10  __v_log10       $runv
+log10  __vn_log10      $runvn
+log10  _ZGVnN2v_log10  $runvn
+
 log10f __s_log10f      $runs
 log10f __v_log10f      $runv
 log10f __vn_log10f     $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index de9285e..dd1837e 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -9,11 +9,15 @@ F1 (log10)
 D1 (log10)
 #if WANT_VMATH
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
+F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 #if __aarch64__
 F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
+F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #ifdef __vpcs
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
+F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (_ZGVnN4v_log10f, Z_log10f, log10, mpfr_log10, 1, 1, f1, 1)
+F (_ZGVnN2v_log10, Z_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #endif
 #endif
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 7cdd3e8..1386fbd 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -25,8 +25,12 @@ static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
 static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
 
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
+static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 #ifdef __vpcs
 static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
+static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
+
 static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
+static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
 #endif
 #endif
diff --git a/pl/math/tools/v_log10.sollya b/pl/math/tools/v_log10.sollya
new file mode 100644
index 0000000..76c1648
--- /dev/null
+++ b/pl/math/tools/v_log10.sollya
@@ -0,0 +1,38 @@
+// polynomial used for __v_log10(x)
+//
+// Copyright (c) 2019-2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 6; // poly degree
+a = -0x1.fc1p-9;
+b = 0x1.009p-8;
+
+// find log(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log(1+x) is the same * x)
+deg = deg-1; // because of /x
+
+// f = log(1+x)/x; using taylor series
+f = 0;
+for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = 1;
+for i from 1 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+// scale coefficients by 1/ln(10)
+ln10 = evaluate(log(10),0);
+poly = poly/ln10;
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do double(coeff(poly,i));
diff --git a/pl/math/v_log10.h b/pl/math/v_log10.h
new file mode 100644
index 0000000..8564911
--- /dev/null
+++ b/pl/math/v_log10.h
@@ -0,0 +1,19 @@
+/*
+ * Declarations for double-precision log10(x) vector function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if WANT_VMATH
+
+#define V_LOG10_TABLE_BITS 7
+
+extern const struct v_log10_data
+{
+  f64_t invc;
+  f64_t log10c;
+} __v_log10_data[1 << V_LOG10_TABLE_BITS] HIDDEN;
+
+#endif
diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c
new file mode 100644
index 0000000..64a2b50
--- /dev/null
+++ b/pl/math/v_log10_2u5.c
@@ -0,0 +1,116 @@
+/*
+ * Double-precision vector log10(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_log10.h"
+#include "include/mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+/* Constants used to switch from base e to base 10.  */
+#define ivln10 v_f64 (0x1.bcb7b1526e50ep-2)
+#define log10_2 v_f64 (0x1.34413509f79ffp-2)
+
+static const f64_t Poly[] = {
+  /* computed from log coeffs divided by log(10) in extended precision then
+     rounded to double precision.  */
+  -0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4,
+  0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4,
+};
+
+#define A0 v_f64 (Poly[0])
+#define A1 v_f64 (Poly[1])
+#define A2 v_f64 (Poly[2])
+#define A3 v_f64 (Poly[3])
+#define A4 v_f64 (Poly[4])
+#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
+#define N (1 << V_LOG10_TABLE_BITS)
+#define OFF v_u64 (0x3fe6900900000000)
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t log10c;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = __v_log10_data[i].invc;
+  e.log10c = __v_log10_data[i].log10c;
+#else
+  e.invc[0] = __v_log10_data[i[0]].invc;
+  e.log10c[0] = __v_log10_data[i[0]].log10c;
+  e.invc[1] = __v_log10_data[i[1]].invc;
+  e.log10c[1] = __v_log10_data[i[1]].log10c;
+#endif
+  return e;
+}
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (log10, x, y, cmp);
+}
+
+/* Our implementation of v_log10 is a slight modification of v_log (1.660ulps).
+   Max ULP error: < 2.5 ulp (nearest rounding.)
+   Maximum measured at 2.46 ulp for x in [0.96, 0.97]
+     __v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
+				    want 0x1.fff6be3cae4b9p-6
+     -0.459999 ulp err 1.96.  */
+VPCS_ATTR
+v_f64_t V_NAME (log10) (v_f64_t x)
+{
+  v_f64_t z, r, r2, p, y, kd, hi;
+  v_u64_t ix, iz, tmp, top, i, cmp;
+  v_s64_t k;
+  struct entry e;
+
+  ix = v_as_u64_f64 (x);
+  top = ix >> 48;
+  cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (52 - V_LOG10_TABLE_BITS)) % N;
+  k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift.  */
+  iz = ix - (tmp & v_u64 (0xfffULL << 52));
+  z = v_as_f64_u64 (iz);
+  e = lookup (i);
+
+  /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2).  */
+  r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  kd = v_to_f64_s64 (k);
+
+  /* hi = r / log(10) + log10(c) + k*log10(2).
+     Constants in `v_log10_data.c` are computed (in extended precision) as
+     e.log10c := e.logc * ivln10.  */
+  v_f64_t w = v_fma_f64 (r, ivln10, e.log10c);
+
+  /* y = log10(1+r) + n * log10(2).  */
+  hi = v_fma_f64 (kd, log10_2, w);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  r2 = r * r;
+  y = v_fma_f64 (A3, r, A2);
+  p = v_fma_f64 (A1, r, A0);
+  y = v_fma_f64 (A4, r2, y);
+  y = v_fma_f64 (y, r2, p);
+  y = v_fma_f64 (y, r2, hi);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_log10_data.c b/pl/math/v_log10_data.c
new file mode 100644
index 0000000..7fdb519
--- /dev/null
+++ b/pl/math/v_log10_data.c
@@ -0,0 +1,157 @@
+/*
+ * Lookup table for double-precision log10(x) vector function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_log10.h"
+
+#define N (1 << V_LOG10_TABLE_BITS)
+
+/* Algorithm:
+
+	x = 2^k z
+	log10(x) = k log10(2) + log10(c) + poly(z/c - 1) / log(10)
+
+where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
+and log(c) and 1/c for the ith subinterval comes from a lookup table:
+
+	tab[i].invc = 1/c
+	tab[i].log10c = (double)log10(c)
+
+where c is near the center of the subinterval and is chosen by trying several
+floating point invc candidates around 1/center and selecting one for which
+the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
+that contains 1 and the previous one got tweaked to avoid cancellation.
+NB: invc should be optimized to minimize error in (double)log10(c) instead.  */
+const struct v_log10_data __v_log10_data[N] = {
+  {0x1.6a133d0dec120p+0, -0x1.345825f221684p-3},
+  {0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3},
+  {0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3},
+  {0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3},
+  {0x1.623f1d916f323p+0, -0x1.20e7081762193p-3},
+  {0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3},
+  {0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3},
+  {0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3},
+  {0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3},
+  {0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3},
+  {0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3},
+  {0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4},
+  {0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4},
+  {0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4},
+  {0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4},
+  {0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4},
+  {0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4},
+  {0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4},
+  {0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4},
+  {0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4},
+  {0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4},
+  {0x1.446f12b278001p+0, -0x1.a56c091954f87p-4},
+  {0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4},
+  {0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4},
+  {0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4},
+  {0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4},
+  {0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4},
+  {0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4},
+  {0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4},
+  {0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4},
+  {0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4},
+  {0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4},
+  {0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4},
+  {0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4},
+  {0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4},
+  {0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4},
+  {0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4},
+  {0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4},
+  {0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4},
+  {0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4},
+  {0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4},
+  {0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5},
+  {0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5},
+  {0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5},
+  {0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5},
+  {0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5},
+  {0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5},
+  {0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5},
+  {0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5},
+  {0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5},
+  {0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5},
+  {0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5},
+  {0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5},
+  {0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5},
+  {0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5},
+  {0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5},
+  {0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5},
+  {0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5},
+  {0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6},
+  {0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6},
+  {0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6},
+  {0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6},
+  {0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6},
+  {0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6},
+  {0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6},
+  {0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6},
+  {0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7},
+  {0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7},
+  {0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7},
+  {0x1.062491aee9904p+0, -0x1.517249c15a75cp-7},
+  {0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7},
+  {0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8},
+  {0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8},
+  {0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9},
+  {0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10},
+  {1.0, 0.0},
+  {0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9},
+  {0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8},
+  {0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7},
+  {0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7},
+  {0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6},
+  {0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6},
+  {0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6},
+  {0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6},
+  {0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6},
+  {0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5},
+  {0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5},
+  {0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5},
+  {0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5},
+  {0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5},
+  {0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5},
+  {0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5},
+  {0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5},
+  {0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5},
+  {0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5},
+  {0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4},
+  {0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4},
+  {0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4},
+  {0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4},
+  {0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4},
+  {0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4},
+  {0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4},
+  {0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4},
+  {0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4},
+  {0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4},
+  {0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4},
+  {0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4},
+  {0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4},
+  {0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4},
+  {0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4},
+  {0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4},
+  {0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4},
+  {0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4},
+  {0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4},
+  {0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4},
+  {0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4},
+  {0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4},
+  {0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4},
+  {0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3},
+  {0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3},
+  {0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3},
+  {0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3},
+  {0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3},
+  {0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3},
+  {0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3},
+  {0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3},
+  {0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3},
+  {0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3},
+};
diff --git a/pl/math/vn_log10_2u5.c b/pl/math/vn_log10_2u5.c
new file mode 100644
index 0000000..b94499b
--- /dev/null
+++ b/pl/math/vn_log10_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log10.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log10, _ZGVnN2v_log10)
+#include "v_log10_2u5.c"
+#endif
-- 
cgit v1.2.3


From 6ad849d459f5fd15053053df1625f592e911a4ed Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 21 Apr 2022 11:49:17 +0100
Subject: pl/math: Add scalar erfc

erfc has dependencies from exp, which have been copied
across from the main math directory. The maximum measured
error is 4.37 ULPs for x = 0x1.3a64c308e7789p+0 (x~=1.228).
---
 pl/math/erfc_4u5.c                       |  163 +++++
 pl/math/erfc_data.c                      |  145 ++++
 pl/math/exp.c                            |  163 +++++
 pl/math/exp_data.c                       | 1120 ++++++++++++++++++++++++++++++
 pl/math/math_config.h                    |   29 +
 pl/math/test/mathbench_funcs.h           |    1 +
 pl/math/test/runulp.sh                   |    9 +
 pl/math/test/testcases/directed/erfc.tst |   23 +
 pl/math/test/ulp_funcs.h                 |    1 +
 pl/math/tools/erfc.sollya                |   23 +
 10 files changed, 1677 insertions(+)
 create mode 100644 pl/math/erfc_4u5.c
 create mode 100644 pl/math/erfc_data.c
 create mode 100644 pl/math/exp.c
 create mode 100644 pl/math/exp_data.c
 create mode 100644 pl/math/test/testcases/directed/erfc.tst
 create mode 100644 pl/math/tools/erfc.sollya

diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
new file mode 100644
index 0000000..810da82
--- /dev/null
+++ b/pl/math/erfc_4u5.c
@@ -0,0 +1,163 @@
+/*
+ * Double-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stdint.h>
+#include <math.h>
+#include <errno.h>
+#include "math_config.h"
+
+#define AbsMask (0x7fffffffffffffff)
+
+#define xint __erfc_data.interval_bounds
+#define PX __erfc_data.poly
+
+/* Accurate exponential from optimized routines.  */
+double
+__exp_dd (double x, double xtail);
+
+/* Evaluate order-12 polynomials using
+   pairwise summation and Horner scheme
+   in double precision.  */
+static inline double
+eval_poly_horner (double z, int i)
+{
+  double r1, r2, r3, r4, r5, r6, z2;
+  r1 = fma (z, PX[i][1], PX[i][0]);
+  r2 = fma (z, PX[i][3], PX[i][2]);
+  r3 = fma (z, PX[i][5], PX[i][4]);
+  r4 = fma (z, PX[i][7], PX[i][6]);
+  r5 = fma (z, PX[i][9], PX[i][8]);
+  r6 = fma (z, PX[i][11], PX[i][10]);
+  z2 = z * z;
+  double r = PX[i][12];
+  r = fma (z2, r, r6);
+  r = fma (z2, r, r5);
+  r = fma (z2, r, r4);
+  r = fma (z2, r, r3);
+  r = fma (z2, r, r2);
+  r = fma (z2, r, r1);
+  return r;
+}
+
+/* Accurate evaluation of exp(x^2)
+   using compensated product (x^2 ~ x*x + e2)
+   and the __exp_dd(y,d) routine, that is the
+   computation of exp(y+d) with a small correction d<<y.  */
+static inline double
+eval_accurate_gaussian (double a)
+{
+  double e2;
+  double a2 = a * a;
+  double aa1 = -fma (0x1.0000002p27, a, -a);
+  aa1 = fma (0x1.0000002p27, a, aa1);
+  double aa2 = a - aa1;
+  e2 = fma (-aa1, aa1, a2);
+  e2 = fma (-aa1, aa2, e2);
+  e2 = fma (-aa2, aa1, e2);
+  e2 = fma (-aa2, aa2, e2);
+  return __exp_dd (-a2, e2);
+}
+
+/* Approximation of erfc for |x| > 6.0.  */
+static inline double
+approx_erfc_hi (double x, int i)
+{
+  double a = fabs (x);
+  double z = a - xint[i];
+  double p = eval_poly_horner (z, i);
+  double e_mx2 = eval_accurate_gaussian (a);
+  return p * e_mx2;
+}
+
+static inline int
+get_itv_idx (double x)
+{
+  /* Interval bounds are a logarithmic scale, i.e. interval n has
+     lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain
+     the interval index.  */
+  double a = asdouble (asuint64 (x) & AbsMask);
+  double z = a + 1.0;
+  z = z * z;
+  z = z * z;
+  return (asuint64 (z) >> 52) - 1023;
+}
+
+/* Approximation of erfc for |x| < 6.0.  */
+static inline double
+approx_erfc_lo (double x, uint32_t sign, int i)
+{
+  double a = fabs (x);
+  double z = a - xint[i];
+  double p = eval_poly_horner (z, i);
+  double e_mx2 = eval_accurate_gaussian (a);
+  if (sign)
+    return fma (-p, e_mx2, 2.0);
+  else
+    return p * e_mx2;
+}
+
+/* Top 12 bits of a double (sign and exponent bits).  */
+static inline uint32_t
+abstop12 (double x)
+{
+  return (asuint64 (x) >> 52) & 0x7ff;
+}
+
+/* Top 32 bits of a double.  */
+static inline uint32_t
+top32 (double x)
+{
+  return asuint64 (x) >> 32;
+}
+
+/* Fast erfc implementation.
+   The approximation uses polynomial approximation of
+   exp(x^2) * erfc(x) with fixed orders on 20 intervals.
+   Maximum measured error is 4.37 ULPs in [1.2281, 1.2282].
+   erfc(0x1.3a64c308e7789p+0) got 0x1.519b08721640cp-4
+			     want 0x1.519b087216408p-4
+   -0.367612 ulp err 3.86761.  */
+double
+erfc (double x)
+{
+  /* Get top words.  */
+  uint32_t ix = top32 (x); /* We need to compare at most 32 bits.  */
+  uint32_t ia = ix & 0x7fffffff;
+  uint32_t sign = ix >> 31;
+
+  /* Handle special cases and small values with a single comparison:
+     abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small)
+     Special cases erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2
+     Errno EDOM does not have to be set in case of erfc(nan).
+     Only ERANGE may be set in case of underflow.
+     Small values (|x|<small)
+       |x|<0x1.0p-56 => accurate up to 0.5 ULP (top12(0x1p-50) = 0x3c7)
+       |x|<0x1.0p-50 => accurate up to 1.0 ULP (top12(0x1p-50) = 0x3cd).  */
+  if (unlikely (abstop12 (x) - 0x3cd >= (abstop12 (INFINITY) & 0x7ff) - 0x3cd))
+    {
+      if (abstop12 (x) >= 0x7ff)
+	return (double) (sign << 1) + 1.0 / x; /* special cases.  */
+      else
+	return 1.0 - x; /* small case.  */
+    }
+  else if (ia < 0x40180000)
+    { /* |x| < 6.0.  */
+      return approx_erfc_lo (x, sign, get_itv_idx (x));
+    }
+  else if (sign)
+    { /* x <= -6.0.  */
+      return 2.0;
+    }
+  else if (ia < 0x403c0000)
+    { /* 6.0 <= x < 28.  */
+      return approx_erfc_hi (x, get_itv_idx (x));
+    }
+  else
+    { /* x > 28.  */
+      return __math_uflow (0);
+    }
+}
diff --git a/pl/math/erfc_data.c b/pl/math/erfc_data.c
new file mode 100644
index 0000000..02b7db1
--- /dev/null
+++ b/pl/math/erfc_data.c
@@ -0,0 +1,145 @@
+/*
+ * Data used in double-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double
+   precision. Generated using the Remez algorithm on each interval separately
+   (see erfc.sollya for more detail).  */
+const struct erfc_data __erfc_data = {
+
+/* Bounds for 20 intervals spanning [0x1.0p-50., 31.]. Interval bounds are a
+   logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the
+   exception of the first interval.  */
+.interval_bounds = {
+  0x1.0p-50,		/* Tiny boundary.  */
+  0x1.837f05c490126p-3, /* 0.189.  */
+  0x1.a827997709f7ap-2, /* 0.414.  */
+  0x1.5d13f326fe9c8p-1, /* 0.682.  */
+  0x1.0p0,		/* 1.000.  */
+  0x1.60dfc14636e2ap0,	/* 1.378.  */
+  0x1.d413cccfe779ap0,	/* 1.828.  */
+  0x1.2e89f995ad3adp1,	/* 2.364.  */
+  0x1.8p1,		/* 3.000.  */
+  0x1.e0dfc14636e2ap1,	/* 3.757.  */
+  0x1.2a09e667f3bcdp2,	/* 4.657.  */
+  0x1.6e89f995ad3adp2,	/* 5.727.  */
+  0x1.cp2,		/* 7.000.  */
+  0x1.106fe0a31b715p3,	/* 8.514.  */
+  0x1.4a09e667f3bcdp3,	/* 10.31.  */
+  0x1.8e89f995ad3adp3,	/* 12.45.  */
+  0x1.ep3,		/* 15.00.  */
+  0x1.206fe0a31b715p4,	/* 18.03.  */
+  0x1.5a09e667f3bcdp4,	/* 21.63.  */
+  0x1.9e89f995ad3adp4,	/* 25.91.  */
+  0x1.fp4		/* 31.00.  */
+},
+
+/* Coefficients for each order 12 polynomial on each of the 20 intervals.  */
+.poly = {
+  {0x1.ffffffffffff6p-1, -0x1.20dd750429b66p0, 0x1.fffffffffffdcp-1,
+   -0x1.812746b03713ap-1, 0x1.ffffffffbe94cp-2, -0x1.341f6bb6ec9a6p-2,
+   0x1.555553a70ec2ep-3, -0x1.6023b4617a388p-4, 0x1.5550f0e40bfbap-5,
+   -0x1.38c290c0c8de8p-6, 0x1.0e84002c6274ep-7, -0x1.a599eb0ac5d04p-9,
+   0x1.c9bfafa73899cp-11},
+  {0x1.a2b43dbd503c8p-1, -0x1.a3495b7c9e6a4p-1, 0x1.535f3fb8cb92ap-1,
+   -0x1.d96ee9c714f44p-2, 0x1.26956676d2c64p-2, -0x1.4e2820da90c08p-3,
+   0x1.5ea0cffac775ap-4, -0x1.57fb82ca373e8p-5, 0x1.3e0e8f48ba0f8p-6,
+   -0x1.16a695af1bbd4p-7, 0x1.cc836241a87d4p-9, -0x1.531de41264fdap-10,
+   0x1.526a8a14e9bfcp-12},
+  {0x1.532e75821ed48p-1, -0x1.28be350460782p-1, 0x1.b08873adbf108p-2,
+   -0x1.14377569249e2p-2, 0x1.3e1ece8cd10dap-3, -0x1.5087e2e6dc2e8p-4,
+   0x1.4b3adb3bb335ap-5, -0x1.32342d711a4f4p-6, 0x1.0bc4f6ce2b656p-7,
+   -0x1.bcdaa331f2144p-9, 0x1.5c21c9e0ca954p-10, -0x1.dfdc9b3b5c402p-12,
+   0x1.b451af7dd52fep-14},
+  {0x1.10f9745a4f44ap-1, -0x1.9b03213e6963ap-2, 0x1.09b942bc8de66p-2,
+   -0x1.32755394481e4p-3, 0x1.42819b18af0e4p-4, -0x1.3a6d643aaa572p-5,
+   0x1.1f17897603eaep-6, -0x1.eefb8d3f89d42p-8, 0x1.95559544f2fbp-9,
+   -0x1.3c2a67c33338p-10, 0x1.cffa784efe6cp-12, -0x1.282646774689cp-13,
+   0x1.e654e67532b44p-16},
+  {0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c04dp-2, 0x1.3c27283c328dbp-3,
+   -0x1.44837f88ea4bdp-4, 0x1.33cad0e887482p-5, -0x1.10fcf0bc8963cp-6,
+   0x1.c8cb68153ec42p-8, -0x1.6aef9a9842c54p-9, 0x1.1334345d6467cp-10,
+   -0x1.8ebe8763a2a8cp-12, 0x1.0f457219dec0dp-13, -0x1.3d2501dcd2a0fp-15,
+   0x1.d213a128a75c9p-18},
+  {0x1.5ee444130b7dbp-2, -0x1.78396ab208478p-3, 0x1.6e617ec5c0cc3p-4,
+   -0x1.49e60f63656b5p-5, 0x1.16064fddbbcb9p-6, -0x1.ba80af6a31018p-8,
+   0x1.4ec374269d4ecp-9, -0x1.e40be960703a4p-11, 0x1.4fb029f35a144p-12,
+   -0x1.be45fd71a60eap-14, 0x1.161235cd2a3e7p-15, -0x1.264890eb1b5ebp-17,
+   0x1.7f90154bde15dp-20},
+  {0x1.19a22c064d4eap-2, -0x1.f645498cae217p-4, 0x1.a0565950e3f08p-5,
+   -0x1.446605c21c178p-6, 0x1.df1231d75622fp-8, -0x1.515167553de25p-9,
+   0x1.c72c1b4a2a57fp-11, -0x1.276ae9394ecf1p-12, 0x1.71d2696d6c8c3p-14,
+   -0x1.bd4152984ce1dp-16, 0x1.f5afd2b450df7p-18, -0x1.dafdaddc7f943p-20,
+   0x1.1020f4741f79ep-22},
+  {0x1.c57f0542a7637p-3, -0x1.4e5535c17afc8p-4, 0x1.d312725242824p-6,
+   -0x1.3727cbc12a4bbp-7, 0x1.8d6730fc45b6bp-9, -0x1.e8855055c9b53p-11,
+   0x1.21f73b70cc792p-12, -0x1.4d4fe06f13831p-14, 0x1.73867a82f7484p-16,
+   -0x1.8fab204d1d75ep-18, 0x1.91d9ba10367f4p-20, -0x1.5077ce4b334ddp-22,
+   0x1.501716d098f14p-25},
+  {0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b135p-5, 0x1.043fe1a989f11p-6,
+   -0x1.259061b98cf96p-8, 0x1.409cc2b1c4fc2p-10, -0x1.53dec152f6abfp-12,
+   0x1.5e72cb4cc919fp-14, -0x1.6018b68100642p-16, 0x1.58d859380fb24p-18,
+   -0x1.471723286dad5p-20, 0x1.21c1a0f7a6593p-22, -0x1.a872678d91154p-25,
+   0x1.6eb74e2e99662p-28},
+  {0x1.29a8a4e95063ep-3, -0x1.29a8a316d3318p-5, 0x1.21876b3fe4f84p-7,
+   -0x1.1276f2d8ee36cp-9, 0x1.fbff52181a454p-12, -0x1.cb9ce9bde195ep-14,
+   0x1.9710786fa90c5p-16, -0x1.6145ad5b471dcp-18, 0x1.2c52fac57009cp-20,
+   -0x1.f02a8711f07cfp-23, 0x1.7eb574960398cp-25, -0x1.e58ce325343aap-28,
+   0x1.68510d1c32842p-31},
+  {0x1.e583024e2bc8p-4, -0x1.8fb458acb5b0fp-6, 0x1.42b9dffac2531p-8,
+   -0x1.ff9fe9a553dddp-11, 0x1.8e7e86883ba0bp-13, -0x1.313af0bb12375p-15,
+   0x1.cc29ccb17372ep-18, -0x1.55895fbb1ae42p-20, 0x1.f2bd2d6c7fd07p-23,
+   -0x1.62ec031844613p-25, 0x1.d7d69ce7c1847p-28, -0x1.0106b95e4db03p-30,
+   0x1.45aabbe505f6ap-34},
+  {0x1.8d9cbafa30408p-4, -0x1.0dd14614ed20fp-6, 0x1.6943976ea9dcap-9,
+   -0x1.dd6f05f4d7ce8p-12, 0x1.37891334aa621p-14, -0x1.91a8207766e1ep-17,
+   0x1.ffcb0c613d75cp-20, -0x1.425116a6c88dfp-22, 0x1.90cb7c902d428p-25,
+   -0x1.e70fc740c3b6dp-28, 0x1.14a09ae5851ep-30, -0x1.00f9e03eae993p-33,
+   0x1.14989aac741c2p-37},
+  {0x1.46dc6bf900f68p-4, -0x1.6e4b45246f8dp-7, 0x1.96a3de47cfdb5p-10,
+   -0x1.bf5070eb6823bp-13, 0x1.e7af6e4aa8ef8p-16, -0x1.078bf26142831p-18,
+   0x1.1a6e547aa40bep-21, -0x1.2c1c68f62f614p-24, 0x1.3bb8b473dd9e7p-27,
+   -0x1.45576cacb45a1p-30, 0x1.39ab71899b44ep-33, -0x1.ee307d46e2866p-37,
+   0x1.c21ba1b404f5ap-41},
+  {0x1.0d9a17e032288p-4, -0x1.f3e942ff4e097p-8, 0x1.cc77f09db5af8p-11,
+   -0x1.a56e8bffaab5cp-14, 0x1.7f49e36974e03p-17, -0x1.5a73fc0025d2fp-20,
+   0x1.3742ae06a8be6p-23, -0x1.15ecf5317789bp-26, 0x1.ec74dd2b109fp-30,
+   -0x1.ac28325f88dc1p-33, 0x1.5ca9e8d7841b2p-36, -0x1.cfef04667185fp-40,
+   0x1.6487c50052867p-44},
+  {0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cb33p-8, 0x1.0645980ec8568p-11,
+   -0x1.8f86f88695a8cp-15, 0x1.2ef80cb1dca7cp-18, -0x1.c97ff7c599a6dp-22,
+   0x1.57f0ac907d436p-25, -0x1.016be8d812c69p-28, 0x1.7ef6d33c73b75p-32,
+   -0x1.17f9784eda0d4p-35, 0x1.7fd8662b486f1p-39, -0x1.ae21758156d89p-43,
+   0x1.165732f1ae138p-47},
+  {0x1.71eafbd9f5877p-5, -0x1.d83714d904525p-9, 0x1.2c74dbaccea28p-12,
+   -0x1.7d27f3cdea565p-16, 0x1.e20b13581fcf8p-20, -0x1.2fe336f089679p-23,
+   0x1.7dfce36129db3p-27, -0x1.dea026ee03f14p-31, 0x1.2a6019f7c64b1p-34,
+   -0x1.6e0eeb9f98eeap-38, 0x1.a58b4ed07d741p-42, -0x1.8d12c77071e4cp-46,
+   0x1.b0241c6d5b761p-51},
+  {0x1.33714a024097ep-5, -0x1.467f441a50cbdp-9, 0x1.59fa2994d0e65p-13,
+   -0x1.6dd369d9306cap-17, 0x1.81fb2b2af9413p-21, -0x1.96604d3c1bb6ep-25,
+   0x1.aaef2da14243p-29, -0x1.bf7f1b935d3ebp-33, 0x1.d3261ebcd2061p-37,
+   -0x1.e04c803bbd875p-41, 0x1.cff98a43bacdep-45, -0x1.6ef39a63cf675p-49,
+   0x1.4f8abb4398a0dp-54},
+  {0x1.fff97acd75487p-6, -0x1.c502e8e46ec0cp-10, 0x1.903b0650672eap-14,
+   -0x1.6110aa5fb096fp-18, 0x1.36fd4c3e4040cp-22, -0x1.118489fe28728p-26,
+   0x1.e06601208ac47p-31, -0x1.a52b90c21650ap-35, 0x1.6ffc42c05429bp-39,
+   -0x1.3ce3322a6972ep-43, 0x1.009d8ef37ff8cp-47, -0x1.5498d2cc51c99p-52,
+   0x1.058cd4ea9bf04p-57},
+  {0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf97dp-10, 0x1.d0ddfb8593f4p-15,
+   -0x1.5673f4aa86542p-19, 0x1.f8048954325f6p-24, -0x1.72839959ab3e9p-28,
+   0x1.101597113be2ap-32, -0x1.8f1cf0ff4adeep-37, 0x1.23dca407fd66p-41,
+   -0x1.a4f387e57a6a5p-46, 0x1.1dafd753f65e9p-50, -0x1.3e15343c973d6p-55,
+   0x1.9a2af47d77e44p-61},
+  {0x1.64839d636f92bp-6, -0x1.b7adf7536232dp-11, 0x1.0eec0b6357148p-15,
+   -0x1.4da09b7f2c52bp-20, 0x1.9a8b146de838ep-25, -0x1.f8d1f145e7b6fp-30,
+   0x1.3624435b3ba11p-34, -0x1.7cba19b4af977p-39, 0x1.d2282481ba91ep-44,
+   -0x1.198c1e91f9564p-48, 0x1.4046224f8ccp-53, -0x1.2b1dc676c096fp-58,
+   0x1.43d3358c64dafp-64}
+}
+};
diff --git a/pl/math/exp.c b/pl/math/exp.c
new file mode 100644
index 0000000..f95c46f
--- /dev/null
+++ b/pl/math/exp.c
@@ -0,0 +1,163 @@
+/*
+ * Double-precision e^x function.
+ *
+ * Copyright (c) 2018-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <float.h>
+#include <math.h>
+#include <stdint.h>
+#include "math_config.h"
+
+#define N (1 << EXP_TABLE_BITS)
+#define InvLn2N __exp_data.invln2N
+#define NegLn2hiN __exp_data.negln2hiN
+#define NegLn2loN __exp_data.negln2loN
+#define Shift __exp_data.shift
+#define T __exp_data.tab
+#define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
+#define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
+#define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
+#define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
+#define C6 __exp_data.poly[9 - EXP_POLY_ORDER]
+
+/* Handle cases that may overflow or underflow when computing the result that
+   is scale*(1+TMP) without intermediate rounding.  The bit representation of
+   scale is in SBITS, however it has a computed exponent that may have
+   overflown into the sign bit so that needs to be adjusted before using it as
+   a double.  (int32_t)KI is the k used in the argument reduction and exponent
+   adjustment of scale, positive k here means the result may overflow and
+   negative k means the result may underflow.  */
+static inline double
+specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
+{
+  double_t scale, y;
+
+  if ((ki & 0x80000000) == 0)
+    {
+      /* k > 0, the exponent of scale might have overflowed by <= 460.  */
+      sbits -= 1009ull << 52;
+      scale = asdouble (sbits);
+      y = 0x1p1009 * (scale + scale * tmp);
+      return check_oflow (eval_as_double (y));
+    }
+  /* k < 0, need special care in the subnormal range.  */
+  sbits += 1022ull << 52;
+  scale = asdouble (sbits);
+  y = scale + scale * tmp;
+  if (y < 1.0)
+    {
+      /* Round y to the right precision before scaling it into the subnormal
+	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
+	 E is the worst-case ulp error outside the subnormal range.  So this
+	 is only useful if the goal is better than 1 ulp worst-case error.  */
+      double_t hi, lo;
+      lo = scale - y + scale * tmp;
+      hi = 1.0 + y;
+      lo = 1.0 - hi + y + lo;
+      y = eval_as_double (hi + lo) - 1.0;
+      /* Avoid -0.0 with downward rounding.  */
+      if (WANT_ROUNDING && y == 0.0)
+	y = 0.0;
+      /* The underflow exception needs to be signaled explicitly.  */
+      force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
+    }
+  y = 0x1p-1022 * y;
+  return check_uflow (eval_as_double (y));
+}
+
+/* Top 12 bits of a double (sign and exponent bits).  */
+static inline uint32_t
+top12 (double x)
+{
+  return asuint64 (x) >> 52;
+}
+
+/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
+   If hastail is 0 then xtail is assumed to be 0 too.  */
+static inline double
+exp_inline (double x, double xtail, int hastail)
+{
+  uint32_t abstop;
+  uint64_t ki, idx, top, sbits;
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t kd, z, r, r2, scale, tail, tmp;
+
+  abstop = top12 (x) & 0x7ff;
+  if (unlikely (abstop - top12 (0x1p-54) >= top12 (512.0) - top12 (0x1p-54)))
+    {
+      if (abstop - top12 (0x1p-54) >= 0x80000000)
+	/* Avoid spurious underflow for tiny x.  */
+	/* Note: 0 is common input.  */
+	return WANT_ROUNDING ? 1.0 + x : 1.0;
+      if (abstop >= top12 (1024.0))
+	{
+	  if (asuint64 (x) == asuint64 (-INFINITY))
+	    return 0.0;
+	  if (abstop >= top12 (INFINITY))
+	    return 1.0 + x;
+	  if (asuint64 (x) >> 63)
+	    return __math_uflow (0);
+	  else
+	    return __math_oflow (0);
+	}
+      /* Large x is special cased below.  */
+      abstop = 0;
+    }
+
+  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+  /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
+  z = InvLn2N * x;
+#if TOINT_INTRINSICS
+  kd = roundtoint (z);
+  ki = converttoint (z);
+#elif EXP_USE_TOINT_NARROW
+  /* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes.  */
+  kd = eval_as_double (z + Shift);
+  ki = asuint64 (kd) >> 16;
+  kd = (double_t) (int32_t) ki;
+#else
+  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+  kd = eval_as_double (z + Shift);
+  ki = asuint64 (kd);
+  kd -= Shift;
+#endif
+  r = x + kd * NegLn2hiN + kd * NegLn2loN;
+  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
+  if (hastail)
+    r += xtail;
+  /* 2^(k/N) ~= scale * (1 + tail).  */
+  idx = 2 * (ki % N);
+  top = ki << (52 - EXP_TABLE_BITS);
+  tail = asdouble (T[idx]);
+  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+  sbits = T[idx + 1] + top;
+  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
+  /* Evaluation is optimized assuming superscalar pipelined execution.  */
+  r2 = r * r;
+  /* Without fma the worst case error is 0.25/N ulp larger.  */
+  /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */
+#if EXP_POLY_ORDER == 4
+  tmp = tail + r + r2 * C2 + r * r2 * (C3 + r * C4);
+#elif EXP_POLY_ORDER == 5
+  tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
+#elif EXP_POLY_ORDER == 6
+  tmp = tail + r + r2 * (0.5 + r * C3) + r2 * r2 * (C4 + r * C5 + r2 * C6);
+#endif
+  if (unlikely (abstop == 0))
+    return specialcase (tmp, sbits, ki);
+  scale = asdouble (sbits);
+  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+     is no spurious underflow here even without fma.  */
+  return eval_as_double (scale + scale * tmp);
+}
+
+/* May be useful for implementing pow where more than double
+   precision input is needed.  */
+double
+__exp_dd (double x, double xtail)
+{
+  return exp_inline (x, xtail, 1);
+}
+
diff --git a/pl/math/exp_data.c b/pl/math/exp_data.c
new file mode 100644
index 0000000..714c845
--- /dev/null
+++ b/pl/math/exp_data.c
@@ -0,0 +1,1120 @@
+/*
+ * Shared data between exp, exp2 and pow.
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << EXP_TABLE_BITS)
+
+const struct exp_data __exp_data = {
+// N/ln2
+.invln2N = 0x1.71547652b82fep0 * N,
+// -ln2/N
+#if N == 64
+.negln2hiN = -0x1.62e42fefa0000p-7,
+.negln2loN = -0x1.cf79abc9e3b3ap-46,
+#elif N == 128
+.negln2hiN = -0x1.62e42fefa0000p-8,
+.negln2loN = -0x1.cf79abc9e3b3ap-47,
+#elif N == 256
+.negln2hiN = -0x1.62e42fefc0000p-9,
+.negln2loN = 0x1.c610ca86c3899p-45,
+#elif N == 512
+.negln2hiN = -0x1.62e42fef80000p-10,
+.negln2loN = -0x1.1cf79abc9e3b4p-45,
+#endif
+// Used for rounding when !TOINT_INTRINSICS
+#if EXP_USE_TOINT_NARROW
+.shift = 0x1800000000.8p0,
+#else
+.shift = 0x1.8p52,
+#endif
+// exp polynomial coefficients.
+.poly = {
+#if N == 64 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE
+// abs error: 1.5543*2^-60
+// ulp error: 0.529 (0.533 without fma)
+// if |x| < ln2/128+eps
+// abs error if |x| < ln2/64: 1.7157*2^-50
+0x1.fffffffffdbcdp-2,
+0x1.555555555444cp-3,
+0x1.555573c6a9f7dp-5,
+0x1.1111266d28935p-7,
+#elif N == 64 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE
+// abs error: 1.6735*2^-64
+// ulp error: 0.518 (0.522 without fma)
+// if |x| < ln2/64
+0x1.5555555548f9ap-3,
+0x1.555555554bf5dp-5,
+0x1.11115b75f0f4dp-7,
+0x1.6c171a6b6303ep-10,
+#elif N == 128 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE
+// abs error: 1.555*2^-66
+// ulp error: 0.509 (0.511 without fma)
+// if |x| < ln2/256+eps
+// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65
+// abs error if |x| < ln2/128: 1.7145*2^-56
+0x1.ffffffffffdbdp-2,
+0x1.555555555543cp-3,
+0x1.55555cf172b91p-5,
+0x1.1111167a4d017p-7,
+#elif N == 128 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE
+// abs error: 1.5542*2^-60
+// ulp error: 0.521 (0.523 without fma)
+// if |x| < ln2/128
+0x1.fffffffffdbcep-2,
+0x1.55555555543c2p-3,
+0x1.555573c64f2e3p-5,
+0x1.111126b4eff73p-7,
+#elif N == 128 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE
+// abs error: 1.6861*2^-71
+// ulp error: 0.509 (0.511 without fma)
+// if |x| < ln2/128
+0x1.55555555548fdp-3,
+0x1.555555555658fp-5,
+0x1.111123a859bb6p-7,
+0x1.6c16ba6920cabp-10,
+#elif N == 256 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE
+// abs error: 1.43*2^-58
+// ulp error: 0.549 (0.550 without fma)
+// if |x| < ln2/512
+0x1p0, // unused
+0x1.fffffffffffd4p-2,
+0x1.5555571d6ef9p-3,
+0x1.5555576a5adcep-5,
+#elif N == 256 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE
+// abs error: 1.5547*2^-66
+// ulp error: 0.505 (0.506 without fma)
+// if |x| < ln2/256
+0x1.ffffffffffdbdp-2,
+0x1.555555555543cp-3,
+0x1.55555cf16e1edp-5,
+0x1.1111167a4b553p-7,
+#elif N == 512 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE
+// abs error: 1.4300*2^-63
+// ulp error: 0.504
+// if |x| < ln2/1024
+// abs error if |x| < ln2/512: 1.0689*2^-55
+0x1p0, // unused
+0x1.ffffffffffffdp-2,
+0x1.555555c75bb6p-3,
+0x1.555555dec04a8p-5,
+#endif
+},
+.exp2_shift = 0x1.8p52 / N,
+// exp2 polynomial coefficients.
+.exp2_poly = {
+#if N == 64 && EXP2_POLY_ORDER == 6 && EXP2_POLY_WIDE
+// abs error: 1.3054*2^-63
+// ulp error: 0.515
+// if |x| < 1/64
+0x1.62e42fefa39efp-1,
+0x1.ebfbdff82c58fp-3,
+0x1.c6b08d7045cf1p-5,
+0x1.3b2ab6fb8fd0ep-7,
+0x1.5d884afec48d7p-10,
+0x1.43097dc684ae1p-13,
+#elif N == 128 && EXP2_POLY_ORDER == 5 && !EXP2_POLY_WIDE
+// abs error: 1.2195*2^-65
+// ulp error: 0.507 (0.511 without fma)
+// if |x| < 1/256
+// abs error if |x| < 1/128: 1.9941*2^-56
+0x1.62e42fefa39efp-1,
+0x1.ebfbdff82c424p-3,
+0x1.c6b08d70cf4b5p-5,
+0x1.3b2abd24650ccp-7,
+0x1.5d7e09b4e3a84p-10,
+#elif N == 256 && EXP2_POLY_ORDER == 5 && EXP2_POLY_WIDE
+// abs error: 1.2195*2^-65
+// ulp error: 0.504 (0.508 without fma)
+// if |x| < 1/256
+0x1.62e42fefa39efp-1,
+0x1.ebfbdff82c424p-3,
+0x1.c6b08d70cf4b5p-5,
+0x1.3b2abd24650ccp-7,
+0x1.5d7e09b4e3a84p-10,
+#elif N == 512 && EXP2_POLY_ORDER == 4 && !EXP2_POLY_WIDE
+// abs error: 1.4411*2^-64
+// ulp error: 0.5024 (0.5063 without fma)
+// if |x| < 1/1024
+// abs error if |x| < 1/512: 1.9430*2^-56
+0x1.62e42fefa39ecp-1,
+0x1.ebfbdff82c58bp-3,
+0x1.c6b08e46de41fp-5,
+0x1.3b2ab786ee1dap-7,
+#endif
+},
+// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
+// tab[2*k] = asuint64(T[k])
+// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
+.tab = {
+#if N == 64
+0x0, 0x3ff0000000000000,
+0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
+0x3c8cd2523567f613, 0x3fefd9b0d3158574,
+0x3c60f74e61e6c861, 0x3fefc74518759bc8,
+0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
+0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
+0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
+0xbc91c923b9d5f416, 0x3fef829aaea92de0,
+0xbc801b15eaa59348, 0x3fef72b83c7d517b,
+0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
+0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
+0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
+0x3c968efde3a8a894, 0x3fef387a6e756238,
+0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
+0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
+0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
+0x3c834d754db0abb6, 0x3fef06fe0a31b715,
+0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
+0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
+0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
+0x3c859f48a72a4c6d, 0x3feedea64c123422,
+0xbc58a78f4817895b, 0x3feed60a21f72e2a,
+0x3c4363ed60c2ac11, 0x3feece086061892d,
+0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
+0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
+0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
+0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
+0x3c93350518fdd78e, 0x3feeaf4736b527da,
+0x3c9063e1e21c5409, 0x3feeab07dd485429,
+0x3c9432e62b64c035, 0x3feea76f15ad2148,
+0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
+0xbc93cedd78565858, 0x3feea23882552225,
+0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
+0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
+0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
+0xbc8619321e55e68a, 0x3fee9feb564267c9,
+0xbc7b32dcb94da51d, 0x3feea11473eb0187,
+0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
+0xbc9369b6f13b3734, 0x3feea589994cce13,
+0xbc94d450d872576e, 0x3feea8d99b4492ed,
+0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
+0x3c7bf68359f35f44, 0x3feeb1ae99157736,
+0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
+0xbc92434322f4f9aa, 0x3feebd829fde4e50,
+0x3c71affc2b91ce27, 0x3feec49182a3f090,
+0xbc87c50422622263, 0x3feecc667b5de565,
+0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
+0x3c8469846e735ab3, 0x3feede6b5579fdbf,
+0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
+0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
+0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
+0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
+0x3c736eae30af0cb3, 0x3fef199bdd85529c,
+0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
+0x3c676b2c6c921968, 0x3fef3720dcef9069,
+0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
+0x3c74a385a63d07a7, 0x3fef5818dcfba487,
+0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
+0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
+0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
+0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
+0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
+0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
+0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
+#elif N == 128
+0x0, 0x3ff0000000000000,
+0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
+0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
+0xbc905e7a108766d1, 0x3fefe315e86e7f85,
+0x3c8cd2523567f613, 0x3fefd9b0d3158574,
+0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
+0x3c60f74e61e6c861, 0x3fefc74518759bc8,
+0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
+0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
+0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
+0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
+0xbc6a033489906e0b, 0x3fef9b66affed31b,
+0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
+0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
+0xbc91c923b9d5f416, 0x3fef829aaea92de0,
+0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
+0xbc801b15eaa59348, 0x3fef72b83c7d517b,
+0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
+0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
+0xbc96d99c7611eb26, 0x3fef5be084045cd4,
+0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
+0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
+0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
+0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
+0x3c968efde3a8a894, 0x3fef387a6e756238,
+0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
+0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
+0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
+0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
+0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
+0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
+0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
+0x3c834d754db0abb6, 0x3fef06fe0a31b715,
+0x3c864201e2ac744c, 0x3fef0170fc4cd831,
+0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
+0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
+0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
+0xbc9907f81b512d8e, 0x3feeecae6d05d866,
+0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
+0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
+0x3c859f48a72a4c6d, 0x3feedea64c123422,
+0xbc9312607a28698a, 0x3feeda4504ac801c,
+0xbc58a78f4817895b, 0x3feed60a21f72e2a,
+0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
+0x3c4363ed60c2ac11, 0x3feece086061892d,
+0x3c9666093b0664ef, 0x3feeca41ed1d0057,
+0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
+0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
+0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
+0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
+0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
+0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
+0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
+0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
+0x3c93350518fdd78e, 0x3feeaf4736b527da,
+0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
+0x3c9063e1e21c5409, 0x3feeab07dd485429,
+0x3c34c7855019c6ea, 0x3feea9268a5946b7,
+0x3c9432e62b64c035, 0x3feea76f15ad2148,
+0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
+0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
+0xbc845378892be9ae, 0x3feea34634ccc320,
+0xbc93cedd78565858, 0x3feea23882552225,
+0x3c5710aa807e1964, 0x3feea155d44ca973,
+0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
+0xbc6a12ad8734b982, 0x3feea012750bdabf,
+0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
+0xbc80dc3d54e08851, 0x3fee9f7df9519484,
+0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
+0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
+0xbc8619321e55e68a, 0x3fee9feb564267c9,
+0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
+0xbc7b32dcb94da51d, 0x3feea11473eb0187,
+0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
+0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
+0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
+0xbc9369b6f13b3734, 0x3feea589994cce13,
+0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
+0xbc94d450d872576e, 0x3feea8d99b4492ed,
+0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
+0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
+0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
+0x3c7bf68359f35f44, 0x3feeb1ae99157736,
+0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
+0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
+0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
+0xbc92434322f4f9aa, 0x3feebd829fde4e50,
+0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
+0x3c71affc2b91ce27, 0x3feec49182a3f090,
+0x3c6dd235e10a73bb, 0x3feec86319e32323,
+0xbc87c50422622263, 0x3feecc667b5de565,
+0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
+0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
+0x3c90cc319cee31d2, 0x3feed99e1330b358,
+0x3c8469846e735ab3, 0x3feede6b5579fdbf,
+0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
+0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
+0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
+0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
+0xbc90a40e3da6f640, 0x3feef9728de5593a,
+0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
+0xbc91eee26b588a35, 0x3fef05b030a1064a,
+0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
+0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
+0x3c736eae30af0cb3, 0x3fef199bdd85529c,
+0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
+0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
+0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
+0x3c676b2c6c921968, 0x3fef3720dcef9069,
+0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
+0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
+0xbc900dae3875a949, 0x3fef4f87080d89f2,
+0x3c74a385a63d07a7, 0x3fef5818dcfba487,
+0xbc82919e2040220f, 0x3fef60e316c98398,
+0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
+0x3c843a59ac016b4b, 0x3fef7321f301b460,
+0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
+0xbc892ab93b470dc9, 0x3fef864614f5a129,
+0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
+0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
+0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
+0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
+0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
+0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
+0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
+0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
+0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
+0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
+#elif N == 256
+0x0, 0x3ff0000000000000,
+0xbc84e82fc61851ac, 0x3feffb1afa5abcbf,
+0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
+0xbc82985dd8521d32, 0x3feff168143b0281,
+0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
+0x3c651e617061bfbd, 0x3fefe7d42e11bbcc,
+0xbc905e7a108766d1, 0x3fefe315e86e7f85,
+0x3c845fad437fa426, 0x3fefde5f72f654b1,
+0x3c8cd2523567f613, 0x3fefd9b0d3158574,
+0xbc954529642b232f, 0x3fefd50a0e3c1f89,
+0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
+0x3c8293708ef5c32e, 0x3fefcbd42b72a836,
+0x3c60f74e61e6c861, 0x3fefc74518759bc8,
+0xbc95b9280905b2a4, 0x3fefc2bdf66607e0,
+0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
+0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919,
+0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
+0x3c9407fb30d06420, 0x3fefb0f145e46c85,
+0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
+0xbc9a5d04b3b9911b, 0x3fefa83b23395dec,
+0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
+0xbc937a01f0739546, 0x3fef9fa55fdfa9c5,
+0xbc6a033489906e0b, 0x3fef9b66affed31b,
+0x3c8b8268b04ef0a5, 0x3fef973028d7233e,
+0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
+0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6,
+0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
+0xbc65704e90c9f860, 0x3fef86a814f204ab,
+0xbc91c923b9d5f416, 0x3fef829aaea92de0,
+0xbc897cea57e46280, 0x3fef7e95934f312e,
+0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
+0x3c56f01429e2b9d2, 0x3fef76a45471c3c2,
+0xbc801b15eaa59348, 0x3fef72b83c7d517b,
+0x3c6e653b2459034b, 0x3fef6ed48695bbc0,
+0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
+0x3c92cc7ea345b7dc, 0x3fef672658375d2f,
+0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
+0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c,
+0xbc96d99c7611eb26, 0x3fef5be084045cd4,
+0x3c8cdc1873af2155, 0x3fef582f95281c6b,
+0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
+0xbc9493684653a131, 0x3fef50e75eb44027,
+0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
+0xbc98e2899077520a, 0x3fef49c18438ce4d,
+0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
+0x3c9120fcd4f59273, 0x3fef42be3578a819,
+0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
+0x3c89b788c188c9b8, 0x3fef3bdda27912d1,
+0x3c968efde3a8a894, 0x3fef387a6e756238,
+0x3c877afbca90ef84, 0x3fef351ffb82140a,
+0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
+0x3c91512f082876ee, 0x3fef2e85711ece75,
+0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
+0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29,
+0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
+0xbc803297e78260bf, 0x3fef21ba7591bb70,
+0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
+0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13,
+0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
+0xbc91e75c40b4251e, 0x3fef157e39771b2f,
+0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
+0x3c98a911f1f7785a, 0x3fef0f961f641589,
+0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
+0xbc61e7c998db7dbb, 0x3fef09d24abd886b,
+0x3c834d754db0abb6, 0x3fef06fe0a31b715,
+0x3c85425c11faadf4, 0x3fef0432edeeb2fd,
+0x3c864201e2ac744c, 0x3fef0170fc4cd831,
+0xbc979517a03e2847, 0x3feefeb83ba8ea32,
+0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
+0xbc800e2a46da4bee, 0x3feef96266e3fa2d,
+0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
+0xbc87430803972b34, 0x3feef431a2de883b,
+0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
+0xbc954de30ae02d94, 0x3feeef26231e754a,
+0xbc9907f81b512d8e, 0x3feeecae6d05d866,
+0xbc94f2487e1c03ec, 0x3feeea401b7140ef,
+0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
+0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4,
+0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
+0x3c79c3bba5562a2f, 0x3feee0e544ede173,
+0x3c859f48a72a4c6d, 0x3feedea64c123422,
+0xbc85a71612e21658, 0x3feedc70df1c5175,
+0xbc9312607a28698a, 0x3feeda4504ac801c,
+0x3c86421f6f1d24d6, 0x3feed822c367a024,
+0xbc58a78f4817895b, 0x3feed60a21f72e2a,
+0xbc9348a6815fce65, 0x3feed3fb2709468a,
+0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
+0x3c835c43984d9871, 0x3feecffa3f84b9d4,
+0x3c4363ed60c2ac11, 0x3feece086061892d,
+0xbc632afc8d9473a0, 0x3feecc2042a7d232,
+0x3c9666093b0664ef, 0x3feeca41ed1d0057,
+0xbc95fc5e44de020e, 0x3feec86d668b3237,
+0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
+0xbc7ea0148327c42f, 0x3feec4e1e192aed2,
+0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
+0xbc7a843ad1a88022, 0x3feec17dea6db7d7,
+0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
+0x3c892ca3bf144e63, 0x3feebe41b817c114,
+0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
+0xbc902c99b04aa8b0, 0x3feebb2d81d8abff,
+0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
+0x3c73e34f67e67118, 0x3feeb8417f4531ee,
+0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
+0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef,
+0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
+0x3c81bd2888075068, 0x3feeb2e2f4f6ad27,
+0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
+0xbc896be8ae89ef8f, 0x3feeb070dde910d2,
+0x3c93350518fdd78e, 0x3feeaf4736b527da,
+0xbc88e6ac90348602, 0x3feeae27dbe2c4cf,
+0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
+0xbc91af7f1365c3ac, 0x3feeac0827ff07cc,
+0x3c9063e1e21c5409, 0x3feeab07dd485429,
+0xbc943a3540d1898a, 0x3feeaa11fba87a03,
+0x3c34c7855019c6ea, 0x3feea9268a5946b7,
+0xbc951f58ddaa8090, 0x3feea84590998b93,
+0x3c9432e62b64c035, 0x3feea76f15ad2148,
+0xbc82e1648e50a17c, 0x3feea6a320dceb71,
+0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
+0x3c95f30eda98a575, 0x3feea52ae6cdf6f4,
+0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
+0x3c917ecda8a72159, 0x3feea3dd1d1929fd,
+0xbc845378892be9ae, 0x3feea34634ccc320,
+0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7,
+0xbc93cedd78565858, 0x3feea23882552225,
+0xbc85c33fdf910406, 0x3feea1c1c70833f6,
+0x3c5710aa807e1964, 0x3feea155d44ca973,
+0x3c81079ab5789604, 0x3feea0f4b19e9538,
+0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
+0x3c727df161cd7778, 0x3feea052fa75173e,
+0xbc6a12ad8734b982, 0x3feea012750bdabf,
+0x3c93f9924a05b767, 0x3fee9fdcddd47645,
+0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
+0xbc87557939a8b5ef, 0x3fee9f9298593ae5,
+0xbc80dc3d54e08851, 0x3fee9f7df9519484,
+0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87,
+0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
+0xbc88e67a9006c909, 0x3fee9f8286ead08a,
+0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
+0x3c86597566977ac8, 0x3fee9fbd35d7cbfd,
+0xbc8619321e55e68a, 0x3fee9feb564267c9,
+0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09,
+0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
+0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6,
+0xbc7b32dcb94da51d, 0x3feea11473eb0187,
+0xbc92dad3519d7b5b, 0x3feea17b0976cfdb,
+0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
+0x3c87d51410fd15c2, 0x3feea26a62ff86f0,
+0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
+0xbc760a3629969871, 0x3feea3878491c491,
+0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
+0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9,
+0xbc9369b6f13b3734, 0x3feea589994cce13,
+0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7,
+0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
+0xbc522cea4f3afa1e, 0x3feea7f4179f5b21,
+0xbc94d450d872576e, 0x3feea8d99b4492ed,
+0x3c7c88549b958471, 0x3feea9cad931a436,
+0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
+0x3c931143962f7877, 0x3feeabd0a478580f,
+0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
+0x3c93e9e96f112479, 0x3feeae05bad61778,
+0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
+0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9,
+0x3c7bf68359f35f44, 0x3feeb1ae99157736,
+0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a,
+0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
+0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2,
+0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
+0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5,
+0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
+0xbc51669428996971, 0x3feebbdd9a7670b3,
+0xbc92434322f4f9aa, 0x3feebd829fde4e50,
+0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2,
+0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
+0xbc9294f304f166b6, 0x3feec2bb4d53fe0d,
+0x3c71affc2b91ce27, 0x3feec49182a3f090,
+0xbc8a1e58414c07d3, 0x3feec674194bb8d5,
+0x3c6dd235e10a73bb, 0x3feec86319e32323,
+0xbc79740b58a20091, 0x3feeca5e8d07f29e,
+0xbc87c50422622263, 0x3feecc667b5de565,
+0x3c9165830a2b96c2, 0x3feece7aed8eb8bb,
+0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
+0xbc903d5cbe27874b, 0x3feed2c980460ad8,
+0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
+0x3c5986178980fce0, 0x3feed74a8af46052,
+0x3c90cc319cee31d2, 0x3feed99e1330b358,
+0xbc89472975b1f2a5, 0x3feedbfe53c12e59,
+0x3c8469846e735ab3, 0x3feede6b5579fdbf,
+0x3c7d8157a34b7e7f, 0x3feee0e521356eba,
+0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
+0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774,
+0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
+0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff,
+0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
+0x3c889c2ea41433c7, 0x3feef0ce6c9a8952,
+0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
+0xbc7274aedac8ff80, 0x3feef68415b749b1,
+0xbc90a40e3da6f640, 0x3feef9728de5593a,
+0x3c85c620ce76df06, 0x3feefc6e29f1c52a,
+0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
+0xbc8fda52e1b51e41, 0x3fef028cf22749e4,
+0xbc91eee26b588a35, 0x3fef05b030a1064a,
+0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f,
+0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
+0xbc302899507554e5, 0x3fef0f69c3f3a207,
+0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
+0xbc80dda2d4c0010c, 0x3fef16286141b33d,
+0x3c736eae30af0cb3, 0x3fef199bdd85529c,
+0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c,
+0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
+0x3c836909391181d3, 0x3fef244778fafb22,
+0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
+0xbc811cd7dbdf9547, 0x3fef2ba88988c933,
+0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
+0xbc7ac28b7bef6621, 0x3fef33405751c4db,
+0x3c676b2c6c921968, 0x3fef3720dcef9069,
+0xbc7030587207b9e1, 0x3fef3b0f2e6d1675,
+0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
+0xbc8cc734592af7fc, 0x3fef43155b5bab74,
+0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
+0x3c87752a44f587e8, 0x3fef4b532b08c968,
+0xbc900dae3875a949, 0x3fef4f87080d89f2,
+0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6,
+0x3c74a385a63d07a7, 0x3fef5818dcfba487,
+0x3c5159d9d908a96e, 0x3fef5c76e862e6d3,
+0xbc82919e2040220f, 0x3fef60e316c98398,
+0x3c8c254d16117a68, 0x3fef655d71ff6075,
+0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
+0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315,
+0x3c843a59ac016b4b, 0x3fef7321f301b460,
+0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658,
+0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
+0xbc63e8e3eab2cbb4, 0x3fef81676b197d17,
+0xbc892ab93b470dc9, 0x3fef864614f5a129,
+0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12,
+0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
+0xbc776caa4c2ff1cf, 0x3fef953924676d76,
+0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
+0xbc81d5fc525d9940, 0x3fef9f7977cdb740,
+0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
+0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e,
+0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
+0x3c8269947c2bed4a, 0x3fefb4aaa2188510,
+0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
+0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a,
+0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
+0xbc69fa74878ba7c7, 0x3fefcac948dd7274,
+0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
+0x3c901f3a75ee0efe, 0x3fefd632798844f8,
+0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
+0xbc516a9ce6ed84fa, 0x3fefe1d802243c89,
+0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
+0xbc699c7db2effc76, 0x3fefedba3692d514,
+0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
+0x3c64b458677f9840, 0x3feff9d96b2a23d9,
+#elif N == 512
+0x0, 0x3ff0000000000000,
+0xbc75d87ade1f60d5, 0x3feffd8c86da1c0a,
+0xbc84e82fc61851ac, 0x3feffb1afa5abcbf,
+0x3c9bffdaa7ac4bac, 0x3feff8ab5b2cbd11,
+0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
+0x3c75c18e5ae0563a, 0x3feff3d1e77170b4,
+0xbc82985dd8521d32, 0x3feff168143b0281,
+0xbc705b1125cf49a5, 0x3fefef003103b10e,
+0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
+0x3c9f879abbff3f87, 0x3fefea363d42b027,
+0x3c651e617061bfbd, 0x3fefe7d42e11bbcc,
+0x3c9b14003824712a, 0x3fefe57411915a8a,
+0xbc905e7a108766d1, 0x3fefe315e86e7f85,
+0x3c61cbf0f38af658, 0x3fefe0b9b35659d8,
+0x3c845fad437fa426, 0x3fefde5f72f654b1,
+0xbc9a3316383dcbc5, 0x3fefdc0727fc1762,
+0x3c8cd2523567f613, 0x3fefd9b0d3158574,
+0x3c9901c9e0e797fd, 0x3fefd75c74f0bec2,
+0xbc954529642b232f, 0x3fefd50a0e3c1f89,
+0xbc89b3236d111646, 0x3fefd2b99fa6407c,
+0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
+0xbc8cb191be99b1b0, 0x3fefce1ead925493,
+0x3c8293708ef5c32e, 0x3fefcbd42b72a836,
+0xbc9acb71e83765b7, 0x3fefc98ba42e7d30,
+0x3c60f74e61e6c861, 0x3fefc74518759bc8,
+0x3c5cd3e58b03697e, 0x3fefc50088f8093f,
+0xbc95b9280905b2a4, 0x3fefc2bdf66607e0,
+0xbc8bfb07d4755452, 0x3fefc07d61701716,
+0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
+0x3c8aedeb3e7b14cd, 0x3fefbc02331b9715,
+0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919,
+0x3c9a8eb1f3d914b4, 0x3fefb78f03834e52,
+0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
+0xbc85b9eb0402507b, 0x3fefb323d833d93f,
+0x3c9407fb30d06420, 0x3fefb0f145e46c85,
+0xbc93f0f225bbf3ee, 0x3fefaec0b6bdae53,
+0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
+0xbc9c3fe7282d1784, 0x3fefaa65a4b520ba,
+0xbc9a5d04b3b9911b, 0x3fefa83b23395dec,
+0x3c9c8be44bf4cde8, 0x3fefa612a7b26300,
+0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
+0x3c820c5444c93c44, 0x3fefa1c7c55189c6,
+0xbc937a01f0739546, 0x3fef9fa55fdfa9c5,
+0xbc84c6baeb580d7a, 0x3fef9d8503328e6d,
+0xbc6a033489906e0b, 0x3fef9b66affed31b,
+0x3c8657aa1b0d9f83, 0x3fef994a66f951ce,
+0x3c8b8268b04ef0a5, 0x3fef973028d7233e,
+0x3c62f2c7fd6ee145, 0x3fef9517f64d9ef1,
+0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
+0xbc6b0b2789925e90, 0x3fef90edb6db2dc1,
+0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6,
+0xbc93aad17d197fae, 0x3fef8ccbae51a5c8,
+0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
+0xbc989c464a07ad70, 0x3fef88b1e264a0e9,
+0xbc65704e90c9f860, 0x3fef86a814f204ab,
+0xbc72c338fce197f4, 0x3fef84a058cbae1e,
+0xbc91c923b9d5f416, 0x3fef829aaea92de0,
+0xbc6dca724cea0eb6, 0x3fef809717425438,
+0xbc897cea57e46280, 0x3fef7e95934f312e,
+0x3c464770b955d34d, 0x3fef7c962388149e,
+0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
+0xbc962811c114424f, 0x3fef789d83606e12,
+0x3c56f01429e2b9d2, 0x3fef76a45471c3c2,
+0x3c8ec58e74904dd4, 0x3fef74ad3c92df73,
+0xbc801b15eaa59348, 0x3fef72b83c7d517b,
+0x3c8d63b0ab2d5bbf, 0x3fef70c554eaea89,
+0x3c6e653b2459034b, 0x3fef6ed48695bbc0,
+0xbc9ca9effbeeac92, 0x3fef6ce5d23816c9,
+0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
+0x3c8bda920de0f6e2, 0x3fef690eba4df41f,
+0x3c92cc7ea345b7dc, 0x3fef672658375d2f,
+0xbc9a597f9a5ff71c, 0x3fef654013041dc2,
+0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
+0x3c50835b125aa573, 0x3fef6179e2363cf8,
+0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c,
+0x3c8aaa13d61aec1f, 0x3fef5dbc2dc40bf0,
+0xbc96d99c7611eb26, 0x3fef5be084045cd4,
+0x3c8a4f81aa7110bd, 0x3fef5a06fb91588f,
+0x3c8cdc1873af2155, 0x3fef582f95281c6b,
+0xbc6817fd6a313e3e, 0x3fef565a51860746,
+0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
+0xbc96236af85fd26a, 0x3fef52b6358e15e8,
+0xbc9493684653a131, 0x3fef50e75eb44027,
+0x3c7795eb4523abe7, 0x3fef4f1aad999e82,
+0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
+0x3c8fe58b91b40095, 0x3fef4b87bf9cda38,
+0xbc98e2899077520a, 0x3fef49c18438ce4d,
+0x3c91ecaa860c614a, 0x3fef47fd7190241e,
+0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
+0xbc3e45c83ba0bbcb, 0x3fef447bc96ffc18,
+0x3c9120fcd4f59273, 0x3fef42be3578a819,
+0xbc29fd3bea07b4ee, 0x3fef4102cd3d09b9,
+0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
+0x3c87f1c7350e256d, 0x3fef3d9282fc1f27,
+0x3c89b788c188c9b8, 0x3fef3bdda27912d1,
+0x3c420dac6c124f4f, 0x3fef3a2af0b63bff,
+0x3c968efde3a8a894, 0x3fef387a6e756238,
+0xbc99501d09bc09fd, 0x3fef36cc1c78903a,
+0x3c877afbca90ef84, 0x3fef351ffb82140a,
+0x3c73baf864dc8675, 0x3fef33760c547f15,
+0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
+0x3c91b0575c1eaf54, 0x3fef3028c65fa1ff,
+0x3c91512f082876ee, 0x3fef2e85711ece75,
+0xbc90364bc9ce33ab, 0x3fef2ce450b3cb82,
+0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
+0xbc7548165d85ed32, 0x3fef29a8b16f0a30,
+0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29,
+0x3c7c3b977a68e32c, 0x3fef2675eeb3ab98,
+0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
+0xbc93a255f697ecfe, 0x3fef234c0ea83f36,
+0xbc803297e78260bf, 0x3fef21ba7591bb70,
+0x3c8d2d19edc1e550, 0x3fef202b17779965,
+0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
+0xbc76b2173113dd8c, 0x3fef1d130f50d65c,
+0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13,
+0x3c811aa5f853590b, 0x3fef1a03fc675d1f,
+0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
+0x3c61d61a34c8aa02, 0x3fef16fde4f2e280,
+0xbc91e75c40b4251e, 0x3fef157e39771b2f,
+0xbc91f892bf6b286d, 0x3fef1400cf2f6c18,
+0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
+0x3c7590c65c20e680, 0x3fef110cc15d5346,
+0x3c98a911f1f7785a, 0x3fef0f961f641589,
+0x3c86fe320b5c1e9d, 0x3fef0e21c1c14833,
+0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
+0xbc903cd8b2f25790, 0x3fef0b3fd6a454d2,
+0xbc61e7c998db7dbb, 0x3fef09d24abd886b,
+0x3c7b3bf786a54a87, 0x3fef08670653dfe4,
+0x3c834d754db0abb6, 0x3fef06fe0a31b715,
+0x3c74bb6c41732885, 0x3fef05975721b004,
+0x3c85425c11faadf4, 0x3fef0432edeeb2fd,
+0xbc99d7399abb9a8b, 0x3fef02d0cf63eeac,
+0x3c864201e2ac744c, 0x3fef0170fc4cd831,
+0xbc5451d60c6ac9eb, 0x3fef001375752b40,
+0xbc979517a03e2847, 0x3feefeb83ba8ea32,
+0x3c8787a210ceafd9, 0x3feefd5f4fb45e20,
+0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
+0xbc888d1e4629943d, 0x3feefab46484ebb4,
+0xbc800e2a46da4bee, 0x3feef96266e3fa2d,
+0xbc93369c544088b6, 0x3feef812ba4ea77d,
+0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
+0x3c85373ce4eb6dfb, 0x3feef57a577dd72b,
+0xbc87430803972b34, 0x3feef431a2de883b,
+0x3c83adec8265a67f, 0x3feef2eb428335b4,
+0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
+0xbc835388bcac6bc5, 0x3feef06581d3f669,
+0xbc954de30ae02d94, 0x3feeef26231e754a,
+0x3c727cdb4e4b6640, 0x3feeede91be9c811,
+0xbc9907f81b512d8e, 0x3feeecae6d05d866,
+0x3c86c2696a26af35, 0x3feeeb761742d808,
+0xbc94f2487e1c03ec, 0x3feeea401b7140ef,
+0x3c888f6ff06b979a, 0x3feee90c7a61d55b,
+0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
+0xbc89d5efaabc2030, 0x3feee6ac4bcdf3ea,
+0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4,
+0xbc76b8867f91c9d6, 0x3feee4559212ef89,
+0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
+0x3c94c9c0b5157fe6, 0x3feee20853c10f28,
+0x3c79c3bba5562a2f, 0x3feee0e544ede173,
+0xbc62455345b51c8e, 0x3feedfc4976d27fa,
+0x3c859f48a72a4c6d, 0x3feedea64c123422,
+0xbc93331de45477d0, 0x3feedd8a63b0a09b,
+0xbc85a71612e21658, 0x3feedc70df1c5175,
+0xbc95f84d39b39b16, 0x3feedb59bf29743f,
+0xbc9312607a28698a, 0x3feeda4504ac801c,
+0xbc72ba4dc7c4d562, 0x3feed932b07a35df,
+0x3c86421f6f1d24d6, 0x3feed822c367a024,
+0xbc844f25dc02691f, 0x3feed7153e4a136a,
+0xbc58a78f4817895b, 0x3feed60a21f72e2a,
+0xbc888d328eb9b501, 0x3feed5016f44d8f5,
+0xbc9348a6815fce65, 0x3feed3fb2709468a,
+0x3c7f0bec42ddb15a, 0x3feed2f74a1af3f1,
+0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
+0xbc615f0a2b9cd452, 0x3feed0f6d5817663,
+0x3c835c43984d9871, 0x3feecffa3f84b9d4,
+0xbc8c2e465a919e1d, 0x3feecf0018321a1a,
+0x3c4363ed60c2ac11, 0x3feece086061892d,
+0xbc865dfd02bd08f1, 0x3feecd1318eb43ec,
+0xbc632afc8d9473a0, 0x3feecc2042a7d232,
+0xbc8e68cec89b1762, 0x3feecb2fde7006f4,
+0x3c9666093b0664ef, 0x3feeca41ed1d0057,
+0xbc48ae858eb682ca, 0x3feec9566f8827d0,
+0xbc95fc5e44de020e, 0x3feec86d668b3237,
+0x3c5dd71277c0915f, 0x3feec786d3001fe5,
+0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
+0x3c92001325ecd7fb, 0x3feec5c10fa920a1,
+0xbc7ea0148327c42f, 0x3feec4e1e192aed2,
+0x3c65ace6e2870332, 0x3feec4052c5916c4,
+0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
+0xbc9595c55690ffaf, 0x3feec2532feaada6,
+0xbc7a843ad1a88022, 0x3feec17dea6db7d7,
+0xbc8b401ba9fb5199, 0x3feec0ab213d5283,
+0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
+0x3c6df82bf324cc57, 0x3feebf0d073537ca,
+0x3c892ca3bf144e63, 0x3feebe41b817c114,
+0x3c97cae38641c7bb, 0x3feebd78e8bb586b,
+0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
+0x3c62d80c5c4a2b67, 0x3feebbeeccbd7b2a,
+0xbc902c99b04aa8b0, 0x3feebb2d81d8abff,
+0x3c8f39c10d12eaf0, 0x3feeba6eba2e35f0,
+0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
+0xbc80b582d74a55d9, 0x3feeb8f8b804f127,
+0x3c73e34f67e67118, 0x3feeb8417f4531ee,
+0xbc6b4e327ff434ca, 0x3feeb78ccd3deb0d,
+0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
+0xbc592dca38593e20, 0x3feeb62b00da3b14,
+0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef,
+0xbc85daca9994833e, 0x3feeb4d359dfd53d,
+0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
+0xbc980b4321bc6dae, 0x3feeb385df598d78,
+0x3c81bd2888075068, 0x3feeb2e2f4f6ad27,
+0xbc8390afec5241c5, 0x3feeb24298571b06,
+0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
+0x3c8f15cdafe7d586, 0x3feeb1098bed1bdf,
+0xbc896be8ae89ef8f, 0x3feeb070dde910d2,
+0xbc910aa91ae9b67f, 0x3feeafdac1351819,
+0x3c93350518fdd78e, 0x3feeaf4736b527da,
+0x3c957e1b67462375, 0x3feeaeb63f4d854c,
+0xbc88e6ac90348602, 0x3feeae27dbe2c4cf,
+0x3c8124d5051552a7, 0x3feead9c0d59ca07,
+0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
+0xbc3ca103952ecf1f, 0x3feeac8c32824135,
+0xbc91af7f1365c3ac, 0x3feeac0827ff07cc,
+0x3c773345c02a4fd6, 0x3feeab86b5f43d92,
+0x3c9063e1e21c5409, 0x3feeab07dd485429,
+0xbc909d2a0fce20f2, 0x3feeaa8b9ee20d1e,
+0xbc943a3540d1898a, 0x3feeaa11fba87a03,
+0xbc924f2cb4f81746, 0x3feea99af482fc8f,
+0x3c34c7855019c6ea, 0x3feea9268a5946b7,
+0xbc943592a0a9846b, 0x3feea8b4be135acc,
+0xbc951f58ddaa8090, 0x3feea84590998b93,
+0xbc956bc85d444f4f, 0x3feea7d902d47c65,
+0x3c9432e62b64c035, 0x3feea76f15ad2148,
+0x3c914d1e4218319f, 0x3feea707ca0cbf0f,
+0xbc82e1648e50a17c, 0x3feea6a320dceb71,
+0x3c971c93709313f4, 0x3feea6411b078d26,
+0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
+0x3c7f88303b60d222, 0x3feea584fd15612a,
+0x3c95f30eda98a575, 0x3feea52ae6cdf6f4,
+0x3c70125ca18d4b5b, 0x3feea4d3778bc944,
+0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
+0x3c9592ea73798b11, 0x3feea42c91c56acd,
+0x3c917ecda8a72159, 0x3feea3dd1d1929fd,
+0xbc9371d6d7d75739, 0x3feea390532205d8,
+0xbc845378892be9ae, 0x3feea34634ccc320,
+0xbc8ac05fd996f807, 0x3feea2fec30678b7,
+0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7,
+0xbc91f5067d03653a, 0x3feea277e8dcc390,
+0xbc93cedd78565858, 0x3feea23882552225,
+0x3c917339c86ce3ad, 0x3feea1fbcc140be7,
+0xbc85c33fdf910406, 0x3feea1c1c70833f6,
+0xbc77e66065ba2500, 0x3feea18a7420a036,
+0x3c5710aa807e1964, 0x3feea155d44ca973,
+0x3c964c827ee6b49a, 0x3feea123e87bfb7a,
+0x3c81079ab5789604, 0x3feea0f4b19e9538,
+0xbc928311a3c73480, 0x3feea0c830a4c8d4,
+0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
+0x3c882c79e185e981, 0x3feea077541ee718,
+0x3c727df161cd7778, 0x3feea052fa75173e,
+0xbc8b48cea80b043b, 0x3feea0315a736c75,
+0xbc6a12ad8734b982, 0x3feea012750bdabf,
+0xbc4f4863bc8e5180, 0x3fee9ff64b30aa09,
+0x3c93f9924a05b767, 0x3fee9fdcddd47645,
+0x3c954835dd4b7548, 0x3fee9fc62dea2f8a,
+0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
+0xbc8bf41f59b59f8a, 0x3fee9fa10a38cee8,
+0xbc87557939a8b5ef, 0x3fee9f9298593ae5,
+0xbc8f652fde52775c, 0x3fee9f86e7ba9fef,
+0xbc80dc3d54e08851, 0x3fee9f7df9519484,
+0xbc7b0300defbcf98, 0x3fee9f77ce1303f6,
+0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87,
+0xbc89dab646035dc0, 0x3fee9f73c4eaa988,
+0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
+0xbc91f0c230588dde, 0x3fee9f7ad3ef9011,
+0xbc88e67a9006c909, 0x3fee9f8286ead08a,
+0x3c9106450507a28c, 0x3fee9f8d02d50b8f,
+0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
+0xbc9129729a10f3a0, 0x3fee9faa5953c849,
+0x3c86597566977ac8, 0x3fee9fbd35d7cbfd,
+0x3c781a70a5124f67, 0x3fee9fd2df29ce7c,
+0xbc8619321e55e68a, 0x3fee9feb564267c9,
+0x3c941626ea62646d, 0x3feea0069c1a861d,
+0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09,
+0xbc940b9f54365b7c, 0x3feea04597eeba8f,
+0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
+0x3c873455e0e826c1, 0x3feea08fda749e5d,
+0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6,
+0x3c94f006ad874e3e, 0x3feea0e56b7fcf03,
+0xbc7b32dcb94da51d, 0x3feea11473eb0187,
+0xbc8f6d693d0973bb, 0x3feea14652e958aa,
+0xbc92dad3519d7b5b, 0x3feea17b0976cfdb,
+0x3c58c5ee2b7e7848, 0x3feea1b2988fb9ec,
+0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
+0xbc88b25e045d207b, 0x3feea22a4456e7a3,
+0x3c87d51410fd15c2, 0x3feea26a62ff86f0,
+0xbc69cb3314060ca7, 0x3feea2ad5e2850ac,
+0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
+0x3c87a0b15d19e0bb, 0x3feea33bedf2e1b9,
+0xbc760a3629969871, 0x3feea3878491c491,
+0x3c94aa7212bfa73c, 0x3feea3d5fbab091f,
+0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
+0xbc81e688272a8a12, 0x3feea47b8f4abaa9,
+0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9,
+0x3c4ab7b7112ec9d5, 0x3feea52cb0d1736a,
+0xbc9369b6f13b3734, 0x3feea589994cce13,
+0x3c8a1e274eed4476, 0x3feea5e968443d9a,
+0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7,
+0x3c94a533a59324da, 0x3feea6b1bdadb46d,
+0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
+0x3c7a56d2760d087d, 0x3feea785b91e07f1,
+0xbc522cea4f3afa1e, 0x3feea7f4179f5b21,
+0x3c91682c1c6e8b05, 0x3feea86562ab00ec,
+0xbc94d450d872576e, 0x3feea8d99b4492ed,
+0x3c89ea99cf7a9591, 0x3feea950c27004c2,
+0x3c7c88549b958471, 0x3feea9cad931a436,
+0xbc59e57d8f92ff8e, 0x3feeaa47e08e1957,
+0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
+0x3c909b176e05a9cd, 0x3feeab4ac52be8f7,
+0x3c931143962f7877, 0x3feeabd0a478580f,
+0x3c711607f1952c95, 0x3feeac597875c644,
+0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
+0x3c869608f0f86431, 0x3feead74029db01e,
+0x3c93e9e96f112479, 0x3feeae05bad61778,
+0xbc7f1ced15c5c5c0, 0x3feeae9a6bdb5598,
+0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
+0x3c614b97be3f7b4e, 0x3feeafccbc6c19e6,
+0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9,
+0x3c81c1701c359530, 0x3feeb10afc931857,
+0x3c7bf68359f35f44, 0x3feeb1ae99157736,
+0xbc8edb1bf6809287, 0x3feeb2553499284b,
+0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a,
+0xbc8ba58ce7a736d3, 0x3feeb3ab6ccce12c,
+0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
+0xbc93fc025e1db9ce, 0x3feeb50dad829e70,
+0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2,
+0xbc8d737c7d71382e, 0x3feeb67bff148396,
+0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
+0x3c6ae88c43905293, 0x3feeb7f669e2802b,
+0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5,
+0xbc93d1f7661fe51b, 0x3feeb97cf65253d1,
+0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
+0x3c651b68797ffc1c, 0x3feebb0faccf9243,
+0xbc51669428996971, 0x3feebbdd9a7670b3,
+0x3c54579c5ceed70b, 0x3feebcae95cba768,
+0xbc92434322f4f9aa, 0x3feebd829fde4e50,
+0x3c87298413381667, 0x3feebe59b9bddb5b,
+0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2,
+0xbc905000be64e965, 0x3feec01121235681,
+0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
+0xbc89fb12e3454b73, 0x3feec1d4d47f2598,
+0xbc9294f304f166b6, 0x3feec2bb4d53fe0d,
+0x3c7be2a03697693b, 0x3feec3a4dc5a3dd3,
+0x3c71affc2b91ce27, 0x3feec49182a3f090,
+0x3c90622b15810eea, 0x3feec581414380f2,
+0xbc8a1e58414c07d3, 0x3feec674194bb8d5,
+0x3be9a5ecc875d327, 0x3feec76a0bcfc15e,
+0x3c6dd235e10a73bb, 0x3feec86319e32323,
+0x3c88ea486a3350ef, 0x3feec95f4499c647,
+0xbc79740b58a20091, 0x3feeca5e8d07f29e,
+0xbc7a2ee551d4c40f, 0x3feecb60f4424fcb,
+0xbc87c50422622263, 0x3feecc667b5de565,
+0x3c89c31f7e38028b, 0x3feecd6f23701b15,
+0x3c9165830a2b96c2, 0x3feece7aed8eb8bb,
+0xbc5fac13f4e005a3, 0x3feecf89dacfe68c,
+0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
+0x3c7d8aced7162e89, 0x3feed1b1231475f7,
+0xbc903d5cbe27874b, 0x3feed2c980460ad8,
+0xbc848f50cea7269f, 0x3feed3e504f696b1,
+0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
+0x3c821eb9a08a0542, 0x3feed625893523d4,
+0x3c5986178980fce0, 0x3feed74a8af46052,
+0xbc6133a953131cfd, 0x3feed872b8950a73,
+0x3c90cc319cee31d2, 0x3feed99e1330b358,
+0x3c89e95e6f4a0ae4, 0x3feedacc9be14dca,
+0xbc89472975b1f2a5, 0x3feedbfe53c12e59,
+0xbc90260cf07cb311, 0x3feedd333beb0b7e,
+0x3c8469846e735ab3, 0x3feede6b5579fdbf,
+0x3c1bca400a7b939d, 0x3feedfa6a1897fd2,
+0x3c7d8157a34b7e7f, 0x3feee0e521356eba,
+0x3c9140bc34dfc19f, 0x3feee226d59a09ee,
+0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
+0xbc8c9b1da461ab87, 0x3feee4b3e100301e,
+0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774,
+0x3c8c115f23ebea8e, 0x3feee74dcca5a413,
+0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
+0xbc6dcab99f23f84e, 0x3feee9f4a17a4735,
+0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff,
+0x3c60a43e8b7e4bfe, 0x3feeeca868742ee4,
+0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
+0x3c915b1397075f04, 0x3feeef692a8fa8cd,
+0x3c889c2ea41433c7, 0x3feef0ce6c9a8952,
+0xbc839f7a1f04d2b0, 0x3feef236f0cf3f3a,
+0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
+0xbc86a510f31e13e6, 0x3feef511c43bbd62,
+0xbc7274aedac8ff80, 0x3feef68415b749b1,
+0xbc92887ea88e7340, 0x3feef7f9ade433c6,
+0xbc90a40e3da6f640, 0x3feef9728de5593a,
+0xbc6e57ac604759ba, 0x3feefaeeb6ddfc87,
+0x3c85c620ce76df06, 0x3feefc6e29f1c52a,
+0x3c8e6c6db4f83226, 0x3feefdf0e844bfc6,
+0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
+0xbc8d1bf10460dba0, 0x3fef01004b3a7804,
+0xbc8fda52e1b51e41, 0x3fef028cf22749e4,
+0x3c8e5d80813dddfc, 0x3fef041ce8e77680,
+0xbc91eee26b588a35, 0x3fef05b030a1064a,
+0x3c8caff9640f2dcb, 0x3fef0746ca7a67a7,
+0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f,
+0x3c7a77557fd62db3, 0x3fef0a7df9285775,
+0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
+0xbc651ba6128db749, 0x3fef0dc27e2cb5e5,
+0xbc302899507554e5, 0x3fef0f69c3f3a207,
+0xbc7c0ffefdc5e251, 0x3fef111462c95b60,
+0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
+0xbc8b6cd058bfd6fa, 0x3fef1473b0468d30,
+0xbc80dda2d4c0010c, 0x3fef16286141b33d,
+0x3c923759b8aca76d, 0x3fef17e06ff301f4,
+0x3c736eae30af0cb3, 0x3fef199bdd85529c,
+0xbc895498a73dac7d, 0x3fef1b5aab23e61e,
+0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c,
+0x3c851de924583108, 0x3fef1ee26b34e065,
+0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
+0xbc8c5fe4051ba06c, 0x3fef2277b9881650,
+0x3c836909391181d3, 0x3fef244778fafb22,
+0xbc6d1816c0a9ac07, 0x3fef261a9f8630ad,
+0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
+0xbc7af5c67c4e8235, 0x3fef29cb269e601f,
+0xbc811cd7dbdf9547, 0x3fef2ba88988c933,
+0xbc8304ef0045d575, 0x3fef2d89584661a1,
+0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
+0x3c8725f94f910375, 0x3fef31553dfa8313,
+0xbc7ac28b7bef6621, 0x3fef33405751c4db,
+0x3c7b53e99f9191e8, 0x3fef352ee13da7cb,
+0x3c676b2c6c921968, 0x3fef3720dcef9069,
+0xbc810a79e6d7e2b8, 0x3fef39164b994d23,
+0xbc7030587207b9e1, 0x3fef3b0f2e6d1675,
+0x3c840635f6d2a9c0, 0x3fef3d0b869d8f0f,
+0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
+0x3c549eeef9ec910c, 0x3fef410e9be12cb9,
+0xbc8cc734592af7fc, 0x3fef43155b5bab74,
+0xbc8335827ffb9dce, 0x3fef451f95018d17,
+0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
+0x3c645563980ef762, 0x3fef493e7ba2c38c,
+0x3c87752a44f587e8, 0x3fef4b532b08c968,
+0xbc8cd0205eb2aab2, 0x3fef4d6b596f948c,
+0xbc900dae3875a949, 0x3fef4f87080d89f2,
+0xbc8aab80ceab2b4a, 0x3fef51a638197a3c,
+0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6,
+0xbc8f870f40a8ba1b, 0x3fef55ef2158a91f,
+0x3c74a385a63d07a7, 0x3fef5818dcfba487,
+0x3c83c119f18464c5, 0x3fef5a461eec14be,
+0x3c5159d9d908a96e, 0x3fef5c76e862e6d3,
+0xbc5a628c2be4e7c7, 0x3fef5eab3a99745b,
+0xbc82919e2040220f, 0x3fef60e316c98398,
+0xbc72550d76be719a, 0x3fef631e7e2d479d,
+0x3c8c254d16117a68, 0x3fef655d71ff6075,
+0xbc82090274667d12, 0x3fef679ff37adb4a,
+0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
+0x3c75f7d28150cac4, 0x3fef6c2fa45c4dfd,
+0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315,
+0x3c890de9296f4cd1, 0x3fef70cd9ab294e4,
+0x3c843a59ac016b4b, 0x3fef7321f301b460,
+0x3c832ff9978b34bc, 0x3fef7579e065807d,
+0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658,
+0xbc7303b63dda1980, 0x3fef7a347f63c159,
+0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
+0xbc81f2ba385f2f95, 0x3fef7efd81a2ece1,
+0xbc63e8e3eab2cbb4, 0x3fef81676b197d17,
+0x3c768d9144ae12fc, 0x3fef83d4f11f8220,
+0xbc892ab93b470dc9, 0x3fef864614f5a129,
+0x3c853687f542403b, 0x3fef88bad7dcee90,
+0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12,
+0xbc736ed2de40b407, 0x3fef8daf3fe592e8,
+0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
+0xbc614ef56c770f3b, 0x3fef92b2334ac7ee,
+0xbc776caa4c2ff1cf, 0x3fef953924676d76,
+0x3c8df7d1353d8e88, 0x3fef97c3bc24e350,
+0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
+0xbc850bed64091b8a, 0x3fef9ce3e4933c7e,
+0xbc81d5fc525d9940, 0x3fef9f7977cdb740,
+0x3c89d852381c317f, 0x3fefa212b6bc3181,
+0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
+0x3c68a00e3cca04c4, 0x3fefa7503ccd2be5,
+0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e,
+0xbc5a1f25ce94cae7, 0x3fefac9c80faa594,
+0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
+0xbc6fb5f3ee307976, 0x3fefb1f78d802dc2,
+0x3c8269947c2bed4a, 0x3fefb4aaa2188510,
+0x3c737e8ae802b851, 0x3fefb7616ca06dd6,
+0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
+0x3c875119560e34af, 0x3fefbcda28a52e59,
+0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a,
+0xbc7431c3840929c6, 0x3fefc261cbdf5be7,
+0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
+0xbc8cb472d2e86b99, 0x3fefc7f860a70c22,
+0xbc69fa74878ba7c7, 0x3fefcac948dd7274,
+0x3c83f5df2fde16a8, 0x3fefcd9df15b82ac,
+0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
+0x3c8eef18336b62e3, 0x3fefd35288633625,
+0x3c901f3a75ee0efe, 0x3fefd632798844f8,
+0x3c80d23f87b50a2a, 0x3fefd916302bd526,
+0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
+0x3c8302dee657c8e6, 0x3fefdee8f32a4b45,
+0xbc516a9ce6ed84fa, 0x3fefe1d802243c89,
+0xbc7b0caa080df170, 0x3fefe4cadbdac61d,
+0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
+0x3c7617a9f2fd24e5, 0x3fefeabbf4c0ba54,
+0xbc699c7db2effc76, 0x3fefedba3692d514,
+0x3c75f103b8fd5ca7, 0x3feff0bc4866e8ad,
+0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
+0x3c8e70b094fa075a, 0x3feff6cbe15f6314,
+0x3c64b458677f9840, 0x3feff9d96b2a23d9,
+0xbc72ec9a3e5d680a, 0x3feffceaca4391b6,
+#endif
+},
+};
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index db16a3d..18d2a6e 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -350,4 +350,33 @@ extern const struct log10_data
 #endif
 } __log10_data HIDDEN;
 
+#define EXP_TABLE_BITS 7
+#define EXP_POLY_ORDER 5
+/* Use polynomial that is optimized for a wider input range.  This may be
+   needed for good precision in non-nearest rounding and !TOINT_INTRINSICS.  */
+#define EXP_POLY_WIDE 0
+/* Use close to nearest rounding toint when !TOINT_INTRINSICS.  This may be
+   needed for good precision in non-nearest rouning and !EXP_POLY_WIDE.  */
+#define EXP_USE_TOINT_NARROW 0
+#define EXP2_POLY_ORDER 5
+#define EXP2_POLY_WIDE 0
+extern const struct exp_data
+{
+  double invln2N;
+  double shift;
+  double negln2hiN;
+  double negln2loN;
+  double poly[4]; /* Last four coefficients.  */
+  double exp2_shift;
+  double exp2_poly[EXP2_POLY_ORDER];
+  uint64_t tab[2*(1 << EXP_TABLE_BITS)];
+} __exp_data HIDDEN;
+
+#define ERFC_NUM_INTERVALS 20
+#define ERFC_POLY_ORDER 12
+extern const struct erfc_data
+{
+  double interval_bounds[ERFC_NUM_INTERVALS + 1];
+  double poly[ERFC_NUM_INTERVALS][ERFC_POLY_ORDER + 1];
+} __erfc_data HIDDEN;
 #endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 8b70d47..a0b3230 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -7,6 +7,7 @@
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
 
+D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index c2e8455..e335c62 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -54,6 +54,15 @@ t log10  0 0xffff000000000000 10000
 t log10  0x1p-4    0x1p4      40000
 t log10  0         inf        40000
 
+L=3.5
+t erfc  0       0xffff0000   10000
+t erfc  0x1p-1022  0x1p-26   40000
+t erfc -0x1p-1022 -0x1p-26   40000
+t erfc  0x1p-26    0x1p5     40000
+t erfc -0x1p-26   -0x1p3     40000
+t erfc  0          inf       40000
+Ldir=0.5
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/erfc.tst b/pl/math/test/testcases/directed/erfc.tst
new file mode 100644
index 0000000..9ccf196
--- /dev/null
+++ b/pl/math/test/testcases/directed/erfc.tst
@@ -0,0 +1,23 @@
+; erfc.tst - Directed test cases for erfc
+;
+; Copyright (c) 2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=erfc op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=erfc op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=erfc op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erfc op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erfc op1=7ff00000.00000000 result=00000000.00000000 errno=0
+func=erfc op1=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
+; We deliberately turned off errno setting in erf, as standard simply
+; state that errno `may` be set to ERANGE in case of underflow.
+; As a result the following condition on errno cannot be satisfied.
+;
+; func=erfc op1=403b44af.48b01531 result=00000000.00000000 errno=ERANGE status=ux
+;
+func=erfc op1=c03b44af.48b01531 result=40000000.00000000 errno=0
+func=erfc op1=403bffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
+func=erfc op1=c03bffff.ffffffff result=40000000.00000000 errno=0
+func=erfc op1=fff00000.00000000 result=40000000.00000000 errno=0
+func=erfc op1=00000000.00000000 result=3ff00000.00000000 errno=0
+func=erfc op1=80000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index dd1837e..299c063 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -6,6 +6,7 @@
  */
 F1 (erf)
 F1 (log10)
+D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
diff --git a/pl/math/tools/erfc.sollya b/pl/math/tools/erfc.sollya
new file mode 100644
index 0000000..55c1495
--- /dev/null
+++ b/pl/math/tools/erfc.sollya
@@ -0,0 +1,23 @@
+// polynomial for approximating erfc(x)*exp(x*x)
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 12; // poly degree
+
+// interval bounds
+a = 0x1.60dfc14636e2ap0;
+b = 0x1.d413cccfe779ap0;
+
+f = proc(y) {
+  t = y + a;
+  return erfc(t) * exp(t*t);
+};
+
+poly = remez(f(x), deg, [0;b-a], 1, 1e-16);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), 52, RN);
-- 
cgit v1.2.3


From 5b61e12675c1e0f6b3b5af9be351caeb8d6116fd Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 21 Apr 2022 13:48:21 +0100
Subject: pl/math: Add scalar erfcf

Scalar erfcf is implemented as a polynomial with 4 intervals within the
non-boring interval. The maximum measured error is 2.0 ulp, obtained
for x = 2.0412941.
---
 pl/math/erfcf_2u.c                        | 160 ++++++++++++++++++++++++++++++
 pl/math/erfcf_data.c                      |  57 +++++++++++
 pl/math/include/mathlib.h                 |   1 +
 pl/math/math_config.h                     |   7 ++
 pl/math/test/mathbench_funcs.h            |   1 +
 pl/math/test/runulp.sh                    |   8 ++
 pl/math/test/testcases/directed/erfcf.tst |  14 +++
 pl/math/test/ulp_funcs.h                  |   1 +
 pl/math/tools/erfcf.sollya                |  31 ++++++
 9 files changed, 280 insertions(+)
 create mode 100644 pl/math/erfcf_2u.c
 create mode 100644 pl/math/erfcf_data.c
 create mode 100644 pl/math/test/testcases/directed/erfcf.tst
 create mode 100644 pl/math/tools/erfcf.sollya

diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c
new file mode 100644
index 0000000..6222847
--- /dev/null
+++ b/pl/math/erfcf_2u.c
@@ -0,0 +1,160 @@
+/*
+ * Single-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define P(i) __erfcf_poly_data.poly[i]
+
+/* Accurate exponential from optimized routines.  */
+double
+__exp_dd (double x, double xtail);
+
+/* Evaluate order-12 polynomials using pairwise summation and Horner scheme in
+   double precision.  */
+static inline double
+eval_poly_horner_lvl2 (double z, const double *coeff)
+{
+  double r1, r2, r3, r4, r5, r6, r7, r8;
+  double R1, R2, R3, R4;
+  double Q1, Q2;
+  double z2, z4, z8;
+  z2 = z * z;
+  r1 = fma (z, coeff[1], coeff[0]);
+  r2 = fma (z, coeff[3], coeff[2]);
+  z4 = z2 * z2;
+  z8 = z4 * z4;
+  R1 = fma (z2, r2, r1);
+  r3 = fma (z, coeff[5], coeff[4]);
+  r4 = fma (z, coeff[7], coeff[6]);
+  R2 = fma (z2, r4, r3);
+  Q1 = fma (z4, R2, R1);
+  r5 = fma (z, coeff[9], coeff[8]);
+  r6 = fma (z, coeff[11], coeff[10]);
+  R3 = fma (z2, r6, r5);
+  r7 = fma (z, coeff[13], coeff[12]);
+  r8 = fma (z, coeff[15], coeff[14]);
+  R4 = fma (z2, r8, r7);
+  Q2 = fma (z4, R4, R3);
+  return fma (z8, Q2, Q1);
+}
+
+static inline double
+eval_exp_mx2 (double x)
+{
+  return __exp_dd (-(x * x), 0.0);
+}
+
+/* Approximation of erfcf for |x| > 4.0.  */
+static inline float
+approx_erfcf_hi (float x, uint32_t sign, const double *coeff)
+{
+  if (sign)
+    {
+      return 2.0f;
+    }
+
+  /* Polynomial contribution.  */
+  double z = (double) fabs (x);
+  float p = (float) eval_poly_horner_lvl2 (z, coeff);
+  /* Gaussian contribution.  */
+  float e_mx2 = (float) eval_exp_mx2 (z);
+
+  return p * e_mx2;
+}
+
+/* Approximation of erfcf for |x| < 4.0.  */
+static inline float
+approx_erfcf_lo (float x, uint32_t sign, const double *coeff)
+{
+  /* Polynomial contribution.  */
+  double z = (double) fabs (x);
+  float p = (float) eval_poly_horner_lvl2 (z, coeff);
+  /* Gaussian contribution.  */
+  float e_mx2 = (float) eval_exp_mx2 (z);
+
+  if (sign)
+    return fmaf (-p, e_mx2, 2.0f);
+  else
+    return p * e_mx2;
+}
+
+/* Top 12 bits of a float (sign and exponent bits).  */
+static inline uint32_t
+abstop12 (float x)
+{
+  return (asuint (x) >> 20) & 0x7ff;
+}
+
+/* Top 12 bits of a float.  */
+static inline uint32_t
+top12 (float x)
+{
+  return asuint (x) >> 20;
+}
+
+/* Fast erfcf approximation using polynomial approximation
+   multiplied by gaussian.
+   Most of the computation is carried out in double precision,
+   and is very sensitive to accuracy of polynomial and exp
+   evaluation.
+   Worst-case error is 1.968ulps, obtained for x = 2.0412941.
+   erfcf(0x1.05492p+1) got 0x1.fe10f6p-9 want 0x1.fe10f2p-9 ulp
+   err 1.46788.  */
+float
+erfcf (float x)
+{
+  /* Get top words and sign.  */
+  uint32_t ix = asuint (x); /* We need to compare at most 32 bits.  */
+  uint32_t sign = ix >> 31;
+  uint32_t ia12 = top12 (x) & 0x7ff;
+
+  /* Handle special cases and small values with a single comparison:
+       abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small)
+
+     Special cases
+       erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2
+
+     Errno
+       EDOM does not have to be set in case of erfcf(nan).
+       Only ERANGE may be set in case of underflow.
+
+     Small values (|x|<small)
+       |x|<0x1.0p-26 => accurate to 0.5 ULP (top12(0x1p-26) = 0x328).  */
+  if (unlikely (abstop12 (x) - 0x328 >= (abstop12 (INFINITY) & 0x7f8) - 0x328))
+    {
+      if (abstop12 (x) >= 0x7f8)
+	return (float) (sign << 1) + 1.0f / x; /* Special cases.  */
+      else
+	return 1.0f - x; /* Small case.  */
+    }
+
+  /* Normalized numbers divided in 4 intervals
+     with bounds: 2.0, 4.0, 8.0 and 10.0. 10 was chosen as the upper bound for
+     the interesting region as it is the smallest value, representable as a
+     12-bit integer, for which returning 0 gives <1.5 ULP.  */
+  if (ia12 < 0x400)
+    {
+      return approx_erfcf_lo (x, sign, P (0));
+    }
+  if (ia12 < 0x408)
+    {
+      return approx_erfcf_lo (x, sign, P (1));
+    }
+  if (ia12 < 0x410)
+    {
+      return approx_erfcf_hi (x, sign, P (2));
+    }
+  if (ia12 < 0x412)
+    {
+      return approx_erfcf_hi (x, sign, P (3));
+    }
+  if (sign)
+    {
+      return 2.0f;
+    }
+  return __math_uflowf (0);
+}
diff --git a/pl/math/erfcf_data.c b/pl/math/erfcf_data.c
new file mode 100644
index 0000000..34fe033
--- /dev/null
+++ b/pl/math/erfcf_data.c
@@ -0,0 +1,57 @@
+/*
+ * Data used in single-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double
+   precision. Generated using the Remez algorithm on each interval separately
+   (see erfcf.sollya for more detail).  */
+const struct erfcf_poly_data __erfcf_poly_data
+  = {.poly
+     = {{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.ffffffffe7c59p-1, -0x1.20dd74f8cecc5p0, 0x1.fffffc67a0fbdp-1,
+	  -0x1.81270c3ced2d6p-1, 0x1.fffc0c6606e45p-2, -0x1.340a779e8a8e3p-2,
+	  0x1.54c1663fc5a01p-3, -0x1.5d468c9269dafp-4, 0x1.4afe6b00df9d5p-5,
+	  -0x1.1d22d2720cb91p-6, 0x1.afa399a5761b1p-8, -0x1.113851b5858adp-9,
+	  0x1.0f992e4d5c6a4p-11, -0x1.86534d558052ap-14, 0x1.63e537bfb7cd5p-17,
+	  -0x1.32712a6275c4dp-21
+#endif
+	},
+
+	{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.fea5663f75cd1p-1, -0x1.1cb5a82adf1c4p0, 0x1.e7c8da942d86fp-1,
+	  -0x1.547ba0456bac7p-1, 0x1.8a6fc0f4421a4p-2, -0x1.7c14f9301ee58p-3,
+	  0x1.2f67c8351577p-4, -0x1.8e733f6d159d9p-6, 0x1.aa6a0ec249067p-8,
+	  -0x1.6f4ec45b11f3fp-10, 0x1.f4c00c4b33ba8p-13, -0x1.0795faf7846d2p-15,
+	  0x1.9cef9031810ddp-19, -0x1.c4d60c3fecdb6p-23, 0x1.360547ec2229dp-27,
+	  -0x1.8ec1581647f9fp-33
+#endif
+	},
+
+	{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.dae421147c591p-1, -0x1.c211957a0abfcp-1, 0x1.28a8d87aa1b12p-1,
+	  -0x1.224d2a58cbef4p-2, 0x1.b3d45dcaef898p-4, -0x1.ff99d8b33e7a9p-6,
+	  0x1.dac66375b99f6p-8, -0x1.5e1786f0f91ap-10, 0x1.9a2588deaec4fp-13,
+	  -0x1.7b886b183b235p-16, 0x1.1209e7da8ff82p-19, -0x1.2e5c870c6ed8p-23,
+	  0x1.ec6a89422928ep-28, -0x1.16e7d837b61bcp-32, 0x1.88868a73e4b43p-38,
+	  -0x1.027034672f11cp-44
+#endif
+	},
+
+	{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.8ae320c1bad5ap-1, -0x1.1cdd6aa6929aap-1, 0x1.0e39a7b285f58p-2,
+	  -0x1.6fb12a95e351dp-4, 0x1.77dd0649e352cp-6, -0x1.28a9e9560c461p-8,
+	  0x1.6f7d7778e9433p-11, -0x1.68363698afe4ap-14, 0x1.17e94cdf35d82p-17,
+	  -0x1.5766a817bd3ffp-21, 0x1.48d892094a2c1p-25, -0x1.e1b6511ab6d0bp-30,
+	  0x1.04c7b8143f6a4p-34, -0x1.898831961065bp-40, 0x1.71ae8a56142a6p-46,
+	  -0x1.45abac612344bp-53
+#endif
+	}}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index df1f884..1f1fc1f 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -8,6 +8,7 @@
 #ifndef _MATHLIB_H
 #define _MATHLIB_H
 
+float erfcf(float);
 float erff (float);
 float log10f (float);
 
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 18d2a6e..1d5f730 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -379,4 +379,11 @@ extern const struct erfc_data
   double interval_bounds[ERFC_NUM_INTERVALS + 1];
   double poly[ERFC_NUM_INTERVALS][ERFC_POLY_ORDER + 1];
 } __erfc_data HIDDEN;
+
+#define ERFCF_POLY_NCOEFFS 16
+extern const struct erfcf_poly_data
+{
+  double poly[4][ERFCF_POLY_NCOEFFS];
+} __erfcf_poly_data HIDDEN;
+
 #endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index a0b3230..1713412 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -4,6 +4,7 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index e335c62..787be0d 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -63,6 +63,14 @@ t erfc -0x1p-26   -0x1p3     40000
 t erfc  0          inf       40000
 Ldir=0.5
 
+L=1.45
+t erfcf  0      0xffff0000 10000
+t erfcf  0x1p-127  0x1p-26 40000
+t erfcf -0x1p-127 -0x1p-26 40000
+t erfcf  0x1p-26    0x1p5  40000
+t erfcf -0x1p-26   -0x1p3  40000
+t erfcf  0          inf    40000
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/erfcf.tst b/pl/math/test/testcases/directed/erfcf.tst
new file mode 100644
index 0000000..4cea316
--- /dev/null
+++ b/pl/math/test/testcases/directed/erfcf.tst
@@ -0,0 +1,14 @@
+; erfcf.tst - Directed test cases for erfcf
+;
+; Copyright (c) 2007-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=erfcf op1=7fc00001 result=7fc00001 errno=0
+func=erfcf op1=ffc00001 result=7fc00001 errno=0
+func=erfcf op1=7f800001 result=7fc00001 errno=0 status=i
+func=erfcf op1=ff800001 result=7fc00001 errno=0 status=i
+func=erfcf op1=7f800000 result=00000000 errno=0
+func=erfcf op1=7f7fffff result=00000000 errno=ERANGE status=ux
+func=erfcf op1=ff800000 result=40000000 errno=0
+func=erfcf op1=00000000 result=3f800000 errno=0
+func=erfcf op1=80000000 result=3f800000 errno=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 299c063..77d9bd8 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -4,6 +4,7 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+F1 (erfc)
 F1 (erf)
 F1 (log10)
 D1 (erfc)
diff --git a/pl/math/tools/erfcf.sollya b/pl/math/tools/erfcf.sollya
new file mode 100644
index 0000000..bfb8451
--- /dev/null
+++ b/pl/math/tools/erfcf.sollya
@@ -0,0 +1,31 @@
+// polynomial for approximating erfc(x)*exp(x*x)
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 15; // poly degree
+
+// interval bounds
+a = 0x1.0p-26;
+b = 2;
+
+f = proc(y) {
+  return erfc(y) * exp(y*y);
+};
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = 0;
+for i from 0 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+  print(i);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
-- 
cgit v1.2.3


From 287e0c05f75986b4c6fdae171a37ca11344bb370 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 29 Apr 2022 11:46:41 +0100
Subject: pl/math: Add Vector/Neon erfc

Neon erfc uses a helper, v_exp_tail, which is a vector variant of __exp_dd (as used by scalar erfc).
Maximum measured error is 3.5ulp.
---
 pl/math/include/mathlib.h      |   5 ++
 pl/math/math_config.h          |   7 ++
 pl/math/s_erfc_3u7.c           |   6 ++
 pl/math/s_exp_tail.c           |   6 ++
 pl/math/test/mathbench_funcs.h |   5 ++
 pl/math/test/runulp.sh         |  14 ++++
 pl/math/test/ulp_funcs.h       |   4 +
 pl/math/test/ulp_wrappers.h    |   3 +
 pl/math/tools/v_erfc.sollya    |  46 +++++++++++
 pl/math/v_erfc_3u7.c           | 182 +++++++++++++++++++++++++++++++++++++++++
 pl/math/v_erfc_data.c          |  96 ++++++++++++++++++++++
 pl/math/v_exp_tail.c           |  75 +++++++++++++++++
 pl/math/v_exp_tail.h           |  21 +++++
 pl/math/v_exp_tail_data.c      |  97 ++++++++++++++++++++++
 pl/math/v_math.h               |  13 +++
 pl/math/vn_erfc_3u7.c          |  12 +++
 pl/math/vn_exp_tail.c          |  11 +++
 17 files changed, 603 insertions(+)
 create mode 100644 pl/math/s_erfc_3u7.c
 create mode 100644 pl/math/s_exp_tail.c
 create mode 100644 pl/math/tools/v_erfc.sollya
 create mode 100644 pl/math/v_erfc_3u7.c
 create mode 100644 pl/math/v_erfc_data.c
 create mode 100644 pl/math/v_exp_tail.c
 create mode 100644 pl/math/v_exp_tail.h
 create mode 100644 pl/math/v_exp_tail_data.c
 create mode 100644 pl/math/vn_erfc_3u7.c
 create mode 100644 pl/math/vn_exp_tail.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 1f1fc1f..e06e449 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -15,6 +15,8 @@ float log10f (float);
 double log10 (double);
 
 float __s_log10f (float);
+
+double __s_erfc (double);
 double __s_log10 (double);
 
 #if __aarch64__
@@ -29,6 +31,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #endif
 
 /* Vector functions following the base PCS.  */
+__f64x2_t __v_erfc (__f64x2_t);
 __f32x4_t __v_log10f (__f32x4_t);
 __f64x2_t __v_log10 (__f64x2_t);
 
@@ -36,10 +39,12 @@ __f64x2_t __v_log10 (__f64x2_t);
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
+__vpcs __f64x2_t __vn_erfc (__f64x2_t);
 __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 1d5f730..710990c 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -379,6 +379,11 @@ extern const struct erfc_data
   double interval_bounds[ERFC_NUM_INTERVALS + 1];
   double poly[ERFC_NUM_INTERVALS][ERFC_POLY_ORDER + 1];
 } __erfc_data HIDDEN;
+extern const struct v_erfc_data
+{
+  double interval_bounds[ERFC_NUM_INTERVALS + 1];
+  double poly[ERFC_NUM_INTERVALS + 1][ERFC_POLY_ORDER + 1];
+}  __v_erfc_data HIDDEN;
 
 #define ERFCF_POLY_NCOEFFS 16
 extern const struct erfcf_poly_data
@@ -386,4 +391,6 @@ extern const struct erfcf_poly_data
   double poly[4][ERFCF_POLY_NCOEFFS];
 } __erfcf_poly_data HIDDEN;
 
+#define V_EXP_TAIL_TABLE_BITS 8
+extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN;
 #endif
diff --git a/pl/math/s_erfc_3u7.c b/pl/math/s_erfc_3u7.c
new file mode 100644
index 0000000..880d7a7
--- /dev/null
+++ b/pl/math/s_erfc_3u7.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erfc_3u7.c"
diff --git a/pl/math/s_exp_tail.c b/pl/math/s_exp_tail.c
new file mode 100644
index 0000000..4db47bb
--- /dev/null
+++ b/pl/math/s_exp_tail.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_exp_tail.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 1713412..99b4856 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -12,12 +12,17 @@ D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
+D (__s_erfc, -6.0, 28.0)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 #if __aarch64__
+VD (__v_erfc, -6.0, 28.0)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 #ifdef __vpcs
+VND(__vn_erfc, -6.0, 28.0)
+VND(_ZGVnN2v_erfc, -6.0, 28.0)
+
 VNF (__vn_log10f, 0.01, 11.1)
 VNF (_ZGVnN4v_log10f, 0.01, 11.1)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 787be0d..07ba642 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -84,6 +84,15 @@ check __v_log10f 1 && runv=1
 runvn=
 check __vn_log10f 1 && runvn=1
 
+range_erfc='
+   0       0xffff0000   10000
+   0x1p-1022  0x1p-26   40000
+  -0x1p-1022 -0x1p-26   40000
+   0x1p-26    0x1p5     40000
+  -0x1p-26   -0x1p3     40000
+   0          inf       40000
+'
+
 range_log10='
   0 0xffff000000000000 10000
   0x1p-4     0x1p4     400000
@@ -95,6 +104,7 @@ range_log10f='
  0x1p-4    0x1p4    500000
 '
 # error limits
+L_erfc=3.7
 L_log10=1.16
 L_log10f=2.81
 
@@ -114,6 +124,10 @@ $range
 EOF
 done << EOF
 # group symbol run
+erfc   __s_erfc        $runs
+erfc   __v_erfc        $runv
+erfc   __vn_erfc       $runvn
+erfc   _ZGVnN2v_erfc   $runvn
 log10  __s_log10       $runs
 log10  __v_log10       $runv
 log10  __vn_log10      $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 77d9bd8..18f50c3 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -10,14 +10,18 @@ F1 (log10)
 D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
+F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0)
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 #if __aarch64__
+F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #ifdef __vpcs
+F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (_ZGVnN2v_erfc, Z_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (_ZGVnN4v_log10f, Z_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (_ZGVnN2v_log10, Z_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 1386fbd..5357852 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -25,12 +25,15 @@ static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
 static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
 
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
+static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
 static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 #ifdef __vpcs
 static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
+static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
 static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
 
 static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
+static double Z_erfc(double x) { return _ZGVnN2v_erfc(argd(x))[0]; }
 static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
 #endif
 #endif
diff --git a/pl/math/tools/v_erfc.sollya b/pl/math/tools/v_erfc.sollya
new file mode 100644
index 0000000..e4e5fb1
--- /dev/null
+++ b/pl/math/tools/v_erfc.sollya
@@ -0,0 +1,46 @@
+// polynomial for approximating erfc(x)*exp(x*x)
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 12; // poly degree
+
+itv = parse(__argv[0]);
+
+bounds = [|3.725290298461914e-9,
+           0.18920711500272103,
+           0.41421356237309515,
+           0.681792830507429,
+           1,
+           1.378414230005442,
+           1.8284271247461903,
+           2.363585661014858,
+           3,
+           3.756828460010884,
+           4.656854249492381,
+           5.727171322029716,
+           7,
+           8.513656920021768,
+           10.313708498984761,
+           12.454342644059432,
+           15,
+           18.027313840043536,
+           21.627416997969522,
+           25.908685288118864,
+           31|];
+
+a = bounds[itv];
+b = bounds[itv + 1];
+
+f = proc(y) {
+  t = y + a;
+  return erfc(t) * exp(t*t);
+};
+
+poly = fpminimax(f(x), deg, [|double ...|], [0;b-a]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly, i);
diff --git a/pl/math/v_erfc_3u7.c b/pl/math/v_erfc_3u7.c
new file mode 100644
index 0000000..d3e80ef
--- /dev/null
+++ b/pl/math/v_erfc_3u7.c
@@ -0,0 +1,182 @@
+/*
+ * Double-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+/* Accurate exponential (vector variant of exp_dd).  */
+v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
+
+#define One v_f64 (1.0)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define Scale v_f64 (0x1.0000002p27)
+
+/* Coeffs for polynomial approximation on [0x1.0p-28., 31.].  */
+#define PX __v_erfc_data.poly
+#define xint __v_erfc_data.interval_bounds
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (erfc, x, y, cmp);
+}
+
+/* A structure to perform look-up in coeffs and other parameter
+   tables.  */
+struct entry
+{
+  v_f64_t P[ERFC_POLY_ORDER + 1];
+  v_f64_t xi;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
+    e.P[j] = PX[i][j];
+  e.xi = xint[i];
+#else
+  for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
+    {
+      e.P[j][0] = PX[i[0]][j];
+      e.P[j][1] = PX[i[1]][j];
+    }
+  e.xi[0] = xint[i[0]];
+  e.xi[1] = xint[i[1]];
+#endif
+  return e;
+}
+
+/* Evaluate order-12 polynomials using pairwise summation and Horner
+   scheme.  */
+static inline v_f64_t
+v_eval_poly (v_f64_t z, struct entry e)
+{
+  v_f64_t r = e.P[12];
+  r = v_fma_f64 (z, r, e.P[11]);
+  r = v_fma_f64 (z, r, e.P[10]);
+  r = v_fma_f64 (z, r, e.P[9]);
+  r = v_fma_f64 (z, r, e.P[8]);
+  r = v_fma_f64 (z, r, e.P[7]);
+  r = v_fma_f64 (z, r, e.P[6]);
+  r = v_fma_f64 (z, r, e.P[5]);
+  r = v_fma_f64 (z, r, e.P[4]);
+  r = v_fma_f64 (z, r, e.P[3]);
+  r = v_fma_f64 (z, r, e.P[2]);
+  r = v_fma_f64 (z, r, e.P[1]);
+  r = v_fma_f64 (z, r, e.P[0]);
+
+  return r;
+}
+
+/* Accurate evaluation of exp(x^2) using compensated product
+   (x^2 ~ x*x + e2) and custom exp(y+d) routine for small
+   corrections d<<y.  */
+static inline v_f64_t
+v_eval_gauss (v_f64_t a)
+{
+  v_f64_t e2;
+  v_f64_t a2 = a * a;
+
+  /* Dekker's algorithm.
+     tmp = a - Scale * a.
+     a_hi = high bits of a.
+	  = Scale * a - tmp.
+     a_lo = low bits of a.
+	  = a - a_hi.  */
+  v_f64_t a_hi = -v_fma_f64 (Scale, a, -a);
+  a_hi = v_fma_f64 (Scale, a, a_hi);
+  v_f64_t a_lo = a - a_hi;
+
+  /* Now assemble error term.  */
+  e2 = v_fma_f64 (-a_hi, a_hi, a2);
+  e2 = v_fma_f64 (-a_hi, a_lo, e2);
+  e2 = v_fma_f64 (-a_lo, a_hi, e2);
+  e2 = v_fma_f64 (-a_lo, a_lo, e2);
+
+  /* Fast and accurate evaluation of exp(-a2 + e2) where e2 << a2.  */
+  return V_NAME (exp_tail) (-a2, e2);
+}
+
+/* Optimized double precision vector complementary error function erfc.
+   Max ULP: 3.7ulps.
+   Max measured: 3.610 on [5.1183, 5.1184] (at 0x1.47923afd09313p+2).
+   __v_erfc(0x1.47923afd09313p+2) got 0x1.ff487ddd86457p-42 want
+   0x1.ff487ddd8645bp-42 -0.390493 ulp err -3.10951.  */
+VPCS_ATTR
+v_f64_t V_NAME (erfc) (v_f64_t x)
+{
+  v_f64_t z, p, y;
+  v_u64_t ix, atop, sign, i, cmp;
+
+  ix = v_as_u64_f64 (x);
+  /* Compute fac as early as possible in order to get best performance.  */
+  v_f64_t fac = v_as_f64_u64 ((ix >> 63) << 62);
+  /* Use 12-bit for small, nan and inf case detection.  */
+  atop = (ix >> 52) & 0x7ff;
+  cmp = v_cond_u64 (atop - v_u64 (0x3cd) >= v_u64 (0x7ff - 0x3cd));
+
+  struct entry dat;
+
+  /* All entries of the vector are out of bounds, take a short path.
+     Use smallest possible number above 28 representable in 12 bits.   */
+  v_u64_t out_of_bounds = v_cond_u64 (atop >= v_u64 (0x404));
+
+  /* Use sign to produce either 0 if x > 0, 2 otherwise.  */
+  if (v_all_u64 (out_of_bounds) && likely (v_any_u64 (~cmp)))
+    return fac;
+
+  /* erfc(|x|) = P(|x|-x_i)*exp(-x^2).  */
+
+  v_f64_t a = v_abs_f64 (x);
+
+  /* Interval bounds are a logarithmic scale, i.e. interval n has
+     lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain
+     the interval index.  */
+  v_f64_t xp1 = a + v_f64 (1.0);
+  xp1 = xp1 * xp1;
+  xp1 = xp1 * xp1;
+  v_u64_t ixp1 = v_as_u64_f64 (xp1);
+  i = (ixp1 >> 52) - v_u64 (1023);
+
+  /* Index cannot exceed number of polynomials.  */
+#ifdef SCALAR
+  i = i <= (ERFC_NUM_INTERVALS) ? i : ERFC_NUM_INTERVALS;
+#else
+  i = (v_u64_t){i[0] <= ERFC_NUM_INTERVALS ? i[0] : ERFC_NUM_INTERVALS,
+		i[1] <= ERFC_NUM_INTERVALS ? i[1] : ERFC_NUM_INTERVALS};
+#endif
+  /* Get coeffs of i-th polynomial.  */
+  dat = lookup (i);
+
+  /* Evaluate Polynomial: P(|x|-x_i).  */
+  z = a - dat.xi;
+  p = v_eval_poly (z, dat);
+
+  /* Evaluate Gaussian: exp(-x^2).  */
+  v_f64_t e = v_eval_gauss (a);
+
+  /* Copy sign.  */
+  sign = v_as_u64_f64 (x) & ~AbsMask;
+  p = v_as_f64_u64 (v_as_u64_f64 (p) ^ sign);
+
+  /* Assemble result as 2.0 - p * e if x < 0, p * e otherwise.  */
+  y = v_fma_f64 (p, e, fac);
+
+  /* No need to fix value of y if x is out of bound, as
+     P[ERFC_NUM_INTERVALS]=0.  */
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_erfc_data.c b/pl/math/v_erfc_data.c
new file mode 100644
index 0000000..c53a669
--- /dev/null
+++ b/pl/math/v_erfc_data.c
@@ -0,0 +1,96 @@
+/*
+ * Polynomial coefficients for double-precision erfc(x) vector function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Coefficients for 20 order-12 polynomials used in v_erfc. The intervals have
+   the same bounds as the scalar algorithm, with the exception of the lower
+   bound of the first interval which is larger. This is because the vector
+   variants fall back to the scalar for tiny arguments, meaning that we can use
+   a slightly different approach which is more precise for larger inputs but
+   unacceptably imprecise for tiny inputs.  */
+
+const struct v_erfc_data __v_erfc_data = {
+
+/* Bounds for 20 intervals spanning [0x1.0p-28., 31.]. Interval bounds are a
+   logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the
+   exception of the first interval.  */
+.interval_bounds = {
+  0x1p-28,		/* If xmin=2^-28, 0 otherwise.  */
+  0x1.837f0518db8a9p-3, /* 0.189.  */
+  0x1.a827999fcef32p-2, /* 0.414.  */
+  0x1.5d13f32b5a75bp-1, /* 0.682.  */
+  0x1.0p0,		/* 1.000.  */
+  0x1.60dfc14636e2ap0,	/* 1.378.  */
+  0x1.d413cccfe779ap0,	/* 1.828.  */
+  0x1.2e89f995ad3adp1,	/* 2.364.  */
+  0x1.8p1,		/* 3.000.  */
+  0x1.e0dfc14636e2ap1,	/* 3.757.  */
+  0x1.2a09e667f3bcdp2,	/* 4.657.  */
+  0x1.6e89f995ad3adp2,	/* 5.727.  */
+  0x1.cp2,		/* 7.000.  */
+  0x1.106fe0a31b715p3,	/* 8.514.  */
+  0x1.4a09e667f3bcdp3,	/* 10.31.  */
+  0x1.8e89f995ad3adp3,	/* 12.45.  */
+  0x1.ep3,		/* 15.00.  */
+  0x1.206fe0a31b715p4,	/* 18.03.  */
+  0x1.5a09e667f3bcdp4,	/* 21.63.  */
+  0x1.9e89f995ad3adp4,	/* 25.91.  */
+  0x1.fp4		/* 31.00.  */
+},
+
+/* Generated using fpminimax algorithm on each interval separately. The
+   polynomial approximates erfc(x + a) * exp((x + a) ^ 2) in the interval
+   [0;b-a], where [a;b] is the interval in which the input lies. Note this is
+   slightly different from the scalar polynomial, which approximates
+   erfc(x + a) * exp(x ^ 2). See v_erfc.sollya for more details.  */
+.poly = {
+/* 3.725290298461914e-9 < x < 0.18920711500272103.  */
+{0x1.ffffffdbe4516p-1, -0x1.20dd74e429b54p0, 0x1.ffffffb7c6a67p-1, -0x1.8127466fa2ec9p-1, 0x1.ffffff6eeff5ap-2, -0x1.341f668c90dccp-2, 0x1.5554aca74e5d6p-3, -0x1.6014d9d3fed0dp-4, 0x1.546b5f2c85127p-5, -0x1.2f7ec79acc129p-6, 0x1.a27e53703b7abp-8, 0x1.7b18bce311fa3p-12, -0x1.1897cda04df3ap-9},
+/* 0.18920711500272103 < x < 0.41421356237309515.  */
+{0x1.a2b43de077724p-1, -0x1.a3495bb58664cp-1, 0x1.535f3ff4547e6p-1, -0x1.d96eea2951a7cp-2, 0x1.269566a956371p-2, -0x1.4e281de026b47p-3, 0x1.5ea071b652a2fp-4, -0x1.57f46cfca7024p-5, 0x1.3db28243f06abp-6, -0x1.138745eef6f26p-7, 0x1.a9cd70bad344p-9, -0x1.c6e4fda8920c4p-11, 0x1.624709ca2bc71p-16},
+/* 0.41421356237309515 < x < 0.681792830507429.  */
+{0x1.532e75764e513p-1, -0x1.28be34f327f9dp-1, 0x1.b088738cca84cp-2, -0x1.14377551bd5c8p-2, 0x1.3e1ecedd64246p-3, -0x1.5087f3110eb57p-4, 0x1.4b3c61efcb562p-5, -0x1.324cc70a4f459p-6, 0x1.0cd19a96af21bp-7, -0x1.cc2ccc725d07p-9, 0x1.a3ba67a7d02b4p-10, -0x1.b1943295882abp-11, 0x1.53a1c5fdf8e67p-12},
+/* 0.681792830507429 < x < 1.  */
+{0x1.10f974588f63dp-1, -0x1.9b032139e3367p-2, 0x1.09b942b8a951dp-2, -0x1.327553909cb88p-3, 0x1.42819b6c9a14p-4, -0x1.3a6d6f1924825p-5, 0x1.1f1864dd6f28fp-6, -0x1.ef12c5e9f3232p-8, 0x1.962ac63d55aa1p-9, -0x1.4146d9206419cp-10, 0x1.f823f62268229p-12, -0x1.837ab488d5ed8p-13, 0x1.aa021ae16edfep-15},
+/* 1 < x < 1.378414230005442.  */
+{0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c034p-2, 0x1.3c27283c31939p-3, -0x1.44837f88a0ecdp-4, 0x1.33cad0dc779c8p-5, -0x1.10fcef8294e8dp-6, 0x1.c8cb3e5a6a5a6p-8, -0x1.6aedbd3a05f1cp-9, 0x1.1325c0bf9a0cap-10, -0x1.8e28d61a0f646p-12, 0x1.0d554e2ab3652p-13, -0x1.35b5f9ac296ebp-15, 0x1.b8faf07e2527dp-18},
+/* 1.378414230005442 < x < 1.8284271247461903.  */
+{0x1.5ee444130b7dbp-2, -0x1.78396ab2083e8p-3, 0x1.6e617ec5bc039p-4, -0x1.49e60f6238765p-5, 0x1.16064fb4428c9p-6, -0x1.ba80a8575a434p-8, 0x1.4ec30f2efeb8p-9, -0x1.e40456c735f09p-11, 0x1.4f7ee6b7885b7p-12, -0x1.bc9997995fdecp-14, 0x1.1169f7327ff2p-15, -0x1.174826d000852p-17, 0x1.5506a7433e925p-20},
+/* 1.8284271247461903 < x < 2.363585661014858.  */
+{0x1.19a22c064d4eap-2, -0x1.f645498cae1b3p-4, 0x1.a0565950e1256p-5, -0x1.446605c186f6dp-6, 0x1.df1231b47ff04p-8, -0x1.515164d13dfafp-9, 0x1.c72bde869ad61p-11, -0x1.2768fbf9b1d6ep-12, 0x1.71bd3a1b851e9p-14, -0x1.bca5b5942017cp-16, 0x1.f2d480b3a2e63p-18, -0x1.d339662d53467p-20, 0x1.06d67ebf792bp-22},
+/* 2.363585661014858 < x < 3.  */
+{0x1.c57f0542a7637p-3, -0x1.4e5535c17af25p-4, 0x1.d31272523acfep-6, -0x1.3727cbbfd1bfcp-7, 0x1.8d6730b8c5a4cp-9, -0x1.e88548286036fp-11, 0x1.21f6e89456853p-12, -0x1.4d4b7787bd3c2p-14, 0x1.735dc84e7ff16p-16, -0x1.8eb02db832048p-18, 0x1.8dfb8add3b86ep-20, -0x1.47a340d76c72bp-22, 0x1.3e5925ffebe6bp-25},
+/* 3 < x < 3.756828460010884.  */
+{0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b1adp-5, 0x1.043fe1a98c3b9p-6, -0x1.259061ba34453p-8, 0x1.409cc2cc96bedp-10, -0x1.53dec3fd6c443p-12, 0x1.5e72f7baf3554p-14, -0x1.601aa94bf21eep-16, 0x1.58e730ceaa91dp-18, -0x1.4762cbd256163p-20, 0x1.22b8bea5d4a5ap-22, -0x1.ac197af37fcadp-25, 0x1.74cdf138a0b73p-28},
+/* 3.756828460010884 < x < 4.656854249492381.  */
+{0x1.29a8a4e95063ep-3, -0x1.29a8a316d331dp-5, 0x1.21876b3fe50cfp-7, -0x1.1276f2d8eefd9p-9, 0x1.fbff521741e5cp-12, -0x1.cb9ce996b9601p-14, 0x1.971075371ef81p-16, -0x1.61458571e4738p-18, 0x1.2c51c21b7ab9ep-20, -0x1.f01e444a666c3p-23, 0x1.7e8f2979b67f1p-25, -0x1.e505367843027p-28, 0x1.67809d68de49cp-31},
+/* 4.656854249492381 < x < 5.727171322029716.  */
+{0x1.e583024e2bc7fp-4, -0x1.8fb458acb5acep-6, 0x1.42b9dffac075cp-8, -0x1.ff9fe9a48522p-11, 0x1.8e7e866f4f073p-13, -0x1.313aeee1c2d45p-15, 0x1.cc299efd7374cp-18, -0x1.5587e53442d66p-20, 0x1.f2aca160f159bp-23, -0x1.62ae4834dcda7p-25, 0x1.d6b070147cb37p-28, -0x1.fee399e7be1bfp-31, 0x1.41d6f9fbc9515p-34},
+/* 5.727171322029716 < x < 7.  */
+{0x1.8d9cbafa30408p-4, -0x1.0dd14614ed1cfp-6, 0x1.6943976ea6bf4p-9, -0x1.dd6f05f3b914cp-12, 0x1.37891317e7bcfp-14, -0x1.91a81ce9014a2p-17, 0x1.ffcac303208b9p-20, -0x1.424f1af78feb3p-22, 0x1.90b8edbca12a5p-25, -0x1.e69bea0338c7fp-28, 0x1.13b974a710373p-30, -0x1.fdc9aa9359794p-34, 0x1.105fc772b5a66p-37},
+/* 7 < x < 8.513656920021768.  */
+{0x1.46dc6bf900f68p-4, -0x1.6e4b45246f95p-7, 0x1.96a3de47d4bd7p-10, -0x1.bf5070eccb409p-13, 0x1.e7af6e83607a2p-16, -0x1.078bf5306f9eep-18, 0x1.1a6e8327243adp-21, -0x1.2c1e7368c7809p-24, 0x1.3bc83557dac43p-27, -0x1.45a6405b2e649p-30, 0x1.3aac4888689ebp-33, -0x1.f1fa23448a168p-37, 0x1.c868668755778p-41},
+/* 8.513656920021768 < x < 10.313708498984761.  */
+{0x1.0d9a17e032288p-4, -0x1.f3e942ff4df7p-8, 0x1.cc77f09dabc5cp-11, -0x1.a56e8bfd32da8p-14, 0x1.7f49e31164409p-17, -0x1.5a73f46a6afc9p-20, 0x1.374240ce973d2p-23, -0x1.15e8d473b728cp-26, 0x1.ec3ec79699378p-30, -0x1.ab3b8aba63362p-33, 0x1.5a1381cfe2866p-36, -0x1.c78e252ce77ccp-40, 0x1.589857ceaaaeep-44},
+/* 10.313708498984761 < x < 12.454342644059432.  */
+{0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cbb1p-8, 0x1.0645980ecbbfcp-11, -0x1.8f86f887f6598p-15, 0x1.2ef80cd9e00b1p-18, -0x1.c97ffd66720e4p-22, 0x1.57f0eeecf030ap-25, -0x1.016df7d5e28d9p-28, 0x1.7f0d022922f1dp-32, -0x1.1849731f004aep-35, 0x1.8149e7ca0fb3cp-39, -0x1.b1fe4abe62d81p-43, 0x1.1ae4d60247651p-47},
+/* 12.454342644059432 < x < 15.  */
+{0x1.71eafbd9f5877p-5, -0x1.d83714d90461fp-9, 0x1.2c74dbacd45fdp-12, -0x1.7d27f3cfe160ep-16, 0x1.e20b13b8d32e3p-20, -0x1.2fe33cb2bce33p-23, 0x1.7dfd564d69a07p-27, -0x1.dea62ef0f7d7ep-31, 0x1.2a7b946273ea5p-34, -0x1.6eb665bad5b72p-38, 0x1.a8191750e8bf9p-42, -0x1.92d8a86cbd0fcp-46, 0x1.bba272feef841p-51},
+/* 15 < x < 18.027313840043536.  */
+{0x1.33714a024097ep-5, -0x1.467f441a50bc3p-9, 0x1.59fa2994c6f7ap-13, -0x1.6dd369d642b7dp-17, 0x1.81fb2aaf2e37p-21, -0x1.966040990b623p-25, 0x1.aaee55e15a079p-29, -0x1.bf756fc8ef04p-33, 0x1.d2daf554e0157p-37, -0x1.dec63e10d317p-41, 0x1.cae915bab7704p-45, -0x1.6537fbb62a8edp-49, 0x1.3f14bd5531da8p-54},
+/* 18.027313840043536 < x < 21.627416997969522.  */
+{0x1.fff97acd75487p-6, -0x1.c502e8e46eb81p-10, 0x1.903b065062756p-14, -0x1.6110aa5e81885p-18, 0x1.36fd4c13c4f1fp-22, -0x1.11848650be987p-26, 0x1.e06596bf6a27p-31, -0x1.a527876771d55p-35, 0x1.6fe1b92a40eb8p-39, -0x1.3c6eb50b23bc6p-43, 0x1.fead2230125dp-48, -0x1.5073427c5207dp-52, 0x1.ff420973fa51dp-58},
+/* 21.627416997969522 < x < 25.908685288118864.  */
+{0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf8e5p-10, 0x1.d0ddfb858b60ap-15, -0x1.5673f4a8bb08ep-19, 0x1.f80488e89ddb9p-24, -0x1.728391905fcf3p-28, 0x1.101538d7e30bap-32, -0x1.8f16f49d0fa3bp-37, 0x1.23bbaea534034p-41, -0x1.a40119533ee1p-46, 0x1.1b75770e435fdp-50, -0x1.3804bdeb33efdp-55, 0x1.8ba4e7838a4dp-61},
+/* 25.908685288118864 < x < 31.  */
+{0x1.64839d636f92bp-6, -0x1.b7adf753623afp-11, 0x1.0eec0b635a0c4p-15, -0x1.4da09b802ef48p-20, 0x1.9a8b149f5ddf1p-25, -0x1.f8d1f722c65bap-30, 0x1.36247d9a20e19p-34, -0x1.7cbd25180c1d3p-39, 0x1.d243c7a5c8331p-44, -0x1.19e00cc6b1e08p-48, 0x1.418cb6823f2d9p-53, -0x1.2dfdc526c43acp-58, 0x1.49885a987486fp-64},
+/* Dummy interval for x>31 */
+{0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0,
+ 0x0p0, 0x0p0, 0x0p0}
+}
+};
diff --git a/pl/math/v_exp_tail.c b/pl/math/v_exp_tail.c
new file mode 100644
index 0000000..cf834e6
--- /dev/null
+++ b/pl/math/v_exp_tail.c
@@ -0,0 +1,75 @@
+/*
+ * Double-precision vector e^(x+tail) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "v_math.h"
+#if V_SUPPORTED
+#include "v_exp_tail.h"
+
+#define C1 v_f64 (C1_scal)
+#define C2 v_f64 (C2_scal)
+#define C3 v_f64 (C3_scal)
+#define InvLn2 v_f64 (InvLn2_scal)
+#define Ln2hi v_f64 (Ln2hi_scal)
+#define Ln2lo v_f64 (Ln2lo_scal)
+
+#define IndexMask v_u64 (IndexMask_scal)
+#define Shift v_f64 (Shift_scal)
+#define Thres v_f64 (Thres_scal)
+
+VPCS_ATTR
+static v_f64_t
+specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
+{
+  v_f64_t absn = v_abs_f64 (n);
+
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
+  v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
+  v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
+  v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
+  v_f64_t r1 = s1 * s1;
+  v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
+  return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
+}
+
+VPCS_ATTR
+v_f64_t V_NAME (exp_tail) (v_f64_t x, v_f64_t xtail)
+{
+  v_f64_t n, r, s, y, z;
+  v_u64_t cmp, u, e, i;
+
+  cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
+
+  /* n = round(x/(ln2/N)).  */
+  z = v_fma_f64 (x, InvLn2, Shift);
+  u = v_as_u64_f64 (z);
+  n = z - Shift;
+
+  /* r = x - n*ln2/N.  */
+  r = x;
+  r = v_fma_f64 (-Ln2hi, n, r);
+  r = v_fma_f64 (-Ln2lo, n, r);
+
+  e = u << (52 - V_EXP_TAIL_TABLE_BITS);
+  i = u & IndexMask;
+
+  /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  y = v_fma_f64 (C3, r, C2);
+  y = v_fma_f64 (y, r, C1);
+  y = v_fma_f64 (y, r, v_f64 (1.0));
+  y = v_fma_f64 (y, r, xtail);
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (Tab, i);
+  s = v_as_f64_u64 (u + e);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (s, y, n);
+  return v_fma_f64 (y, s, s);
+}
+#endif
diff --git a/pl/math/v_exp_tail.h b/pl/math/v_exp_tail.h
new file mode 100644
index 0000000..e1417d3
--- /dev/null
+++ b/pl/math/v_exp_tail.h
@@ -0,0 +1,21 @@
+/*
+ * Constants for double-precision e^(x+tail) vector function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define C1_scal 0x1.fffffffffffd4p-2
+#define C2_scal 0x1.5555571d6b68cp-3
+#define C3_scal 0x1.5555576a59599p-5
+#define InvLn2_scal 0x1.71547652b82fep8 /* N/ln2.  */
+#define Ln2hi_scal 0x1.62e42fefa39efp-9 /* ln2/N.  */
+#define Ln2lo_scal 0x1.abc9e3b39803f3p-64
+
+#define N (1 << V_EXP_TAIL_TABLE_BITS)
+#define Tab __v_exp_tail_data
+#define IndexMask_scal (N - 1)
+#define Shift_scal 0x1.8p+52
+#define Thres_scal 704.0
diff --git a/pl/math/v_exp_tail_data.c b/pl/math/v_exp_tail_data.c
new file mode 100644
index 0000000..97e1bc1
--- /dev/null
+++ b/pl/math/v_exp_tail_data.c
@@ -0,0 +1,97 @@
+/*
+ * Lookup table for double-precision e^(x+tail) vector function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* 2^(j/N), j=0..N (where N = 256).  */
+const uint64_t __v_exp_tail_data[]
+  = {0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
+     0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
+     0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
+     0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
+     0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
+     0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
+     0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
+     0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
+     0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
+     0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
+     0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
+     0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
+     0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
+     0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
+     0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
+     0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
+     0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
+     0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
+     0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
+     0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
+     0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
+     0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
+     0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
+     0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
+     0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
+     0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
+     0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
+     0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
+     0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
+     0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
+     0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
+     0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
+     0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
+     0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
+     0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
+     0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
+     0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
+     0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
+     0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
+     0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
+     0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
+     0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
+     0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
+     0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
+     0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
+     0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
+     0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
+     0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
+     0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
+     0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
+     0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
+     0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
+     0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
+     0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
+     0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
+     0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
+     0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
+     0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
+     0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
+     0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
+     0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
+     0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
+     0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
+     0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
+     0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
+     0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
+     0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
+     0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
+     0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
+     0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
+     0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
+     0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
+     0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
+     0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
+     0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
+     0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
+     0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
+     0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
+     0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
+     0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
+     0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
+     0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
+     0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
+     0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
+     0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
+     0x3feff9d96b2a23d9};
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index 97c3731..e9e7d44 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -282,6 +282,12 @@ v_any_u64 (v_u64_t x)
 {
   return x != 0;
 }
+/* true if all elements of a v_cond result is non-zero.  */
+static inline int
+v_all_u64 (v_u64_t x)
+{
+  return x;
+}
 /* to wrap the result of relational operators.  */
 static inline v_u64_t
 v_cond_u64 (v_u64_t x)
@@ -555,6 +561,13 @@ v_any_u64 (v_u64_t x)
   /* assume elements in x are either 0 or -1u.  */
   return vpaddd_u64 (x) != 0;
 }
+/* true if all elements of a v_cond result is 1.  */
+static inline int
+v_all_u64 (v_u64_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
+}
 /* to wrap the result of relational operators.  */
 static inline v_u64_t
 v_cond_u64 (v_u64_t x)
diff --git a/pl/math/vn_erfc_3u7.c b/pl/math/vn_erfc_3u7.c
new file mode 100644
index 0000000..324b541
--- /dev/null
+++ b/pl/math/vn_erfc_3u7.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erfc.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_erfc, _ZGVnN2v_erfc);
+#include "v_erfc_3u7.c"
+#endif
diff --git a/pl/math/vn_exp_tail.c b/pl/math/vn_exp_tail.c
new file mode 100644
index 0000000..04b5aaa
--- /dev/null
+++ b/pl/math/vn_exp_tail.c
@@ -0,0 +1,11 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erfc.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#include "v_exp_tail.c"
+#endif
-- 
cgit v1.2.3


From a18ff357bce38643a06704062f67037ec67c8b34 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 29 Apr 2022 14:32:04 +0100
Subject: pl/math: Add vector/Neon erfcf

Neon erfcf uses the same algorithm as the scalar variant, including
sharing the same coefficients. The maximum measured error is 0.75ulp,
measured at -3.93e-6.
---
 pl/math/erfcf.h                |  52 +++++++++++
 pl/math/erfcf_2u.c             |  40 +--------
 pl/math/include/mathlib.h      |   4 +
 pl/math/s_erfcf_1u.c           |   6 ++
 pl/math/test/mathbench_funcs.h |   5 ++
 pl/math/test/runulp.sh         |  14 +++
 pl/math/test/ulp_funcs.h       |   4 +
 pl/math/test/ulp_wrappers.h    |   3 +
 pl/math/v_erfcf_1u.c           | 192 +++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_erfcf_1u.c          |  12 +++
 10 files changed, 293 insertions(+), 39 deletions(-)
 create mode 100644 pl/math/erfcf.h
 create mode 100644 pl/math/s_erfcf_1u.c
 create mode 100644 pl/math/v_erfcf_1u.c
 create mode 100644 pl/math/vn_erfcf_1u.c

diff --git a/pl/math/erfcf.h b/pl/math/erfcf.h
new file mode 100644
index 0000000..6adc6b4
--- /dev/null
+++ b/pl/math/erfcf.h
@@ -0,0 +1,52 @@
+/*
+ * Shared functions for scalar and vector single-precision erfc(x) functions.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_ERFCF_H
+#define PL_MATH_ERFCF_H
+
+#include <math.h>
+
+/* Accurate exponential from optimized-routines.  */
+double
+__exp_dd (double x, double xtail);
+
+/* Evaluate order-12 polynomials using pairwise summation and Horner scheme in
+   double precision.  */
+static inline double
+eval_poly_horner_lvl2 (double z, const double *coeff)
+{
+  double r1, r2, r3, r4, r5, r6, r7, r8;
+  double R1, R2, R3, R4;
+  double Q1, Q2;
+  double z2, z4, z8;
+  z2 = z * z;
+  r1 = fma (z, coeff[1], coeff[0]);
+  r2 = fma (z, coeff[3], coeff[2]);
+  z4 = z2 * z2;
+  z8 = z4 * z4;
+  R1 = fma (z2, r2, r1);
+  r3 = fma (z, coeff[5], coeff[4]);
+  r4 = fma (z, coeff[7], coeff[6]);
+  R2 = fma (z2, r4, r3);
+  Q1 = fma (z4, R2, R1);
+  r5 = fma (z, coeff[9], coeff[8]);
+  r6 = fma (z, coeff[11], coeff[10]);
+  R3 = fma (z2, r6, r5);
+  r7 = fma (z, coeff[13], coeff[12]);
+  r8 = fma (z, coeff[15], coeff[14]);
+  R4 = fma (z2, r8, r7);
+  Q2 = fma (z4, R4, R3);
+  return fma (z8, Q2, Q1);
+}
+
+static inline double
+eval_exp_mx2 (double x)
+{
+  return __exp_dd (-(x * x), 0.0);
+}
+
+#endif // PL_MATH_ERFCF_H
diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c
index 6222847..80dba83 100644
--- a/pl/math/erfcf_2u.c
+++ b/pl/math/erfcf_2u.c
@@ -5,49 +5,11 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+#include "erfcf.h"
 #include "math_config.h"
 
 #define P(i) __erfcf_poly_data.poly[i]
 
-/* Accurate exponential from optimized routines.  */
-double
-__exp_dd (double x, double xtail);
-
-/* Evaluate order-12 polynomials using pairwise summation and Horner scheme in
-   double precision.  */
-static inline double
-eval_poly_horner_lvl2 (double z, const double *coeff)
-{
-  double r1, r2, r3, r4, r5, r6, r7, r8;
-  double R1, R2, R3, R4;
-  double Q1, Q2;
-  double z2, z4, z8;
-  z2 = z * z;
-  r1 = fma (z, coeff[1], coeff[0]);
-  r2 = fma (z, coeff[3], coeff[2]);
-  z4 = z2 * z2;
-  z8 = z4 * z4;
-  R1 = fma (z2, r2, r1);
-  r3 = fma (z, coeff[5], coeff[4]);
-  r4 = fma (z, coeff[7], coeff[6]);
-  R2 = fma (z2, r4, r3);
-  Q1 = fma (z4, R2, R1);
-  r5 = fma (z, coeff[9], coeff[8]);
-  r6 = fma (z, coeff[11], coeff[10]);
-  R3 = fma (z2, r6, r5);
-  r7 = fma (z, coeff[13], coeff[12]);
-  r8 = fma (z, coeff[15], coeff[14]);
-  R4 = fma (z2, r8, r7);
-  Q2 = fma (z4, R4, R3);
-  return fma (z8, Q2, Q1);
-}
-
-static inline double
-eval_exp_mx2 (double x)
-{
-  return __exp_dd (-(x * x), 0.0);
-}
-
 /* Approximation of erfcf for |x| > 4.0.  */
 static inline float
 approx_erfcf_hi (float x, uint32_t sign, const double *coeff)
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e06e449..e496cda 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -14,6 +14,7 @@ float log10f (float);
 
 double log10 (double);
 
+float __s_erfcf (float);
 float __s_log10f (float);
 
 double __s_erfc (double);
@@ -31,6 +32,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #endif
 
 /* Vector functions following the base PCS.  */
+__f32x4_t __v_erfcf (__f32x4_t);
 __f64x2_t __v_erfc (__f64x2_t);
 __f32x4_t __v_log10f (__f32x4_t);
 __f64x2_t __v_log10 (__f64x2_t);
@@ -39,11 +41,13 @@ __f64x2_t __v_log10 (__f64x2_t);
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_erfcf (__f32x4_t);
 __vpcs __f64x2_t __vn_erfc (__f64x2_t);
 __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
diff --git a/pl/math/s_erfcf_1u.c b/pl/math/s_erfcf_1u.c
new file mode 100644
index 0000000..615db16
--- /dev/null
+++ b/pl/math/s_erfcf_1u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erfcf_1u.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 99b4856..b35b2f0 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -12,14 +12,19 @@ D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
+F (__s_erfcf, -6.0, 28.0)
 D (__s_erfc, -6.0, 28.0)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 #if __aarch64__
+VF (__v_erfcf, -6.0, 28.0)
 VD (__v_erfc, -6.0, 28.0)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 #ifdef __vpcs
+VNF(__vn_erfcf, -6.0, 28.0)
+VNF(_ZGVnN4v_erfcf, -6.0, 28.0)
+
 VND(__vn_erfc, -6.0, 28.0)
 VND(_ZGVnN2v_erfc, -6.0, 28.0)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 07ba642..3787233 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -93,6 +93,15 @@ range_erfc='
    0          inf       40000
 '
 
+range_erfcf='
+   0      0xffff0000 10000
+   0x1p-127  0x1p-26 40000
+  -0x1p-127 -0x1p-26 40000
+   0x1p-26    0x1p5  40000
+  -0x1p-26   -0x1p3  40000
+   0          inf    40000
+'
+
 range_log10='
   0 0xffff000000000000 10000
   0x1p-4     0x1p4     400000
@@ -105,6 +114,7 @@ range_log10f='
 '
 # error limits
 L_erfc=3.7
+L_erfcf=1.0
 L_log10=1.16
 L_log10f=2.81
 
@@ -133,6 +143,10 @@ log10  __v_log10       $runv
 log10  __vn_log10      $runvn
 log10  _ZGVnN2v_log10  $runvn
 
+erfcf  __s_erfcf       $runs
+erfcf  __v_erfcf       $runv
+erfcf  __vn_erfcf      $runvn
+erfcf  _ZGVnN4v_erfcf  $runvn
 log10f __s_log10f      $runs
 log10f __v_log10f      $runv
 log10f __vn_log10f     $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 18f50c3..48fd1ad 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -10,17 +10,21 @@ F1 (log10)
 D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
+F (__s_erfcf, __s_erfcf, erfc, mpfr_erfc, 1, 1, f1, 0)
 F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0)
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 #if __aarch64__
+F (__v_erfcf, v_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #ifdef __vpcs
+F (__vn_erfcf, vn_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (_ZGVnN4v_erfcf, Z_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (_ZGVnN2v_erfc, Z_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (_ZGVnN4v_log10f, Z_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (_ZGVnN2v_log10, Z_log10, log10l, mpfr_log10, 1, 0, d1, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 5357852..c364525 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -24,14 +24,17 @@ static const double dv[2] = {1.0, -INFINITY};
 static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
 static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
 
+static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
 static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
 static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 #ifdef __vpcs
+static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
 static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
 static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
 static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
 
+static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
 static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
 static double Z_erfc(double x) { return _ZGVnN2v_erfc(argd(x))[0]; }
 static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c
new file mode 100644
index 0000000..057ef5c
--- /dev/null
+++ b/pl/math/v_erfcf_1u.c
@@ -0,0 +1,192 @@
+/*
+ * Single-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "erfcf.h"
+
+#if V_SUPPORTED
+
+#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)]
+
+VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
+
+static inline uint32_t
+interval_index (uint32_t ia12)
+{
+  // clang-format off
+  return (ia12 < 0x400 ? 0 :
+         (ia12 < 0x408 ? 1 :
+         (ia12 < 0x410 ? 2 :
+                         3)));
+  // clang-format on
+}
+
+/* The C macro wraps the coeffs argument in order to make the
+   poynomial evaluation more readable. In the scalarised variant the
+   second pointer is ignored.  */
+#ifdef SCALAR
+#define C(i) coeff1[i]
+#else
+#define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
+#endif
+
+static inline v_f64_t
+v_eval_poly_estrin (v_f64_t z, const double *coeff1, const double *coeff2)
+{
+  v_f64_t z2 = z * z;
+  v_f64_t z4 = z2 * z2;
+  v_f64_t z8 = z4 * z4;
+
+  v_f64_t c0_zc1 = v_fma_f64 (z, C (1), C (0));
+  v_f64_t c2_zc3 = v_fma_f64 (z, C (3), C (2));
+  v_f64_t c4_zc5 = v_fma_f64 (z, C (5), C (4));
+  v_f64_t c6_zc7 = v_fma_f64 (z, C (7), C (6));
+  v_f64_t c8_zc9 = v_fma_f64 (z, C (9), C (8));
+  v_f64_t c10_zc11 = v_fma_f64 (z, C (11), C (10));
+  v_f64_t c12_zc13 = v_fma_f64 (z, C (13), C (12));
+  v_f64_t c14_zc15 = v_fma_f64 (z, C (15), C (14));
+
+  v_f64_t c0_z2c3 = v_fma_f64 (z2, c2_zc3, c0_zc1);
+  v_f64_t c4_z2c7 = v_fma_f64 (z2, c6_zc7, c4_zc5);
+  v_f64_t c8_z2c11 = v_fma_f64 (z2, c10_zc11, c8_zc9);
+  v_f64_t c12_z2c15 = v_fma_f64 (z2, c14_zc15, c12_zc13);
+
+  v_f64_t c0_z4c7 = v_fma_f64 (z4, c4_z2c7, c0_z2c3);
+  v_f64_t c8_z4c15 = v_fma_f64 (z4, c12_z2c15, c8_z2c11);
+
+  return v_fma_f64 (z8, c8_z4c15, c0_z4c7);
+}
+
+#undef C
+
+static inline v_f64_t
+v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
+			   const double *coeff2)
+{
+  v_f64_t poly = v_eval_poly_estrin (x, coeff1, coeff2);
+  v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
+  return poly * gauss;
+}
+
+static inline float
+approx_poly_gauss (float abs_x, const double *coeff)
+{
+  return (float) (eval_poly_horner_lvl2 (abs_x, coeff) * eval_exp_mx2 (abs_x));
+}
+
+static v_f32_t
+v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes)
+{
+#ifdef SCALAR
+  float y = approx_poly_gauss (abs_x, P (ia12));
+  return sign ? 2 - y : y;
+#else
+  float32x2_t lo32 = {0, 0};
+  float32x2_t hi32 = {0, 0};
+  /* The polynomial and Gaussian components must be calculated in
+     double precision in order to meet the required ULP error. This
+     means we have to promote low and high halves of the
+     single-precision input vector to two separate double-precision
+     input vectors. This incurs some overhead, and there is also
+     overhead to loading the polynomial coefficients as this cannot be
+     done in a vector fashion. This would be wasted effort for
+     elements which lie in the 'boring' zone, as they will be
+     overwritten later. Hence we use the lanes parameter to only do
+     the promotion on a pair of lanes if both of those lanes are
+     interesting and not special cases. If one lane is inactive, we
+     use a scalar routine which is shared with the scalar variant.  */
+  if (lanes[0] & lanes[1])
+    {
+      lo32 = vcvt_f32_f64 (
+	v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)),
+				   P (ia12[0]), P (ia12[1])));
+    }
+  else if (lanes[0])
+    {
+      lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0]));
+    }
+  else if (lanes[1])
+    {
+      lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1]));
+    }
+
+  if (lanes[2] & lanes[3])
+    {
+      hi32
+	= vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x),
+						   P (ia12[2]), P (ia12[3])));
+    }
+  else if (lanes[2])
+    {
+      hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2]));
+    }
+  else if (lanes[3])
+    {
+      hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3]));
+    }
+
+  v_f32_t y = vcombine_f32 (lo32, hi32);
+
+  if (v_any_u32 (sign))
+    {
+      y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y);
+    }
+
+  return y;
+#endif
+}
+
+/* Optimized single-precision vector complementary error function
+   erfcf. Max measured error: 0.750092 at various values between
+   -0x1.06521p-20 and -0x1.add1dap-17. For example:
+   __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0
+   +0.249908 ulp err 0.250092.  */
+VPCS_ATTR
+v_f32_t V_NAME (erfcf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t ia = ix & 0x7fffffff;
+  v_u32_t ia12 = ia >> 20;
+  v_u32_t sign = ix >> 31;
+  v_u32_t inf_ia12 = v_u32 (0x7f8);
+
+  v_u32_t special_cases
+    = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328));
+  v_u32_t in_bounds
+    = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3)));
+  v_f32_t boring_zone = v_as_f32_u32 (sign << 30);
+
+#ifdef SCALAR
+  if (unlikely (special_cases))
+    {
+      if (ia12 >= 0x7f8)
+	return (float) (sign << 1) + 1.0f / x; /* Special cases.  */
+      else
+	return 1.0f - x; /* Small case.  */
+    }
+  else if (likely (!in_bounds))
+    {
+      return sign ? boring_zone : __math_uflowf (boring_zone);
+    }
+#endif
+
+  v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12,
+			      in_bounds & ~special_cases);
+
+#ifndef SCALAR
+  y = vbslq_f32 (~in_bounds, boring_zone, y);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    {
+      y = v_call_f32 (erfcf, x, y, special_cases);
+    }
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/vn_erfcf_1u.c b/pl/math/vn_erfcf_1u.c
new file mode 100644
index 0000000..0262c86
--- /dev/null
+++ b/pl/math/vn_erfcf_1u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erfcf.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_erfcf, _ZGVnN4v_erfcf);
+#include "v_erfcf_1u.c"
+#endif
-- 
cgit v1.2.3


From d0e8b691d41654f4853f94f7dc3e570dc4c088b9 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Tue, 24 May 2022 17:17:42 +0100
Subject: pl/math: Fix outside source compilation.

---
 pl/math/Dir.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index 13ecf87..4a96dc6 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -5,7 +5,7 @@
 
 PLM := $(srcdir)/pl/math
 AOR := $(srcdir)/math
-B := $(srcdir)/build/pl/math
+B := build/pl/math
 
 math-lib-srcs := $(wildcard $(PLM)/*.[cS])
 math-test-srcs := \
-- 
cgit v1.2.3


From b129d871b3a168ba9c3a3b1584a3b110bc69ee7c Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 14 Jun 2022 11:34:05 +0100
Subject: pl/math: Add vector/Neon erf

Maximum measured error is 1.75 ULP.
---
 pl/math/include/mathlib.h      |   4 ++
 pl/math/math_config.h          |   8 +++
 pl/math/s_erf_2u.c             |   6 +++
 pl/math/test/mathbench_funcs.h |   6 +++
 pl/math/test/runulp.sh         |  15 ++++++
 pl/math/test/ulp_funcs.h       |   4 ++
 pl/math/test/ulp_wrappers.h    |   3 ++
 pl/math/tools/v_erf.sollya     |  20 +++++++
 pl/math/v_erf_2u.c             | 104 +++++++++++++++++++++++++++++++++++
 pl/math/v_erf_data.c           | 119 +++++++++++++++++++++++++++++++++++++++++
 pl/math/v_math.h               |  18 +++++++
 pl/math/vn_erf_2u.c            |  12 +++++
 12 files changed, 319 insertions(+)
 create mode 100644 pl/math/s_erf_2u.c
 create mode 100644 pl/math/tools/v_erf.sollya
 create mode 100644 pl/math/v_erf_2u.c
 create mode 100644 pl/math/v_erf_data.c
 create mode 100644 pl/math/vn_erf_2u.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e496cda..b20dfa4 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -17,6 +17,7 @@ double log10 (double);
 float __s_erfcf (float);
 float __s_log10f (float);
 
+double __s_erf (double);
 double __s_erfc (double);
 double __s_log10 (double);
 
@@ -32,6 +33,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #endif
 
 /* Vector functions following the base PCS.  */
+__f64x2_t __v_erf (__f64x2_t);
 __f32x4_t __v_erfcf (__f32x4_t);
 __f64x2_t __v_erfc (__f64x2_t);
 __f32x4_t __v_log10f (__f32x4_t);
@@ -41,12 +43,14 @@ __f64x2_t __v_log10 (__f64x2_t);
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
+__vpcs __f64x2_t __vn_erf (__f64x2_t);
 __vpcs __f32x4_t __vn_erfcf (__f32x4_t);
 __vpcs __f64x2_t __vn_erfc (__f64x2_t);
 __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 710990c..51bada5 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -393,4 +393,12 @@ extern const struct erfcf_poly_data
 
 #define V_EXP_TAIL_TABLE_BITS 8
 extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN;
+
+#define V_ERF_NINTS 49
+#define V_ERF_NCOEFFS 10
+extern const struct v_erf_data
+{
+  double shifts[V_ERF_NINTS];
+  double coeffs[V_ERF_NCOEFFS][V_ERF_NINTS];
+} __v_erf_data HIDDEN;
 #endif
diff --git a/pl/math/s_erf_2u.c b/pl/math/s_erf_2u.c
new file mode 100644
index 0000000..e5c25e0
--- /dev/null
+++ b/pl/math/s_erf_2u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erf_2u.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index b35b2f0..882f496 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -8,20 +8,26 @@ F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
 
+D (erf, -6,6)
 D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
+D (__s_erf, -6.0, 6.0)
 F (__s_erfcf, -6.0, 28.0)
 D (__s_erfc, -6.0, 28.0)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 #if __aarch64__
+VD  (__v_erf, -6.0, 6.0)
 VF (__v_erfcf, -6.0, 28.0)
 VD (__v_erfc, -6.0, 28.0)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 #ifdef __vpcs
+VND(__vn_erf, -6.0, 6.0)
+VND(_ZGVnN2v_erf, -6.0, 6.0)
+
 VNF(__vn_erfcf, -6.0, 28.0)
 VNF(_ZGVnN4v_erfcf, -6.0, 28.0)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 3787233..bef0c27 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -112,11 +112,22 @@ range_log10f='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
 '
+
+range_erf='
+ 0      0xffff0000 10000
+ 0x1p-127  0x1p-26 40000
+-0x1p-127 -0x1p-26 40000
+ 0x1p-26   0x1p3   40000
+-0x1p-26  -0x1p3   40000
+ 0         inf     40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
 L_log10=1.16
 L_log10f=2.81
+L_erf=1.76
 
 while read G F R
 do
@@ -134,6 +145,10 @@ $range
 EOF
 done << EOF
 # group symbol run
+erf   __s_erf          $runs
+erf   __v_erf          $runv
+erf   __vn_erf         $runvn
+erf   _ZGVnN2v_erf     $runvn
 erfc   __s_erfc        $runs
 erfc   __v_erfc        $runv
 erfc   __vn_erfc       $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 48fd1ad..717f8c5 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -10,20 +10,24 @@ F1 (log10)
 D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
+F (__s_erf, __s_erf, erfl, mpfr_erf, 1, 0, d1, 0)
 F (__s_erfcf, __s_erfcf, erfc, mpfr_erfc, 1, 1, f1, 0)
 F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0)
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 #if __aarch64__
+F (__v_erf, v_erf, erfl, mpfr_erf, 1, 0, d1, 1)
 F (__v_erfcf, v_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #ifdef __vpcs
+F (__vn_erf, vn_erf, erfl, mpfr_erf, 1, 0, d1, 1)
 F (__vn_erfcf, vn_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (_ZGVnN2v_erf, Z_erf, erfl, mpfr_erf, 1, 0, d1, 1)
 F (_ZGVnN4v_erfcf, Z_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (_ZGVnN2v_erfc, Z_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (_ZGVnN4v_log10f, Z_log10f, log10, mpfr_log10, 1, 1, f1, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index c364525..80789f9 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -26,16 +26,19 @@ static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
 
 static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
+static double v_erf(double x) { return __v_erf(argd(x))[0]; }
 static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
 static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 #ifdef __vpcs
 static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
 static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
+static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
 static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
 static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
 
 static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
 static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
+static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; }
 static double Z_erfc(double x) { return _ZGVnN2v_erfc(argd(x))[0]; }
 static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
 #endif
diff --git a/pl/math/tools/v_erf.sollya b/pl/math/tools/v_erf.sollya
new file mode 100644
index 0000000..c9deae9
--- /dev/null
+++ b/pl/math/tools/v_erf.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating erf(x).
+// To generate coefficients for interval i (0 to 47) do:
+// $ sollya v_erf.sollya $i
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+scale = 1/8;
+deg = 9;
+
+itv = parse(__argv[0]);
+if (itv == 0)  then { a = 0x1p-1022; }
+else                { a = itv * scale; };
+
+prec=256;
+
+poly = fpminimax(erf(scale*x+a), deg, [|D ...|], [0; 1]);
+
+display = hexadecimal;
+for i from 0 to deg do coeff(poly, i);
\ No newline at end of file
diff --git a/pl/math/v_erf_2u.c b/pl/math/v_erf_2u.c
new file mode 100644
index 0000000..7a08a2c
--- /dev/null
+++ b/pl/math/v_erf_2u.c
@@ -0,0 +1,104 @@
+/*
+ * Double-precision vector erf(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "include/mathlib.h"
+#include "math_config.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define AbsXMax v_f64 (0x1.8p+2)
+#define Scale v_f64 (0x1p+3)
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (erf, x, y, cmp);
+}
+
+/* A structure to perform look-up in coeffs and other parameter tables.  */
+struct entry
+{
+  v_f64_t P[V_ERF_NCOEFFS];
+  v_f64_t shift;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  for (int j = 0; j < V_ERF_NCOEFFS; ++j)
+    e.P[j] = __v_erf_data.coeffs[j][i];
+  e.shift = __v_erf_data.shifts[i];
+#else
+  for (int j = 0; j < V_ERF_NCOEFFS; ++j)
+    {
+      e.P[j][0] = __v_erf_data.coeffs[j][i[0]];
+      e.P[j][1] = __v_erf_data.coeffs[j][i[1]];
+    }
+  e.shift[0] = __v_erf_data.shifts[i[0]];
+  e.shift[1] = __v_erf_data.shifts[i[1]];
+#endif
+  return e;
+}
+
+/* Optimized double precision vector error function erf. Maximum
+   observed error is 1.75 ULP, in [0.110, 0.111]:
+   verf(0x1.c5e0c2d5d0543p-4) got 0x1.fe0ed62a54987p-4
+			     want 0x1.fe0ed62a54985p-4.  */
+VPCS_ATTR
+v_f64_t V_NAME (erf) (v_f64_t x)
+{
+  /* Handle both inf/nan as well as small values (|x|<2^-28)
+     If any condition in the lane is true then a loop over
+     scalar calls will be performed.  */
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t atop = (ix >> 48) & v_u64 (0x7fff);
+  v_u64_t special_case
+    = v_cond_u64 (atop - v_u64 (0x3e30) >= v_u64 (0x7ff0 - 0x3e30));
+
+  /* Get sign and absolute value.  */
+  v_u64_t sign = v_as_u64_f64 (x) & ~AbsMask;
+  v_f64_t a = v_min_f64 (v_abs_f64 (x), AbsXMax);
+
+  /* Compute index by truncating 8 * a with a=|x| saturated to 6.0.  */
+
+#ifdef SCALAR
+  v_u64_t i = v_trunc_u64 (a * Scale);
+#else
+  v_u64_t i = vcvtq_n_u64_f64 (a, 3);
+#endif
+  /* Get polynomial coefficients and shift parameter using lookup.  */
+  struct entry dat = lookup (i);
+
+  /* Evaluate polynomial on transformed argument.  */
+  v_f64_t z = v_fma_f64 (a, Scale, dat.shift);
+
+  v_f64_t r1 = v_fma_f64 (z, dat.P[1], dat.P[0]);
+  v_f64_t r2 = v_fma_f64 (z, dat.P[3], dat.P[2]);
+  v_f64_t r3 = v_fma_f64 (z, dat.P[5], dat.P[4]);
+  v_f64_t r4 = v_fma_f64 (z, dat.P[7], dat.P[6]);
+  v_f64_t r5 = v_fma_f64 (z, dat.P[9], dat.P[8]);
+
+  v_f64_t z2 = z * z;
+  v_f64_t y = v_fma_f64 (z2, r5, r4);
+  y = v_fma_f64 (z2, y, r3);
+  y = v_fma_f64 (z2, y, r2);
+  y = v_fma_f64 (z2, y, r1);
+
+  /* y=erf(x) if x>0, -erf(-x) otherwise.  */
+  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
+
+  if (unlikely (v_any_u64 (special_case)))
+    return specialcase (x, y, special_case);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_erf_data.c b/pl/math/v_erf_data.c
new file mode 100644
index 0000000..1694f28
--- /dev/null
+++ b/pl/math/v_erf_data.c
@@ -0,0 +1,119 @@
+/*
+ * Polynomial coefficients and shifts for double-precision erf(x) vector
+ * function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* 48 intervals of the form [x_i, x_{i+1}] with x_i = i / 8 for
+   i=1,...,47 (x_0 = 2^-1022). There is an extra dummy interval for
+   [6, +inf] with all coeffs = 0 except for P_0 = 1.0, as erf(x) == 1
+   above 6.
+
+   Coefficients for each interval generated using fpminimax algorithm. See
+   v_erf.sollya for details. Note the array is transposed, so for a set of
+   coefficients C generated on interval i, C[j] is at coeffs[j][i].  */
+
+const struct v_erf_data __v_erf_data
+  = {.shifts
+     = {-0x1p-1019, -1,	 -2,  -3,  -4,	-5,  -6,  -7,  -8,  -9,	 -10, -11, -12,
+	-13,	    -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25,
+	-26,	    -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38,
+	-39,	    -40, -41, -42, -43, -44, -45, -46, -47, 0},
+     .coeffs = {
+       // clang-format off
+
+{0x1.20dd750429b6dp-1022, 0x1.1f5e1a35c3b8ap-3, 0x1.1af54e232d609p-2, 0x1.9dd0d2b721f39p-2, 0x1.0a7ef5c18edd2p-1, 0x1.3f196dcd0f135p-1,
+ 0x1.6c1c9759d0e5fp-1, 0x1.91724951b8fc6p-1, 0x1.af767a741088bp-1, 0x1.c6dad2829ec62p-1, 0x1.d8865d98abe00p-1, 0x1.e5768c3b4a3fcp-1,
+ 0x1.eea5557137ae0p-1, 0x1.f4f693b67bd77p-1, 0x1.f92d077f8d56dp-1, 0x1.fbe61eef4cf6ap-1, 0x1.fd9ae142795e3p-1, 0x1.fea4218d6594ap-1,
+ 0x1.ff404760319b4p-1, 0x1.ff9960f3eb327p-1, 0x1.ffcaa8f4c9beap-1, 0x1.ffe514bbdc197p-1, 0x1.fff2cfb0453d9p-1, 0x1.fff9ba420e834p-1,
+ 0x1.fffd1ac4135f9p-1, 0x1.fffeb3ebb267bp-1, 0x1.ffff6f9f67e55p-1, 0x1.ffffc316d9ed0p-1, 0x1.ffffe710d565ep-1, 0x1.fffff618c3da6p-1,
+ 0x1.fffffc2f171e3p-1, 0x1.fffffe92ced93p-1, 0x1.ffffff7b91176p-1, 0x1.ffffffd169d0cp-1, 0x1.fffffff01a8b6p-1, 0x1.fffffffabd229p-1,
+ 0x1.fffffffe4fa30p-1, 0x1.ffffffff79626p-1, 0x1.ffffffffd759dp-1, 0x1.fffffffff4188p-1, 0x1.fffffffffc9e8p-1, 0x1.ffffffffff11ap-1,
+ 0x1.ffffffffffc05p-1, 0x1.ffffffffffef8p-1, 0x1.fffffffffffbep-1, 0x1.ffffffffffff0p-1, 0x1.ffffffffffffcp-1, 0x1.fffffffffffffp-1, 1.0},
+
+{0x1.20dd750429b6dp-3, 0x1.1c62fa1e86989p-3, 0x1.0f5d1602f7dfbp-3, 0x1.f5f0cdaf152b2p-4, 0x1.c1efca49a5051p-4, 0x1.86e9694134b22p-4,
+ 0x1.492e42d78d39cp-4, 0x1.0cab61f084b1bp-4, 0x1.a911f096fbb79p-5, 0x1.45e99bcbb78d4p-5, 0x1.e4652fadcbaa3p-6, 0x1.5ce595c455bccp-6,
+ 0x1.e723726b81ff1p-7, 0x1.499d478bca4acp-7, 0x1.b055303221566p-8, 0x1.12ceb37ffa389p-8, 0x1.529b9e8cfa59fp-9, 0x1.94624e78e084fp-10,
+ 0x1.d4143a9e023f5p-11, 0x1.06918b63537c2p-11, 0x1.1d83170fcc34bp-12, 0x1.2ce898808f08ep-13, 0x1.3360ccd26e06ap-14, 0x1.30538fbb986fbp-15,
+ 0x1.2408e9bb1b657p-16, 0x1.0f9e1b4e4baaep-17, 0x1.e9b5e8d71b5e3p-19, 0x1.abe09e85af38ap-20, 0x1.6a5972347c568p-21, 0x1.296a70eff1bd9p-22,
+ 0x1.d9371ee6bfc07p-24, 0x1.6ce1a88a01b3ap-25, 0x1.10b14985663f9p-26, 0x1.8b0d07ade43d8p-28, 0x1.155a098eceb0fp-29, 0x1.7974d3b397e7cp-31,
+ 0x1.f1e3bf5a6493ap-33, 0x1.3e47781d91b97p-34, 0x1.8a7038368986cp-36, 0x1.d9d4d7be5992cp-38, 0x1.137dabebc1319p-39, 0x1.367541123e46cp-41,
+ 0x1.58007ab162c1dp-43, 0x1.709f0d280b3f5p-45, 0x1.30a3dcf531ebfp-47, 0x1.d2707c055dedcp-50, 0x1.0d97f61945387p-49, 0x1.1dbc3ab728933p-50, 0},
+
+{0x1.2411381609db0p-51, -0x1.1c62fa1e75c0ap-9, -0x1.0f5d1602eb436p-8, -0x1.78749a4346714p-8, -0x1.c1efca49a7b15p-8, -0x1.e8a3c39178d95p-8,
+ -0x1.edc5644363883p-8, -0x1.d62beb64e19eep-8, -0x1.a911f096f7a87p-8, -0x1.6ea6cf452dca3p-8, -0x1.2ebf3dccb166cp-8, -0x1.dfbbadedfcde6p-9,
+ -0x1.6d5a95d08c346p-9, -0x1.0bcfca21880c9p-9, -0x1.7a4a8a2bf1a0bp-10, -0x1.01a1c8481a466p-10, -0x1.529b9e8d29ddap-11, -0x1.ada873604cf20p-12,
+ -0x1.074b60f960c25p-12, -0x1.37ccd585732c6p-13, -0x1.64e3dcd73a1d3p-14, -0x1.8af14827e93bap-15, -0x1.a6a519ae712fbp-16, -0x1.b5781ea681265p-17,
+ -0x1.b60d5ed744563p-18, -0x1.a8670acc75c29p-19, -0x1.8de3ce2154088p-20, -0x1.690584329096ap-21, -0x1.3d0e478659a54p-22, -0x1.0d8875cb088d0p-23,
+ -0x1.bba3c56e56d69p-25, -0x1.617a60b4bcd87p-26, -0x1.10b16afb9ce08p-27, -0x1.9766e11f62828p-29, -0x1.26afbc55ef33cp-30, -0x1.9cd52c0e709a9p-32,
+ -0x1.18175f6758766p-33, -0x1.705a68dde7f3ap-35, -0x1.d65ba6d52556dp-37, -0x1.23af5c3865987p-38, -0x1.51c72cd64a6bcp-40, -0x1.79f63bbc02f5ap-42,
+ -0x1.2346f2840d7bfp-43, -0x1.8110f614395a8p-45, 0x1.c3309f1fe85a4p-46, 0x1.09e6fb6ee0b85p-46, -0x1.959834938224fp-46, -0x1.0e9a684ecee47p-46, 0},
+
+{-0x1.812746b057b58p-11, -0x1.6f552dbf96b31p-11, -0x1.3c97445cee1b0p-11, -0x1.e106c523a966dp-12, -0x1.2bf5318638e21p-12, -0x1.c8105034ea92fp-14,
+ 0x1.b6e85963275c5p-15, 0x1.7c9d756585d29p-13, 0x1.1b614b0e78122p-12, 0x1.4cb3cf0b42031p-12, 0x1.571d01cf7eeb3p-12, 0x1.4374d82fe7f2ep-12,
+ 0x1.1c2a02b9199a0p-12, 0x1.d6631e131dabap-13, 0x1.7148c3d9d22bap-13, 0x1.143d1c76ae7c6p-13, 0x1.8b0ae3afc07e6p-14, 0x1.0ea475d5b3822p-14,
+ 0x1.63ef6208bd4adp-15, 0x1.c1ec100ec3e71p-16, 0x1.119da13709716p-16, 0x1.407fbd00318a5p-17, 0x1.69cf481b4666cp-18, 0x1.89e17d2b19c42p-19,
+ 0x1.9db7531fa76f6p-20, 0x1.a37382bd61dc8p-21, 0x1.9aa4a8e8fe8dfp-22, 0x1.8451fcde36f23p-23, 0x1.62cd605193fe9p-24, 0x1.394b0d46af85cp-25,
+ 0x1.0b6c0d1191ec9p-26, 0x1.b9581bcc8f4ebp-28, 0x1.603ea0f602119p-29, 0x1.0ff28bc88022cp-30, 0x1.95ecc71a0b4bep-32, 0x1.24ffe516534d4p-33,
+ 0x1.9aa89abeffd90p-35, 0x1.1ab57210158fap-36, 0x1.8b0c503eafbcbp-38, 0x1.166413b8ba611p-39, 0x1.5848fad1e38e9p-42, 0x1.3573cc6d6d4e6p-49,
+ 0x1.404c0dc8b5ffcp-42, 0x1.38779160f5f11p-43, -0x1.1dc84293acf27p-42, -0x1.2892755467252p-43, 0x1.8e40aed4a9e02p-43, 0x1.0cef3bce98bedp-43, 0},
+
+{0x1.4ade8e6d47ef0p-43, 0x1.196c9ee6491cfp-16, 0x1.040e8be6a9625p-15, 0x1.5529ad049b967p-15, 0x1.76f27e1744b44p-15, 0x1.6963c95cd8395p-15,
+ 0x1.349b5d6ae76a6p-15, 0x1.cc6056b95eed3p-16, 0x1.1b614adacb10dp-16, 0x1.ca5080f4ec9b9p-18, -0x1.93a9d54fb750bp-20, -0x1.f3b8d7695d38cp-18,
+ -0x1.6d5a929bfde5fp-17, -0x1.974c013452be9p-17, -0x1.8a0da620ab60fp-17, -0x1.5a3166e1f5682p-17, -0x1.1a2c5ad80a584p-17, -0x1.afe552a6507eep-18,
+ -0x1.38a9879a760b8p-18, -0x1.ae595d5041755p-19, -0x1.1a89c93c4b9c8p-19, -0x1.62d4c3dc10fdbp-20, -0x1.ab0c620cf63d1p-21, -0x1.ed4aeff35fd90p-22,
+ -0x1.11c8e63fae76dp-22, -0x1.2454a1fb4749ap-23, -0x1.2c7f7846b0e7bp-24, -0x1.298c17acfd63ap-25, -0x1.1c0f6cc5baa18p-26, -0x1.0574c9f0e63fap-27,
+ -0x1.d0a5c4232f4cep-29, -0x1.8d9d301253af8p-30, -0x1.49cb78be34c81p-31, -0x1.08fc30eb50526p-32, -0x1.96e2f50cad458p-34, -0x1.2c888ddad994bp-35,
+ -0x1.c5dd3068e7fcap-37, -0x1.935b876ed56ffp-38, -0x1.e74a7c256ba0dp-39, -0x1.1681c73733b50p-39, 0x1.855ab0b8664dep-41, 0x1.4aebdf7fb67e5p-41,
+ -0x1.2aef07c393759p-40, -0x1.37e52b17505e6p-41, 0x1.394b997da7ed5p-40, 0x1.4345440ea9876p-41, -0x1.af227669dca68p-41, -0x1.23589e4f3cc49p-41, 0},
+
+{0x1.ce2f1b1646d4bp-19, 0x1.aaba29a029bd5p-19, 0x1.47e57fbf662a0p-19, 0x1.74882f55f1bd4p-20, 0x1.dfed759bd9091p-23, -0x1.c124b2acb3ee8p-21,
+ -0x1.b429a82901889p-20, -0x1.1350ee93fbfb3p-19, -0x1.1b613a5e1e196p-19, -0x1.f65ceb61aa63ap-20, -0x1.82814da1daaa1p-20, -0x1.f5729185c040ep-21,
+ -0x1.e72489bfea503p-22, -0x1.17d784c065f21p-24, 0x1.b2229e5122850p-23, 0x1.779b916c44358p-22, 0x1.ace7a08f66cb0p-22, 0x1.9973788b8f181p-22,
+ 0x1.5d3bceb9c39d5p-22, 0x1.11da976499339p-22, 0x1.90eaa0d25df91p-23, 0x1.146c19a9f0ae8p-23, 0x1.693a52f5ccd0bp-24, 0x1.c122683fc1404p-25,
+ 0x1.0a866e311e50ap-25, 0x1.2e85588e08741p-26, 0x1.493501a3ee15cp-27, 0x1.572eec204dc18p-28, 0x1.590e0157d4dabp-29, 0x1.4c0619d7359e8p-30,
+ 0x1.36608b7b22d22p-31, 0x1.0e3f514a0d7fep-32, 0x1.e04d29135056ep-34, 0x1.aa936eb977e33p-35, 0x1.3ce1ec4a299b6p-36, 0x1.aba42bc751130p-38,
+ 0x1.0861b5dc819e3p-38, 0x1.3bc7b1f0f8afbp-38, 0x1.7d6c896bf3579p-38, 0x1.14f24be91338cp-38, -0x1.2896024cf2ca9p-39, -0x1.c2e8399d1e8e7p-40,
+ 0x1.7836a61cc0f4bp-39, 0x1.8a98e07f8cdfcp-40, -0x1.8f332379c6ce4p-39, -0x1.9bbec3ab83755p-40, 0x1.126c9c6d24bd6p-39, 0x1.72eaeac065cc2p-40, 0},
+
+{0x1.240b25b9a9823p-39, -0x1.733f879c52150p-24, -0x1.4c00873f3742fp-23, -0x1.9a6fe48163775p-23, -0x1.99ed7481d2399p-23, -0x1.52aea61425cf7p-23,
+ -0x1.b853c3ad1c781p-24, -0x1.53c3e486c1845p-25, 0x1.2e2a4e7a0286dp-26, 0x1.fd0e266132929p-25, 0x1.5cf1d8fe5611fp-24, 0x1.6b140ba72ac56p-24,
+ 0x1.3cab2fa73a9c4p-24, 0x1.d864967df5009p-25, 0x1.25b4551256078p-25, 0x1.0d029bc50b0cdp-26, 0x1.e126485c5dceep-30, -0x1.dd5e4bed818c0p-28,
+ -0x1.7cd1b44dbfdc3p-27, -0x1.981def704f39ep-27, -0x1.6f0e87a0f3e35p-27, -0x1.267c0dc9b6e95p-27, -0x1.b2ec3078bf153p-28, -0x1.2b066605239f5p-28,
+ -0x1.840473ed3d070p-29, -0x1.daf9b9b8c06cap-30, -0x1.1661520cf8a32p-30, -0x1.2fa49c29e30b5p-31, -0x1.4ddfd9d6a7cf4p-32, -0x1.4a55b8564425ap-33,
+ -0x1.5df1ca746f291p-34, -0x1.dd6b8d1ec2e4fp-36, -0x1.34c63d902f888p-36, -0x1.b55b65a1655c0p-37, -0x1.9c1cfd1e2142cp-39, 0x1.98f2b73f288c4p-43,
+ -0x1.3baba91a10af8p-39, -0x1.8cb03e5359e2bp-38, -0x1.16063ce2129afp-37, -0x1.9fd74120d8e00p-38, 0x1.cf0caf7defe71p-39, 0x1.5d029f324f3a7p-39,
+ -0x1.21268c2290cb5p-38, -0x1.2f6de12d74afdp-39, 0x1.332ead763d55ap-38, 0x1.3cd3a7103e138p-39, -0x1.a64e5d1cdb028p-39, -0x1.1d674b3db2a42p-39, 0},
+
+{-0x1.b84a0abf33534p-27, -0x1.89c6cd0cf2b65p-27, -0x1.09bb37091d4aep-27, -0x1.68f777b72ca95p-29, 0x1.60a5240c5ece1p-29, 0x1.c7421c28ef551p-28,
+ 0x1.2e75b6acb2116p-27, 0x1.30f14412b258cp-27, 0x1.f153992d28a09p-28, 0x1.3b80153a3c97bp-28, 0x1.df36fe4b5094cp-30, -0x1.724a2b185f507p-31,
+ -0x1.37cb36ce4237dp-29, -0x1.963d70f677f90p-29, -0x1.8d5c135b0af66p-29, -0x1.42fbc01c11a3bp-29, -0x1.baba060b7adb1p-30, -0x1.eaf481fbc6feap-31,
+ -0x1.5b5d0a354e49cp-32, 0x1.fb57bbdb6f854p-35, 0x1.2423823b5dcaep-32, 0x1.64e9c7f44ececp-32, 0x1.59b6fb115bcefp-32, 0x1.179a1737c24d9p-32,
+ 0x1.a9515bcf95bb0p-33, 0x1.1ca83baba64bdp-33, 0x1.826e7ef89b3cap-34, 0x1.7ab5cb5ca2db0p-35, 0x1.2ce997226e82dp-35, 0x1.fdd14ca5a6d38p-37,
+ 0x1.d35252de2a363p-37, -0x1.8dd5e799b3695p-39, 0x1.047fd46786432p-38, 0x1.aa8639c65a4a4p-38, 0x1.10495d2cdaee5p-41, -0x1.24b2b7e751230p-40,
+ 0x1.e2ec0b9e9b211p-40, 0x1.6203cc50754ffp-38, 0x1.f95c0def7238bp-38, 0x1.7b31a463405b9p-38, -0x1.a826fa90b3c96p-39, -0x1.3f6315812b719p-39,
+ 0x1.0862d42832ac6p-38, 0x1.1575d5fa4614cp-39, -0x1.18eb527929cedp-38, -0x1.21bd844e0e3b8p-39, 0x1.8233e415548a0p-39, 0x1.0501b16f5819bp-39, 0},
+
+{0x1.9b4497171a29dp-39, 0x1.7f9c0bcd4b3e7p-32, 0x1.4928133bccac3p-31, 0x1.7b5a70f49485bp-31, 0x1.4f71ee2c4aff3p-31, 0x1.bca22e6a9cd38p-32,
+ 0x1.1c93a34970852p-33, -0x1.03d86c164d20cp-33, -0x1.448222383eb95p-32, -0x1.95aa76b3417ddp-32, -0x1.80448ecd34689p-32, -0x1.19d3f547d1f1fp-32,
+ -0x1.2c65995a6a63fp-33, -0x1.01b5832823cc6p-35, 0x1.97d70f56a4524p-35, 0x1.7d57df58d20a9p-34, 0x1.a3d6fe32773b9p-34, 0x1.6ff53581ac827p-34,
+ 0x1.faff84d277a6fp-35, 0x1.39ff19e23455bp-35, 0x1.9b1e383b8e03dp-37, 0x1.fd37bce839816p-40, -0x1.31b58a910d109p-37, -0x1.480a28743a67fp-37,
+ -0x1.9a8b926ca51b4p-37, -0x1.14d6b0b9c8256p-37, -0x1.227dfd10a7f51p-37, -0x1.d1d5ba9e5676cp-42, -0x1.71c57d72b90eap-38, -0x1.018922e3bb1eap-40,
+ -0x1.e0970faab38e6p-39, 0x1.a442b8ab5ed33p-39, -0x1.3a6f0acbd7293p-40, -0x1.7c53be7062a3ap-39, -0x1.c562622693573p-44, 0x1.458e668db57cdp-41,
+ -0x1.d5f41a61e90a0p-41, -0x1.60d1f7c57cb11p-39, -0x1.f8fa4c98324fep-39, -0x1.7b178840b90e3p-39, 0x1.a8558cdf5220ap-40, 0x1.3f7acb241cdbbp-40,
+ -0x1.086dc81118428p-39, -0x1.15828db8b2da6p-40, 0x1.18f9d5a5099c3p-39, 0x1.21cd05249b8c9p-40, -0x1.82493a2d7a1fep-40, -0x1.0510a8a58c1abp-40, 0},
+
+{0x1.4c0cf8eccd2e0p-35, 0x1.de696ed8004cbp-36, 0x1.62392d5363e58p-37, -0x1.21d68e1a8e4c7p-37, -0x1.867b57075ec9dp-36, -0x1.058af4c30abafp-35,
+ -0x1.dbb6594ed5127p-36, -0x1.6006d1f354794p-36, -0x1.311e96adfec96p-37, 0x1.2c82e5ef56703p-39, 0x1.6f2c1413cbe8ep-37, 0x1.c46886dd6c5d6p-37,
+ 0x1.92e273bf63d54p-37, 0x1.2982faf5df034p-37, 0x1.5ad37b1dc30c4p-38, 0x1.97104fd2630f8p-40, -0x1.38bcd955ecbb9p-40, -0x1.7779727d36c91p-39,
+ -0x1.4862c13c3ccf5p-39, -0x1.53facd6319433p-39, -0x1.de2f6e88b0926p-41, -0x1.fb0967f0fa611p-41, 0x1.5fadb405af344p-42, 0x1.e90319ef64411p-43,
+ 0x1.fc013fac4d3d7p-41, 0x1.0546d08a05cacp-41, 0x1.fa1b10c35012ep-41, -0x1.000d4354b8049p-41, 0x1.b68ee44b2b84bp-41, 0x1.cfa36d83ea2afp-48,
+ 0x1.5c41a6c8aaf3ap-41, -0x1.7edb2342ceb28p-41, 0x1.d9211942a37d9p-43, 0x1.39b815d399ba2p-41, 0x1.1fc46969db91bp-46, -0x1.1736507c25bafp-43,
+ 0x1.89bbcfdb5c677p-43, 0x1.28f22b295bc86p-41, 0x1.a9396e0b45a3bp-41, 0x1.3f409ac2dbfafp-41, -0x1.65682520f07a7p-42, -0x1.0d1586492d3b1p-42,
+ 0x1.bd6c9f236abc3p-42, 0x1.d376a4bd795bep-43, -0x1.d94e87dd31275p-42, -0x1.e82d04ff5649fp-43, 0x1.455b18d5d810fp-42, 0x1.b7c6a4ab711bdp-43, 0}
+       // clang-format on
+     }};
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index e9e7d44..68325b5 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -305,6 +305,10 @@ v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
   return __builtin_fma (x, y, z);
 }
 static inline v_f64_t
+v_min_f64(v_f64_t x, v_f64_t y) {
+  return x < y ? x : y;
+}
+static inline v_f64_t
 v_round_f64 (v_f64_t x)
 {
   return __builtin_round (x);
@@ -314,6 +318,11 @@ v_round_s64 (v_f64_t x)
 {
   return __builtin_lround (x); /* relies on -fno-math-errno.  */
 }
+static inline v_u64_t
+v_trunc_u64 (v_f64_t x)
+{
+  return __builtin_trunc (x);
+}
 /* convert to type1 from type2.  */
 static inline v_f64_t
 v_to_f64_s64 (v_s64_t x)
@@ -585,6 +594,10 @@ v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
   return vfmaq_f64 (z, x, y);
 }
 static inline v_f64_t
+v_min_f64(v_f64_t x, v_f64_t y) {
+  return vminq_f64(x, y);
+}
+static inline v_f64_t
 v_round_f64 (v_f64_t x)
 {
   return vrndaq_f64 (x);
@@ -594,6 +607,11 @@ v_round_s64 (v_f64_t x)
 {
   return vcvtaq_s64_f64 (x);
 }
+static inline v_u64_t
+v_trunc_u64 (v_f64_t x)
+{
+  return vcvtq_u64_f64 (x);
+}
 /* convert to type1 from type2.  */
 static inline v_f64_t
 v_to_f64_s64 (v_s64_t x)
diff --git a/pl/math/vn_erf_2u.c b/pl/math/vn_erf_2u.c
new file mode 100644
index 0000000..e0e10bb
--- /dev/null
+++ b/pl/math/vn_erf_2u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erf.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_erf, _ZGVnN2v_erf);
+#include "v_erf_2u.c"
+#endif
-- 
cgit v1.2.3


From dcaedbcdfe4a1ae9bb287af01e73082a43cd78df Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 14 Jun 2022 11:34:14 +0100
Subject: pl/math: Add vector/Neon erff

New routine is accurate to 1.5 ULP. Vector erff has a dependency on
v_expf, which has been copied across from the main math/ directory.
---
 pl/math/include/mathlib.h      |   4 ++
 pl/math/math_config.h          |   6 +++
 pl/math/s_erff_1u5.c           |   6 +++
 pl/math/s_expf.c               |   6 +++
 pl/math/test/mathbench_funcs.h |   5 ++
 pl/math/test/runulp.sh         |  14 ++++++
 pl/math/test/ulp_funcs.h       |   4 ++
 pl/math/test/ulp_wrappers.h    |   3 ++
 pl/math/v_erff_1u5.c           | 104 +++++++++++++++++++++++++++++++++++++++++
 pl/math/v_erff_data.c          |  18 +++++++
 pl/math/v_expf.c               |  83 ++++++++++++++++++++++++++++++++
 pl/math/v_math.h               |  40 ++++++++++++++++
 pl/math/vn_erff_1u5.c          |  12 +++++
 pl/math/vn_expf.c              |  12 +++++
 14 files changed, 317 insertions(+)
 create mode 100644 pl/math/s_erff_1u5.c
 create mode 100644 pl/math/s_expf.c
 create mode 100644 pl/math/v_erff_1u5.c
 create mode 100644 pl/math/v_erff_data.c
 create mode 100644 pl/math/v_expf.c
 create mode 100644 pl/math/vn_erff_1u5.c
 create mode 100644 pl/math/vn_expf.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index b20dfa4..e2da2eb 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -15,6 +15,7 @@ float log10f (float);
 double log10 (double);
 
 float __s_erfcf (float);
+float __s_erff (float);
 float __s_log10f (float);
 
 double __s_erf (double);
@@ -33,6 +34,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #endif
 
 /* Vector functions following the base PCS.  */
+__f32x4_t __v_erff (__f32x4_t);
 __f64x2_t __v_erf (__f64x2_t);
 __f32x4_t __v_erfcf (__f32x4_t);
 __f64x2_t __v_erfc (__f64x2_t);
@@ -43,6 +45,7 @@ __f64x2_t __v_log10 (__f64x2_t);
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_erff (__f32x4_t);
 __vpcs __f64x2_t __vn_erf (__f64x2_t);
 __vpcs __f32x4_t __vn_erfcf (__f32x4_t);
 __vpcs __f64x2_t __vn_erfc (__f64x2_t);
@@ -50,6 +53,7 @@ __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 51bada5..4b009c1 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -401,4 +401,10 @@ extern const struct v_erf_data
   double shifts[V_ERF_NINTS];
   double coeffs[V_ERF_NCOEFFS][V_ERF_NINTS];
 } __v_erf_data HIDDEN;
+
+#define V_ERFF_NCOEFFS 7
+extern const struct v_erff_data
+{
+  float coeffs[V_ERFF_NCOEFFS][2];
+} __v_erff_data HIDDEN;
 #endif
diff --git a/pl/math/s_erff_1u5.c b/pl/math/s_erff_1u5.c
new file mode 100644
index 0000000..f6817eb
--- /dev/null
+++ b/pl/math/s_erff_1u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erff_1u5.c"
diff --git a/pl/math/s_expf.c b/pl/math/s_expf.c
new file mode 100644
index 0000000..dacda7f
--- /dev/null
+++ b/pl/math/s_expf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_expf.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 882f496..bd90ae3 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -13,18 +13,23 @@ D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
+F (__s_erff, -4.0, 4.0)
 D (__s_erf, -6.0, 6.0)
 F (__s_erfcf, -6.0, 28.0)
 D (__s_erfc, -6.0, 28.0)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 #if __aarch64__
+VF  (__v_erff, -4.0, 4.0)
 VD  (__v_erf, -6.0, 6.0)
 VF (__v_erfcf, -6.0, 28.0)
 VD (__v_erfc, -6.0, 28.0)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 #ifdef __vpcs
+VNF(__vn_erff, -4.0, 4.0)
+VNF(_ZGVnN4v_erff, -4.0, 4.0)
+
 VND(__vn_erf, -6.0, 6.0)
 VND(_ZGVnN2v_erf, -6.0, 6.0)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index bef0c27..72cad47 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -122,12 +122,22 @@ range_erf='
  0         inf     40000
 '
 
+range_erff='
+ 0      0xffff0000 10000
+ 0x1p-127  0x1p-26 40000
+-0x1p-127 -0x1p-26 40000
+ 0x1p-26   0x1p3   40000
+-0x1p-26  -0x1p3   40000
+ 0         inf     40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
 L_log10=1.16
 L_log10f=2.81
 L_erf=1.76
+L_erff=1.5
 
 while read G F R
 do
@@ -158,6 +168,10 @@ log10  __v_log10       $runv
 log10  __vn_log10      $runvn
 log10  _ZGVnN2v_log10  $runvn
 
+erff   __s_erff        $runs
+erff   __v_erff        $runv
+erff   __vn_erff       $runvn
+erff   _ZGVnN4v_erff   $runvn
 erfcf  __s_erfcf       $runs
 erfcf  __v_erfcf       $runv
 erfcf  __vn_erfcf      $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 717f8c5..8142a69 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -10,23 +10,27 @@ F1 (log10)
 D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
+F (__s_erff, __s_erff, erf, mpfr_erf, 1, 1, f1, 0)
 F (__s_erf, __s_erf, erfl, mpfr_erf, 1, 0, d1, 0)
 F (__s_erfcf, __s_erfcf, erfc, mpfr_erfc, 1, 1, f1, 0)
 F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0)
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 #if __aarch64__
+F (__v_erff, v_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (__v_erf, v_erf, erfl, mpfr_erf, 1, 0, d1, 1)
 F (__v_erfcf, v_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #ifdef __vpcs
+F (__vn_erff, vn_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (__vn_erf, vn_erf, erfl, mpfr_erf, 1, 0, d1, 1)
 F (__vn_erfcf, vn_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (_ZGVnN4v_erff, Z_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (_ZGVnN2v_erf, Z_erf, erfl, mpfr_erf, 1, 0, d1, 1)
 F (_ZGVnN4v_erfcf, Z_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (_ZGVnN2v_erfc, Z_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 80789f9..d6db464 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -24,18 +24,21 @@ static const double dv[2] = {1.0, -INFINITY};
 static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
 static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
 
+static float v_erff(float x) { return __v_erff(argf(x))[0]; }
 static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
 static double v_erf(double x) { return __v_erf(argd(x))[0]; }
 static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
 static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 #ifdef __vpcs
+static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
 static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
 static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
 static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
 static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
 static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
 
+static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
 static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
 static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
 static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; }
diff --git a/pl/math/v_erff_1u5.c b/pl/math/v_erff_1u5.c
new file mode 100644
index 0000000..7c910bd
--- /dev/null
+++ b/pl/math/v_erff_1u5.c
@@ -0,0 +1,104 @@
+/*
+ * Single-precision vector erf(x) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "include/mathlib.h"
+#include "math_config.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+VPCS_ATTR v_f32_t V_NAME (expf) (v_f32_t);
+
+#define AbsMask v_u32 (0x7fffffff)
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  return v_call_f32 (erff, x, y, cmp);
+}
+
+/* A structure to perform look-up in coeffs and other parameter tables.  */
+struct entry
+{
+  v_f32_t P[V_ERFF_NCOEFFS];
+};
+
+static inline struct entry
+lookup (v_u32_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  for (int j = 0; j < V_ERFF_NCOEFFS; ++j)
+    e.P[j] = __v_erff_data.coeffs[j][i];
+#else
+  for (int j = 0; j < V_ERFF_NCOEFFS; ++j)
+    {
+      e.P[j][0] = __v_erff_data.coeffs[j][i[0]];
+      e.P[j][1] = __v_erff_data.coeffs[j][i[1]];
+      e.P[j][2] = __v_erff_data.coeffs[j][i[2]];
+      e.P[j][3] = __v_erff_data.coeffs[j][i[3]];
+    }
+#endif
+  return e;
+}
+
+/* Optimized single precision vector error function erf.
+   Maximum measured at +/- 0.931, 1.25ULP:
+   v_erff(-0x1.dc59fap-1) got -0x1.9f9c88p-1
+			 want -0x1.9f9c8ap-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (erff) (v_f32_t x)
+{
+  /* Handle both inf/nan as well as small values (|x|<2^-28). If any condition
+     in the lane is true then a loop over scalar calls will be performed.  */
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t atop = (ix >> 16) & v_u32 (0x7fff);
+  v_u32_t cmp = v_cond_u32 (atop - v_u32 (0x3180) >= v_u32 (0x7ff0 - 0x3180));
+
+  /* Get sign and absolute value.  */
+  v_u32_t sign = ix & ~AbsMask;
+  /* |x| < 0.921875.  */
+  v_u32_t red = v_calt_f32 (x, v_f32 (0.921875f));
+  /* |x| > 4.0.  */
+  v_u32_t bor = v_cagt_f32 (x, v_f32 (4.0f));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  v_u32_t i = v_sel_u32 (red, v_u32 (0), v_u32 (1));
+
+  /* Get polynomial coefficients.  */
+  struct entry dat = lookup (i);
+
+  v_f32_t a = v_abs_f32 (x);
+  v_f32_t z = v_sel_f32 (red, x * x, a);
+
+  /* Evaluate Polynomial of |x| or x^2.  */
+  v_f32_t r = dat.P[6];
+  r = v_fma_f32 (z, r, dat.P[5]);
+  r = v_fma_f32 (z, r, dat.P[4]);
+  r = v_fma_f32 (z, r, dat.P[3]);
+  r = v_fma_f32 (z, r, dat.P[2]);
+  r = v_fma_f32 (z, r, dat.P[1]);
+  r = v_sel_f32 (red, r, v_fma_f32 (z, r, dat.P[0]));
+  r = v_fma_f32 (a, r, a);
+
+  /* y = |x| + |x|*P(|x|)        if |x| < 0.921875
+     1 - exp (-(|x|+|x|*P(x^2))) otherwise.  */
+  v_f32_t y = v_sel_f32 (red, r, v_f32 (1.0f) - V_NAME (expf) (-r));
+
+  /* Boring domain (absolute value is required to get the sign of erf(-nan)
+     right).  */
+  y = v_sel_f32 (bor, v_f32 (1.0f), v_abs_f32 (y));
+
+  /* y=erf(x) if x>0, -erf(-x) otherwise.  */
+  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_erff_data.c b/pl/math/v_erff_data.c
new file mode 100644
index 0000000..0661d20
--- /dev/null
+++ b/pl/math/v_erff_data.c
@@ -0,0 +1,18 @@
+/*
+ * Data for approximation of vector erff.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Minimax approximation of erff.  */
+const struct v_erff_data __v_erff_data
+  = {.coeffs = {{0x0p0f, 0x1.079d0cp-3f},
+		{0x1.06eba6p-03f, 0x1.450aa0p-1},
+		{-0x1.8126e0p-02f, 0x1.b55cb0p-4f},
+		{0x1.ce1a46p-04f, -0x1.8d6300p-6f},
+		{-0x1.b68bd2p-06f, 0x1.fd1336p-9f},
+		{0x1.473f48p-08f, -0x1.91d2ccp-12f},
+		{-0x1.3a1a82p-11f, 0x1.222900p-16f}}};
diff --git a/pl/math/v_expf.c b/pl/math/v_expf.c
new file mode 100644
index 0000000..2707ebc
--- /dev/null
+++ b/pl/math/v_expf.c
@@ -0,0 +1,83 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* maxerr: 1.45358 +0.5 ulp.  */
+  0x1.0e4020p-7f,
+  0x1.573e2ep-5f,
+  0x1.555e66p-3f,
+  0x1.fffdb6p-2f,
+  0x1.ffffecp-1f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
+  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+  v_f32_t s2 = v_as_f32_u32 (e - b);
+  v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
+  v_u32_t r2 = v_as_u32_f32 (s1 * s1);
+  v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
+  return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(expf) (v_f32_t x)
+{
+  v_f32_t n, r, r2, scale, p, q, poly, absn, z;
+  v_u32_t cmp, e;
+
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+#if 1
+  z = v_fma_f32 (x, InvLn2, Shift);
+  n = z - Shift;
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_f32 (z) << 23;
+#else
+  z = x * InvLn2;
+  n = v_round_f32 (z);
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_s32 (v_round_s32 (z)) << 23;
+#endif
+  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+  absn = v_abs_f32 (n);
+  cmp = v_cond_u32 (absn > v_f32 (126.0f));
+  r2 = r * r;
+  p = v_fma_f32 (C0, r, C1);
+  q = v_fma_f32 (C2, r, C3);
+  q = v_fma_f32 (p, r2, q);
+  p = C4 * r;
+  poly = v_fma_f32 (q, r2, p);
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (poly, n, e, absn, cmp, scale);
+  return v_fma_f32 (poly, scale, scale);
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index 68325b5..1f9217e 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -173,6 +173,16 @@ v_abs_f32 (v_f32_t x)
 {
   return __builtin_fabsf (x);
 }
+static inline v_u32_t
+v_cagt_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) > fabsf (y);
+}
+static inline v_u32_t
+v_calt_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) < fabsf (y);
+}
 static inline v_f32_t
 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
 {
@@ -188,6 +198,16 @@ v_round_s32 (v_f32_t x)
 {
   return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
 }
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return p ? x : y;
+}
+static inline v_u32_t
+v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
+{
+  return p ? x : y;
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
@@ -457,6 +477,16 @@ v_abs_f32 (v_f32_t x)
 {
   return vabsq_f32 (x);
 }
+static inline v_u32_t
+v_cagt_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcagtq_f32 (x, y);
+}
+static inline v_u32_t
+v_calt_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcaltq_f32 (x, y);
+}
 static inline v_f32_t
 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
 {
@@ -472,6 +502,16 @@ v_round_s32 (v_f32_t x)
 {
   return vcvtaq_s32_f32 (x);
 }
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return vbslq_f32 (p, x, y);
+}
+static inline v_u32_t
+v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
+{
+  return vbslq_u32 (p, x, y);
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
diff --git a/pl/math/vn_erff_1u5.c b/pl/math/vn_erff_1u5.c
new file mode 100644
index 0000000..89126f9
--- /dev/null
+++ b/pl/math/vn_erff_1u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erff.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_erff, _ZGVnN4v_erff);
+#include "v_erff_1u5.c"
+#endif
diff --git a/pl/math/vn_expf.c b/pl/math/vn_expf.c
new file mode 100644
index 0000000..6e91a94
--- /dev/null
+++ b/pl/math/vn_expf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
+#include "v_expf.c"
+#endif
-- 
cgit v1.2.3


From a3f88d5ad9e0f9a8a5f0baecb7148cbd1f059839 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 14 Jun 2022 14:58:59 +0100
Subject: Add Joe Ramsay to MAINTAINERS

---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index ed77c6a..6c5823a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6,6 +6,7 @@ networking/
 	Szabolcs Nagy <szabolcs.nagy@arm.com>
 pl/
 	Pierre Blanchard <pierre.blanchard@arm.com>
+	Joe Ramsay <joe.ramsay@arm.com>
 string/
 	Szabolcs Nagy <szabolcs.nagy@arm.com>
 	Wilco Dijkstra <wilco.dijkstra@arm.com>
-- 
cgit v1.2.3


From bc9ff56f52ff973786da5204b021eaac2f3a8a95 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Tue, 14 Jun 2022 15:55:50 +0100
Subject: math: Enable compilation of SVE routines.

Enable SVE compilation by uncommenting following line in config.mk:

math-cflags += -march=armv8.2-a+sve -DWANT_SVE_MATH=1
---
 config.mk.dist | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/config.mk.dist b/config.mk.dist
index 78588de..b7fc243 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -62,6 +62,9 @@ math-cflags += -ffp-contract=fast -fno-math-errno
 # Disable vector math code
 #math-cflags += -DWANT_VMATH=0
 
+# Enable SVE vector code
+#math-cflags += -march=armv8.2-a+sve -DWANT_SVE_MATH=1
+
 # Disable fenv checks
 #math-ulpflags = -q -f
 #math-testflags = -nostatus
-- 
cgit v1.2.3


From ebb48105b083d89741facd3dd5d51da068c3efaa Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Wed, 15 Jun 2022 08:31:13 +0100
Subject: math: Add support for SVE in ulp assessment.

This patch provides features for assessing accuracy of SVE routines.

It also provides dummy entries and wrappers to test ulp assessment
without having to provide an actual implementation.
---
 math/test/ulp.c             | 101 ++++++++++++++++++++++++++++++++++++++++++++
 math/test/ulp_wrappers.h    |  12 ------
 pl/math/test/ulp_funcs.h    |   1 +
 pl/math/test/ulp_wrappers.h |  13 +-----
 4 files changed, 103 insertions(+), 24 deletions(-)

diff --git a/math/test/ulp.c b/math/test/ulp.c
index a38238e..6fdc395 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -214,8 +214,89 @@ struct conf
   double errlim;
 };
 
+/* A bit of a hack: call vector functions twice with the same
+   input in lane 0 but a different value in other lanes: once
+   with an in-range value and then with a special case value.  */
+static int secondcall;
+
+/* Wrappers for vector functions.  */
+#if __aarch64__ && WANT_VMATH
+typedef __f32x4_t v_float;
+typedef __f64x2_t v_double;
+static const float fv[2] = {1.0f, -INFINITY};
+static const double dv[2] = {1.0, -INFINITY};
+static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
+static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+typedef __SVFloat32_t sv_float;
+typedef __SVFloat64_t sv_double;
+
+static inline sv_float svargf(float x)  {
+	int n = svcntw();
+	float base[n];
+	for (int i=0; i<n; i++)
+		base[i] = (float)x;
+	base[n-1] = (float) fv[secondcall];
+	return svld1(svptrue_b32(), base);
+}
+static inline sv_double svargd(double x) {
+	int n = svcntd();
+	double base[n];
+	for (int i=0; i<n; i++)
+		base[i] = x;
+	base[n-1] = dv[secondcall];
+	return svld1(svptrue_b64(), base);
+}
+static inline float svretf(sv_float vec)  {
+	int n = svcntw();
+	float res[n];
+	svst1(svptrue_b32(), res, vec);
+	return res[0];
+}
+static inline double svretd(sv_double vec) {
+	int n = svcntd();
+	double res[n];
+	svst1(svptrue_b64(), res, vec);
+	return res[0];
+}
+#endif
+#endif
+
+#if WANT_SVE_MATH
+long double
+dummyl (long double x)
+{
+  return x;
+}
+
+double
+dummy (double x)
+{
+  return x;
+}
+
+static sv_double
+__sv_dummy (sv_double x)
+{
+  return x;
+}
+
+static sv_float
+__sv_dummyf (sv_float x)
+{
+  return x;
+}
+#endif
+
 #include "test/ulp_wrappers.h"
 
+/* Wrappers for SVE functions.  */
+#if WANT_SVE_MATH
+static double sv_dummy (double x) { return svretd (__sv_dummy (svargd (x))); }
+static float sv_dummyf (float x) { return svretf (__sv_dummyf (svargf (x))); }
+#endif
+
 struct fun
 {
   const char *name;
@@ -259,12 +340,32 @@ static const struct fun fun[] = {
 #define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
 #define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
 #define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
+/* SVE routines.  */
+#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define SVD1(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define SVD2(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZSVF1(x) F (_ZGVsMxv_##x##f, Z_sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZSVF2(x) F (_ZGVsMxvv_##x##f, Z_sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZSVD1(x) F (_ZGVsMxv_##x, Z_sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZSVD2(x) F (_ZGVsMxvv_##x, Z_sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+
 #include "test/ulp_funcs.h"
+
+#if WANT_SVE_MATH
+ SVD1 (dummy)
+ SVF1 (dummy)
+#endif
+
 #undef F
 #undef F1
 #undef F2
 #undef D1
 #undef D2
+#undef SVF1
+#undef SVF2
+#undef SVD1
+#undef SVD2
  {0}};
 
 /* Boilerplate for generic calls.  */
diff --git a/math/test/ulp_wrappers.h b/math/test/ulp_wrappers.h
index 10abe0a..fd9e00c 100644
--- a/math/test/ulp_wrappers.h
+++ b/math/test/ulp_wrappers.h
@@ -15,20 +15,8 @@ static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,
 static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
 #endif
 
-/* A bit of a hack: call vector functions twice with the same
-   input in lane 0 but a different value in other lanes: once
-   with an in-range value and then with a special case value.  */
-static int secondcall;
-
 /* Wrappers for vector functions.  */
 #if __aarch64__ && WANT_VMATH
-typedef __f32x4_t v_float;
-typedef __f64x2_t v_double;
-static const float fv[2] = {1.0f, -INFINITY};
-static const double dv[2] = {1.0, -INFINITY};
-static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
-static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
-
 static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
 static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
 static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 8142a69..0592794 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -4,6 +4,7 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+
 F1 (erfc)
 F1 (erf)
 F1 (log10)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index d6db464..ab04d96 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -5,25 +5,14 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+
 #if USE_MPFR
 static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
 static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
 #endif
 
-/* A bit of a hack: call vector functions twice with the same
-   input in lane 0 but a different value in other lanes: once
-   with an in-range value and then with a special case value.  */
-static int secondcall;
-
 /* Wrappers for vector functions.  */
 #if __aarch64__ && WANT_VMATH
-typedef __f32x4_t v_float;
-typedef __f64x2_t v_double;
-static const float fv[2] = {1.0f, -INFINITY};
-static const double dv[2] = {1.0, -INFINITY};
-static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
-static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
-
 static float v_erff(float x) { return __v_erff(argf(x))[0]; }
 static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
-- 
cgit v1.2.3


From 82ba299d132a68c5f526cdc74ceee93e962a5d5e Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Wed, 15 Jun 2022 08:31:57 +0100
Subject: math: Add support for SVE in mathbench.

Provide dummy example to test benchmarks without having to provide an
actual implementation.
---
 math/test/mathbench.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)

diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 5c8881a..a3093f3 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -66,6 +66,43 @@ v_float_dup (float x)
 {
   return (v_float){x, x, x, x};
 }
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+typedef svbool_t sv_bool;
+typedef svfloat64_t sv_double;
+
+#define sv_double_len() svcntd()
+
+static inline sv_double
+sv_double_load (const double *p)
+{
+  svbool_t pg = svptrue_b64();
+  return svld1(pg, p);
+}
+
+static inline sv_double
+sv_double_dup (double x)
+{
+  return svdup_n_f64(x);
+}
+
+typedef svfloat32_t sv_float;
+
+#define sv_float_len() svcntw()
+
+static inline sv_float
+sv_float_load (const float *p)
+{
+  svbool_t pg = svptrue_b32();
+  return svld1(pg, p);
+}
+
+static inline sv_float
+sv_float_dup (float x)
+{
+  return svdup_n_f32(x);
+}
+#endif
 #else
 /* dummy definitions to make things compile.  */
 typedef double v_double;
@@ -115,6 +152,20 @@ __vn_dummyf (v_float x)
 {
   return x;
 }
+#endif
+#if WANT_SVE_MATH
+static sv_double
+__sv_dummy (sv_double x, sv_bool pg)
+{
+  return x;
+}
+
+static sv_float
+__sv_dummyf (sv_float x, sv_bool pg)
+{
+  return x;
+}
+
 #endif
 #endif
 #endif
@@ -137,6 +188,10 @@ static const struct fun
 #ifdef __vpcs
     __vpcs v_double (*vnd) (v_double);
     __vpcs v_float (*vnf) (v_float);
+#endif
+#if WANT_SVE_MATH
+    sv_double (*svd) (sv_double, sv_bool);
+    sv_float (*svf) (sv_float, sv_bool);
 #endif
   } fun;
 } funtab[] = {
@@ -146,6 +201,8 @@ static const struct fun
 #define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
+#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
+#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
 D (dummy, 1.0, 2.0)
 F (dummyf, 1.0, 2.0)
 #if WANT_VMATH
@@ -156,6 +213,10 @@ VF (__v_dummyf, 1.0, 2.0)
 VND (__vn_dummy, 1.0, 2.0)
 VNF (__vn_dummyf, 1.0, 2.0)
 #endif
+#if WANT_SVE_MATH
+SVD (__sv_dummy, 1.0, 2.0)
+SVF (__sv_dummyf, 1.0, 2.0)
+#endif
 #endif
 #endif
 #include "test/mathbench_funcs.h"
@@ -166,6 +227,8 @@ VNF (__vn_dummyf, 1.0, 2.0)
 #undef VD
 #undef VNF
 #undef VND
+#undef SVF
+#undef SVD
 };
 
 static void
@@ -330,6 +393,40 @@ runf_vn_latency (__vpcs v_float f (v_float))
 }
 #endif
 
+#if WANT_SVE_MATH
+static void
+run_sv_thruput (sv_double f (sv_double, sv_bool))
+{
+  for (int i = 0; i < N; i += sv_double_len ())
+    f (sv_double_load (A+i), svptrue_b64 ());
+}
+
+static void
+runf_sv_thruput (sv_float f (sv_float, sv_bool))
+{
+  for (int i = 0; i < N; i += sv_float_len ())
+    f (sv_float_load (Af+i), svptrue_b32 ());
+}
+
+static void
+run_sv_latency (sv_double f (sv_double, sv_bool))
+{
+  sv_double z = sv_double_dup (zero);
+  sv_double prev = z;
+  for (int i = 0; i < N; i += sv_double_len ())
+    prev = f (svmad_f64_x (svptrue_b64 (), prev, z, sv_double_load (A+i)), svptrue_b64 ());
+}
+
+static void
+runf_sv_latency (sv_float f (sv_float, sv_bool))
+{
+  sv_float z = sv_float_dup (zero);
+  sv_float prev = z;
+  for (int i = 0; i < N; i += sv_float_len ())
+    prev = f (svmad_f32_x (svptrue_b32 (), prev, z, sv_float_load (Af+i)), svptrue_b32 ());
+}
+#endif
+
 static uint64_t
 tic (void)
 {
@@ -392,6 +489,16 @@ bench1 (const struct fun *f, int type, double lo, double hi)
   else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
     TIMEIT (runf_vn_latency, f->fun.vnf);
 #endif
+#if WANT_SVE_MATH
+  else if (f->prec == 'd' && type == 't' && f->vec == 's')
+    TIMEIT (run_sv_thruput, f->fun.svd);
+  else if (f->prec == 'd' && type == 'l' && f->vec == 's')
+    TIMEIT (run_sv_latency, f->fun.svd);
+  else if (f->prec == 'f' && type == 't' && f->vec == 's')
+    TIMEIT (runf_sv_thruput, f->fun.svf);
+  else if (f->prec == 'f' && type == 'l' && f->vec == 's')
+    TIMEIT (runf_sv_latency, f->fun.svf);
+#endif
 
   if (type == 't')
     {
-- 
cgit v1.2.3


From acd2080cd05cfbc598785c51cd2b4f69ea0eb57c Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 16 Jun 2022 10:18:47 +0100
Subject: pl/math: Turn clang-format off in headers

These headers do not follow the glibc style.
---
 pl/math/include/mathlib.h      |  4 +++-
 pl/math/test/mathbench_funcs.h | 18 ++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e2da2eb..7813390 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -1,3 +1,4 @@
+// clang-format off
 /*
  * Public API.
  *
@@ -8,7 +9,7 @@
 #ifndef _MATHLIB_H
 #define _MATHLIB_H
 
-float erfcf(float);
+float erfcf (float);
 float erff (float);
 float log10f (float);
 
@@ -64,3 +65,4 @@ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 #endif
 
 #endif
+// clang-format on
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index bd90ae3..5703089 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -1,3 +1,4 @@
+// clang-format off
 /*
  * Function entries for mathbench.
  *
@@ -27,17 +28,17 @@ VD (__v_erfc, -6.0, 28.0)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 #ifdef __vpcs
-VNF(__vn_erff, -4.0, 4.0)
-VNF(_ZGVnN4v_erff, -4.0, 4.0)
+VNF (__vn_erff, -4.0, 4.0)
+VNF (_ZGVnN4v_erff, -4.0, 4.0)
 
-VND(__vn_erf, -6.0, 6.0)
-VND(_ZGVnN2v_erf, -6.0, 6.0)
+VND (__vn_erf, -6.0, 6.0)
+VND (_ZGVnN2v_erf, -6.0, 6.0)
 
-VNF(__vn_erfcf, -6.0, 28.0)
-VNF(_ZGVnN4v_erfcf, -6.0, 28.0)
+VNF (__vn_erfcf, -6.0, 28.0)
+VNF (_ZGVnN4v_erfcf, -6.0, 28.0)
 
-VND(__vn_erfc, -6.0, 28.0)
-VND(_ZGVnN2v_erfc, -6.0, 28.0)
+VND (__vn_erfc, -6.0, 28.0)
+VND (_ZGVnN2v_erfc, -6.0, 28.0)
 
 VNF (__vn_log10f, 0.01, 11.1)
 VNF (_ZGVnN4v_log10f, 0.01, 11.1)
@@ -47,3 +48,4 @@ VND (_ZGVnN2v_log10, 0.01, 11.1)
 #endif
 #endif
 #endif
+// clang-format on
-- 
cgit v1.2.3


From 3596995bfbfe81561b7f0bb76ec160fb8231f411 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 16 Jun 2022 11:19:56 +0100
Subject: pl/math: Refine ulp assessment for log10

Somehow the tests currently miss a lot of values which give
significantly worse error than the previous estimate. Attempted to
come up with a new absolute-worst-case estimate, however it is still
very much an estimate as values close to the worst-case appear right
across the range of positive doubles, with tiny and huge values being
a shade worse.
---
 pl/math/log10_2u.c     | 132 -----------------------------------------------
 pl/math/log10_2u1.c    | 135 +++++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/test/runulp.sh |   2 +-
 3 files changed, 136 insertions(+), 133 deletions(-)
 delete mode 100644 pl/math/log10_2u.c
 create mode 100644 pl/math/log10_2u1.c

diff --git a/pl/math/log10_2u.c b/pl/math/log10_2u.c
deleted file mode 100644
index d0c3123..0000000
--- a/pl/math/log10_2u.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Double-precision log10(x) function.
- *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-#include <float.h>
-#include <math.h>
-#include <stdint.h>
-
-/* Polynomial coefficients and lookup tables.  */
-#define T __log10_data.tab
-#define T2 __log10_data.tab2
-#define B __log10_data.poly1
-#define A __log10_data.poly
-#define Ln2hi __log10_data.ln2hi
-#define Ln2lo __log10_data.ln2lo
-#define InvLn10 __log10_data.invln10
-#define N (1 << LOG10_TABLE_BITS)
-#define OFF 0x3fe6000000000000
-#define LO asuint64 (1.0 - 0x1p-5)
-#define HI asuint64 (1.0 + 0x1.1p-5)
-
-/* Top 16 bits of a double.  */
-static inline uint32_t
-top16 (double x)
-{
-  return asuint64 (x) >> 48;
-}
-
-/* Fast and low accuracy implementation of log10.
-   The implementation is similar to that of math/log, except that:
-   - Polynomials are computed for log10(1+r) with r on same intervals as log.
-   - Lookup parameters are scaled (at runtime) to switch from base e to base 10.
-   Max ULP error: < 1.7 ulp (nearest rounding.)
-     with (LOG10_POLY1_ORDER = 10, LOG10_POLY_ORDER = 6, N = 128)
-   Maximum measured at 1.655 ulp for x in [0.0746, 0.0747]:
-     log10(0x1.ee008434a44a4p-1) got -0x1.fd415bb39db27p-7
-				want -0x1.fd415bb39db29p-7
-     +0.344511 ulp err 1.15549.  */
-double
-log10 (double x)
-{
-  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
-  double_t w, z, r, r2, r3, y, invc, logc, kd;
-  uint64_t ix, iz, tmp;
-  uint32_t top;
-  int k, i;
-
-  ix = asuint64 (x);
-  top = top16 (x);
-
-  if (unlikely (ix - LO < HI - LO))
-    {
-      /* Handle close to 1.0 inputs separately.  */
-      /* Fix sign of zero with downward rounding when x==1.  */
-      if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
-	return 0;
-      r = x - 1.0;
-      r2 = r * r;
-      r3 = r * r2;
-      /* Worst-case error is around 0.727 ULP.  */
-      y = r3
-	  * (B[1] + r * B[2] + r2 * B[3]
-	     + r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8])));
-      w = B[0] * r2; /* B[0] == -0.5.  */
-      /* Scale by 1/ln(10). Polynomial already contains scaling.  */
-      y = (y + w) + r * InvLn10;
-
-      return eval_as_double (y);
-    }
-  if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
-    {
-      /* x < 0x1p-1022 or inf or nan.  */
-      if (ix * 2 == 0)
-	return __math_divzero (1);
-      if (ix == asuint64 (INFINITY)) /* log10(inf) == inf.  */
-	return x;
-      if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
-	return __math_invalid (x);
-      /* x is subnormal, normalize it.  */
-      ix = asuint64 (x * 0x1p52);
-      ix -= 52ULL << 52;
-    }
-
-  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
-     The range is split into N subintervals.
-     The ith subinterval contains z and c is near its center.  */
-  tmp = ix - OFF;
-  i = (tmp >> (52 - LOG10_TABLE_BITS)) % N;
-  k = (int64_t) tmp >> 52; /* arithmetic shift.  */
-  iz = ix - (tmp & 0xfffULL << 52);
-  invc = T[i].invc;
-  logc = T[i].logc;
-  z = asdouble (iz);
-
-  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
-  /* r ~= z/c - 1, |r| < 1/(2*N).  */
-#if HAVE_FAST_FMA
-  /* rounding error: 0x1p-55/N.  */
-  r = fma (z, invc, -1.0);
-#else
-  /* rounding error: 0x1p-55/N + 0x1p-66.  */
-  r = (z - T2[i].chi - T2[i].clo) * invc;
-#endif
-  kd = (double_t) k;
-
-  /* w = log(c) + k*Ln2hi.  */
-  w = kd * Ln2hi + logc;
-
-  /* log10(x) = (w + r)/log(10) + (log10(1+r) - r/log(10)).  */
-  r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
-  y = r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4]));
-
-  /* Scale by 1/ln(10). Polynomial already contains scaling.  */
-  y = y + ((r + kd * Ln2lo) + w) * InvLn10;
-
-  return eval_as_double (y);
-}
-#if USE_GLIBC_ABI
-strong_alias (log10, __log10_finite)
-hidden_alias (log10, __ieee754_log10)
-#if LDBL_MANT_DIG == 53
-long double
-log10l (long double x)
-{
-  return log10 (x);
-}
-#endif
-#endif
diff --git a/pl/math/log10_2u1.c b/pl/math/log10_2u1.c
new file mode 100644
index 0000000..29860ab
--- /dev/null
+++ b/pl/math/log10_2u1.c
@@ -0,0 +1,135 @@
+/*
+ * Double-precision log10(x) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include <float.h>
+#include <math.h>
+#include <stdint.h>
+
+/* Polynomial coefficients and lookup tables.  */
+#define T __log10_data.tab
+#define T2 __log10_data.tab2
+#define B __log10_data.poly1
+#define A __log10_data.poly
+#define Ln2hi __log10_data.ln2hi
+#define Ln2lo __log10_data.ln2lo
+#define InvLn10 __log10_data.invln10
+#define N (1 << LOG10_TABLE_BITS)
+#define OFF 0x3fe6000000000000
+#define LO asuint64 (1.0 - 0x1p-5)
+#define HI asuint64 (1.0 + 0x1.1p-5)
+
+/* Top 16 bits of a double.  */
+static inline uint32_t
+top16 (double x)
+{
+  return asuint64 (x) >> 48;
+}
+
+/* Fast and low accuracy implementation of log10.
+   The implementation is similar to that of math/log, except that:
+   - Polynomials are computed for log10(1+r) with r on same intervals as log.
+   - Lookup parameters are scaled (at runtime) to switch from base e to base 10.
+   Max ULP error: < 1.7 ulp (nearest rounding.)
+     with (LOG10_POLY1_ORDER = 10, LOG10_POLY_ORDER = 6, N = 128)
+   Many errors above 2.08 ulp are observed across the whole range of doubles.
+   The greatest observed error is 2.09 ulp, at around 2.66e-127:
+   log10(0x1.713b77689f011p-421) got -0x1.fa4c5bacfbe41p+6
+				want -0x1.fa4c5bacfbe43p+6.  */
+double
+log10 (double x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t w, z, r, r2, r3, y, invc, logc, kd;
+  uint64_t ix, iz, tmp;
+  uint32_t top;
+  int k, i;
+
+  ix = asuint64 (x);
+  top = top16 (x);
+
+  if (unlikely (ix - LO < HI - LO))
+    {
+      /* Handle close to 1.0 inputs separately.  */
+      /* Fix sign of zero with downward rounding when x==1.  */
+      if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
+	return 0;
+      r = x - 1.0;
+      r2 = r * r;
+      r3 = r * r2;
+      /* Worst-case error is around 0.727 ULP.  */
+      y = r3
+	  * (B[1] + r * B[2] + r2 * B[3]
+	     + r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8])));
+      w = B[0] * r2; /* B[0] == -0.5.  */
+      /* Scale by 1/ln(10). Polynomial already contains scaling.  */
+      y = (y + w) + r * InvLn10;
+
+      return eval_as_double (y);
+    }
+  if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
+    {
+      /* x < 0x1p-1022 or inf or nan.  */
+      if (ix * 2 == 0)
+	return __math_divzero (1);
+      if (ix == asuint64 (INFINITY)) /* log10(inf) == inf.  */
+	return x;
+      if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
+	return __math_invalid (x);
+      /* x is subnormal, normalize it.  */
+      ix = asuint64 (x * 0x1p52);
+      ix -= 52ULL << 52;
+    }
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (52 - LOG10_TABLE_BITS)) % N;
+  k = (int64_t) tmp >> 52; /* arithmetic shift.  */
+  iz = ix - (tmp & 0xfffULL << 52);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = asdouble (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  /* r ~= z/c - 1, |r| < 1/(2*N).  */
+#if HAVE_FAST_FMA
+  /* rounding error: 0x1p-55/N.  */
+  r = fma (z, invc, -1.0);
+#else
+  /* rounding error: 0x1p-55/N + 0x1p-66.  */
+  r = (z - T2[i].chi - T2[i].clo) * invc;
+#endif
+  kd = (double_t) k;
+
+  /* w = log(c) + k*Ln2hi.  */
+  w = kd * Ln2hi + logc;
+
+  /* log10(x) = (w + r)/log(10) + (log10(1+r) - r/log(10)).  */
+  r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
+  y = r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4]));
+
+  /* Scale by 1/ln(10). Polynomial already contains scaling.  */
+  y = y + ((r + kd * Ln2lo) + w) * InvLn10;
+
+  return eval_as_double (y);
+}
+
+// clang-format off
+#if USE_GLIBC_ABI
+strong_alias (log10, __log10_finite)
+hidden_alias (log10, __ieee754_log10)
+#if LDBL_MANT_DIG == 53
+long double
+log10l (long double x)
+{
+  return log10 (x);
+}
+#endif
+#endif
+// clang-format on
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 72cad47..6f936f8 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -48,7 +48,7 @@ t log10f  0x1p-26   0x1p3   50000
 t log10f  0x1p-4    0x1p4   50000
 t log10f  0         inf     50000
 
-L=1.15
+L=1.6
 Ldir=
 t log10  0 0xffff000000000000 10000
 t log10  0x1p-4    0x1p4      40000
-- 
cgit v1.2.3


From f75d8045e2be293c4d1b8a54267e4db4feec5f0c Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 16 Jun 2022 12:06:36 +0100
Subject: pl/math: Add scalar atan2

Ran tests and benchmarks. The new routine is accurate to 2.0 ulps.
---
 pl/math/atan2_2u.c                        | 158 ++++++++++++++++++++++++++++++
 pl/math/atan_common.h                     |  73 ++++++++++++++
 pl/math/atan_data.c                       |  20 ++++
 pl/math/include/mathlib.h                 |   1 +
 pl/math/math_config.h                     |   6 ++
 pl/math/test/mathbench_funcs.h            |   3 +-
 pl/math/test/mathbench_wrappers.h         |   5 +
 pl/math/test/runulp.sh                    |   7 ++
 pl/math/test/testcases/directed/atan2.tst | 110 +++++++++++++++++++++
 pl/math/test/ulp_funcs.h                  |   1 +
 pl/math/tools/atan.sollya                 |  23 +++++
 11 files changed, 406 insertions(+), 1 deletion(-)
 create mode 100644 pl/math/atan2_2u.c
 create mode 100644 pl/math/atan_common.h
 create mode 100644 pl/math/atan_data.c
 create mode 100644 pl/math/test/testcases/directed/atan2.tst
 create mode 100644 pl/math/tools/atan.sollya

diff --git a/pl/math/atan2_2u.c b/pl/math/atan2_2u.c
new file mode 100644
index 0000000..9bd88ef
--- /dev/null
+++ b/pl/math/atan2_2u.c
@@ -0,0 +1,158 @@
+/*
+ * Double-precision scalar atan2(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "atan_common.h"
+
+#define Pi (0x1.921fb54442d18p+1)
+#define PiOver2 (0x1.921fb54442d18p+0)
+#define PiOver4 (0x1.921fb54442d18p-1)
+#define SignMask (0x8000000000000000)
+#define ExpMask (0x7ff0000000000000)
+
+/* We calculate atan2 by P(n/d), where n and d are similar to the input
+   arguments, and P is a polynomial. Evaluating P(x) requires calculating x^8,
+   which may underflow if n and d have very different magnitude.
+   POW8_EXP_UFLOW_BOUND is the lower bound of the difference in exponents of n
+   and d for which P underflows, and is used to special-case such inputs.  */
+#define POW8_EXP_UFLOW_BOUND 62
+
+static inline int64_t
+biased_exponent (double f)
+{
+  uint64_t fi = asuint64 (f);
+  int64_t ex = (fi & ExpMask) >> 52;
+  if (unlikely (ex == 0))
+    {
+      /* Subnormal case - we still need to get the exponent right for subnormal
+	 numbers as division may take us back inside the normal range.  */
+      return ex - __builtin_clz (fi << 12);
+    }
+  return ex;
+}
+
+/* Fast implementation of scalar atan2. Errors are greatest when y and
+   x are reasonably close together. Maximum observed error is 2.0 ulps:
+   atan2(0x1.8d9621df2f329p+2, 0x1.884cf49437972p+2)
+   got 0x1.958cd0e8c618bp-1 want 0x1.958cd0e8c618dp-1.  */
+double
+atan2 (double y, double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iy = asuint64 (y);
+
+  uint64_t sign_x = ix & SignMask;
+  uint64_t sign_y = iy & SignMask;
+
+  uint64_t iax = ix & ~SignMask;
+  uint64_t iay = iy & ~SignMask;
+
+  /* x or y is NaN.  */
+  if ((iax > 0x7ff0000000000000) || (iay > 0x7ff0000000000000))
+    {
+      if (unlikely ((iax > 0x7f80000000000000) && (iay > 0x7f80000000000000)))
+	{
+	  /* Both are NaN. Force sign to be +ve.  */
+	  return (asdouble (iax) + asdouble (iay));
+	}
+      return x + y;
+    }
+
+  /* m = 2 * sign(x) + sign(y).  */
+  uint32_t m = ((iy >> 63) & 1) | ((ix >> 62) & 2);
+
+  int64_t exp_diff = biased_exponent (x) - biased_exponent (y);
+
+  /* y = 0.  */
+  if (iay == 0)
+    {
+      switch (m)
+	{
+	case 0:
+	case 1:
+	  return y; /* atan(+-0,+anything)=+-0.  */
+	case 2:
+	  return Pi; /* atan(+0,-anything) = pi.  */
+	case 3:
+	  return -Pi; /* atan(-0,-anything) =-pi.  */
+	}
+    }
+  /* Special case for (x, y) either on or very close to the y axis. Either x =
+     0, or y is much larger than x (difference in exponents >=
+     POW8_EXP_UFLOW_BOUND).  */
+  if (unlikely (iax == 0 || exp_diff <= -POW8_EXP_UFLOW_BOUND))
+    return sign_y ? -PiOver2 : PiOver2;
+
+  /* Special case for either x is INF or (x, y) is very close to x axis and x is
+     negative.  */
+  if (unlikely (iax == 0x7ff0000000000000
+		|| (exp_diff >= POW8_EXP_UFLOW_BOUND && m >= 2)))
+    {
+      if (iay == 0x7ff0000000000000)
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return PiOver4; /* atan(+INF,+INF).  */
+	    case 1:
+	      return -PiOver4; /* atan(-INF,+INF).  */
+	    case 2:
+	      return 3.0 * PiOver4; /* atan(+INF,-INF).  */
+	    case 3:
+	      return -3.0 * PiOver4; /* atan(-INF,-INF).  */
+	    }
+	}
+      else
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return 0.0; /* atan(+...,+INF).  */
+	    case 1:
+	      return -0.0; /* atan(-...,+INF).  */
+	    case 2:
+	      return Pi; /* atan(+...,-INF).  */
+	    case 3:
+	      return -Pi; /* atan(-...,-INF).  */
+	    }
+	}
+    }
+  /* y is INF.  */
+  if (iay == 0x7ff0000000000000)
+    return sign_y ? -PiOver2 : PiOver2;
+
+  uint64_t sign_xy = sign_x ^ sign_y;
+
+  double ax = asdouble (iax);
+  double ay = asdouble (iay);
+  uint64_t pred_aygtax = (ay > ax);
+
+  /* Set up z for call to atan.  */
+  double n = pred_aygtax ? -ax : ay;
+  double d = pred_aygtax ? ay : ax;
+  double z = n / d;
+
+  double ret;
+  if (unlikely (m < 2 && exp_diff >= POW8_EXP_UFLOW_BOUND))
+    {
+      /* If (x, y) is very close to x axis and x is positive, the polynomial
+	 will underflow and evaluate to z.  */
+      ret = z;
+    }
+  else
+    {
+      /* Work out the correct shift.  */
+      double shift = sign_x ? -2.0 : 0.0;
+      shift = pred_aygtax ? shift + 1.0 : shift;
+      shift *= PiOver2;
+
+      ret = eval_poly (z, z, shift);
+    }
+
+  /* Account for the sign of x and y.  */
+  return asdouble (asuint64 (ret) ^ sign_xy);
+}
diff --git a/pl/math/atan_common.h b/pl/math/atan_common.h
new file mode 100644
index 0000000..1690e7e
--- /dev/null
+++ b/pl/math/atan_common.h
@@ -0,0 +1,73 @@
+/*
+ * Double-precision polynomial evaluation function for scalar and vector atan(x)
+ * and atan2(y,x).
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#if V_SUPPORTED
+
+#include "v_math.h"
+
+#define DBL_T v_f64_t
+#define FMA v_fma_f64
+#define P(i) v_f64 (__atan_poly_data.poly[i])
+
+#else
+
+#define DBL_T double
+#define FMA fma
+#define P(i) __atan_poly_data.poly[i]
+
+#endif
+
+/* Polynomial used in fast atan(x) and atan2(y,x) implementations
+   The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline DBL_T
+eval_poly (DBL_T z, DBL_T az, DBL_T shift)
+{
+  /* Use full Estrin scheme for P(z^2) with deg(P)=19.  */
+  DBL_T z2 = z * z;
+  /* Level 1.  */
+  DBL_T P_1_0 = FMA (P (1), z2, P (0));
+  DBL_T P_3_2 = FMA (P (3), z2, P (2));
+  DBL_T P_5_4 = FMA (P (5), z2, P (4));
+  DBL_T P_7_6 = FMA (P (7), z2, P (6));
+  DBL_T P_9_8 = FMA (P (9), z2, P (8));
+  DBL_T P_11_10 = FMA (P (11), z2, P (10));
+  DBL_T P_13_12 = FMA (P (13), z2, P (12));
+  DBL_T P_15_14 = FMA (P (15), z2, P (14));
+  DBL_T P_17_16 = FMA (P (17), z2, P (16));
+  DBL_T P_19_18 = FMA (P (19), z2, P (18));
+
+  /* Level 2.  */
+  DBL_T x2 = z2 * z2;
+  DBL_T P_3_0 = FMA (P_3_2, x2, P_1_0);
+  DBL_T P_7_4 = FMA (P_7_6, x2, P_5_4);
+  DBL_T P_11_8 = FMA (P_11_10, x2, P_9_8);
+  DBL_T P_15_12 = FMA (P_15_14, x2, P_13_12);
+  DBL_T P_19_16 = FMA (P_19_18, x2, P_17_16);
+
+  /* Level 3.  */
+  DBL_T x4 = x2 * x2;
+  DBL_T P_7_0 = FMA (P_7_4, x4, P_3_0);
+  DBL_T P_15_8 = FMA (P_15_12, x4, P_11_8);
+
+  /* Level 4.  */
+  DBL_T x8 = x4 * x4;
+  DBL_T y = FMA (P_19_16, x8, P_15_8);
+  y = FMA (y, x8, P_7_0);
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  y = FMA (y, z2 * az, az);
+  y = y + shift;
+
+  return y;
+}
+
+#undef DBL_T
+#undef FMA
+#undef P
diff --git a/pl/math/atan_data.c b/pl/math/atan_data.c
new file mode 100644
index 0000000..fa34d11
--- /dev/null
+++ b/pl/math/atan_data.c
@@ -0,0 +1,20 @@
+/*
+ * Double-precision polynomial coefficients for vector atan(x) and atan2(y,x).
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct atan_poly_data __atan_poly_data = {
+  .poly = {/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+	      [2**-1022, 1.0]. See atan.sollya for details of how these were
+	      generated.  */
+	   -0x1.5555555555555p-2,  0x1.99999999996c1p-3,  -0x1.2492492478f88p-3,
+	   0x1.c71c71bc3951cp-4,   -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
+	   -0x1.11100ee084227p-4,  0x1.e1d0f9696f63bp-5,  -0x1.aebfe7b418581p-5,
+	   0x1.842dbe9b0d916p-5,   -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
+	   -0x1.00e6eece7de8p-5,   0x1.860897b29e5efp-6,  -0x1.0051381722a59p-6,
+	   0x1.14e9dc19a4a4ep-7,   -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
+	   -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 7813390..53d4365 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -13,6 +13,7 @@ float erfcf (float);
 float erff (float);
 float log10f (float);
 
+double atan2 (double, double);
 double log10 (double);
 
 float __s_erfcf (float);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 4b009c1..b5a7bb9 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -407,4 +407,10 @@ extern const struct v_erff_data
 {
   float coeffs[V_ERFF_NCOEFFS][2];
 } __v_erff_data HIDDEN;
+
+#define ATAN_POLY_NCOEFFS 20
+extern const struct atan_poly_data
+{
+  double poly[ATAN_POLY_NCOEFFS];
+} __atan_poly_data HIDDEN;
 #endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 5703089..d67205c 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -9,6 +9,7 @@ F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
 
+{"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
 D (erf, -6,6)
 D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
@@ -48,4 +49,4 @@ VND (_ZGVnN2v_log10, 0.01, 11.1)
 #endif
 #endif
 #endif
-// clang-format on
+  // clang-format on
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index 8f85079..fce0add 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -5,3 +5,8 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+static double
+atan2_wrap (double x)
+{
+  return atan2 (5.0, x);
+}
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 6f936f8..bcfbb38 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -71,6 +71,13 @@ t erfcf  0x1p-26    0x1p5  40000
 t erfcf -0x1p-26   -0x1p3  40000
 t erfcf  0          inf    40000
 
+L=2.0
+t atan2 -10.0       10.0  50000
+t atan2  -1.0        1.0  40000
+t atan2   0.0        1.0  40000
+t atan2   1.0      100.0  40000
+t atan2   1e6       1e32  40000
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/atan2.tst b/pl/math/test/testcases/directed/atan2.tst
new file mode 100644
index 0000000..df16d41
--- /dev/null
+++ b/pl/math/test/testcases/directed/atan2.tst
@@ -0,0 +1,110 @@
+; atan2.tst
+;
+; Copyright (c) 1999-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atan2 op1=7ff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff80000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff80000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff80000.00000001 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff80000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff80000.00000001 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff00000.00000000 op2=7ff00000.00000000 result=3fe921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=fff00000.00000000 result=4002d97c.7f3321d2.34f errno=0
+func=atan2 op1=7ff00000.00000000 op2=00000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=80000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=3ff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=bff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff00000.00000000 op2=7ff00000.00000000 result=bfe921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=fff00000.00000000 result=c002d97c.7f3321d2.34f errno=0
+func=atan2 op1=fff00000.00000000 op2=00000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=80000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=3ff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=bff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=00000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=00000000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=00000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=00000000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=00000000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=00000000.00000000 op2=fff00000.00000000 result=400921fb.54442d18.469 errno=0
+func=atan2 op1=00000000.00000000 op2=00000000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=00000000.00000000 op2=80000000.00000000 result=400921fb.54442d18.469 errno=0
+func=atan2 op1=00000000.00000000 op2=3ff00000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=00000000.00000000 op2=bff00000.00000000 result=400921fb.54442d18.469 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2 op1=00000000.00000001 op2=3ff00000.00000000 result=00000000.00000001 errno=0 maybestatus=ux
+func=atan2 op1=80000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=80000000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=80000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=80000000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=80000000.00000000 op2=7ff00000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=80000000.00000000 op2=fff00000.00000000 result=c00921fb.54442d18.469 errno=0
+func=atan2 op1=80000000.00000000 op2=00000000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=80000000.00000000 op2=80000000.00000000 result=c00921fb.54442d18.469 errno=0
+func=atan2 op1=80000000.00000000 op2=3ff00000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=80000000.00000000 op2=bff00000.00000000 result=c00921fb.54442d18.469 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2 op1=80000000.00000001 op2=3ff00000.00000000 result=80000000.00000001 errno=0 maybestatus=ux
+func=atan2 op1=3ff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=3ff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=3ff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=3ff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=3ff00000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=3ff00000.00000000 op2=fff00000.00000000 result=400921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=00000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=80000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=3ff00000.00000000 result=3fe921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=bff00000.00000000 result=4002d97c.7f3321d2.34f errno=0
+func=atan2 op1=bff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=bff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=bff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=bff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=bff00000.00000000 op2=7ff00000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=bff00000.00000000 op2=fff00000.00000000 result=c00921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=00000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=80000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=3ff00000.00000000 result=bfe921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=bff00000.00000000 result=c002d97c.7f3321d2.34f errno=0
+func=atan2 op1=3ff00000.00000000 op2=3ff00000.00000000 result=3fe921fb.54442d18 errno=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 0592794..f84afe3 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -8,6 +8,7 @@
 F1 (erfc)
 F1 (erf)
 F1 (log10)
+D2 (atan2)
 D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
diff --git a/pl/math/tools/atan.sollya b/pl/math/tools/atan.sollya
new file mode 100644
index 0000000..f1f33c5
--- /dev/null
+++ b/pl/math/tools/atan.sollya
@@ -0,0 +1,23 @@
+// polynomial for approximating atan(x) and atan2(y, x)
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// atan is odd, so approximate with an odd polynomial:
+// x + ax^3 + bx^5 + cx^7 + ...
+// We generate a, b, c, ... such that we can approximate atan(x) by:
+// x + x^3 * (a + bx^2 + cx^4 + ...)
+
+// Assemble monomials
+deg = 20;
+mons = [|1,...,deg|];
+for i from 0 to deg-1 do mons[i] = mons[i] * 2 + 1;
+
+a = 0x1.0p-1022;
+b = 1;
+
+poly = fpminimax(atan(x)-x, mons, [|double ...|], [a;b]);
+
+display = hexadecimal;
+print("coeffs:");
+for i from 0 to deg-1 do coeff(poly,mons[i]);
-- 
cgit v1.2.3


From 2eedcfa316be6a8b74e062e903a7551fb4b7ddd8 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 16 Jun 2022 12:07:10 +0100
Subject: pl/math: Add scalar atan2f

Ran make check and benchmarks. New routine is accurate to 3.0 ULP.
---
 pl/math/atan2f_3u.c                        | 146 +++++++++++++++++++++++++++++
 pl/math/atanf_common.h                     |  55 +++++++++++
 pl/math/atanf_data.c                       |  15 +++
 pl/math/include/mathlib.h                  |   1 +
 pl/math/math_config.h                      |   6 ++
 pl/math/test/mathbench_funcs.h             |   1 +
 pl/math/test/mathbench_wrappers.h          |   6 ++
 pl/math/test/runulp.sh                     |   7 ++
 pl/math/test/testcases/directed/atan2f.tst | 121 ++++++++++++++++++++++++
 pl/math/test/ulp_funcs.h                   |   2 +-
 pl/math/tools/atanf.sollya                 |  20 ++++
 11 files changed, 379 insertions(+), 1 deletion(-)
 create mode 100644 pl/math/atan2f_3u.c
 create mode 100644 pl/math/atanf_common.h
 create mode 100644 pl/math/atanf_data.c
 create mode 100644 pl/math/test/testcases/directed/atan2f.tst
 create mode 100644 pl/math/tools/atanf.sollya

diff --git a/pl/math/atan2f_3u.c b/pl/math/atan2f_3u.c
new file mode 100644
index 0000000..7d83b67
--- /dev/null
+++ b/pl/math/atan2f_3u.c
@@ -0,0 +1,146 @@
+/*
+ * Single-precision scalar atan2(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stdbool.h>
+
+#include "math_config.h"
+#include "atanf_common.h"
+
+#define Pi (0x1.921fb6p+1f)
+#define PiOver2 (0x1.921fb6p+0f)
+#define PiOver4 (0x1.921fb6p-1f)
+#define SignMask (0x80000000)
+
+static inline int32_t
+biased_exponent (float f)
+{
+  uint32_t fi = asuint (f);
+  int32_t ex = (int32_t) ((fi & 0x7f800000) >> 23);
+  if (unlikely (ex == 0))
+    {
+      /* Subnormal case - we still need to get the exponent right for subnormal
+	 numbers as division may take us back inside the normal range.  */
+      return ex - __builtin_clz (fi << 9);
+    }
+  return ex;
+}
+
+/* Fast implementation of scalar atan2f. Largest observed error is
+   2.88ulps in [99.0, 101.0] x [99.0, 101.0]:
+   atan2f(0x1.9332d8p+6, 0x1.8cb6c4p+6) got 0x1.964646p-1
+				       want 0x1.964640p-1.  */
+float
+atan2f (float y, float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iy = asuint (y);
+
+  uint32_t sign_x = ix & SignMask;
+  uint32_t sign_y = iy & SignMask;
+
+  uint32_t iax = ix & ~SignMask;
+  uint32_t iay = iy & ~SignMask;
+
+  /* x or y is NaN.  */
+  if ((iax > 0x7f800000) || (iay > 0x7f800000))
+    {
+      if (unlikely ((iax > 0x7f800000) && (iay > 0x7f800000)))
+	{
+	  /* Both are NaN. Force sign to be +ve.  */
+	  return (asfloat (iax) + asfloat (iay));
+	}
+      return x + y;
+    }
+
+  /* m = 2 * sign(x) + sign(y).  */
+  uint32_t m = ((iy >> 31) & 1) | ((ix >> 30) & 2);
+
+  /* The following follows glibc ieee754 implementation, except
+     that we do not use +-tiny shifts (non-nearest rounding mode).  */
+
+  int32_t exp_diff = biased_exponent (x) - biased_exponent (y);
+
+  /* Special case for (x, y) either on or very close to the x axis. Either y =
+     0, or y is tiny and x is huge (difference in exponents >= 126). In the
+     second case, we only want to use this special case when x is negative (i.e.
+     quadrants 2 or 3).  */
+  if (unlikely (iay == 0 || (exp_diff >= 126 && m >= 2)))
+    {
+      switch (m)
+	{
+	case 0:
+	case 1:
+	  return y; /* atan(+-0,+anything)=+-0.  */
+	case 2:
+	  return Pi; /* atan(+0,-anything) = pi.  */
+	case 3:
+	  return -Pi; /* atan(-0,-anything) =-pi.  */
+	}
+    }
+  /* Special case for (x, y) either on or very close to the y axis. Either x =
+     0, or x is tiny and y is huge (difference in exponents >= 126).  */
+  if (unlikely (iax == 0 || exp_diff <= -126))
+    return sign_y ? -PiOver2 : PiOver2;
+
+  /* x is INF.  */
+  if (iax == 0x7f800000)
+    {
+      if (iay == 0x7f800000)
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return PiOver4; /* atan(+INF,+INF).  */
+	    case 1:
+	      return -PiOver4; /* atan(-INF,+INF).  */
+	    case 2:
+	      return 3.0f * PiOver4; /* atan(+INF,-INF).  */
+	    case 3:
+	      return -3.0f * PiOver4; /* atan(-INF,-INF).  */
+	    }
+	}
+      else
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return 0.0f; /* atan(+...,+INF).  */
+	    case 1:
+	      return -0.0f; /* atan(-...,+INF).  */
+	    case 2:
+	      return Pi; /* atan(+...,-INF).  */
+	    case 3:
+	      return -Pi; /* atan(-...,-INF).  */
+	    }
+	}
+    }
+  /* y is INF.  */
+  if (iay == 0x7f800000)
+    return sign_y ? -PiOver2 : PiOver2;
+
+  uint32_t sign_xy = sign_x ^ sign_y;
+
+  float ax = asfloat (iax);
+  float ay = asfloat (iay);
+
+  bool pred_aygtax = (ay > ax);
+
+  /* Set up z for call to atanf.  */
+  float n = pred_aygtax ? -ax : ay;
+  float d = pred_aygtax ? ay : ax;
+  float z = n / d;
+
+  /* Work out the correct shift.  */
+  float shift = sign_x ? -2.0f : 0.0f;
+  shift = pred_aygtax ? shift + 1.0f : shift;
+  shift *= PiOver2;
+
+  float ret = eval_poly (z, z, shift);
+
+  /* Account for the sign of x and y.  */
+  return asfloat (asuint (ret) ^ sign_xy);
+}
diff --git a/pl/math/atanf_common.h b/pl/math/atanf_common.h
new file mode 100644
index 0000000..55cee89
--- /dev/null
+++ b/pl/math/atanf_common.h
@@ -0,0 +1,55 @@
+/*
+ * Single-precision polynomial evaluation function for scalar and vector
+ * atan(x) and atan2(y,x).
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_ATANF_COMMON_H
+#define PL_MATH_ATANF_COMMON_H
+
+#include "math_config.h"
+
+#if V_SUPPORTED
+
+#include "v_math.h"
+
+#define FLT_T v_f32_t
+#define FMA v_fma_f32
+#define P(i) v_f32 (__atanf_poly_data.poly[i])
+
+#else
+
+#define FLT_T double
+#define FMA fma
+#define P(i) __atanf_poly_data.poly[i]
+
+#endif
+
+/* Polynomial used in fast atanf(x) and atan2f(y,x) implementations
+   The order 7 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline FLT_T
+eval_poly (FLT_T z, FLT_T az, FLT_T shift)
+{
+  /* Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+     a standard implementation using z8 creates spurious underflow
+     in the very last fma (when z^8 is small enough).
+     Therefore, we split the last fma into a mul and and an fma.
+     Horner and single-level Estrin have higher errors that exceed
+     threshold.  */
+  FLT_T z2 = z * z;
+  FLT_T z4 = z2 * z2;
+
+  /* Then assemble polynomial.  */
+  FLT_T y
+    = FMA (z4,
+	   z4 * FMA (z4, (FMA (z2, P (7), P (6))), (FMA (z2, P (5), P (4)))),
+	   FMA (z4, (FMA (z2, P (3), P (2))), (FMA (z2, P (1), P (0)))));
+
+  /* Finalize:
+     y = shift + z * P(z^2).  */
+  return FMA (y, z2 * az, az) + shift;
+}
+
+#endif // PL_MATH_ATANF_COMMON_H
diff --git a/pl/math/atanf_data.c b/pl/math/atanf_data.c
new file mode 100644
index 0000000..8ea952a
--- /dev/null
+++ b/pl/math/atanf_data.c
@@ -0,0 +1,15 @@
+/*
+ * Single-precision polynomial coefficients for vector atan(x) and atan2(y,x).
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0].
+ */
+const struct atanf_poly_data __atanf_poly_data = {
+  .poly = {/* See atanf.sollya for details of how these were generated.  */
+	   -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
+	   -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 53d4365..1092ea8 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -9,6 +9,7 @@
 #ifndef _MATHLIB_H
 #define _MATHLIB_H
 
+float atan2f (float, float);
 float erfcf (float);
 float erff (float);
 float log10f (float);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index b5a7bb9..47b192c 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -413,4 +413,10 @@ extern const struct atan_poly_data
 {
   double poly[ATAN_POLY_NCOEFFS];
 } __atan_poly_data HIDDEN;
+
+#define ATANF_POLY_NCOEFFS 8
+extern const struct atanf_poly_data
+{
+  float poly[ATANF_POLY_NCOEFFS];
+} __atanf_poly_data HIDDEN;
 #endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index d67205c..9828bae 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -5,6 +5,7 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
 F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index fce0add..19cba74 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -10,3 +10,9 @@ atan2_wrap (double x)
 {
   return atan2 (5.0, x);
 }
+
+static float
+atan2f_wrap (float x)
+{
+  return atan2f (5.0f, x);
+}
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index bcfbb38..db8460b 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -78,6 +78,13 @@ t atan2   0.0        1.0  40000
 t atan2   1.0      100.0  40000
 t atan2   1e6       1e32  40000
 
+L=3.0
+t atan2f -10.0       10.0  50000
+t atan2f  -1.0        1.0  40000
+t atan2f   0.0        1.0  40000
+t atan2f   1.0      100.0  40000
+t atan2f   1e6       1e32  40000
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/atan2f.tst b/pl/math/test/testcases/directed/atan2f.tst
new file mode 100644
index 0000000..708e867
--- /dev/null
+++ b/pl/math/test/testcases/directed/atan2f.tst
@@ -0,0 +1,121 @@
+; atan2f.tst
+;
+; Copyright (c) 1999-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atan2f op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=7fc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=ffc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=7f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=ff800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=00000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=80000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=3f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=bf800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=7fc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=ffc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=7f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=ff800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=00000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=80000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=3f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=bf800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7fc00001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7fc00001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7fc00001 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=ffc00001 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=7f800000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=ff800000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=00000000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=80000000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=3f800000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=bf800000 result=7fc00001 errno=0
+func=atan2f op1=ffc00001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ffc00001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ffc00001 op2=7fc00001 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=7f800000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=ff800000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=00000000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=80000000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=3f800000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=bf800000 result=ffc00001 errno=0
+func=atan2f op1=7f800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=7f800000 op2=ffc00001 result=7fc00001 errno=0
+func=atan2f op1=7f800000 op2=7f800000 result=3f490fda.a22 errno=0
+func=atan2f op1=7f800000 op2=ff800000 result=4016cbe3.f99 errno=0
+func=atan2f op1=7f800000 op2=00000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=7f800000 op2=80000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=7f800000 op2=3f800000 result=3fc90fda.a22 errno=0
+func=atan2f op1=7f800000 op2=bf800000 result=3fc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=ff800000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=ff800000 op2=7f800000 result=bf490fda.a22 errno=0
+func=atan2f op1=ff800000 op2=ff800000 result=c016cbe3.f99 errno=0
+func=atan2f op1=ff800000 op2=00000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=80000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=3f800000 result=bfc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=bf800000 result=bfc90fda.a22 errno=0
+func=atan2f op1=00000000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=00000000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=00000000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=00000000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=00000000 op2=7f800000 result=00000000 errno=0
+func=atan2f op1=00000000 op2=ff800000 result=40490fda.a22 errno=0
+func=atan2f op1=00000000 op2=00000000 result=00000000 errno=0
+func=atan2f op1=00000000 op2=80000000 result=40490fda.a22 errno=0
+func=atan2f op1=00000000 op2=3f800000 result=00000000 errno=0
+func=atan2f op1=00000000 op2=bf800000 result=40490fda.a22 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2f op1=00000001 op2=3f800000 result=00000001 errno=0 maybestatus=ux
+
+func=atan2f op1=80000000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=80000000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=80000000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=80000000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=80000000 op2=7f800000 result=80000000 errno=0
+func=atan2f op1=80000000 op2=ff800000 result=c0490fda.a22 errno=0
+func=atan2f op1=80000000 op2=00000000 result=80000000 errno=0
+func=atan2f op1=80000000 op2=80000000 result=c0490fda.a22 errno=0
+func=atan2f op1=80000000 op2=3f800000 result=80000000 errno=0
+func=atan2f op1=80000000 op2=bf800000 result=c0490fda.a22 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2f op1=80000001 op2=3f800000 result=80000001 errno=0 maybestatus=ux
+
+func=atan2f op1=3f800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=3f800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=3f800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=3f800000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=3f800000 op2=7f800000 result=00000000 errno=0
+func=atan2f op1=3f800000 op2=ff800000 result=40490fda.a22 errno=0
+func=atan2f op1=3f800000 op2=00000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=3f800000 op2=80000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=3f800000 op2=3f800000 result=3f490fda.a22 errno=0
+func=atan2f op1=3f800000 op2=bf800000 result=4016cbe3.f99 errno=0
+func=atan2f op1=bf800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=bf800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=bf800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=bf800000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=bf800000 op2=7f800000 result=80000000 errno=0
+func=atan2f op1=bf800000 op2=ff800000 result=c0490fda.a22 errno=0
+func=atan2f op1=bf800000 op2=00000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=bf800000 op2=80000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=bf800000 op2=3f800000 result=bf490fda.a22 errno=0
+func=atan2f op1=bf800000 op2=bf800000 result=c016cbe3.f99 errno=0
+func=atan2f op1=8005f16d op2=002bb601 result=be0a60a5.d88 error=0
+func=atan2f op1=80818ec8 op2=80ba5db9 result=c0222eda.f42 error=0
+
+func=atan2f op1=ff7fffff op2=ff7fffff result=c016cbe3.f99 errno=0
+func=atan2f op1=bfc00001 op2=7f7fffff result=80300000.700 errno=0 status=u
+func=atan2f op1=80800001 op2=40000000 result=80400000.800 errno=0 status=u
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index f84afe3..16cfb88 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -4,7 +4,7 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
-
+F2 (atan2)
 F1 (erfc)
 F1 (erf)
 F1 (log10)
diff --git a/pl/math/tools/atanf.sollya b/pl/math/tools/atanf.sollya
new file mode 100644
index 0000000..42b8c36
--- /dev/null
+++ b/pl/math/tools/atanf.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating atanf(x)
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// Generate list of monomials:
+// Taylor series of atan is of the form x + ax^3 + bx^5 + cx^7 + ...
+// So generate a, b, c, ... such that we can approximate atan(x) by:
+// x + x^3 * (a + bx^2 + cx^4 + ...)
+
+deg = 7;
+
+a = 1.1754943508222875e-38;
+b = 1;
+
+poly = fpminimax((atan(sqrt(x))-sqrt(x))/x^(3/2), deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
-- 
cgit v1.2.3


From 9440393e38953b415e790860691860169b690434 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 16 Jun 2022 12:07:42 +0100
Subject: pl/math: Add vector/Neon atan2

Successfully ran tests and benchmarks. New routine is accurate to 3.0 ulps.
---
 pl/math/include/mathlib.h         |  4 ++
 pl/math/s_atan2_3u.c              |  6 +++
 pl/math/test/mathbench_funcs.h    |  5 +++
 pl/math/test/mathbench_wrappers.h | 33 +++++++++++++++++
 pl/math/test/runulp.sh            | 13 +++++++
 pl/math/test/ulp_funcs.h          |  4 ++
 pl/math/test/ulp_wrappers.h       |  5 +++
 pl/math/v_atan2_3u.c              | 78 +++++++++++++++++++++++++++++++++++++++
 pl/math/v_math.h                  | 33 +++++++++++++++++
 pl/math/vn_atan2_3u.c             | 12 ++++++
 10 files changed, 193 insertions(+)
 create mode 100644 pl/math/s_atan2_3u.c
 create mode 100644 pl/math/v_atan2_3u.c
 create mode 100644 pl/math/vn_atan2_3u.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 1092ea8..8296e77 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -21,6 +21,7 @@ float __s_erfcf (float);
 float __s_erff (float);
 float __s_log10f (float);
 
+double __s_atan2 (double, double);
 double __s_erf (double);
 double __s_erfc (double);
 double __s_log10 (double);
@@ -37,6 +38,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #endif
 
 /* Vector functions following the base PCS.  */
+__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
 __f32x4_t __v_erff (__f32x4_t);
 __f64x2_t __v_erf (__f64x2_t);
 __f32x4_t __v_erfcf (__f32x4_t);
@@ -48,6 +50,7 @@ __f64x2_t __v_log10 (__f64x2_t);
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
+__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
 __vpcs __f64x2_t __vn_erf (__f64x2_t);
 __vpcs __f32x4_t __vn_erfcf (__f32x4_t);
@@ -56,6 +59,7 @@ __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
diff --git a/pl/math/s_atan2_3u.c b/pl/math/s_atan2_3u.c
new file mode 100644
index 0000000..5955e3c
--- /dev/null
+++ b/pl/math/s_atan2_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan2_3u.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 9828bae..2e80aa0 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -16,6 +16,7 @@ D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
+{"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
 F (__s_erff, -4.0, 4.0)
 D (__s_erf, -6.0, 6.0)
 F (__s_erfcf, -6.0, 28.0)
@@ -23,6 +24,7 @@ D (__s_erfc, -6.0, 28.0)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 #if __aarch64__
+{"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
 VF  (__v_erff, -4.0, 4.0)
 VD  (__v_erf, -6.0, 6.0)
 VF (__v_erfcf, -6.0, 28.0)
@@ -30,6 +32,9 @@ VD (__v_erfc, -6.0, 28.0)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 #ifdef __vpcs
+{"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
+{"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
+
 VNF (__vn_erff, -4.0, 4.0)
 VNF (_ZGVnN4v_erff, -4.0, 4.0)
 
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index 19cba74..57c0c8f 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -16,3 +16,36 @@ atan2f_wrap (float x)
 {
   return atan2f (5.0f, x);
 }
+
+#if WANT_VMATH
+#if __aarch64__
+
+static double
+__s_atan2_wrap (double x)
+{
+  return __s_atan2 (5.0, x);
+}
+
+static v_double
+__v_atan2_wrap (v_double x)
+{
+  return __v_atan2 (v_double_dup (5.0), x);
+}
+
+#ifdef __vpcs
+
+__vpcs static v_double
+__vn_atan2_wrap (v_double x)
+{
+  return __vn_atan2 (v_double_dup (5.0), x);
+}
+
+__vpcs static v_double
+_Z_atan2_wrap (v_double x)
+{
+  return _ZGVnN2vv_atan2 (v_double_dup (5.0), x);
+}
+
+#endif // __vpcs
+#endif // __arch64__
+#endif // WANT_VMATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index db8460b..b0cbbe2 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -145,6 +145,14 @@ range_erff='
  0         inf     40000
 '
 
+range_atan2='
+ -10.0       10.0  50000
+  -1.0        1.0  40000
+   0.0        1.0  40000
+   1.0      100.0  40000
+   1e6       1e32  40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -152,6 +160,7 @@ L_log10=1.16
 L_log10f=2.81
 L_erf=1.76
 L_erff=1.5
+L_atan2=2.9
 
 while read G F R
 do
@@ -169,6 +178,10 @@ $range
 EOF
 done << EOF
 # group symbol run
+atan2 __s_atan2        $runs
+atan2 __v_atan2        $runv
+atan2 __vn_atan2       $runvn
+atan2 _ZGVnN2vv_atan2  $runvn
 erf   __s_erf          $runs
 erf   __v_erf          $runv
 erf   __vn_erf         $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 16cfb88..27ea586 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -12,6 +12,7 @@ D2 (atan2)
 D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
+F (__s_atan2, __s_atan2, atan2l, mpfr_atan2, 2, 0, d2, 0)
 F (__s_erff, __s_erff, erf, mpfr_erf, 1, 1, f1, 0)
 F (__s_erf, __s_erf, erfl, mpfr_erf, 1, 0, d1, 0)
 F (__s_erfcf, __s_erfcf, erfc, mpfr_erfc, 1, 1, f1, 0)
@@ -19,6 +20,7 @@ F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0)
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 #if __aarch64__
+F (__v_atan2, v_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (__v_erff, v_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (__v_erf, v_erf, erfl, mpfr_erf, 1, 0, d1, 1)
 F (__v_erfcf, v_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
@@ -26,12 +28,14 @@ F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #ifdef __vpcs
+F (__vn_atan2, vn_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (__vn_erff, vn_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (__vn_erf, vn_erf, erfl, mpfr_erf, 1, 0, d1, 1)
 F (__vn_erfcf, vn_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (_ZGVnN2vv_atan2, Z_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (_ZGVnN4v_erff, Z_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (_ZGVnN2v_erf, Z_erf, erfl, mpfr_erf, 1, 0, d1, 1)
 F (_ZGVnN4v_erfcf, Z_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index ab04d96..75b63b1 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -1,3 +1,4 @@
+// clang-format off
 /*
  * Function wrappers for ulp.
  *
@@ -16,6 +17,7 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,
 static float v_erff(float x) { return __v_erff(argf(x))[0]; }
 static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
+static double v_atan2(double x, double y) { return __v_atan2(argd(x), argd(y))[0]; }
 static double v_erf(double x) { return __v_erf(argd(x))[0]; }
 static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
 static double v_log10(double x) { return __v_log10(argd(x))[0]; }
@@ -23,6 +25,7 @@ static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
 static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
 static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
+static double vn_atan2(double x, double y) { return __vn_atan2(argd(x), argd(y))[0]; }
 static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
 static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
 static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
@@ -30,8 +33,10 @@ static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
 static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
 static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
 static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
+static double Z_atan2(double x, double y) { return _ZGVnN2vv_atan2(argd(x), argd(y))[0]; }
 static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; }
 static double Z_erfc(double x) { return _ZGVnN2v_erfc(argd(x))[0]; }
 static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
 #endif
 #endif
+// clang-format on
diff --git a/pl/math/v_atan2_3u.c b/pl/math/v_atan2_3u.c
new file mode 100644
index 0000000..184b220
--- /dev/null
+++ b/pl/math/v_atan2_3u.c
@@ -0,0 +1,78 @@
+/*
+ * Double-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#include "atan_common.h"
+
+#define PiOver2 v_f64 (0x1.921fb54442d18p+0)
+#define SignMask v_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls).  */
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f64_t
+specialcase (v_f64_t y, v_f64_t x, v_f64_t ret, v_u64_t cmp)
+{
+  return v_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline v_u64_t
+zeroinfnan (v_u64_t i)
+{
+  return v_cond_u64 (2 * i - 1 >= v_u64 (2 * asuint64 (INFINITY) - 1));
+}
+
+/* Fast implementation of vector atan2.
+   Maximum observed error is 2.8 ulps:
+   v_atan2(0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
+	got 0x1.92d628ab678ccp-1
+       want 0x1.92d628ab678cfp-1.  */
+VPCS_ATTR
+v_f64_t V_NAME (atan2) (v_f64_t y, v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iy = v_as_u64_f64 (y);
+
+  v_u64_t special_cases = zeroinfnan (ix) | zeroinfnan (iy);
+
+  v_u64_t sign_x = ix & SignMask;
+  v_u64_t sign_y = iy & SignMask;
+  v_u64_t sign_xy = sign_x ^ sign_y;
+
+  v_f64_t ax = v_abs_f64 (x);
+  v_f64_t ay = v_abs_f64 (y);
+
+  v_u64_t pred_xlt0 = x < 0.0;
+  v_u64_t pred_aygtax = ay > ax;
+
+  /* Set up z for call to atan.  */
+  v_f64_t n = v_sel_f64 (pred_aygtax, -ax, ay);
+  v_f64_t d = v_sel_f64 (pred_aygtax, ay, ax);
+  v_f64_t z = v_div_f64 (n, d);
+
+  /* Work out the correct shift.  */
+  v_f64_t shift = v_sel_f64 (pred_xlt0, v_f64 (-2.0), v_f64 (0.0));
+  shift = v_sel_f64 (pred_aygtax, shift + 1.0, shift);
+  shift *= PiOver2;
+
+  v_f64_t ret = eval_poly (z, z, shift);
+
+  /* Account for the sign of x and y.  */
+  ret = v_as_f64_u64 (v_as_u64_f64 (ret) ^ sign_xy);
+
+  if (unlikely (v_any_u64 (special_cases)))
+    {
+      return specialcase (y, x, ret, special_cases);
+    }
+
+  return ret;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index 1f9217e..43292cf 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -320,6 +320,11 @@ v_abs_f64 (v_f64_t x)
   return __builtin_fabs (x);
 }
 static inline v_f64_t
+v_div_f64 (v_f64_t x, v_f64_t y)
+{
+  return x / y;
+}
+static inline v_f64_t
 v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
 {
   return __builtin_fma (x, y, z);
@@ -333,6 +338,11 @@ v_round_f64 (v_f64_t x)
 {
   return __builtin_round (x);
 }
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return p ? x : y;
+}
 static inline v_s64_t
 v_round_s64 (v_f64_t x)
 {
@@ -394,6 +404,12 @@ v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
 {
   return f (x);
 }
+static inline v_f64_t
+v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
+	     v_u64_t p)
+{
+  return f (x1, x2);
+}
 
 #elif __aarch64__
 #define V_SUPPORTED 1
@@ -629,6 +645,11 @@ v_abs_f64 (v_f64_t x)
   return vabsq_f64 (x);
 }
 static inline v_f64_t
+v_div_f64 (v_f64_t x, v_f64_t y)
+{
+  return vdivq_f64 (x, y);
+}
+static inline v_f64_t
 v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
 {
   return vfmaq_f64 (z, x, y);
@@ -642,6 +663,11 @@ v_round_f64 (v_f64_t x)
 {
   return vrndaq_f64 (x);
 }
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return vbslq_f64 (p, x, y);
+}
 static inline v_s64_t
 v_round_s64 (v_f64_t x)
 {
@@ -703,6 +729,13 @@ v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
 {
   return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
 }
+static inline v_f64_t
+v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
+	     v_u64_t p)
+{
+  return (v_f64_t){p[0] ? f (x1[0], x2[0]) : y[0],
+		   p[1] ? f (x1[1], x2[1]) : y[1]};
+}
 #endif
 
 #endif
diff --git a/pl/math/vn_atan2_3u.c b/pl/math/vn_atan2_3u.c
new file mode 100644
index 0000000..b7c46e9
--- /dev/null
+++ b/pl/math/vn_atan2_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan2.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atan2, _ZGVnN2vv_atan2)
+#include "v_atan2_3u.c"
+#endif
-- 
cgit v1.2.3


From ea4649a6ea3b79033755217613bd8ab4791c1dea Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 16 Jun 2022 12:08:16 +0100
Subject: pl/math: Add vector/Neon atan

Successfully ran tests and benchmarks. New routine is accurate to 3 ulps.
---
 pl/math/include/mathlib.h      |  4 ++++
 pl/math/s_atan_3u.c            |  6 +++++
 pl/math/test/mathbench_funcs.h |  6 +++++
 pl/math/test/runulp.sh         | 14 ++++++++++++
 pl/math/test/ulp_funcs.h       |  4 ++++
 pl/math/test/ulp_wrappers.h    |  3 +++
 pl/math/v_atan_3u.c            | 51 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/v_math.h               | 10 +++++++++
 pl/math/vn_atan_3u.c           | 12 ++++++++++
 9 files changed, 110 insertions(+)
 create mode 100644 pl/math/s_atan_3u.c
 create mode 100644 pl/math/v_atan_3u.c
 create mode 100644 pl/math/vn_atan_3u.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 8296e77..cda3371 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -21,6 +21,7 @@ float __s_erfcf (float);
 float __s_erff (float);
 float __s_log10f (float);
 
+double __s_atan (double);
 double __s_atan2 (double, double);
 double __s_erf (double);
 double __s_erfc (double);
@@ -38,6 +39,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #endif
 
 /* Vector functions following the base PCS.  */
+__f64x2_t __v_atan (__f64x2_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
 __f32x4_t __v_erff (__f32x4_t);
 __f64x2_t __v_erf (__f64x2_t);
@@ -50,6 +52,7 @@ __f64x2_t __v_log10 (__f64x2_t);
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
+__vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
 __vpcs __f64x2_t __vn_erf (__f64x2_t);
@@ -59,6 +62,7 @@ __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
diff --git a/pl/math/s_atan_3u.c b/pl/math/s_atan_3u.c
new file mode 100644
index 0000000..1cdc4ed
--- /dev/null
+++ b/pl/math/s_atan_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan_3u.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 2e80aa0..f2befc8 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -10,12 +10,14 @@ F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
 
+D (atan, -10.0, 10.0)
 {"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
 D (erf, -6,6)
 D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
+D (__s_atan, -10.0, 10.0)
 {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
 F (__s_erff, -4.0, 4.0)
 D (__s_erf, -6.0, 6.0)
@@ -24,6 +26,7 @@ D (__s_erfc, -6.0, 28.0)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 #if __aarch64__
+VD (__v_atan, -10.0, 10.0)
 {"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
 VF  (__v_erff, -4.0, 4.0)
 VD  (__v_erf, -6.0, 6.0)
@@ -32,6 +35,9 @@ VD (__v_erfc, -6.0, 28.0)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 #ifdef __vpcs
+VND (__vn_atan, -10.0, 10.0)
+VND (_ZGVnN2v_atan, -10.0, 10.0)
+
 {"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
 {"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index b0cbbe2..7ad2318 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -153,6 +153,14 @@ range_atan2='
    1e6       1e32  40000
 '
 
+range_atan='
+ -10.0       10.0  50000
+  -1.0        1.0  40000
+   0.0        1.0  40000
+   1.0      100.0  40000
+   1e6       1e32  40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -161,6 +169,7 @@ L_log10f=2.81
 L_erf=1.76
 L_erff=1.5
 L_atan2=2.9
+L_atan=3.0
 
 while read G F R
 do
@@ -178,6 +187,11 @@ $range
 EOF
 done << EOF
 # group symbol run
+
+atan   __s_atan        $runs
+atan   __v_atan        $runv
+atan   __vn_atan       $runvn
+atan   _ZGVnN2v_atan   $runvn
 atan2 __s_atan2        $runs
 atan2 __v_atan2        $runv
 atan2 __vn_atan2       $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 27ea586..244c96b 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -12,6 +12,7 @@ D2 (atan2)
 D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
+F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0)
 F (__s_atan2, __s_atan2, atan2l, mpfr_atan2, 2, 0, d2, 0)
 F (__s_erff, __s_erff, erf, mpfr_erf, 1, 1, f1, 0)
 F (__s_erf, __s_erf, erfl, mpfr_erf, 1, 0, d1, 0)
@@ -20,6 +21,7 @@ F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0)
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 #if __aarch64__
+F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (__v_atan2, v_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (__v_erff, v_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (__v_erf, v_erf, erfl, mpfr_erf, 1, 0, d1, 1)
@@ -28,6 +30,7 @@ F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #ifdef __vpcs
+F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (__vn_atan2, vn_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (__vn_erff, vn_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (__vn_erf, vn_erf, erfl, mpfr_erf, 1, 0, d1, 1)
@@ -35,6 +38,7 @@ F (__vn_erfcf, vn_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (_ZGVnN2vv_atan2, Z_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (_ZGVnN4v_erff, Z_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (_ZGVnN2v_erf, Z_erf, erfl, mpfr_erf, 1, 0, d1, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 75b63b1..f1bfdf2 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -17,6 +17,7 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,
 static float v_erff(float x) { return __v_erff(argf(x))[0]; }
 static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
+static double v_atan(double x) { return __v_atan(argd(x))[0]; }
 static double v_atan2(double x, double y) { return __v_atan2(argd(x), argd(y))[0]; }
 static double v_erf(double x) { return __v_erf(argd(x))[0]; }
 static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
@@ -25,6 +26,7 @@ static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
 static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
 static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
+static double vn_atan(double x) { return __vn_atan(argd(x))[0]; }
 static double vn_atan2(double x, double y) { return __vn_atan2(argd(x), argd(y))[0]; }
 static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
 static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
@@ -33,6 +35,7 @@ static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
 static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
 static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
 static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
+static double Z_atan(double x) { return _ZGVnN2v_atan(argd(x))[0]; }
 static double Z_atan2(double x, double y) { return _ZGVnN2vv_atan2(argd(x), argd(y))[0]; }
 static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; }
 static double Z_erfc(double x) { return _ZGVnN2v_erfc(argd(x))[0]; }
diff --git a/pl/math/v_atan_3u.c b/pl/math/v_atan_3u.c
new file mode 100644
index 0000000..bf11399
--- /dev/null
+++ b/pl/math/v_atan_3u.c
@@ -0,0 +1,51 @@
+/*
+ * Double-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#include "atan_common.h"
+
+#define PiOver2 v_f64 (0x1.921fb54442d18p+0)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+
+/* Fast implementation of vector atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Maximum observed error is 3.0 ulps, in
+   [0x1.00e766b50e9f2p+0, 0x1.00e78cab70984p+0]:
+   v_atan(0x1.00e76c0e723e4p+0) got 0x1.9306b8d822418p-1
+			       want 0x1.9306b8d82241bp-1.  */
+VPCS_ATTR
+v_f64_t V_NAME (atan) (v_f64_t x)
+{
+  /* No need to trigger special case. Small cases, infs and nans
+     are supported by our approximation technique.  */
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t sign = ix & ~AbsMask;
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  v_u64_t red = v_cagt_f64 (x, v_f64 (1.0));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  v_f64_t z = v_sel_f64 (red, v_div_f64 (v_f64 (-1.0), x), x);
+  v_f64_t shift = v_sel_f64 (red, PiOver2, v_f64 (0.0));
+  /* Use absolute value only when needed (odd powers of z).  */
+  v_f64_t az = v_abs_f64 (z);
+  az = v_sel_f64 (red, -az, az);
+
+  /* Calculate the polynomial approximation.  */
+  v_f64_t y = eval_poly (z, az, shift);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
+
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index 43292cf..3733557 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -319,6 +319,11 @@ v_abs_f64 (v_f64_t x)
 {
   return __builtin_fabs (x);
 }
+static inline v_u64_t
+v_cagt_f64 (v_f64_t x, v_f64_t y)
+{
+  return fabs (x) > fabs (y);
+}
 static inline v_f64_t
 v_div_f64 (v_f64_t x, v_f64_t y)
 {
@@ -644,6 +649,11 @@ v_abs_f64 (v_f64_t x)
 {
   return vabsq_f64 (x);
 }
+static inline v_u64_t
+v_cagt_f64 (v_f64_t x, v_f64_t y)
+{
+  return vcagtq_f64 (x, y);
+}
 static inline v_f64_t
 v_div_f64 (v_f64_t x, v_f64_t y)
 {
diff --git a/pl/math/vn_atan_3u.c b/pl/math/vn_atan_3u.c
new file mode 100644
index 0000000..93bd7cf
--- /dev/null
+++ b/pl/math/vn_atan_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atan, _ZGVnN2v_atan)
+#include "v_atan_3u.c"
+#endif
-- 
cgit v1.2.3


From 47eb0a883fb82fcd394353920e2cca4d0a0ffe9d Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 17 Jun 2022 11:09:34 +0100
Subject: pl/math: Add vector/Neon atan2f

Successfully ran tests and benchmarks. New routine is accurate to 3 ulps.
---
 pl/math/include/mathlib.h         |  4 ++
 pl/math/s_atan2f_3u.c             |  6 +++
 pl/math/test/mathbench_funcs.h    |  5 +++
 pl/math/test/mathbench_wrappers.h | 24 ++++++++++++
 pl/math/test/runulp.sh            | 13 +++++++
 pl/math/test/ulp_funcs.h          |  4 ++
 pl/math/test/ulp_wrappers.h       |  3 ++
 pl/math/v_atan2f_3u.c             | 78 +++++++++++++++++++++++++++++++++++++++
 pl/math/v_math.h                  | 10 +++++
 pl/math/vn_atan2f_3u.c            | 12 ++++++
 10 files changed, 159 insertions(+)
 create mode 100644 pl/math/s_atan2f_3u.c
 create mode 100644 pl/math/v_atan2f_3u.c
 create mode 100644 pl/math/vn_atan2f_3u.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index cda3371..baee70f 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -17,6 +17,7 @@ float log10f (float);
 double atan2 (double, double);
 double log10 (double);
 
+float __s_atan2f (float, float);
 float __s_erfcf (float);
 float __s_erff (float);
 float __s_log10f (float);
@@ -40,6 +41,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 
 /* Vector functions following the base PCS.  */
 __f64x2_t __v_atan (__f64x2_t);
+__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
 __f32x4_t __v_erff (__f32x4_t);
 __f64x2_t __v_erf (__f64x2_t);
@@ -53,6 +55,7 @@ __f64x2_t __v_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS.  */
 __vpcs __f64x2_t __vn_atan (__f64x2_t);
+__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
 __vpcs __f64x2_t __vn_erf (__f64x2_t);
@@ -63,6 +66,7 @@ __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
diff --git a/pl/math/s_atan2f_3u.c b/pl/math/s_atan2f_3u.c
new file mode 100644
index 0000000..5002d32
--- /dev/null
+++ b/pl/math/s_atan2f_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan2f_3u.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f2befc8..f1455aa 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -18,6 +18,7 @@ D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
 D (__s_atan, -10.0, 10.0)
+{"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
 {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
 F (__s_erff, -4.0, 4.0)
 D (__s_erf, -6.0, 6.0)
@@ -27,6 +28,7 @@ F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 #if __aarch64__
 VD (__v_atan, -10.0, 10.0)
+{"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
 {"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
 VF  (__v_erff, -4.0, 4.0)
 VD  (__v_erf, -6.0, 6.0)
@@ -38,6 +40,9 @@ VF (__v_log10f, 0.01, 11.1)
 VND (__vn_atan, -10.0, 10.0)
 VND (_ZGVnN2v_atan, -10.0, 10.0)
 
+{"__vn_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}},
+{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}},
+
 {"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
 {"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
 
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index 57c0c8f..37edeae 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -26,12 +26,24 @@ __s_atan2_wrap (double x)
   return __s_atan2 (5.0, x);
 }
 
+static float
+__s_atan2f_wrap (float x)
+{
+  return __s_atan2f (5.0f, x);
+}
+
 static v_double
 __v_atan2_wrap (v_double x)
 {
   return __v_atan2 (v_double_dup (5.0), x);
 }
 
+static v_float
+__v_atan2f_wrap (v_float x)
+{
+  return __v_atan2f (v_float_dup (5.0f), x);
+}
+
 #ifdef __vpcs
 
 __vpcs static v_double
@@ -40,12 +52,24 @@ __vn_atan2_wrap (v_double x)
   return __vn_atan2 (v_double_dup (5.0), x);
 }
 
+__vpcs static v_float
+__vn_atan2f_wrap (v_float x)
+{
+  return __vn_atan2f (v_float_dup (5.0f), x);
+}
+
 __vpcs static v_double
 _Z_atan2_wrap (v_double x)
 {
   return _ZGVnN2vv_atan2 (v_double_dup (5.0), x);
 }
 
+__vpcs static v_float
+_Z_atan2f_wrap (v_float x)
+{
+  return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x);
+}
+
 #endif // __vpcs
 #endif // __arch64__
 #endif // WANT_VMATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 7ad2318..66ac3bf 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -161,6 +161,14 @@ range_atan='
    1e6       1e32  40000
 '
 
+range_atan2f='
+ -10.0       10.0  50000
+  -1.0        1.0  40000
+   0.0        1.0  40000
+   1.0      100.0  40000
+   1e6       1e32  40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -170,6 +178,7 @@ L_erf=1.76
 L_erff=1.5
 L_atan2=2.9
 L_atan=3.0
+L_atan2f=3.0
 
 while read G F R
 do
@@ -209,6 +218,10 @@ log10  __v_log10       $runv
 log10  __vn_log10      $runvn
 log10  _ZGVnN2v_log10  $runvn
 
+atan2f __s_atan2f       $runs
+atan2f __v_atan2f       $runv
+atan2f __vn_atan2f      $runvn
+atan2f _ZGVnN4vv_atan2f $runvn
 erff   __s_erff        $runs
 erff   __v_erff        $runv
 erff   __vn_erff       $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 244c96b..5bc7411 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -13,6 +13,7 @@ D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
 F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0)
+F (__s_atan2f, __s_atan2f, atan2, mpfr_atan2, 2, 1, f2, 0)
 F (__s_atan2, __s_atan2, atan2l, mpfr_atan2, 2, 0, d2, 0)
 F (__s_erff, __s_erff, erf, mpfr_erf, 1, 1, f1, 0)
 F (__s_erf, __s_erf, erfl, mpfr_erf, 1, 0, d1, 0)
@@ -22,6 +23,7 @@ F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 #if __aarch64__
 F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1)
+F (__v_atan2f, v_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (__v_atan2, v_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (__v_erff, v_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (__v_erf, v_erf, erfl, mpfr_erf, 1, 0, d1, 1)
@@ -31,6 +33,7 @@ F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #ifdef __vpcs
 F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1)
+F (__vn_atan2f, vn_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (__vn_atan2, vn_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (__vn_erff, vn_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (__vn_erf, vn_erf, erfl, mpfr_erf, 1, 0, d1, 1)
@@ -39,6 +42,7 @@ F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
+F (_ZGVnN4vv_atan2f, Z_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (_ZGVnN2vv_atan2, Z_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
 F (_ZGVnN4v_erff, Z_erff, erf, mpfr_erf, 1, 1, f1, 1)
 F (_ZGVnN2v_erf, Z_erf, erfl, mpfr_erf, 1, 0, d1, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index f1bfdf2..0603e45 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -14,6 +14,7 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,
 
 /* Wrappers for vector functions.  */
 #if __aarch64__ && WANT_VMATH
+static float v_atan2f(float x, float y) { return __v_atan2f(argf(x), argf(y))[0]; }
 static float v_erff(float x) { return __v_erff(argf(x))[0]; }
 static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
@@ -23,6 +24,7 @@ static double v_erf(double x) { return __v_erf(argd(x))[0]; }
 static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
 static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 #ifdef __vpcs
+static float vn_atan2f(float x, float y) { return __vn_atan2f(argf(x), argf(y))[0]; }
 static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
 static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
 static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
@@ -32,6 +34,7 @@ static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
 static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
 static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
 
+static float Z_atan2f(float x, float y) { return _ZGVnN4vv_atan2f(argf(x), argf(y))[0]; }
 static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
 static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
 static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
new file mode 100644
index 0000000..4212351
--- /dev/null
+++ b/pl/math/v_atan2f_3u.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#include "atanf_common.h"
+
+/* Useful constants.  */
+#define PiOver2 v_f32 (0x1.921fb6p+0f)
+#define SignMask v_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity and nan (fall back to scalar calls).  */
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f32_t
+specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp)
+{
+  return v_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline v_u32_t
+zeroinfnan (v_u32_t i)
+{
+  return v_cond_u32 (2 * i - 1 >= v_u32 (2 * 0x7f800000lu - 1));
+}
+
+/* Fast implementation of vector atan2f. Maximum observed error is
+   2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+   v_atan2(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+				       want 0x1.967f00p-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iy = v_as_u32_f32 (y);
+
+  v_u32_t special_cases = zeroinfnan (ix) | zeroinfnan (iy);
+
+  v_u32_t sign_x = ix & SignMask;
+  v_u32_t sign_y = iy & SignMask;
+  v_u32_t sign_xy = sign_x ^ sign_y;
+
+  v_f32_t ax = v_abs_f32 (x);
+  v_f32_t ay = v_abs_f32 (y);
+
+  v_u32_t pred_xlt0 = x < 0.0f;
+  v_u32_t pred_aygtax = ay > ax;
+
+  /* Set up z for call to atanf.  */
+  v_f32_t n = v_sel_f32 (pred_aygtax, -ax, ay);
+  v_f32_t d = v_sel_f32 (pred_aygtax, ay, ax);
+  v_f32_t z = v_div_f32 (n, d);
+
+  /* Work out the correct shift.  */
+  v_f32_t shift = v_sel_f32 (pred_xlt0, v_f32 (-2.0f), v_f32 (0.0f));
+  shift = v_sel_f32 (pred_aygtax, shift + 1.0f, shift);
+  shift *= PiOver2;
+
+  v_f32_t ret = eval_poly (z, z, shift);
+
+  /* Account for the sign of y.  */
+  ret = v_as_f32_u32 (v_as_u32_f32 (ret) ^ sign_xy);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    {
+      return specialcase (y, x, ret, special_cases);
+    }
+
+  return ret;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index 3733557..ddc5dab 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -184,6 +184,11 @@ v_calt_f32 (v_f32_t x, v_f32_t y)
   return fabsf (x) < fabsf (y);
 }
 static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+  return x / y;
+}
+static inline v_f32_t
 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
 {
   return __builtin_fmaf (x, y, z);
@@ -509,6 +514,11 @@ v_calt_f32 (v_f32_t x, v_f32_t y)
   return vcaltq_f32 (x, y);
 }
 static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+  return vdivq_f32 (x, y);
+}
+static inline v_f32_t
 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
 {
   return vfmaq_f32 (z, x, y);
diff --git a/pl/math/vn_atan2f_3u.c b/pl/math/vn_atan2f_3u.c
new file mode 100644
index 0000000..23aad38
--- /dev/null
+++ b/pl/math/vn_atan2f_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan2f.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atan2f, _ZGVnN4vv_atan2f)
+#include "v_atan2f_3u.c"
+#endif
-- 
cgit v1.2.3


From 570d607f2d53dc1d416ec8500487a9e261b15bb9 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 17 Jun 2022 11:09:50 +0100
Subject: pl/math: Add vector/Neon atanf

Successfully ran tests and benchmarks. New routine is accurate to 3 ulps.
---
 pl/math/include/mathlib.h      |  4 ++++
 pl/math/s_atanf_3u.c           |  6 ++++++
 pl/math/test/mathbench_funcs.h |  6 ++++++
 pl/math/test/runulp.sh         | 13 +++++++++++
 pl/math/test/ulp_funcs.h       |  4 ++++
 pl/math/test/ulp_wrappers.h    |  3 +++
 pl/math/v_atanf_3u.c           | 49 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_atanf_3u.c          | 12 +++++++++++
 8 files changed, 97 insertions(+)
 create mode 100644 pl/math/s_atanf_3u.c
 create mode 100644 pl/math/v_atanf_3u.c
 create mode 100644 pl/math/vn_atanf_3u.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index baee70f..0b7a745 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -17,6 +17,7 @@ float log10f (float);
 double atan2 (double, double);
 double log10 (double);
 
+float __s_atanf (float);
 float __s_atan2f (float, float);
 float __s_erfcf (float);
 float __s_erff (float);
@@ -40,6 +41,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #endif
 
 /* Vector functions following the base PCS.  */
+__f32x4_t __v_atanf (__f32x4_t);
 __f64x2_t __v_atan (__f64x2_t);
 __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
@@ -54,6 +56,7 @@ __f64x2_t __v_log10 (__f64x2_t);
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_atanf (__f32x4_t);
 __vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
@@ -65,6 +68,7 @@ __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
diff --git a/pl/math/s_atanf_3u.c b/pl/math/s_atanf_3u.c
new file mode 100644
index 0000000..4e8a2f7
--- /dev/null
+++ b/pl/math/s_atanf_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanf_3u.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f1455aa..0f8e0ca 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -5,6 +5,7 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
 F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
@@ -17,6 +18,7 @@ D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
+F (__s_atanf, -10.0, 10.0)
 D (__s_atan, -10.0, 10.0)
 {"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
 {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
@@ -27,6 +29,7 @@ D (__s_erfc, -6.0, 28.0)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 #if __aarch64__
+VF (__v_atanf, -10.0, 10.0)
 VD (__v_atan, -10.0, 10.0)
 {"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
 {"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
@@ -37,6 +40,9 @@ VD (__v_erfc, -6.0, 28.0)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 #ifdef __vpcs
+VNF (__vn_atanf, -10.0, 10.0)
+VNF (_ZGVnN4v_atanf, -10.0, 10.0)
+
 VND (__vn_atan, -10.0, 10.0)
 VND (_ZGVnN2v_atan, -10.0, 10.0)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 66ac3bf..674d718 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -169,6 +169,14 @@ range_atan2f='
    1e6       1e32  40000
 '
 
+range_atanf='
+ -10.0       10.0  50000
+  -1.0        1.0  40000
+   0.0        1.0  40000
+   1.0      100.0  40000
+   1e6       1e32  40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -179,6 +187,7 @@ L_erff=1.5
 L_atan2=2.9
 L_atan=3.0
 L_atan2f=3.0
+L_atanf=3.0
 
 while read G F R
 do
@@ -218,6 +227,10 @@ log10  __v_log10       $runv
 log10  __vn_log10      $runvn
 log10  _ZGVnN2v_log10  $runvn
 
+atanf  __s_atanf       $runs
+atanf  __v_atanf       $runv
+atanf  __vn_atanf      $runvn
+atanf  _ZGVnN4v_atanf  $runvn
 atan2f __s_atan2f       $runs
 atan2f __v_atan2f       $runv
 atan2f __vn_atan2f      $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 5bc7411..40fae51 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -12,6 +12,7 @@ D2 (atan2)
 D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
+F (__s_atanf, __s_atanf, atan, mpfr_atan, 1, 1, f1, 0)
 F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0)
 F (__s_atan2f, __s_atan2f, atan2, mpfr_atan2, 2, 1, f2, 0)
 F (__s_atan2, __s_atan2, atan2l, mpfr_atan2, 2, 0, d2, 0)
@@ -22,6 +23,7 @@ F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0)
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 #if __aarch64__
+F (__v_atanf, v_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (__v_atan2f, v_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (__v_atan2, v_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
@@ -32,6 +34,7 @@ F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #ifdef __vpcs
+F (__vn_atanf, vn_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (__vn_atan2f, vn_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (__vn_atan2, vn_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
@@ -41,6 +44,7 @@ F (__vn_erfcf, vn_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (_ZGVnN4v_atanf, Z_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (_ZGVnN4vv_atan2f, Z_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (_ZGVnN2vv_atan2, Z_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 0603e45..fa4ba4c 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -14,6 +14,7 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,
 
 /* Wrappers for vector functions.  */
 #if __aarch64__ && WANT_VMATH
+static float v_atanf(float x) { return __v_atanf(argf(x))[0]; }
 static float v_atan2f(float x, float y) { return __v_atan2f(argf(x), argf(y))[0]; }
 static float v_erff(float x) { return __v_erff(argf(x))[0]; }
 static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
@@ -24,6 +25,7 @@ static double v_erf(double x) { return __v_erf(argd(x))[0]; }
 static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
 static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 #ifdef __vpcs
+static float vn_atanf(float x) { return __vn_atanf(argf(x))[0]; }
 static float vn_atan2f(float x, float y) { return __vn_atan2f(argf(x), argf(y))[0]; }
 static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
 static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
@@ -34,6 +36,7 @@ static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
 static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
 static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
 
+static float Z_atanf(float x) { return _ZGVnN4v_atanf(argf(x))[0]; }
 static float Z_atan2f(float x, float y) { return _ZGVnN4vv_atan2f(argf(x), argf(y))[0]; }
 static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
 static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c
new file mode 100644
index 0000000..7c84244
--- /dev/null
+++ b/pl/math/v_atanf_3u.c
@@ -0,0 +1,49 @@
+/*
+ * Single-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#include "atanf_common.h"
+
+#define PiOver2 v_f32 (0x1.921fb6p+0f)
+#define AbsMask v_u32 (0x7fffffff)
+
+/* Fast implementation of vector atanf based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+   using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
+   v_atanf(0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (atanf) (v_f32_t x)
+{
+  /* No need to trigger special case. Small cases, infs and nans
+     are supported by our approximation technique.  */
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t sign = ix & ~AbsMask;
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  v_u32_t red = v_cagt_f32 (x, v_f32 (1.0));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  v_f32_t z = v_sel_f32 (red, v_div_f32 (v_f32 (-1.0f), x), x);
+  v_f32_t shift = v_sel_f32 (red, PiOver2, v_f32 (0.0f));
+  /* Use absolute value only when needed (odd powers of z).  */
+  v_f32_t az = v_abs_f32 (z);
+  az = v_sel_f32 (red, -az, az);
+
+  /* Calculate the polynomial approximation.  */
+  v_f32_t y = eval_poly (z, az, shift);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign);
+
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/vn_atanf_3u.c b/pl/math/vn_atanf_3u.c
new file mode 100644
index 0000000..17ba6b8
--- /dev/null
+++ b/pl/math/vn_atanf_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanf.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atanf, _ZGVnN4v_atanf)
+#include "v_atanf_3u.c"
+#endif
-- 
cgit v1.2.3


From 7f8252e7a0302c08f88a30fedebe7333662f1c01 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 20 Jun 2022 16:43:08 +0100
Subject: pl/math: Improve accuracy in log10

Increase polynomial order to 12, and update summation scheme to
match AOR log. New coefficients are copied from AOR log.
---
 pl/math/log10_2u.c     | 145 +++++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/log10_2u1.c    | 135 ---------------------------------------------
 pl/math/log10_data.c   |  39 +++++++------
 pl/math/math_config.h  |   2 +-
 pl/math/test/runulp.sh |   2 +-
 5 files changed, 168 insertions(+), 155 deletions(-)
 create mode 100644 pl/math/log10_2u.c
 delete mode 100644 pl/math/log10_2u1.c

diff --git a/pl/math/log10_2u.c b/pl/math/log10_2u.c
new file mode 100644
index 0000000..3330389
--- /dev/null
+++ b/pl/math/log10_2u.c
@@ -0,0 +1,145 @@
+/*
+ * Double-precision log10(x) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include <float.h>
+#include <math.h>
+#include <stdint.h>
+
+/* Polynomial coefficients and lookup tables.  */
+#define T __log10_data.tab
+#define T2 __log10_data.tab2
+#define B __log10_data.poly1
+#define A __log10_data.poly
+#define Ln2hi __log10_data.ln2hi
+#define Ln2lo __log10_data.ln2lo
+#define InvLn10 __log10_data.invln10
+#define N (1 << LOG10_TABLE_BITS)
+#define OFF 0x3fe6000000000000
+#define LO asuint64 (1.0 - 0x1p-4)
+#define HI asuint64 (1.0 + 0x1.09p-4)
+
+/* Top 16 bits of a double.  */
+static inline uint32_t
+top16 (double x)
+{
+  return asuint64 (x) >> 48;
+}
+
+/* Fast and low accuracy implementation of log10.
+   The implementation is similar to that of math/log, except that:
+   - Polynomials are computed for log10(1+r) with r on same intervals as log.
+   - Lookup parameters are scaled (at runtime) to switch from base e to base 10.
+   Many errors above 1.59 ulp are observed across the whole range of doubles.
+   The greatest observed error is 1.61 ulp, at around 0.965:
+   log10(0x1.dc8710333a29bp-1) got -0x1.fee26884905a6p-6
+			      want -0x1.fee26884905a8p-6.  */
+double
+log10 (double x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo;
+  uint64_t ix, iz, tmp;
+  uint32_t top;
+  int k, i;
+
+  ix = asuint64 (x);
+  top = top16 (x);
+
+  if (unlikely (ix - LO < HI - LO))
+    {
+      /* Handle close to 1.0 inputs separately.  */
+      /* Fix sign of zero with downward rounding when x==1.  */
+      if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
+	return 0;
+      r = x - 1.0;
+      r2 = r * r;
+      r3 = r * r2;
+      y = r3
+	  * (B[1] + r * B[2] + r2 * B[3]
+	     + r3
+		 * (B[4] + r * B[5] + r2 * B[6]
+		    + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
+      /* Worst-case error is around 0.507 ULP.  */
+      w = r * 0x1p27;
+      double_t rhi = r + w - w;
+      double_t rlo = r - rhi;
+      w = rhi * rhi * B[0];
+      hi = r + w;
+      lo = r - hi + w;
+      lo += B[0] * rlo * (rhi + r);
+      y += lo;
+      y += hi;
+      /* Scale by 1/ln(10). Polynomial already contains scaling.  */
+      y = y * InvLn10;
+
+      return eval_as_double (y);
+    }
+  if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
+    {
+      /* x < 0x1p-1022 or inf or nan.  */
+      if (ix * 2 == 0)
+	return __math_divzero (1);
+      if (ix == asuint64 (INFINITY)) /* log10(inf) == inf.  */
+	return x;
+      if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
+	return __math_invalid (x);
+      /* x is subnormal, normalize it.  */
+      ix = asuint64 (x * 0x1p52);
+      ix -= 52ULL << 52;
+    }
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (52 - LOG10_TABLE_BITS)) % N;
+  k = (int64_t) tmp >> 52; /* arithmetic shift.  */
+  iz = ix - (tmp & 0xfffULL << 52);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = asdouble (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  /* r ~= z/c - 1, |r| < 1/(2*N).  */
+#if HAVE_FAST_FMA
+  /* rounding error: 0x1p-55/N.  */
+  r = fma (z, invc, -1.0);
+#else
+  /* rounding error: 0x1p-55/N + 0x1p-66.  */
+  r = (z - T2[i].chi - T2[i].clo) * invc;
+#endif
+  kd = (double_t) k;
+
+  /* w = log(c) + k*Ln2hi.  */
+  w = kd * Ln2hi + logc;
+  hi = w + r;
+  lo = w - hi + r + kd * Ln2lo;
+
+  /* log10(x) = (w + r)/log(10) + (log10(1+r) - r/log(10)).  */
+  r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
+
+  /* Scale by 1/ln(10). Polynomial already contains scaling.  */
+  y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi;
+  y = y * InvLn10;
+
+  return eval_as_double (y);
+}
+
+// clang-format off
+#if USE_GLIBC_ABI
+strong_alias (log10, __log10_finite)
+hidden_alias (log10, __ieee754_log10)
+#if LDBL_MANT_DIG == 53
+long double
+log10l (long double x)
+{
+  return log10 (x);
+}
+#endif
+#endif
+// clang-format on
diff --git a/pl/math/log10_2u1.c b/pl/math/log10_2u1.c
deleted file mode 100644
index 29860ab..0000000
--- a/pl/math/log10_2u1.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Double-precision log10(x) function.
- *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-#include <float.h>
-#include <math.h>
-#include <stdint.h>
-
-/* Polynomial coefficients and lookup tables.  */
-#define T __log10_data.tab
-#define T2 __log10_data.tab2
-#define B __log10_data.poly1
-#define A __log10_data.poly
-#define Ln2hi __log10_data.ln2hi
-#define Ln2lo __log10_data.ln2lo
-#define InvLn10 __log10_data.invln10
-#define N (1 << LOG10_TABLE_BITS)
-#define OFF 0x3fe6000000000000
-#define LO asuint64 (1.0 - 0x1p-5)
-#define HI asuint64 (1.0 + 0x1.1p-5)
-
-/* Top 16 bits of a double.  */
-static inline uint32_t
-top16 (double x)
-{
-  return asuint64 (x) >> 48;
-}
-
-/* Fast and low accuracy implementation of log10.
-   The implementation is similar to that of math/log, except that:
-   - Polynomials are computed for log10(1+r) with r on same intervals as log.
-   - Lookup parameters are scaled (at runtime) to switch from base e to base 10.
-   Max ULP error: < 1.7 ulp (nearest rounding.)
-     with (LOG10_POLY1_ORDER = 10, LOG10_POLY_ORDER = 6, N = 128)
-   Many errors above 2.08 ulp are observed across the whole range of doubles.
-   The greatest observed error is 2.09 ulp, at around 2.66e-127:
-   log10(0x1.713b77689f011p-421) got -0x1.fa4c5bacfbe41p+6
-				want -0x1.fa4c5bacfbe43p+6.  */
-double
-log10 (double x)
-{
-  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
-  double_t w, z, r, r2, r3, y, invc, logc, kd;
-  uint64_t ix, iz, tmp;
-  uint32_t top;
-  int k, i;
-
-  ix = asuint64 (x);
-  top = top16 (x);
-
-  if (unlikely (ix - LO < HI - LO))
-    {
-      /* Handle close to 1.0 inputs separately.  */
-      /* Fix sign of zero with downward rounding when x==1.  */
-      if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
-	return 0;
-      r = x - 1.0;
-      r2 = r * r;
-      r3 = r * r2;
-      /* Worst-case error is around 0.727 ULP.  */
-      y = r3
-	  * (B[1] + r * B[2] + r2 * B[3]
-	     + r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8])));
-      w = B[0] * r2; /* B[0] == -0.5.  */
-      /* Scale by 1/ln(10). Polynomial already contains scaling.  */
-      y = (y + w) + r * InvLn10;
-
-      return eval_as_double (y);
-    }
-  if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
-    {
-      /* x < 0x1p-1022 or inf or nan.  */
-      if (ix * 2 == 0)
-	return __math_divzero (1);
-      if (ix == asuint64 (INFINITY)) /* log10(inf) == inf.  */
-	return x;
-      if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
-	return __math_invalid (x);
-      /* x is subnormal, normalize it.  */
-      ix = asuint64 (x * 0x1p52);
-      ix -= 52ULL << 52;
-    }
-
-  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
-     The range is split into N subintervals.
-     The ith subinterval contains z and c is near its center.  */
-  tmp = ix - OFF;
-  i = (tmp >> (52 - LOG10_TABLE_BITS)) % N;
-  k = (int64_t) tmp >> 52; /* arithmetic shift.  */
-  iz = ix - (tmp & 0xfffULL << 52);
-  invc = T[i].invc;
-  logc = T[i].logc;
-  z = asdouble (iz);
-
-  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
-  /* r ~= z/c - 1, |r| < 1/(2*N).  */
-#if HAVE_FAST_FMA
-  /* rounding error: 0x1p-55/N.  */
-  r = fma (z, invc, -1.0);
-#else
-  /* rounding error: 0x1p-55/N + 0x1p-66.  */
-  r = (z - T2[i].chi - T2[i].clo) * invc;
-#endif
-  kd = (double_t) k;
-
-  /* w = log(c) + k*Ln2hi.  */
-  w = kd * Ln2hi + logc;
-
-  /* log10(x) = (w + r)/log(10) + (log10(1+r) - r/log(10)).  */
-  r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
-  y = r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4]));
-
-  /* Scale by 1/ln(10). Polynomial already contains scaling.  */
-  y = y + ((r + kd * Ln2lo) + w) * InvLn10;
-
-  return eval_as_double (y);
-}
-
-// clang-format off
-#if USE_GLIBC_ABI
-strong_alias (log10, __log10_finite)
-hidden_alias (log10, __ieee754_log10)
-#if LDBL_MANT_DIG == 53
-long double
-log10l (long double x)
-{
-  return log10 (x);
-}
-#endif
-#endif
-// clang-format on
diff --git a/pl/math/log10_data.c b/pl/math/log10_data.c
index e844203..e02e9b1 100644
--- a/pl/math/log10_data.c
+++ b/pl/math/log10_data.c
@@ -14,29 +14,32 @@ const struct log10_data __log10_data = {
 .ln2lo = 0x1.ef35793c76730p-45,
 .invln10 = 0x1.bcb7b1526e50ep-2,
 .poly1 = {
-#if LOG10_POLY1_ORDER == 10
-// relative error: 0x1.d34d5238p-63
-// in -0x1p-5 0x1.1p-5 (|log10(1+x)| > 0x1p-5 outside this interval)
--0x1.bcb7b1526e50ep-3,
-0x1.287a7636f4314p-3,
--0x1.bcb7b1526eeebp-4,
-0x1.63c62776b50e6p-4,
--0x1.287a76329b69dp-4,
-0x1.fc3f7e81f44c2p-5,
--0x1.bcb7b7893672ap-5,
-0x1.8c0fa601b4779p-5,
--0x1.64403e39d7278p-5,
+#if LOG10_POLY1_ORDER == 12
+// relative error: 0x1.c04d76cp-63
+// in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval)
+-0x1p-1,
+0x1.5555555555577p-2,
+-0x1.ffffffffffdcbp-3,
+0x1.999999995dd0cp-3,
+-0x1.55555556745a7p-3,
+0x1.24924a344de3p-3,
+-0x1.fffffa4423d65p-4,
+0x1.c7184282ad6cap-4,
+-0x1.999eb43b068ffp-4,
+0x1.78182f7afd085p-4,
+-0x1.5521375d145cdp-4,
 #endif
 },
 .poly = {
 #if N == 128 && LOG10_POLY_ORDER == 6
-// relative error: 0x1.29fc52bp-56
+// relative error: 0x1.926199e8p-56
+// abs error: 0x1.882ff33p-65
 // in -0x1.fp-9 0x1.fp-9
--0x1.bcb7b1526e50fp-3,
-0x1.287a7636c4076p-3,
--0x1.bcb7b151bffaep-4,
-0x1.63c77372810dep-4,
--0x1.287bdeec963c2p-4,
+-0x1.0000000000001p-1,
+0x1.555555551305bp-2,
+-0x1.fffffffeb459p-3,
+0x1.999b324f10111p-3,
+-0x1.55575e506c89fp-3,
 #endif
 },
 /* Algorithm:
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 47b192c..231d6bc 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -336,7 +336,7 @@ extern const struct logf_data
 /* Data for low accuracy log10 (with 1/ln(10) included in coefficients).  */
 #define LOG10_TABLE_BITS 7
 #define LOG10_POLY_ORDER 6
-#define LOG10_POLY1_ORDER 10
+#define LOG10_POLY1_ORDER 12
 extern const struct log10_data
 {
   double ln2hi;
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 674d718..17b13ae 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -48,7 +48,7 @@ t log10f  0x1p-26   0x1p3   50000
 t log10f  0x1p-4    0x1p4   50000
 t log10f  0         inf     50000
 
-L=1.6
+L=1.15
 Ldir=
 t log10  0 0xffff000000000000 10000
 t log10  0x1p-4    0x1p4      40000
-- 
cgit v1.2.3


From 90510ecbb921d79799e7d62eda43696f25b8e51e Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 22 Jun 2022 10:00:21 +0100
Subject: pl/math: Use single-precision fma in atan2f

The polynomial was mistakenly using double-precision fma, where
single is sufficiently accurate. New underflow special cases have
been handled accordingly.
---
 pl/math/atan2f_3u.c    | 39 ++++++++++++++++++++++++++++-----------
 pl/math/atanf_common.h |  4 ++--
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/pl/math/atan2f_3u.c b/pl/math/atan2f_3u.c
index 7d83b67..d2f1749 100644
--- a/pl/math/atan2f_3u.c
+++ b/pl/math/atan2f_3u.c
@@ -15,6 +15,12 @@
 #define PiOver4 (0x1.921fb6p-1f)
 #define SignMask (0x80000000)
 
+/* We calculate atan2f by P(n/d), where n and d are similar to the input
+   arguments, and P is a polynomial. The polynomial may underflow.
+   POLY_UFLOW_BOUND is the lower bound of the difference in exponents of n and d
+   for which P underflows, and is used to special-case such inputs.  */
+#define POLY_UFLOW_BOUND 24
+
 static inline int32_t
 biased_exponent (float f)
 {
@@ -65,10 +71,10 @@ atan2f (float y, float x)
   int32_t exp_diff = biased_exponent (x) - biased_exponent (y);
 
   /* Special case for (x, y) either on or very close to the x axis. Either y =
-     0, or y is tiny and x is huge (difference in exponents >= 126). In the
-     second case, we only want to use this special case when x is negative (i.e.
-     quadrants 2 or 3).  */
-  if (unlikely (iay == 0 || (exp_diff >= 126 && m >= 2)))
+     0, or y is tiny and x is huge (difference in exponents >=
+     POLY_UFLOW_BOUND). In the second case, we only want to use this special
+     case when x is negative (i.e. quadrants 2 or 3).  */
+  if (unlikely (iay == 0 || (exp_diff >= POLY_UFLOW_BOUND && m >= 2)))
     {
       switch (m)
 	{
@@ -82,8 +88,9 @@ atan2f (float y, float x)
 	}
     }
   /* Special case for (x, y) either on or very close to the y axis. Either x =
-     0, or x is tiny and y is huge (difference in exponents >= 126).  */
-  if (unlikely (iax == 0 || exp_diff <= -126))
+     0, or x is tiny and y is huge (difference in exponents >=
+     POLY_UFLOW_BOUND).  */
+  if (unlikely (iax == 0 || exp_diff <= -POLY_UFLOW_BOUND))
     return sign_y ? -PiOver2 : PiOver2;
 
   /* x is INF.  */
@@ -134,12 +141,22 @@ atan2f (float y, float x)
   float d = pred_aygtax ? ay : ax;
   float z = n / d;
 
-  /* Work out the correct shift.  */
-  float shift = sign_x ? -2.0f : 0.0f;
-  shift = pred_aygtax ? shift + 1.0f : shift;
-  shift *= PiOver2;
+  float ret;
+  if (unlikely (m < 2 && exp_diff >= POLY_UFLOW_BOUND))
+    {
+      /* If (x, y) is very close to x axis and x is positive, the polynomial
+	 will underflow and evaluate to z.  */
+      ret = z;
+    }
+  else
+    {
+      /* Work out the correct shift.  */
+      float shift = sign_x ? -2.0f : 0.0f;
+      shift = pred_aygtax ? shift + 1.0f : shift;
+      shift *= PiOver2;
 
-  float ret = eval_poly (z, z, shift);
+      ret = eval_poly (z, z, shift);
+    }
 
   /* Account for the sign of x and y.  */
   return asfloat (asuint (ret) ^ sign_xy);
diff --git a/pl/math/atanf_common.h b/pl/math/atanf_common.h
index 55cee89..436b88b 100644
--- a/pl/math/atanf_common.h
+++ b/pl/math/atanf_common.h
@@ -21,8 +21,8 @@
 
 #else
 
-#define FLT_T double
-#define FMA fma
+#define FLT_T float
+#define FMA fmaf
 #define P(i) __atanf_poly_data.poly[i]
 
 #endif
-- 
cgit v1.2.3


From 3e6f95c1adfc784fbb5746dc5156df3f120222b0 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 8 Jun 2022 12:16:36 +0100
Subject: string: Fix header file issue in strlen test

Remove unnecessary sys/mman.h dependency.
---
 string/test/strlen.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/string/test/strlen.c b/string/test/strlen.c
index 68c51b1..0c20018 100644
--- a/string/test/strlen.c
+++ b/string/test/strlen.c
@@ -9,7 +9,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/mman.h>
 #include <limits.h>
 #include "mte.h"
 #include "stringlib.h"
-- 
cgit v1.2.3


From ca02337e93b28f6a9442f479ee019d553996fbd1 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 8 Jun 2022 12:19:29 +0100
Subject: string: Fix header file issue in arm strcmp-armv6m.S

Fix missing include directive for use of ENTRY_ALIGN and END macros.
---
 string/arm/strcmp-armv6m.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
index 8b11175..0e49d09 100644
--- a/string/arm/strcmp-armv6m.S
+++ b/string/arm/strcmp-armv6m.S
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+#include "../asmdefs.h"
+
 #if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
 
 	.thumb_func
-- 
cgit v1.2.3


From 40b662ce7b65d5eaefa40fd8046d6f3c6b3238c1 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 22 Jun 2022 14:56:00 +0100
Subject: string: add .fnstart and .fnend directives to ENTRY/END macros

Modify the ENTRY_ALIGN and END assembler macros to mark the start and
end of functions for arm unwind tables.

Enables the pacbti epilogue function to emit .save{} directives for
stack unwinding.
---
 string/asmdefs.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/string/asmdefs.h b/string/asmdefs.h
index 0d6ebd7..e84626f 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -8,6 +8,11 @@
 #ifndef _ASMDEFS_H
 #define _ASMDEFS_H
 
+#if defined (__arm__)
+#define ARM_FNSTART .fnstart
+#define ARM_FNEND .fnend
+#endif
+
 #if defined(__aarch64__)
 
 /* Branch Target Identitication support.  */
@@ -52,6 +57,7 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
   .type name,%function;	\
   .align alignment;		\
   name:			\
+  ARM_FNSTART;		\
   .cfi_startproc;	\
   BTI_C;
 
@@ -64,6 +70,7 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
   .type name,%function;	\
   .align alignment;		\
   name:			\
+  ARM_FNSTART;		\
   .cfi_startproc;
 
 #endif
@@ -77,6 +84,7 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
 
 #define END(name)	\
   .cfi_endproc;		\
+  ARM_FNEND;		\
   .size name, .-name;
 
 #define L(l) .L ## l
-- 
cgit v1.2.3


From 7b5e5cb19d5f2d1c09d6196a4acfbd8a943e1f36 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 22 Jun 2022 14:59:37 +0100
Subject: string: Add M-profile PACBTI-enablement header file

Header adds assembler macro to handle Pointer Authentication and Branch
Target Identification assembly instructions in function prologues
and epilogues according to selected flags at compile-time.
---
 string/pacbti.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 string/pacbti.h

diff --git a/string/pacbti.h b/string/pacbti.h
new file mode 100644
index 0000000..4b6e7df
--- /dev/null
+++ b/string/pacbti.h
@@ -0,0 +1,36 @@
+/*
+ * Macros for pacbti asm code.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Macro to handle function entry depending on branch-protection
+   schemes */
+	.macro pacbti_prologue
+#if __ARM_FEATURE_PAC_DEFAULT
+#if __ARM_FEATURE_BTI_DEFAULT
+	pacbti ip, lr, sp
+#else
+	pac ip, lr, sp
+#endif /* __ARM_FEATURE_BTI_DEFAULT */
+	str ip, [sp, #-4]!
+	.save {ra_auth_code}
+	.cfi_def_cfa_offset 4
+	.cfi_offset 143, -4
+#elif __ARM_FEATURE_BTI_DEFAULT
+	bti
+#endif /* __ARM_FEATURE_PAC_DEFAULT */
+	.endm
+
+/* Macro to handle different branch exchange cases depending on
+   branch-protection schemes */
+	.macro pacbti_epilogue
+#if __ARM_FEATURE_PAC_DEFAULT
+	ldr ip, [sp], #4
+	.cfi_restore 143
+	.cfi_def_cfa_offset 0
+	aut ip, lr, sp
+#endif /* __ARM_FEATURE_PAC_DEFAULT */
+	bx lr
+	.endm
-- 
cgit v1.2.3


From fa00b1bde8444483823723958fed97ae91775437 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 22 Jun 2022 15:04:08 +0100
Subject: string: Add M-profile PACBTI implementation of strcmp

Ensure BTI indirect branch landing pads (BTI) and pointer authentication
code genetaion (PAC) and verification instructions (BXAUT) are
conditionally added to assembly when branch protection is requested.

NOTE: ENTRY_ALIGN() Macro factored out as .fnstart & .cfi_startproc
directives needed to be moved to prior to L(fastpath_exit)
---
 string/arm/strcmp.S | 45 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index 622efb9..db96cc0 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -13,6 +13,7 @@
    the compares.  */
 
 #include "../asmdefs.h"
+#include "../pacbti.h"
 
 /* Build Options:
    STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
@@ -98,8 +99,9 @@
 	ldrd	r4, r5, [sp], #16
 	.cfi_restore 4
 	.cfi_restore 5
+	.cfi_adjust_cfa_offset -16
 	sub	result, result, r1, lsr #24
-	bx	lr
+	pacbti_epilogue
 #else
 	/* To use the big-endian trick we'd have to reverse all three words.
 	   that's slower than this approach.  */
@@ -119,21 +121,28 @@
 	ldrd	r4, r5, [sp], #16
 	.cfi_restore 4
 	.cfi_restore 5
+	.cfi_adjust_cfa_offset -16
 	sub	result, result, r1
 
-	bx	lr
+	pacbti_epilogue
 #endif
 	.endm
 
 	.p2align	5
 L(strcmp_start_addr):
+	.fnstart
+	.cfi_startproc
 #if STRCMP_NO_PRECHECK == 0
 L(fastpath_exit):
 	sub	r0, r2, r3
-	bx	lr
+	pacbti_epilogue
 	nop
 #endif
-ENTRY_ALIGN (__strcmp_arm, 0)
+	.global __strcmp_arm
+	.type __strcmp_arm,%function
+	.align 0
+__strcmp_arm:
+	pacbti_prologue
 #if STRCMP_NO_PRECHECK == 0
 	ldrb	r2, [src1]
 	ldrb	r3, [src2]
@@ -143,13 +152,25 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	bne	L(fastpath_exit)
 #endif
 	strd	r4, r5, [sp, #-16]!
-	.cfi_def_cfa_offset 16
+	.save	{r4, r5}
+	.cfi_adjust_cfa_offset 16
+#ifdef __ARM_FEATURE_PAC_DEFAULT
+	.cfi_offset 4, -20
+	.cfi_offset 5, -16
+#else
 	.cfi_offset 4, -16
 	.cfi_offset 5, -12
+#endif /* __ARM_FEATURE_PAC_DEFAULT */
 	orr	tmp1, src1, src2
 	strd	r6, r7, [sp, #8]
+	.save	{r6, r7}
+#ifdef __ARM_FEATURE_PAC_DEFAULT
+	.cfi_offset 6, -12
+	.cfi_offset 7, -8
+#else
 	.cfi_offset 6, -8
 	.cfi_offset 7, -4
+#endif /* __ARM_FEATURE_PAC_DEFAULT */
 	mvn	const_m1, #0
 	lsl	r2, tmp1, #29
 	cbz	r2, L(loop_aligned8)
@@ -318,7 +339,9 @@ L(misaligned_exit):
 	mov	result, tmp1
 	ldr	r4, [sp], #16
 	.cfi_restore 4
-	bx	lr
+	.cfi_adjust_cfa_offset -16
+
+	pacbti_epilogue
 
 #if STRCMP_NO_PRECHECK == 0
 L(aligned_m1):
@@ -368,9 +391,9 @@ L(overlap3):
 	/* R6/7 Not used in this sequence.  */
 	.cfi_restore 6
 	.cfi_restore 7
+	.cfi_adjust_cfa_offset -16
 	neg	result, result
-	bx	lr
-
+	pacbti_epilogue
 6:
 	.cfi_restore_state
 	S2LO	data1, data1, #24
@@ -445,7 +468,8 @@ L(strcmp_done_equal):
 	/* R6/7 not used in this sequence.  */
 	.cfi_restore 6
 	.cfi_restore 7
-	bx	lr
+	.cfi_adjust_cfa_offset -16
+	pacbti_epilogue
 
 L(strcmp_tail):
 	.cfi_restore_state
@@ -467,8 +491,9 @@ L(strcmp_tail):
 	/* R6/7 not used in this sequence.  */
 	.cfi_restore 6
 	.cfi_restore 7
+	.cfi_adjust_cfa_offset -16
 	sub	result, result, data2, lsr #24
-	bx	lr
+	pacbti_epilogue
 
 END (__strcmp_arm)
 
-- 
cgit v1.2.3


From f931b0e187330bdd38dd0149cc16d9a103502936 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 22 Jun 2022 15:06:02 +0100
Subject: string: Add M-profile PACBTI implementation of strlen

Ensure BTI indirect branch landing pads (BTI) and pointer authentication
code genetaion (PAC) and verification instructions (BXAUT) are
conditionally added to assembly when branch protection is requested.
---
 string/arm/strlen-armv6t2.S | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 01ebf1d..5beaa7d 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -14,6 +14,7 @@
  */
 
 #include "../asmdefs.h"
+#include "../pacbti.h"
 
 #ifdef __ARMEB__
 #define S2LO		lsl
@@ -41,8 +42,31 @@
 #define tmp2		r5
 
 ENTRY (__strlen_armv6t2)
+	/* common pacbti_prologue macro from pacbti.h not used.
+	   handwritten prologue saves one push instruction. */
+#if __ARM_FEATURE_PAC_DEFAULT
+#if __ARM_FEATURE_BTI_DEFAULT
+	pacbti ip, lr, sp
+#else
+	pac ip, lr, sp
+#endif /* __ARM_FEATURE_BTI_DEFAULT */
+	push    {r4, r5, ip}
+	.save   {r4, r5, ra_auth_code}
+	.cfi_def_cfa_offset 12
+	.cfi_offset 143, -4
+	.cfi_offset 4, -8
+	.cfi_offset 5, -12
+#else
+#if __ARM_FEATURE_BTI_DEFAULT
+	bti
+#endif /* __ARM_FEATURE_BTI_DEFAULT */
+	push    {r4, r5}
+	.save   {r4, r5}
+	.cfi_def_cfa_offset 8
+	.cfi_offset 4, -4
+	.cfi_offset 5, -8
+#endif /* __ARM_FEATURE_PAC_DEFAULT */
 	pld	[srcin, #0]
-	strd	r4, r5, [sp, #-8]!
 	bic	src, srcin, #7
 	mvn	const_m1, #0
 	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
@@ -101,8 +125,11 @@ L(null_found):
 #endif
 	clz	data1a, data1a
 	ldrd	r4, r5, [sp], #8
+	.cfi_restore 5
+	.cfi_restore 4
+	.cfi_adjust_cfa_offset -8
 	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
-	bx	lr
+	pacbti_epilogue
 
 L(misaligned8):
 	ldrd	data1a, data1b, [src]
-- 
cgit v1.2.3


From 8bf2238fc0571d673b593c1ec2f72f4179a8ec79 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 22 Jun 2022 15:07:31 +0100
Subject: string: Add M-profile PACBTI implementation of memchr

Ensure BTI indirect branch landing pads (BTI) and pointer authentication
code genetaion (PAC) and verification instructions (BXAUT) are
conditionally added to assembly when branch protection is requested
---
 string/arm/memchr.S | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 1271ca1..4f7d8f4 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -23,7 +23,11 @@
 @    Removed unneeded cbz from align loop
 
 	.syntax unified
+#if __ARM_ARCH_8M_MAIN__
+    /* keep config inherited from -march= */
+#else
 	.arch armv7-a
+#endif
 
 @ this lets us check a flag in a 00/ff byte easily in either endianness
 #ifdef __ARMEB__
@@ -33,17 +37,22 @@
 #endif
 	.thumb
 
+#include "../pacbti.h"
+
 @ ---------------------------------------------------------------------------
 	.thumb_func
 	.align 2
 	.p2align 4,,15
 	.global __memchr_arm
 	.type __memchr_arm,%function
+	.fnstart
+	.cfi_startproc
 __memchr_arm:
 	@ r0 = start of memory to scan
 	@ r1 = character to look for
 	@ r2 = length
 	@ returns r0 = pointer to character or NULL if not found
+	pacbti_prologue
 	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
 
 	cmp	r2,#16		@ If it's short don't bother with anything clever
@@ -64,6 +73,19 @@ __memchr_arm:
 10:
 	@ At this point, we are aligned, we know we have at least 8 bytes to work with
 	push	{r4,r5,r6,r7}
+	.save   {r4-r7}
+	.cfi_adjust_cfa_offset 16
+#ifdef __ARM_FEATURE_PAC_DEFAULT
+	.cfi_offset 4, -8
+	.cfi_offset 5, -12
+	.cfi_offset 6, -16
+	.cfi_offset 7, -20
+#else
+	.cfi_offset 4, -4
+	.cfi_offset 5, -8
+	.cfi_offset 6, -12
+	.cfi_offset 7, -16
+#endif /*  __ARM_FEATURE_PAC_DEFAULT */
 	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
 	orr	r1, r1, r1, lsl #16
 	bic	r4, r2, #7	@ Number of double words to work with
@@ -83,6 +105,11 @@ __memchr_arm:
 	bne	15b		@ (Flags from the subs above) If not run out of bytes then go around again
 
 	pop	{r4,r5,r6,r7}
+	.cfi_restore 7
+	.cfi_restore 6
+	.cfi_restore 5
+	.cfi_restore 4
+	.cfi_adjust_cfa_offset -16
 	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
 	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
  
@@ -98,11 +125,11 @@ __memchr_arm:
 
 40:
 	movs	r0,#0		@ not found
-	bx	lr
+	pacbti_epilogue
 
 50:
 	subs	r0,r0,#1	@ found
-	bx	lr
+	pacbti_epilogue
 
 60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
 	@ r0 points to the start of the double word after the one that was tested
@@ -126,7 +153,14 @@ __memchr_arm:
 
 61:
 	pop	{r4,r5,r6,r7}
+	.cfi_restore 7
+	.cfi_restore 6
+	.cfi_restore 5
+	.cfi_restore 4
+	.cfi_adjust_cfa_offset -16
 	subs	r0,r0,#1
-	bx	lr
+	pacbti_epilogue
+	.cfi_endproc
+	.fnend
 
 	.size	__memchr_arm, . - __memchr_arm
-- 
cgit v1.2.3


From e373f6595230087a8ddea449bfb14b47150b4059 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Fri, 24 Jun 2022 12:59:22 +0100
Subject: string: Fix ARM_FNSTART on non-arm targets

Fix build failure introduced by

  commit 40b662ce7b65d5eaefa40fd8046d6f3c6b3238c1
  string: add .fnstart and .fnend directives to ENTRY/END macros
---
 string/asmdefs.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/string/asmdefs.h b/string/asmdefs.h
index e84626f..d122b26 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -11,6 +11,9 @@
 #if defined (__arm__)
 #define ARM_FNSTART .fnstart
 #define ARM_FNEND .fnend
+#else
+#define ARM_FNSTART
+#define ARM_FNEND
 #endif
 
 #if defined(__aarch64__)
-- 
cgit v1.2.3


From 9776149753ec2d217ecd52c1786ac73548fed8d8 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Mon, 4 Jul 2022 15:27:25 +0100
Subject: string: simplify M-profile memchr PACBTI epilogue

Merge stack pop instructions prior to returning from function. This also
introduces fixes to CFI offset calculations to reflect the register
ordering on push and pop instructions, with the lowest-numbered register
saved to the lowest memory address.
---
 string/arm/memchr.S | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 4f7d8f4..bc1608f 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -76,15 +76,15 @@ __memchr_arm:
 	.save   {r4-r7}
 	.cfi_adjust_cfa_offset 16
 #ifdef __ARM_FEATURE_PAC_DEFAULT
-	.cfi_offset 4, -8
-	.cfi_offset 5, -12
-	.cfi_offset 6, -16
-	.cfi_offset 7, -20
-#else
-	.cfi_offset 4, -4
-	.cfi_offset 5, -8
+	.cfi_offset 4, -20
+	.cfi_offset 5, -16
 	.cfi_offset 6, -12
-	.cfi_offset 7, -16
+	.cfi_offset 7, -8
+#else
+	.cfi_offset 4, -16
+	.cfi_offset 5, -12
+	.cfi_offset 6, -8
+	.cfi_offset 7, -4
 #endif /*  __ARM_FEATURE_PAC_DEFAULT */
 	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
 	orr	r1, r1, r1, lsl #16
@@ -152,14 +152,22 @@ __memchr_arm:
 	addeq	r0,r0,#1
 
 61:
+	subs	r0,r0,#1
+#if __ARM_FEATURE_PAC_DEFAULT
+	pop	{r4,r5,r6,r7,ip}
+	.cfi_restore 143
+#else
 	pop	{r4,r5,r6,r7}
+#endif /* __ARM_FEATURE_PAC_DEFAULT */
 	.cfi_restore 7
 	.cfi_restore 6
 	.cfi_restore 5
 	.cfi_restore 4
-	.cfi_adjust_cfa_offset -16
-	subs	r0,r0,#1
-	pacbti_epilogue
+	.cfi_def_cfa_offset 0
+#if __ARM_FEATURE_PAC_DEFAULT
+	aut ip, lr, sp
+#endif /* __ARM_FEATURE_PAC_DEFAULT */
+	bx lr
 	.cfi_endproc
 	.fnend
 
-- 
cgit v1.2.3


From 94ed5b928a498bccb72d582156bf0f39c104a481 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Mon, 4 Jul 2022 15:28:03 +0100
Subject: string: simplify M-profile strlen PACBTI epilogue

Merge stack pop instructions prior to returning from function. This also
introduces fixes to CFI offset calculations to reflect the register
ordering on push and pop instructions, with the lowest-numbered register
saved to the lowest memory address.
---
 string/arm/strlen-armv6t2.S | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 5beaa7d..a981600 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -54,8 +54,8 @@ ENTRY (__strlen_armv6t2)
 	.save   {r4, r5, ra_auth_code}
 	.cfi_def_cfa_offset 12
 	.cfi_offset 143, -4
-	.cfi_offset 4, -8
-	.cfi_offset 5, -12
+	.cfi_offset 5, -8
+	.cfi_offset 4, -12
 #else
 #if __ARM_FEATURE_BTI_DEFAULT
 	bti
@@ -63,8 +63,8 @@ ENTRY (__strlen_armv6t2)
 	push    {r4, r5}
 	.save   {r4, r5}
 	.cfi_def_cfa_offset 8
-	.cfi_offset 4, -4
-	.cfi_offset 5, -8
+	.cfi_offset 4, -8
+	.cfi_offset 5, -4
 #endif /* __ARM_FEATURE_PAC_DEFAULT */
 	pld	[srcin, #0]
 	bic	src, srcin, #7
@@ -124,12 +124,21 @@ L(null_found):
 	rev	data1a, data1a
 #endif
 	clz	data1a, data1a
-	ldrd	r4, r5, [sp], #8
+	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
+#if __ARM_FEATURE_PAC_DEFAULT
+	pop	{r4, r5, ip}
+	.cfi_restore 4
 	.cfi_restore 5
+	.cfi_restore 143
+	.cfi_def_cfa_offset 0
+	aut ip, lr, sp
+#else
+	ldrd	r4, r5, [sp], #8
 	.cfi_restore 4
-	.cfi_adjust_cfa_offset -8
-	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
-	pacbti_epilogue
+	.cfi_restore 5
+	.cfi_def_cfa_offset 0
+#endif /* __ARM_FEATURE_PAC_DEFAULT */
+	bx lr
 
 L(misaligned8):
 	ldrd	data1a, data1b, [src]
-- 
cgit v1.2.3


From 2be383412b3c5470d8d45f49d73f4a7c503022a2 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <danilak@google.com>
Date: Wed, 22 Jun 2022 12:37:06 +0100
Subject: string: Optimize string functions with shrn instruction

Optimize

  __memchr_aarch64_mte
  __memrchr_aarch64
  __strchrnul_aarch64_mte
  __stpcpy_aarch64
  __strcpy_aarch64
  __strlen_aarch64_mte

using the shrn instruction for computing the nibble mask instead of
and + addp, which reduces instruction count.
---
 string/aarch64/memchr-mte.S    | 26 +++++++++-----------------
 string/aarch64/memrchr.S       | 26 +++++++++-----------------
 string/aarch64/strchrnul-mte.S | 29 +++++++++++------------------
 string/aarch64/strcpy.S        | 32 ++++++++++++--------------------
 string/aarch64/strlen-mte.S    | 14 ++++----------
 5 files changed, 45 insertions(+), 82 deletions(-)

diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index 8441585..0f434cf 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -23,25 +23,21 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memchr_aarch64_mte)
 	PTR_ARG (0)
@@ -50,12 +46,9 @@ ENTRY (__memchr_aarch64_mte)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4            /* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -94,8 +87,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	add	tmp, srcin, cntin
 	sub	cntrem, tmp, src
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index ff4f47a..47fbce2 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -23,7 +23,6 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
@@ -31,19 +30,16 @@
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memrchr_aarch64)
 	PTR_ARG (0)
@@ -53,12 +49,9 @@ ENTRY (__memrchr_aarch64)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4            /* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -95,8 +88,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 4a08b52..9be5cbc 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -20,38 +20,32 @@
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
-#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vend		v4
+#define dend		d4
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strchrnul_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	tmp2w, 0xf00f
-	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@@ -70,8 +64,7 @@ L(loop):
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
 
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 92b2850..ba4a7d8 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -22,7 +22,6 @@
 #define len		x4
 #define synd		x4
 #define	tmp		x5
-#define wtmp		w5
 #define shift		x5
 #define data1		x6
 #define dataw1		w6
@@ -32,9 +31,8 @@
 #define dataq		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 #define dataq2		q1
 
 #ifdef BUILD_STPCPY
@@ -45,34 +43,29 @@
 # define IFSTPCPY(X,...)
 #endif
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbnz	synd, L(tail)
 
 	ldr	dataq, [src, 16]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
 	cbz	synd, L(start_loop)
 
@@ -144,8 +137,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 45103ff..0d33ebb 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -19,15 +19,13 @@
 #define src		x1
 #define	synd		x2
 #define tmp		x3
-#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /* Core algorithm:
 
@@ -41,13 +39,10 @@
 ENTRY (__strlen_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
@@ -65,8 +60,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
-- 
cgit v1.2.3


From d9a816bb547d3acc29e343c85dc568ff86add1c9 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 11 Jul 2022 09:31:38 +0100
Subject: pl/math: Remove some stray semi-colons

These cause compile warnings and are unnecessary as strong_alias is a
macro.
---
 pl/math/vn_erf_2u.c   | 2 +-
 pl/math/vn_erfc_3u7.c | 2 +-
 pl/math/vn_erfcf_1u.c | 2 +-
 pl/math/vn_erff_1u5.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pl/math/vn_erf_2u.c b/pl/math/vn_erf_2u.c
index e0e10bb..2841eca 100644
--- a/pl/math/vn_erf_2u.c
+++ b/pl/math/vn_erf_2u.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_erf, _ZGVnN2v_erf);
+#define VPCS_ALIAS strong_alias (__vn_erf, _ZGVnN2v_erf)
 #include "v_erf_2u.c"
 #endif
diff --git a/pl/math/vn_erfc_3u7.c b/pl/math/vn_erfc_3u7.c
index 324b541..db06bc3 100644
--- a/pl/math/vn_erfc_3u7.c
+++ b/pl/math/vn_erfc_3u7.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_erfc, _ZGVnN2v_erfc);
+#define VPCS_ALIAS strong_alias (__vn_erfc, _ZGVnN2v_erfc)
 #include "v_erfc_3u7.c"
 #endif
diff --git a/pl/math/vn_erfcf_1u.c b/pl/math/vn_erfcf_1u.c
index 0262c86..2248f79 100644
--- a/pl/math/vn_erfcf_1u.c
+++ b/pl/math/vn_erfcf_1u.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_erfcf, _ZGVnN4v_erfcf);
+#define VPCS_ALIAS strong_alias (__vn_erfcf, _ZGVnN4v_erfcf)
 #include "v_erfcf_1u.c"
 #endif
diff --git a/pl/math/vn_erff_1u5.c b/pl/math/vn_erff_1u5.c
index 89126f9..5b48442 100644
--- a/pl/math/vn_erff_1u5.c
+++ b/pl/math/vn_erff_1u5.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_erff, _ZGVnN4v_erff);
+#define VPCS_ALIAS strong_alias (__vn_erff, _ZGVnN4v_erff)
 #include "v_erff_1u5.c"
 #endif
-- 
cgit v1.2.3


From ea3ad6c20ddd99aa1ee3f86e8f4bc5fe76cee35c Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 12 Jul 2022 09:13:09 +0100
Subject: pl/math: Add scalar asinhf

asinhf depends on logf, which has been copied over from the main math
directory. The only modification was to change the name logf to
optr_aor_log_f32 to resolve any ambiguity with libm.

Worst-case error is about 3.4 ULP, at very large input. There are 4
intervals with slightly different error behaviour, as follows:

Interval              Worst-case accuracy (ulp)
|x| < 2^-12           0
|x| < 1               1.3
|x| < sqrt(FLT_MAX)   2.0
|x| < infinity        3.4
---
 pl/math/asinhf_3u5.c                       | 77 ++++++++++++++++++++++++++++++
 pl/math/asinhf_data.c                      | 14 ++++++
 pl/math/include/mathlib.h                  |  1 +
 pl/math/logf.c                             | 75 +++++++++++++++++++++++++++++
 pl/math/math_config.h                      |  6 +++
 pl/math/test/mathbench_funcs.h             |  1 +
 pl/math/test/runulp.sh                     |  6 +++
 pl/math/test/testcases/directed/asinhf.tst | 18 +++++++
 pl/math/test/ulp_funcs.h                   |  1 +
 pl/math/tools/asinhf.sollya                | 29 +++++++++++
 10 files changed, 228 insertions(+)
 create mode 100644 pl/math/asinhf_3u5.c
 create mode 100644 pl/math/asinhf_data.c
 create mode 100644 pl/math/logf.c
 create mode 100644 pl/math/test/testcases/directed/asinhf.tst
 create mode 100644 pl/math/tools/asinhf.sollya

diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
new file mode 100644
index 0000000..10f9f31
--- /dev/null
+++ b/pl/math/asinhf_3u5.c
@@ -0,0 +1,77 @@
+/*
+ * Single-precision asinh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define AbsMask (0x7fffffff)
+#define SqrtFltMax (0x1.749e96p+10f)
+#define Ln2 (0x1.62e4p-1f)
+#define One (0x3f8)
+#define ExpM12 (0x398)
+#define QNaN (0x7fc)
+
+#define C(i) __asinhf_data.coeffs[i]
+
+float
+optr_aor_log_f32 (float);
+
+/* asinhf approximation using a variety of approaches on different intervals:
+
+   |x| < 2^-12: Return x. Function is exactly rounded in this region.
+
+   |x| < 1.0: Use custom order-8 polynomial. The largest observed
+     error in this region is 1.3ulps:
+     asinhf(0x1.f0f74cp-1) got 0x1.b88de4p-1 want 0x1.b88de2p-1.
+
+   |x| <= SqrtFltMax: Calculate the result directly using the
+     definition of asinh(x) = ln(x + sqrt(x*x + 1)). The largest
+     observed error in this region is 1.99ulps.
+     asinhf(0x1.00e358p+0) got 0x1.c4849ep-1 want 0x1.c484a2p-1.
+
+   |x| > SqrtFltMax: We cannot square x without overflow at a low
+     cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot
+     even double x without overflow, so calculate this as ln(x) +
+     ln(2). This largest observed error in this region is 3.39ulps.
+     asinhf(0x1.749e9ep+10) got 0x1.fffff8p+2 want 0x1.fffffep+2.  */
+float
+asinhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t ia = ix & AbsMask;
+  uint32_t ia12 = ia >> 20;
+  float ax = asfloat (ia);
+  uint32_t sign = ix & ~AbsMask;
+
+  if (ia12 < ExpM12 || ia12 == QNaN)
+    {
+      return x;
+    }
+
+  if (ia12 < One)
+    {
+      float x2 = ax * ax;
+      float x4 = x2 * x2;
+
+      float p_01 = fmaf (ax, C (1), C (0));
+      float p_23 = fmaf (ax, C (3), C (2));
+      float p_45 = fmaf (ax, C (5), C (4));
+      float p_67 = fmaf (ax, C (7), C (6));
+
+      float p_03 = fmaf (x2, p_23, p_01);
+      float p_47 = fmaf (x2, p_67, p_45);
+
+      float p = fmaf (x4, p_47, p_03);
+      float y = fmaf (x2, p, ax);
+      return asfloat (asuint (y) | sign);
+    }
+
+  if (unlikely (ax > SqrtFltMax))
+    {
+      return asfloat (asuint (optr_aor_log_f32 (ax) + Ln2) | sign);
+    }
+
+  return asfloat (asuint (optr_aor_log_f32 (ax + sqrtf (ax * ax + 1))) | sign);
+}
diff --git a/pl/math/asinhf_data.c b/pl/math/asinhf_data.c
new file mode 100644
index 0000000..ce9b632
--- /dev/null
+++ b/pl/math/asinhf_data.c
@@ -0,0 +1,14 @@
+/*
+ * Coefficients for single-precision asinh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Approximate asinhf(x) directly in [2^-12, 1]. See for tools/asinhf.sollya for
+   these coeffs were generated.  */
+const struct asinhf_data __asinhf_data
+  = {.coeffs
+     = {-0x1.9b16fap-19f, -0x1.552baap-3f, -0x1.4e572ap-11f, 0x1.3a81dcp-4f,
+	0x1.65bbaap-10f, -0x1.057f1p-4f, 0x1.6c1d46p-5f, -0x1.4cafe8p-7f}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 0b7a745..c566d12 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -9,6 +9,7 @@
 #ifndef _MATHLIB_H
 #define _MATHLIB_H
 
+float asinhf (float);
 float atan2f (float, float);
 float erfcf (float);
 float erff (float);
diff --git a/pl/math/logf.c b/pl/math/logf.c
new file mode 100644
index 0000000..2962ee7
--- /dev/null
+++ b/pl/math/logf.c
@@ -0,0 +1,75 @@
+/*
+ * Single-precision log function.
+ *
+ * Copyright (c) 2017-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include "math_config.h"
+
+/*
+LOGF_TABLE_BITS = 4
+LOGF_POLY_ORDER = 4
+
+ULP error: 0.818 (nearest rounding.)
+Relative error: 1.957 * 2^-26 (before rounding.)
+*/
+
+#define T __logf_data.tab
+#define A __logf_data.poly
+#define Ln2 __logf_data.ln2
+#define N (1 << LOGF_TABLE_BITS)
+#define OFF 0x3f330000
+
+float
+optr_aor_log_f32 (float x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t z, r, r2, y, y0, invc, logc;
+  uint32_t ix, iz, tmp;
+  int k, i;
+
+  ix = asuint (x);
+#if WANT_ROUNDING
+  /* Fix sign of zero with downward rounding when x==1.  */
+  if (unlikely (ix == 0x3f800000))
+    return 0;
+#endif
+  if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000))
+    {
+      /* x < 0x1p-126 or inf or nan.  */
+      if (ix * 2 == 0)
+	return __math_divzerof (1);
+      if (ix == 0x7f800000) /* log(inf) == inf.  */
+	return x;
+      if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
+	return __math_invalidf (x);
+      /* x is subnormal, normalize it.  */
+      ix = asuint (x * 0x1p23f);
+      ix -= 23 << 23;
+    }
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
+  k = (int32_t) tmp >> 23; /* arithmetic shift */
+  iz = ix - (tmp & 0x1ff << 23);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = (double_t) asfloat (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */
+  r = z * invc - 1;
+  y0 = logc + (double_t) k * Ln2;
+
+  /* Pipelined polynomial evaluation to approximate log1p(r).  */
+  r2 = r * r;
+  y = A[1] * r + A[2];
+  y = A[0] * r2 + y;
+  y = y * r2 + (y0 + r);
+  return eval_as_float (y);
+}
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 231d6bc..e77170e 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -419,4 +419,10 @@ extern const struct atanf_poly_data
 {
   float poly[ATANF_POLY_NCOEFFS];
 } __atanf_poly_data HIDDEN;
+
+#define ASINHF_NCOEFFS 8
+extern const struct asinhf_data
+{
+  float coeffs[ASINHF_NCOEFFS];
+} __asinhf_data HIDDEN;
 #endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 0f8e0ca..45ee627 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -5,6 +5,7 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+F (asinhf, -10.0, 10.0)
 F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
 F (erfcf, -4.0, 10.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 17b13ae..746562d 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -85,6 +85,12 @@ t atan2f   0.0        1.0  40000
 t atan2f   1.0      100.0  40000
 t atan2f   1e6       1e32  40000
 
+L=3.0
+t asinhf        0  0x1p-12  5000
+t asinhf  0x1p-12      1.0  50000
+t asinhf      1.0   0x1p11  50000
+t asinhf   0x1p11  0x1p127  20000
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/asinhf.tst b/pl/math/test/testcases/directed/asinhf.tst
new file mode 100644
index 0000000..d832056
--- /dev/null
+++ b/pl/math/test/testcases/directed/asinhf.tst
@@ -0,0 +1,18 @@
+; asinhf.tst
+;
+; Copyright (c) 2007-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=asinhf op1=7fc00001 result=7fc00001 errno=0
+func=asinhf op1=ffc00001 result=7fc00001 errno=0
+func=asinhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=asinhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=asinhf op1=7f800000 result=7f800000 errno=0
+func=asinhf op1=ff800000 result=ff800000 errno=0
+func=asinhf op1=00000000 result=00000000 errno=0
+func=asinhf op1=80000000 result=80000000 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=asinhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=asinhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 40fae51..28ef25c 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -4,6 +4,7 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+F1 (asinh)
 F2 (atan2)
 F1 (erfc)
 F1 (erf)
diff --git a/pl/math/tools/asinhf.sollya b/pl/math/tools/asinhf.sollya
new file mode 100644
index 0000000..cbe7d62
--- /dev/null
+++ b/pl/math/tools/asinhf.sollya
@@ -0,0 +1,29 @@
+// polynomial for approximating asinh(x)
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 9;
+
+a = 0x1.0p-12;
+b = 1.0;
+
+f = proc(y) {
+  return asinh(x);
+};
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = x;
+for i from 2 to deg do {
+  p = roundcoefficients(approx(poly,i), [|SG ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do coeff(poly,i);
-- 
cgit v1.2.3


From d082d55feea607f231358cc49a958da419fac537 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 12 Jul 2022 12:22:06 +0100
Subject: pl/math: Add scalar asinh

The new routine uses a similar approach to asinhf, using a polynomial
only in the region where either returning x or calculating the result
directly is not sufficiently precise.

Worst-case error is about 2 ULP, close to 1. There are 4
intervals with slightly different error behaviour, as follows:

Interval              Worst-case accuracy (ulp)
|x| < 2^-26           0.0
|x| < 1               1.5
|x| < ~sqrt(DBL_MAX)  2.0
|x| < infinity        1.0

log has been copied from the main math directory so that it can be
used in asinh. The only modifications to the relevant files are to
remove aliases and rename log itself to an internal 'helper' name.
---
 pl/math/asinh_2u5.c                       | 101 ++++++
 pl/math/asinh_data.c                      |  22 ++
 pl/math/include/mathlib.h                 |   1 +
 pl/math/log.c                             | 161 ++++++++++
 pl/math/log_data.c                        | 511 ++++++++++++++++++++++++++++++
 pl/math/math_config.h                     |  27 ++
 pl/math/test/mathbench_funcs.h            |   1 +
 pl/math/test/runulp.sh                    |   9 +
 pl/math/test/testcases/directed/asinh.tst |  18 ++
 pl/math/test/ulp_funcs.h                  |   1 +
 pl/math/tools/asinh.sollya                |  28 ++
 11 files changed, 880 insertions(+)
 create mode 100644 pl/math/asinh_2u5.c
 create mode 100644 pl/math/asinh_data.c
 create mode 100644 pl/math/log.c
 create mode 100644 pl/math/log_data.c
 create mode 100644 pl/math/test/testcases/directed/asinh.tst
 create mode 100644 pl/math/tools/asinh.sollya

diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c
new file mode 100644
index 0000000..293626d
--- /dev/null
+++ b/pl/math/asinh_2u5.c
@@ -0,0 +1,101 @@
+/*
+ * Double-precision asinh(x) function
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26).  */
+#define One 0x3ff0000000000000	  /* asuint64(1.0).  */
+#define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511).  */
+#define Ln2 0x1.62e42fefa39efp-1
+#define C(i) __asinh_data.poly[i]
+
+double
+optr_aor_log_f64 (double);
+
+static inline double
+eval_poly (double z)
+{
+  /* Evaluate polynomial using Estrin scheme.  */
+  double p_01 = fma (z, C (1), C (0));
+  double p_23 = fma (z, C (3), C (2));
+  double p_45 = fma (z, C (5), C (4));
+  double p_67 = fma (z, C (7), C (6));
+  double p_89 = fma (z, C (9), C (8));
+  double p_ab = fma (z, C (11), C (10));
+  double p_cd = fma (z, C (13), C (12));
+  double p_ef = fma (z, C (15), C (14));
+  double p_gh = fma (z, C (17), C (16));
+
+  double z2 = z * z;
+  double p_03 = fma (z2, p_23, p_01);
+  double p_47 = fma (z2, p_67, p_45);
+  double p_8b = fma (z2, p_ab, p_89);
+  double p_cf = fma (z2, p_ef, p_cd);
+
+  double z4 = z2 * z2;
+  double p_07 = fma (z4, p_47, p_03);
+  double p_8f = fma (z4, p_cf, p_8b);
+
+  double z8 = z4 * z4;
+  double p_0f = fma (z8, p_8f, p_07);
+
+  double z16 = z8 * z8;
+  return fma (z16, p_gh, p_0f);
+}
+
+/* Scalar double-precision asinh implementation. This routine uses different
+   approaches on different intervals:
+
+   |x| < 2^-26: Return x. Function is exact in this region.
+
+   |x| < 1: Use custom order-17 polynomial. This is least accurate close to 1.
+     The largest observed error in this region is 1.47 ULPs:
+     asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+				want 0x1.c1d6bf874019cp-1.
+
+   |x| < 2^511: Upper bound of this region is close to sqrt(DBL_MAX). Calculate
+     the result directly using the definition asinh(x) = ln(x + sqrt(x*x + 1)).
+     The largest observed error in this region is 2.03 ULPs:
+     asinh(0x1.00441cdce7fd5p+0) got 0x1.c3a3b32255bf9p-1
+				want 0x1.c3a3b32255bfbp-1.
+
+   |x| >= 2^511: We cannot square x without overflow at a low
+     cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot
+     even double x without overflow, so calculate this as ln(x) +
+     ln(2). The largest observed error in this region is 0.98 ULPs at many
+     values, for instance:
+     asinh(0x1.5255a4cf10319p+975) got 0x1.52652f4cb26cbp+9
+				  want 0x1.52652f4cb26ccp+9.  */
+double
+asinh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ia = ix & AbsMask;
+  double ax = asdouble (ia);
+  uint64_t sign = ix & ~AbsMask;
+
+  if (ia < ExpM26)
+    {
+      return x;
+    }
+
+  if (ia < One)
+    {
+      double x2 = x * x;
+      double p = eval_poly (x2);
+      double y = fma (p, x2 * ax, ax);
+      return asdouble (asuint64 (y) | sign);
+    }
+
+  if (unlikely (ia >= Exp511))
+    {
+      return asdouble (asuint64 (optr_aor_log_f64 (ax) + Ln2) | sign);
+    }
+
+  return asdouble (asuint64 (optr_aor_log_f64 (ax + sqrt (ax * ax + 1)))
+		   | sign);
+}
diff --git a/pl/math/asinh_data.c b/pl/math/asinh_data.c
new file mode 100644
index 0000000..319c572
--- /dev/null
+++ b/pl/math/asinh_data.c
@@ -0,0 +1,22 @@
+/*
+ * Double-precision polynomial coefficients for scalar asinh(x)
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* asinh(x) is odd, and the first term of the Taylor expansion is x, so we can
+   approximate the function by x + x^3 * P(x^2), where P(z) has the form:
+   C0 + C1 * z + C2 * z^2 + C3 * z^3 + ...
+   Note P is evaluated on even powers of x only. See tools/asinh.sollya for the
+   algorithm used to generate these coefficients.  */
+const struct asinh_data __asinh_data
+  = {.poly
+     = {-0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
+	0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
+	-0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
+	0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
+	-0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
+	0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index c566d12..bd2d8c2 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -15,6 +15,7 @@ float erfcf (float);
 float erff (float);
 float log10f (float);
 
+double asinh (double);
 double atan2 (double, double);
 double log10 (double);
 
diff --git a/pl/math/log.c b/pl/math/log.c
new file mode 100644
index 0000000..418c715
--- /dev/null
+++ b/pl/math/log.c
@@ -0,0 +1,161 @@
+/*
+ * Double-precision log(x) function.
+ *
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <float.h>
+#include <math.h>
+#include <stdint.h>
+#include "math_config.h"
+
+#define T __log_data.tab
+#define T2 __log_data.tab2
+#define B __log_data.poly1
+#define A __log_data.poly
+#define Ln2hi __log_data.ln2hi
+#define Ln2lo __log_data.ln2lo
+#define N (1 << LOG_TABLE_BITS)
+#define OFF 0x3fe6000000000000
+
+/* Top 16 bits of a double.  */
+static inline uint32_t
+top16 (double x)
+{
+  return asuint64 (x) >> 48;
+}
+
+double
+optr_aor_log_f64 (double x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo;
+  uint64_t ix, iz, tmp;
+  uint32_t top;
+  int k, i;
+
+  ix = asuint64 (x);
+  top = top16 (x);
+
+#if LOG_POLY1_ORDER == 10 || LOG_POLY1_ORDER == 11
+#define LO asuint64 (1.0 - 0x1p-5)
+#define HI asuint64 (1.0 + 0x1.1p-5)
+#elif LOG_POLY1_ORDER == 12
+#define LO asuint64 (1.0 - 0x1p-4)
+#define HI asuint64 (1.0 + 0x1.09p-4)
+#endif
+  if (unlikely (ix - LO < HI - LO))
+    {
+      /* Handle close to 1.0 inputs separately.  */
+      /* Fix sign of zero with downward rounding when x==1.  */
+      if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
+	return 0;
+      r = x - 1.0;
+      r2 = r * r;
+      r3 = r * r2;
+#if LOG_POLY1_ORDER == 10
+      /* Worst-case error is around 0.516 ULP.  */
+      y = r3
+	  * (B[1] + r * B[2] + r2 * B[3]
+	     + r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8])));
+      w = B[0] * r2; /* B[0] == -0.5.  */
+      hi = r + w;
+      y += r - hi + w;
+      y += hi;
+#elif LOG_POLY1_ORDER == 11
+      /* Worst-case error is around 0.516 ULP.  */
+      y = r3
+	  * (B[1] + r * B[2]
+	     + r2
+		 * (B[3] + r * B[4] + r2 * B[5]
+		    + r3 * (B[6] + r * B[7] + r2 * B[8] + r3 * B[9])));
+      w = B[0] * r2; /* B[0] == -0.5.  */
+      hi = r + w;
+      y += r - hi + w;
+      y += hi;
+#elif LOG_POLY1_ORDER == 12
+      y = r3
+	  * (B[1] + r * B[2] + r2 * B[3]
+	     + r3
+		 * (B[4] + r * B[5] + r2 * B[6]
+		    + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
+#if N <= 64
+      /* Worst-case error is around 0.532 ULP.  */
+      w = B[0] * r2; /* B[0] == -0.5.  */
+      hi = r + w;
+      y += r - hi + w;
+      y += hi;
+#else
+      /* Worst-case error is around 0.507 ULP.  */
+      w = r * 0x1p27;
+      double_t rhi = r + w - w;
+      double_t rlo = r - rhi;
+      w = rhi * rhi * B[0]; /* B[0] == -0.5.  */
+      hi = r + w;
+      lo = r - hi + w;
+      lo += B[0] * rlo * (rhi + r);
+      y += lo;
+      y += hi;
+#endif
+#endif
+      return eval_as_double (y);
+    }
+  if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
+    {
+      /* x < 0x1p-1022 or inf or nan.  */
+      if (ix * 2 == 0)
+	return __math_divzero (1);
+      if (ix == asuint64 (INFINITY)) /* log(inf) == inf.  */
+	return x;
+      if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
+	return __math_invalid (x);
+      /* x is subnormal, normalize it.  */
+      ix = asuint64 (x * 0x1p52);
+      ix -= 52ULL << 52;
+    }
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
+  k = (int64_t) tmp >> 52; /* arithmetic shift */
+  iz = ix - (tmp & 0xfffULL << 52);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = asdouble (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  /* r ~= z/c - 1, |r| < 1/(2*N).  */
+#if HAVE_FAST_FMA
+  /* rounding error: 0x1p-55/N.  */
+  r = fma (z, invc, -1.0);
+#else
+  /* rounding error: 0x1p-55/N + 0x1p-66.  */
+  r = (z - T2[i].chi - T2[i].clo) * invc;
+#endif
+  kd = (double_t) k;
+
+  /* hi + lo = r + log(c) + k*Ln2.  */
+  w = kd * Ln2hi + logc;
+  hi = w + r;
+  lo = w - hi + r + kd * Ln2lo;
+
+  /* log(x) = lo + (log1p(r) - r) + hi.  */
+  r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
+  /* Worst case error if |y| > 0x1p-5:
+     0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma)
+     Worst case error if |y| > 0x1p-4:
+     0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma).  */
+#if LOG_POLY_ORDER == 6
+  y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi;
+#elif LOG_POLY_ORDER == 7
+  y = lo
+      + r2
+	  * (A[0] + r * A[1] + r2 * (A[2] + r * A[3])
+	     + r2 * r2 * (A[4] + r * A[5]))
+      + hi;
+#endif
+  return eval_as_double (y);
+}
diff --git a/pl/math/log_data.c b/pl/math/log_data.c
new file mode 100644
index 0000000..ef10d33
--- /dev/null
+++ b/pl/math/log_data.c
@@ -0,0 +1,511 @@
+/*
+ * Data for log.
+ *
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << LOG_TABLE_BITS)
+
+const struct log_data __log_data = {
+.ln2hi = 0x1.62e42fefa3800p-1,
+.ln2lo = 0x1.ef35793c76730p-45,
+.poly1 = {
+#if LOG_POLY1_ORDER == 10
+// relative error: 0x1.32eccc6p-62
+// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
+-0x1p-1,
+0x1.55555555554e5p-2,
+-0x1.0000000000af2p-2,
+0x1.9999999bbe436p-3,
+-0x1.55555537f9cdep-3,
+0x1.24922fc8127cfp-3,
+-0x1.0000b7d6bb612p-3,
+0x1.c806ee1ddbcafp-4,
+-0x1.972335a9c2d6ep-4,
+#elif LOG_POLY1_ORDER == 11
+// relative error: 0x1.52c8b708p-68
+// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
+-0x1p-1,
+0x1.5555555555555p-2,
+-0x1.ffffffffffea9p-3,
+0x1.999999999c4d4p-3,
+-0x1.55555557f5541p-3,
+0x1.249248fbe33e4p-3,
+-0x1.ffffc9a3c825bp-4,
+0x1.c71e1f204435dp-4,
+-0x1.9a7f26377d06ep-4,
+0x1.71c30cf8f7364p-4,
+#elif LOG_POLY1_ORDER == 12
+// relative error: 0x1.c04d76cp-63
+// in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval)
+-0x1p-1,
+0x1.5555555555577p-2,
+-0x1.ffffffffffdcbp-3,
+0x1.999999995dd0cp-3,
+-0x1.55555556745a7p-3,
+0x1.24924a344de3p-3,
+-0x1.fffffa4423d65p-4,
+0x1.c7184282ad6cap-4,
+-0x1.999eb43b068ffp-4,
+0x1.78182f7afd085p-4,
+-0x1.5521375d145cdp-4,
+#endif
+},
+.poly = {
+#if N == 64 && LOG_POLY_ORDER == 7
+// relative error: 0x1.906eb8ap-58
+// abs error: 0x1.d2cad5a8p-67
+// in -0x1.fp-8 0x1.fp-8
+-0x1.0000000000027p-1,
+0x1.555555555556ap-2,
+-0x1.fffffff0440bap-3,
+0x1.99999991906c3p-3,
+-0x1.555c8d7e8201ep-3,
+0x1.24978c59151fap-3,
+#elif N == 128 && LOG_POLY_ORDER == 6
+// relative error: 0x1.926199e8p-56
+// abs error: 0x1.882ff33p-65
+// in -0x1.fp-9 0x1.fp-9
+-0x1.0000000000001p-1,
+0x1.555555551305bp-2,
+-0x1.fffffffeb459p-3,
+0x1.999b324f10111p-3,
+-0x1.55575e506c89fp-3,
+#elif N == 128 && LOG_POLY_ORDER == 7
+// relative error: 0x1.649fc4bp-64
+// abs error: 0x1.c3b5769p-74
+// in -0x1.fp-9 0x1.fp-9
+-0x1.0000000000001p-1,
+0x1.5555555555556p-2,
+-0x1.fffffffea1a8p-3,
+0x1.99999998e9139p-3,
+-0x1.555776801b968p-3,
+0x1.2493c29331a5cp-3,
+#endif
+},
+/* Algorithm:
+
+	x = 2^k z
+	log(x) = k ln2 + log(c) + log(z/c)
+	log(z/c) = poly(z/c - 1)
+
+where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
+into the ith one, then table entries are computed as
+
+	tab[i].invc = 1/c
+	tab[i].logc = (double)log(c)
+	tab2[i].chi = (double)c
+	tab2[i].clo = (double)(c - (double)c)
+
+where c is near the center of the subinterval and is chosen by trying +-2^29
+floating point invc candidates around 1/center and selecting one for which
+
+	1) the rounding error in 0x1.8p9 + logc is 0,
+	2) the rounding error in z - chi - clo is < 0x1p-66 and
+	3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
+
+Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
+2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
+a single rounding error when there is no fast fma for z*invc - 1, 3) ensures
+that logc + poly(z/c - 1) has small error, however near x == 1 when
+|log(x)| < 0x1p-4, this is not enough so that is special cased.  */
+.tab = {
+#if N == 64
+{0x1.7242886495cd8p+0, -0x1.79e267bdfe000p-2},
+{0x1.6e1f769340dc9p+0, -0x1.6e60ee0ecb000p-2},
+{0x1.6a13ccc8f195cp+0, -0x1.63002fdbf6000p-2},
+{0x1.661ec72e86f3ap+0, -0x1.57bf76c597000p-2},
+{0x1.623fa6c447b16p+0, -0x1.4c9e07f0d2000p-2},
+{0x1.5e75bbca31702p+0, -0x1.419b42f027000p-2},
+{0x1.5ac05655adb10p+0, -0x1.36b67660e6000p-2},
+{0x1.571ed3e940191p+0, -0x1.2bef0839e4800p-2},
+{0x1.539094ac0fbbfp+0, -0x1.21445727cb000p-2},
+{0x1.5015007e7fc42p+0, -0x1.16b5ca3c3d000p-2},
+{0x1.4cab877c31cf9p+0, -0x1.0c42d3805f800p-2},
+{0x1.49539e76a88d3p+0, -0x1.01eae61b60800p-2},
+{0x1.460cbc12211dap+0, -0x1.ef5adb9fb0000p-3},
+{0x1.42d6624debe3ap+0, -0x1.db13daab99000p-3},
+{0x1.3fb0144f0d462p+0, -0x1.c6ffbe896e000p-3},
+{0x1.3c995a1f9a9b4p+0, -0x1.b31d84722d000p-3},
+{0x1.3991c23952500p+0, -0x1.9f6c3cf6eb000p-3},
+{0x1.3698df35eaa14p+0, -0x1.8beafe7f13000p-3},
+{0x1.33ae463091760p+0, -0x1.7898db878d000p-3},
+{0x1.30d190aae3d72p+0, -0x1.6574efe4ec000p-3},
+{0x1.2e025c9203c89p+0, -0x1.527e620845000p-3},
+{0x1.2b404a7244988p+0, -0x1.3fb457d798000p-3},
+{0x1.288b01dc19544p+0, -0x1.2d1615a077000p-3},
+{0x1.25e2268085f69p+0, -0x1.1aa2b431e5000p-3},
+{0x1.23456812abb74p+0, -0x1.08598f1d2b000p-3},
+{0x1.20b4703174157p+0, -0x1.ec738fee40000p-4},
+{0x1.1e2ef308b4e9bp+0, -0x1.c885768862000p-4},
+{0x1.1bb4a36b70a3fp+0, -0x1.a4e75b6a46000p-4},
+{0x1.194538e960658p+0, -0x1.8197efba9a000p-4},
+{0x1.16e0692a10ac8p+0, -0x1.5e95ad734e000p-4},
+{0x1.1485f1ba1568bp+0, -0x1.3bdf67117c000p-4},
+{0x1.12358e123ed6fp+0, -0x1.1973b744f0000p-4},
+{0x1.0fef01de37c8dp+0, -0x1.eea33446bc000p-5},
+{0x1.0db20b82be414p+0, -0x1.aaef4ab304000p-5},
+{0x1.0b7e6f67f69b3p+0, -0x1.67c962fd2c000p-5},
+{0x1.0953f342fc108p+0, -0x1.252f29acf8000p-5},
+{0x1.0732604ec956bp+0, -0x1.c63d19e9c0000p-6},
+{0x1.051980117f9b0p+0, -0x1.432ab6a388000p-6},
+{0x1.03091aa6810f1p+0, -0x1.8244357f50000p-7},
+{0x1.01010152cf066p+0, -0x1.0080a711c0000p-8},
+{0x1.fc07ef6b6e30bp-1, 0x1.fe03018e80000p-8},
+{0x1.f4465aa1024afp-1, 0x1.7b91986450000p-6},
+{0x1.ecc07a8fd3f5ep-1, 0x1.39e88608c8000p-5},
+{0x1.e573ad856b537p-1, 0x1.b42dc6e624000p-5},
+{0x1.de5d6dc7b8057p-1, 0x1.165372ec20000p-4},
+{0x1.d77b6498bddf7p-1, 0x1.51b07a0170000p-4},
+{0x1.d0cb580315c0fp-1, 0x1.8c3465c7ea000p-4},
+{0x1.ca4b30d1cf449p-1, 0x1.c5e544a290000p-4},
+{0x1.c3f8ef4810d8ep-1, 0x1.fec91aa0a6000p-4},
+{0x1.bdd2b8b311f44p-1, 0x1.1b72acdc5c000p-3},
+{0x1.b7d6c2eeac054p-1, 0x1.371fc65a98000p-3},
+{0x1.b20363474c8f5p-1, 0x1.526e61c1aa000p-3},
+{0x1.ac570165eeab1p-1, 0x1.6d60ffc240000p-3},
+{0x1.a6d019f331df4p-1, 0x1.87fa08a013000p-3},
+{0x1.a16d3ebc9e3c3p-1, 0x1.a23bc630c3000p-3},
+{0x1.9c2d14567ef45p-1, 0x1.bc286a3512000p-3},
+{0x1.970e4efae9169p-1, 0x1.d5c2195697000p-3},
+{0x1.920fb3bd0b802p-1, 0x1.ef0ae132d3000p-3},
+{0x1.8d3018b58699ap-1, 0x1.040259974e000p-2},
+{0x1.886e5ff170ee6p-1, 0x1.1058bd40e2000p-2},
+{0x1.83c977ad35d27p-1, 0x1.1c898c1137800p-2},
+{0x1.7f405ed16c520p-1, 0x1.2895a3e65b000p-2},
+{0x1.7ad220d0335c4p-1, 0x1.347dd8f6bd000p-2},
+{0x1.767dce53474fdp-1, 0x1.4043083cb3800p-2},
+#elif N == 128
+{0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2},
+{0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2},
+{0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2},
+{0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2},
+{0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2},
+{0x1.69147332f0cbap+0, -0x1.602d076180000p-2},
+{0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2},
+{0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2},
+{0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2},
+{0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2},
+{0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2},
+{0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2},
+{0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2},
+{0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2},
+{0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2},
+{0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2},
+{0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2},
+{0x1.52aff42064583p+0, -0x1.1e9e129279000p-2},
+{0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2},
+{0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2},
+{0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2},
+{0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2},
+{0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2},
+{0x1.4880524d48434p+0, -0x1.feb224586f000p-3},
+{0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3},
+{0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3},
+{0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3},
+{0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3},
+{0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3},
+{0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3},
+{0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3},
+{0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3},
+{0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3},
+{0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3},
+{0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3},
+{0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3},
+{0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3},
+{0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3},
+{0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3},
+{0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3},
+{0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3},
+{0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3},
+{0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3},
+{0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3},
+{0x1.293726014b530p+0, -0x1.31b996b490000p-3},
+{0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3},
+{0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3},
+{0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3},
+{0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3},
+{0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3},
+{0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4},
+{0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4},
+{0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4},
+{0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4},
+{0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4},
+{0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4},
+{0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4},
+{0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4},
+{0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4},
+{0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4},
+{0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4},
+{0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4},
+{0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4},
+{0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4},
+{0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5},
+{0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5},
+{0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5},
+{0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5},
+{0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5},
+{0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5},
+{0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5},
+{0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5},
+{0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6},
+{0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6},
+{0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6},
+{0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6},
+{0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7},
+{0x1.02865137932a9p+0, -0x1.419355daa0000p-7},
+{0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8},
+{0x1.008040614b195p+0, -0x1.0040979240000p-9},
+{0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9},
+{0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7},
+{0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6},
+{0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6},
+{0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5},
+{0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5},
+{0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5},
+{0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5},
+{0x1.e01e009609a56p-1, 0x1.07598e598c000p-4},
+{0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4},
+{0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4},
+{0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4},
+{0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4},
+{0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4},
+{0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4},
+{0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4},
+{0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4},
+{0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3},
+{0x1.bf583eeece73fp-1, 0x1.147858292b000p-3},
+{0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3},
+{0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3},
+{0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3},
+{0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3},
+{0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3},
+{0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3},
+{0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3},
+{0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3},
+{0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3},
+{0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3},
+{0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3},
+{0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3},
+{0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3},
+{0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3},
+{0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3},
+{0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3},
+{0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3},
+{0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2},
+{0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2},
+{0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2},
+{0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2},
+{0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2},
+{0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2},
+{0x1.8060195f40260p-1, 0x1.2595fd7636800p-2},
+{0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2},
+{0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2},
+{0x1.79baa679725c2p-1, 0x1.377266dec1800p-2},
+{0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2},
+{0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2},
+#endif
+},
+#if !HAVE_FAST_FMA
+.tab2 = {
+#if N == 64
+{0x1.61ffff94c4fecp-1, -0x1.9fe4fc998f325p-56},
+{0x1.66000020377ddp-1, 0x1.e804c7a9519f2p-55},
+{0x1.6a00004c41678p-1, 0x1.902c675d9ecfep-55},
+{0x1.6dffff7384f87p-1, -0x1.2fd6b95e55043p-56},
+{0x1.720000b37216ep-1, 0x1.802bc8d437043p-55},
+{0x1.75ffffbeb3c9dp-1, 0x1.6047ad0a0d4e4p-57},
+{0x1.7a0000628daep-1, -0x1.e00434b49313dp-56},
+{0x1.7dffffd7abd1ap-1, -0x1.6015f8a083576p-56},
+{0x1.81ffffdf40c54p-1, 0x1.7f54bf76a42c9p-57},
+{0x1.860000f334e11p-1, 0x1.60054cb5344d7p-56},
+{0x1.8a0001238aca7p-1, 0x1.c03c9bd132f55p-57},
+{0x1.8dffffb81d212p-1, -0x1.001e519f2764fp-55},
+{0x1.92000086adc7cp-1, 0x1.1fe40f88f49c6p-55},
+{0x1.960000135d8eap-1, -0x1.f832268dc3095p-55},
+{0x1.99ffff9435acp-1, 0x1.7031d8b835edcp-56},
+{0x1.9e00003478565p-1, -0x1.0030b221ce3eep-58},
+{0x1.a20000b592948p-1, 0x1.8fd2f1dbd4639p-55},
+{0x1.a600000ad0bcfp-1, 0x1.901d6a974e6bep-55},
+{0x1.a9ffff55953a5p-1, 0x1.a07556192db98p-57},
+{0x1.adffff29ce03dp-1, -0x1.fff0717ec71c2p-56},
+{0x1.b1ffff34f3ac8p-1, 0x1.8005573de89d1p-57},
+{0x1.b60000894c55bp-1, -0x1.ff2fb51b044c7p-57},
+{0x1.b9fffef45ec7dp-1, -0x1.9ff7c4e8730fp-56},
+{0x1.be0000cda7b2ap-1, 0x1.57d058dbf3c1dp-55},
+{0x1.c1ffff2c57917p-1, 0x1.7e66d7e48dbc9p-58},
+{0x1.c60000ea5b82ap-1, -0x1.47f5e132ed4bep-55},
+{0x1.ca0001121ae98p-1, -0x1.40958c8d5e00ap-58},
+{0x1.ce0000f9241cbp-1, -0x1.7da063caa81c8p-59},
+{0x1.d1fffe8be95a4p-1, -0x1.82e3a411afcd9p-59},
+{0x1.d5ffff035932bp-1, -0x1.00f901b3fe87dp-58},
+{0x1.d9fffe8b54ba7p-1, 0x1.ffef55d6e3a4p-55},
+{0x1.de0000ad95d19p-1, 0x1.5feb2efd4c7c7p-55},
+{0x1.e1fffe925ce47p-1, 0x1.c8085484eaf08p-55},
+{0x1.e5fffe3ddf853p-1, -0x1.fd5ed02c5cadp-60},
+{0x1.e9fffed0a0e5fp-1, -0x1.a80aaef411586p-55},
+{0x1.ee00008f82eep-1, -0x1.b000aeaf97276p-55},
+{0x1.f20000a22d2f4p-1, -0x1.8f8906e13eba3p-56},
+{0x1.f5fffee35b57dp-1, 0x1.1fdd33b2d3714p-57},
+{0x1.fa00014eec3a6p-1, -0x1.3ee0b7a18c1a5p-58},
+{0x1.fdffff5daa89fp-1, -0x1.c1e24c8e3b503p-58},
+{0x1.0200005b93349p+0, -0x1.50197fe6bedcap-54},
+{0x1.05ffff9d597acp+0, 0x1.20160d062d0dcp-55},
+{0x1.0a00005687a63p+0, -0x1.27f3f9307696ep-54},
+{0x1.0dffff779164ep+0, 0x1.b7eb40bb9c4f4p-54},
+{0x1.12000044a0aa8p+0, 0x1.efbc914d512c4p-55},
+{0x1.16000069685bcp+0, -0x1.c0bea3eb2d82cp-57},
+{0x1.1a000093f0d78p+0, 0x1.1fecbf1e8c52p-54},
+{0x1.1dffffb2b1457p+0, -0x1.3fc91365637d6p-55},
+{0x1.2200008824a1p+0, -0x1.dff7e9feb578ap-54},
+{0x1.25ffffeef953p+0, -0x1.b00a61ec912f7p-55},
+{0x1.2a0000a1e7783p+0, 0x1.60048318b0483p-56},
+{0x1.2e0000853d4c7p+0, -0x1.77fbedf2c8cf3p-54},
+{0x1.320000324c55bp+0, 0x1.f81983997354fp-54},
+{0x1.360000594f796p+0, -0x1.cfe4beff900a9p-54},
+{0x1.3a0000a4c1c0fp+0, 0x1.07dbb2e268d0ep-54},
+{0x1.3e0000751c61bp+0, 0x1.80583ed1c566ep-56},
+{0x1.42000069e8a9fp+0, 0x1.f01f1edf82045p-54},
+{0x1.460000b5a1e34p+0, -0x1.dfdf0cf45c14ap-55},
+{0x1.4a0000187e513p+0, 0x1.401306b83a98dp-55},
+{0x1.4dffff3ba420bp+0, 0x1.9fc6539a6454ep-56},
+{0x1.51fffffe391c9p+0, -0x1.601ef3353ac83p-54},
+{0x1.560000e342455p+0, 0x1.3fb7fac8ac151p-55},
+{0x1.59ffffc39676fp+0, 0x1.4fe7dd6659cc2p-55},
+{0x1.5dfffff10ef42p+0, -0x1.48154cb592bcbp-54},
+#elif N == 128
+{0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56},
+{0x1.63000034db495p-1, 0x1.dbfea48005d41p-55},
+{0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55},
+{0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57},
+{0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56},
+{0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55},
+{0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55},
+{0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56},
+{0x1.710000e86978p-1, 0x1.bff6671097952p-56},
+{0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55},
+{0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57},
+{0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57},
+{0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55},
+{0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56},
+{0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55},
+{0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55},
+{0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55},
+{0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55},
+{0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55},
+{0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55},
+{0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55},
+{0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56},
+{0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55},
+{0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55},
+{0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55},
+{0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56},
+{0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55},
+{0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56},
+{0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55},
+{0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55},
+{0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60},
+{0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55},
+{0x1.a10001145b006p-1, 0x1.4ff489958da56p-56},
+{0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55},
+{0x1.a500010971d79p-1, 0x1.8fecadd78793p-55},
+{0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55},
+{0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55},
+{0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57},
+{0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55},
+{0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57},
+{0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58},
+{0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56},
+{0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56},
+{0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55},
+{0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56},
+{0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57},
+{0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57},
+{0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55},
+{0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55},
+{0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57},
+{0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55},
+{0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55},
+{0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56},
+{0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57},
+{0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55},
+{0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55},
+{0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56},
+{0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55},
+{0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58},
+{0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56},
+{0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56},
+{0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55},
+{0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55},
+{0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57},
+{0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56},
+{0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56},
+{0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56},
+{0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58},
+{0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55},
+{0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56},
+{0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58},
+{0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55},
+{0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59},
+{0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55},
+{0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55},
+{0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57},
+{0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56},
+{0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57},
+{0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56},
+{0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57},
+{0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55},
+{0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54},
+{0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54},
+{0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55},
+{0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57},
+{0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54},
+{0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55},
+{0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56},
+{0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55},
+{0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54},
+{0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54},
+{0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55},
+{0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54},
+{0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54},
+{0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57},
+{0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54},
+{0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54},
+{0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54},
+{0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56},
+{0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56},
+{0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56},
+{0x1.2b00014556313p+0, -0x1.2808233f21f02p-54},
+{0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55},
+{0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55},
+{0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55},
+{0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54},
+{0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54},
+{0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55},
+{0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54},
+{0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55},
+{0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56},
+{0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54},
+{0x1.410001532aff4p+0, 0x1.7f8375f198524p-57},
+{0x1.4300017478b29p+0, 0x1.301e672dc5143p-55},
+{0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55},
+{0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54},
+{0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54},
+{0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54},
+{0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54},
+{0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54},
+{0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57},
+{0x1.530001605277ap+0, -0x1.6bfcece233209p-54},
+{0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55},
+{0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54},
+{0x1.5900017e61012p+0, 0x1.87ec581afef9p-55},
+{0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54},
+{0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54},
+{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54},
+#endif
+},
+#endif /* !HAVE_FAST_FMA */
+};
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index e77170e..22654ad 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -425,4 +425,31 @@ extern const struct asinhf_data
 {
   float coeffs[ASINHF_NCOEFFS];
 } __asinhf_data HIDDEN;
+
+#define LOG_TABLE_BITS 7
+#define LOG_POLY_ORDER 6
+#define LOG_POLY1_ORDER 12
+extern const struct log_data
+{
+  double ln2hi;
+  double ln2lo;
+  double poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1.  */
+  double poly1[LOG_POLY1_ORDER - 1];
+  struct
+  {
+    double invc, logc;
+  } tab[1 << LOG_TABLE_BITS];
+#if !HAVE_FAST_FMA
+  struct
+  {
+    double chi, clo;
+  } tab2[1 << LOG_TABLE_BITS];
+#endif
+} __log_data HIDDEN;
+
+#define ASINH_NCOEFFS 18
+extern const struct asinh_data
+{
+  double poly[ASINH_NCOEFFS];
+} __asinh_data HIDDEN;
 #endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 45ee627..9fc0e32 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -12,6 +12,7 @@ F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
 
+D (asinh, -10.0, 10.0)
 D (atan, -10.0, 10.0)
 {"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
 D (erf, -6,6)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 746562d..3ef2d0e 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -91,6 +91,15 @@ t asinhf  0x1p-12      1.0  50000
 t asinhf      1.0   0x1p11  50000
 t asinhf   0x1p11  0x1p127  20000
 
+L=2.0
+t asinh -0x1p-26 0x1p-26   50000
+t asinh  0x1p-26     1.0   40000
+t asinh -0x1p-26    -1.0   10000
+t asinh      1.0   100.0   40000
+t asinh     -1.0  -100.0   10000
+t asinh    100.0     inf   50000
+t asinh   -100.0    -inf   10000
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/asinh.tst b/pl/math/test/testcases/directed/asinh.tst
new file mode 100644
index 0000000..f0d50ac
--- /dev/null
+++ b/pl/math/test/testcases/directed/asinh.tst
@@ -0,0 +1,18 @@
+; asinh.tst
+;
+; Copyright (c) 2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=asinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=asinh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=asinh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=asinh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=asinh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=asinh op1=fff00000.00000000 result=fff00000.00000000 errno=0
+func=asinh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=asinh op1=80000000.00000000 result=80000000.00000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=asinh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=asinh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 28ef25c..7e8145f 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -9,6 +9,7 @@ F2 (atan2)
 F1 (erfc)
 F1 (erf)
 F1 (log10)
+D1 (asinh)
 D2 (atan2)
 D1 (erfc)
 D1 (log10)
diff --git a/pl/math/tools/asinh.sollya b/pl/math/tools/asinh.sollya
new file mode 100644
index 0000000..6ff217f
--- /dev/null
+++ b/pl/math/tools/asinh.sollya
@@ -0,0 +1,28 @@
+// polynomial for approximating asinh(x)
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// Polynomial is used in [2^-26, 1]. However it is least accurate close to 1, so
+// we use 2^-6 as the lower bound for coeff generation, which yields sufficiently
+// accurate results in [2^-26, 2^-6].
+a = 0x1p-6;
+b = 1.0;
+
+f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2);
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = 0;
+for i from 0 to deg do {
+  i;
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+
+display = hexadecimal;
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
-- 
cgit v1.2.3


From c6e5af7d30eb825f53ec7596021f7237a0ca23e3 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 12 Jul 2022 12:25:09 +0100
Subject: pl/math: Add scalar log1p

New routine uses a polynomial on reduced interval. Worst-case error is
about 1.7 ULP.
---
 pl/math/include/mathlib.h                 |   1 +
 pl/math/log1p_2u.c                        | 144 ++++++++++++++++++++++++++++++
 pl/math/log1p_data.c                      |  18 ++++
 pl/math/math_config.h                     |   6 ++
 pl/math/test/mathbench_funcs.h            |   1 +
 pl/math/test/runulp.sh                    |  10 +++
 pl/math/test/testcases/directed/log1p.tst |  22 +++++
 pl/math/test/ulp_funcs.h                  |   1 +
 pl/math/tools/log1p.sollya                |  30 +++++++
 9 files changed, 233 insertions(+)
 create mode 100644 pl/math/log1p_2u.c
 create mode 100644 pl/math/log1p_data.c
 create mode 100644 pl/math/test/testcases/directed/log1p.tst
 create mode 100644 pl/math/tools/log1p.sollya

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index bd2d8c2..8a92a47 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -18,6 +18,7 @@ float log10f (float);
 double asinh (double);
 double atan2 (double, double);
 double log10 (double);
+double log1p (double);
 
 float __s_atanf (float);
 float __s_atan2f (float, float);
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
new file mode 100644
index 0000000..c214954
--- /dev/null
+++ b/pl/math/log1p_2u.c
@@ -0,0 +1,144 @@
+/*
+ * Double-precision log(1+x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define Ln2Hi 0x1.62e42fefa3800p-1
+#define Ln2Lo 0x1.ef35793c76730p-45
+#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)).  */
+#define OneMHfRt2Top                                                           \
+  0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)).  */
+#define OneTop12 0x3ff
+#define BottomMask 0xffffffff
+#define OneMHfRt2 0x3fd2bec333018866
+#define Rt2MOne 0x3fda827999fcef32
+#define AbsMask 0x7fffffffffffffff
+#define ExpM63 0x3c00
+
+#define C(i) __log1p_data.coeffs[i]
+
+static inline double
+eval_poly (double f)
+{
+  /* Evaluate polynomial using Estrin's method.  */
+  double p_01 = fma (f, C (1), C (0));
+  double p_23 = fma (f, C (3), C (2));
+  double p_45 = fma (f, C (5), C (4));
+  double p_67 = fma (f, C (7), C (6));
+  double p_89 = fma (f, C (9), C (8));
+  double p_ab = fma (f, C (11), C (10));
+  double p_cd = fma (f, C (13), C (12));
+  double p_ef = fma (f, C (15), C (14));
+  double p_gh = fma (f, C (17), C (16));
+
+  double f2 = f * f;
+  double p_03 = fma (f2, p_23, p_01);
+  double p_47 = fma (f2, p_67, p_45);
+  double p_8b = fma (f2, p_ab, p_89);
+  double p_cf = fma (f2, p_ef, p_cd);
+  double p_gi = fma (f2, C (18), p_gh);
+
+  double f4 = f2 * f2;
+  double p_07 = fma (f4, p_47, p_03);
+  double p_8f = fma (f4, p_cf, p_8b);
+
+  double f8 = f4 * f4;
+  double p_0f = fma (f8, p_8f, p_07);
+
+  return fma (f8 * f8, p_gi, p_0f);
+}
+
+/* log1p approximation using polynomial on reduced interval. Largest
+   observed errors are near the lower boundary of the region where k
+   is 0.
+   Maximum measured error: 1.7ULP.
+   log1p(-0x1.2e515c0f31f8p-2) got  -0x1.6648c36863fc2p-2
+			       want -0x1.6648c36863fc4p-2.  */
+double
+log1p (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ia = ix & AbsMask;
+  uint32_t ia16 = ia >> 48;
+
+  /* Handle special cases first.  */
+  if (unlikely (ia16 >= 0x7ff0 || ix >= 0xbff0000000000000
+		|| ix == 0x8000000000000000))
+    {
+      if (ix == 0x8000000000000000 || ix == 0x7ff0000000000000)
+	{
+	  /* x ==  -0 => log1p(x) =  -0.
+	     x == Inf => log1p(x) = Inf.  */
+	  return x;
+	}
+      if (ix == 0xbff0000000000000)
+	{
+	  /* x == -1 => log1p(x) = -Inf.  */
+	  return __math_divzero (-1);
+	  ;
+	}
+      if (ia16 >= 0x7ff0)
+	{
+	  /* x == +/-NaN => log1p(x) = NaN.  */
+	  return __math_invalid (asdouble (ia));
+	}
+      /* x  <      -1 => log1p(x) =  NaN.
+	 x ==    -Inf => log1p(x) =  NaN.  */
+      return __math_invalid (x);
+    }
+
+  /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
+			   is in [sqrt(2)/2, sqrt(2)]):
+     log1p(x) = k*log(2) + log1p(f).
+
+     f may not be representable exactly, so we need a correction term:
+     let m = round(1 + x), c = (1 + x) - m.
+     c << m: at very small x, log1p(x) ~ x, hence:
+     log(1+x) - log(m) ~ c/m.
+
+     We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m.  */
+
+  uint64_t sign = ix & ~AbsMask;
+  if (ia <= OneMHfRt2 || (!sign && ia <= Rt2MOne))
+    {
+      if (unlikely (ia16 <= ExpM63))
+	{
+	  /* If exponent of x <= -63 then shortcut the polynomial and avoid
+	     underflow by just returning x, which is exactly rounded in this
+	     region.  */
+	  return x;
+	}
+      /* If x is in [sqrt(2)/2 - 1, sqrt(2) - 1] then we can shortcut all the
+	 logic below, as k = 0 and f = x and therefore representable exactly.
+	 All we need is to return the polynomial.  */
+      return fma (x, eval_poly (x) * x, x);
+    }
+
+  /* Obtain correctly scaled k by manipulation in the exponent.  */
+  double m = x + 1;
+  uint64_t mi = asuint64 (m);
+  uint32_t u = (mi >> 32) + OneMHfRt2Top;
+  int32_t k = (int32_t) (u >> 20) - OneTop12;
+
+  /* Correction term c/m.  */
+  double cm = (x - (m - 1)) / m;
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  uint32_t utop = (u & 0x000fffff) + HfRt2Top;
+  uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask);
+  double f = asdouble (u_red) - 1;
+
+  /* Approximate log1p(x) on the reduced input using a polynomial. Because
+     log1p(0)=0 we choose an approximation of the form:
+	x + C0*x^2 + C1*x^3 + C2x^4 + ...
+     Hence approximation has the form f + f^2 * P(f)
+	where P(x) = C0 + C1*x + C2x^2 + ...  */
+  double p = fma (f, eval_poly (f) * f, f);
+
+  double kd = k;
+  double y = fma (Ln2Lo, kd, cm);
+  return y + fma (Ln2Hi, kd, p);
+}
diff --git a/pl/math/log1p_data.c b/pl/math/log1p_data.c
new file mode 100644
index 0000000..9380d13
--- /dev/null
+++ b/pl/math/log1p_data.c
@@ -0,0 +1,18 @@
+/*
+ * Data used in double-precision log(1+x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients generated using Remez algorithm, see
+   log1p.sollya for details.  */
+const struct log1p_data __log1p_data = {
+  .coeffs = {-0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+	     0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+	     -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+	     0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+	     -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+	     0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+	     -0x1.cfa7385bdb37ep-6}};
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 22654ad..5e46e2d 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -452,4 +452,10 @@ extern const struct asinh_data
 {
   double poly[ASINH_NCOEFFS];
 } __asinh_data HIDDEN;
+
+#define LOG1P_NCOEFFS 19
+extern const struct log1p_data
+{
+  double coeffs[LOG1P_NCOEFFS];
+} __log1p_data HIDDEN;
 #endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 9fc0e32..441c937 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -18,6 +18,7 @@ D (atan, -10.0, 10.0)
 D (erf, -6,6)
 D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
+D (log1p, -0.9, 10.0)
 
 #if WANT_VMATH
 F (__s_atanf, -10.0, 10.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 3ef2d0e..72e5378 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -100,6 +100,16 @@ t asinh     -1.0  -100.0   10000
 t asinh    100.0     inf   50000
 t asinh   -100.0    -inf   10000
 
+L=2.0
+t log1p    -10.0     10.0  10000
+t log1p      0.0  0x1p-23  50000
+t log1p  0x1p-23    0.001  50000
+t log1p    0.001      1.0  50000
+t log1p      0.0 -0x1p-23  50000
+t log1p -0x1p-23   -0.001  50000
+t log1p   -0.001     -1.0  50000
+t log1p     -1.0      inf   5000
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/log1p.tst b/pl/math/test/testcases/directed/log1p.tst
new file mode 100644
index 0000000..41a1896
--- /dev/null
+++ b/pl/math/test/testcases/directed/log1p.tst
@@ -0,0 +1,22 @@
+; log1p.tst
+;
+; Copyright (c) 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log1p op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=log1p op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=log1p op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log1p op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log1p op1=fff02000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=log1p op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+; Cases 6, 9 , 10, 11, 12 fail with certain versions of GLIBC and not others.
+; The main reason seems to be the handling of errno and exceptions.
+
+func=log1p op1=00000000.00000000 result=00000000.00000000 errno=0
+func=log1p op1=80000000.00000000 result=80000000.00000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=log1p op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=log1p op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 7e8145f..739ecfd 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -13,6 +13,7 @@ D1 (asinh)
 D2 (atan2)
 D1 (erfc)
 D1 (log10)
+D1 (log1p)
 #if WANT_VMATH
 F (__s_atanf, __s_atanf, atan, mpfr_atan, 1, 1, f1, 0)
 F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0)
diff --git a/pl/math/tools/log1p.sollya b/pl/math/tools/log1p.sollya
new file mode 100644
index 0000000..fb159b3
--- /dev/null
+++ b/pl/math/tools/log1p.sollya
@@ -0,0 +1,30 @@
+// polynomial for approximating log(1+x) in double precision
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 20;
+
+a = sqrt(2)/2-1;
+b = sqrt(2)-1;
+
+f = proc(y) {
+  return log(1+y);
+};
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = x;
+for i from 2 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+
+print("coeffs:");
+display = hexadecimal;
+for i from 2 to deg do coeff(poly,i);
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
-- 
cgit v1.2.3


From 8c77ce82dc4920d660e6814367c553a02f14b408 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 14 Jul 2022 11:41:37 +0100
Subject: pl/math: Add scalar log1pf

The new routine uses a polynomial on a reduced interval. Worst-case
error is about 2.1 ULP, with an option in math_config.h to use Horner
instead of Estrin for the polynomial, which gives worst-case error of
1.3 ULP.
---
 pl/math/include/mathlib.h                  |   1 +
 pl/math/log1pf_2u1.c                       | 158 +++++++++++++++++++++++++++++
 pl/math/log1pf_data.c                      |  14 +++
 pl/math/math_config.h                      |   7 ++
 pl/math/test/mathbench_funcs.h             |   1 +
 pl/math/test/runulp.sh                     |  10 ++
 pl/math/test/testcases/directed/log1pf.tst | 130 ++++++++++++++++++++++++
 pl/math/test/ulp_funcs.h                   |   1 +
 pl/math/tools/log1pf.sollya                |  21 ++++
 9 files changed, 343 insertions(+)
 create mode 100644 pl/math/log1pf_2u1.c
 create mode 100644 pl/math/log1pf_data.c
 create mode 100644 pl/math/test/testcases/directed/log1pf.tst
 create mode 100644 pl/math/tools/log1pf.sollya

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 8a92a47..7775ec3 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -14,6 +14,7 @@ float atan2f (float, float);
 float erfcf (float);
 float erff (float);
 float log10f (float);
+float log1pf (float);
 
 double asinh (double);
 double atan2 (double, double);
diff --git a/pl/math/log1pf_2u1.c b/pl/math/log1pf_2u1.c
new file mode 100644
index 0000000..5b0d542
--- /dev/null
+++ b/pl/math/log1pf_2u1.c
@@ -0,0 +1,158 @@
+/*
+ * Single-precision log(1+x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define Ln2 (0x1.62e43p-1f)
+#define SignMask (0x80000000)
+
+/* Biased exponent of the largest float m for which m^8 underflows.  */
+#define M8UFLOW_BOUND_BEXP 112
+/* Biased exponent of the largest float for which we just return x.  */
+#define TINY_BOUND_BEXP 103
+
+#define C(i) __log1pf_data.coeffs[i]
+
+static inline float
+eval_poly (float m, uint32_t e)
+{
+#ifdef LOG1PF_2U5
+
+  /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using Estrin
+     scheme.  */
+  float p_12 = fmaf (m, C (1), C (0));
+  float p_34 = fmaf (m, C (3), C (2));
+  float p_56 = fmaf (m, C (5), C (4));
+  float p_78 = fmaf (m, C (7), C (6));
+
+  float m2 = m * m;
+  float p_02 = fmaf (m2, p_12, m);
+  float p_36 = fmaf (m2, p_56, p_34);
+  float p_79 = fmaf (m2, C (8), p_78);
+
+  float m4 = m2 * m2;
+  float p_06 = fmaf (m4, p_36, p_02);
+
+  if (unlikely (e < M8UFLOW_BOUND_BEXP))
+    return p_06;
+
+  float m8 = m4 * m4;
+  return fmaf (m8, p_79, p_06);
+
+#elif defined(LOG1PF_1U3)
+
+  /* 1.3 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using Horner
+     scheme. Our polynomial approximation for log1p has the form
+     x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ...
+     Hence approximation has the form m + m^2 * P(m)
+       where P(x) = C1 + C2 * x + C3 * x^2 + ... .  */
+  float p = fmaf (C (8), m, C (7));
+  p = fmaf (p, m, C (6));
+  p = fmaf (p, m, C (5));
+  p = fmaf (p, m, C (4));
+  p = fmaf (p, m, C (3));
+  p = fmaf (p, m, C (2));
+  p = fmaf (p, m, C (1));
+  p = fmaf (p, m, C (0));
+  return fmaf (m, m * p, m);
+
+#else
+#error No log1pf approximation exists with the requested precision. Options are 13 or 25.
+#endif
+}
+
+static inline uint32_t
+biased_exponent (uint32_t ix)
+{
+  return (ix & 0x7f800000) >> 23;
+}
+
+/* log1pf approximation using polynomial on reduced interval. Worst-case error
+   when using Estrin is roughly 2.02 ULP:
+   log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3.  */
+float
+log1pf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t ia = ix & ~SignMask;
+  uint32_t ia12 = ia >> 20;
+  uint32_t e = biased_exponent (ix);
+
+  /* Handle special cases first.  */
+  if (unlikely (ia12 >= 0x7f8 || ix >= 0xbf800000 || ix == 0x80000000
+		|| e <= TINY_BOUND_BEXP))
+    {
+      if (ix == 0xff800000)
+	{
+	  /* x == -Inf => log1pf(x) =  NaN.  */
+	  return NAN;
+	}
+      if ((ix == 0x7f800000 || e <= TINY_BOUND_BEXP) && ia12 <= 0x7f8)
+	{
+	  /* |x| < TinyBound => log1p(x)  =  x.
+	      x ==       Inf => log1pf(x) = Inf.  */
+	  return x;
+	}
+      if (ix == 0xbf800000)
+	{
+	  /* x == -1.0 => log1pf(x) = -Inf.  */
+	  return __math_divzerof (-1);
+	}
+      if (ia12 >= 0x7f8)
+	{
+	  /* x == +/-NaN => log1pf(x) = NaN.  */
+	  return __math_invalidf (asfloat (ia));
+	}
+      /* x <    -1.0 => log1pf(x) = NaN.  */
+      return __math_invalidf (x);
+    }
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+			   is in [-0.25, 0.5]):
+     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+     We approximate log1p(m) with a polynomial, then scale by
+     k*log(2). Instead of doing this directly, we use an intermediate
+     scale factor s = 4*k*log(2) to ensure the scale is representable
+     as a normalised fp32 number.  */
+
+  if (ix <= 0x3f000000 || ia <= 0x3e800000)
+    {
+      /* If x is in [-0.25, 0.5] then we can shortcut all the logic
+	 below, as k = 0 and m = x.  All we need is to return the
+	 polynomial.  */
+      return eval_poly (x, e);
+    }
+
+  float m = x + 1.0f;
+
+  /* k is used scale the input. 0x3f400000 is chosen as we are trying to
+     reduce x to the range [-0.25, 0.5]. Inside this range, k is 0.
+     Outside this range, if k is reinterpreted as (NOT CONVERTED TO) float:
+	 let k = sign * 2^p      where sign = -1 if x < 0
+					       1 otherwise
+	 and p is a negative integer whose magnitude increases with the
+	 magnitude of x.  */
+  int k = (asuint (m) - 0x3f400000) & 0xff800000;
+
+  /* By using integer arithmetic, we obtain the necessary scaling by
+     subtracting the unbiased exponent of k from the exponent of x.  */
+  float m_scale = asfloat (asuint (x) - k);
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number (s in [2**-126,2**26]), and scale m down accordingly.  */
+  float s = asfloat (asuint (4.0f) - k);
+  m_scale = m_scale + fmaf (0.25f, s, -1.0f);
+
+  float p = eval_poly (m_scale, biased_exponent (asuint (m_scale)));
+
+  /* The scale factor to be applied back at the end - by multiplying float(k)
+     by 2^-23 we get the unbiased exponent of k.  */
+  float scale_back = (float) k * 0x1.0p-23f;
+
+  /* Apply the scaling back.  */
+  return fmaf (scale_back, Ln2, p);
+}
diff --git a/pl/math/log1pf_data.c b/pl/math/log1pf_data.c
new file mode 100644
index 0000000..d7bc95c
--- /dev/null
+++ b/pl/math/log1pf_data.c
@@ -0,0 +1,14 @@
+/*
+ * Data used in single-precision log1p(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+/* Polynomial coefficients generated using floating-point minimax
+   algorithm, see tools/log1pf.sollya for details.  */
+const struct log1pf_data __log1pf_data
+  = {.coeffs = {-0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+		-0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
+		-0x1.6f0d5ep-5f}};
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 5e46e2d..623ceff 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -458,4 +458,11 @@ extern const struct log1p_data
 {
   double coeffs[LOG1P_NCOEFFS];
 } __log1p_data HIDDEN;
+
+#define LOG1PF_2U5
+#define LOG1PF_NCOEFFS 9
+extern const struct log1pf_data
+{
+  float coeffs[LOG1PF_NCOEFFS];
+} __log1pf_data HIDDEN;
 #endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 441c937..9eff7e4 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -11,6 +11,7 @@ F (atanf, -10.0, 10.0)
 F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
+F (log1pf, -0.9, 10.0)
 
 D (asinh, -10.0, 10.0)
 D (atan, -10.0, 10.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 72e5378..ebe5b92 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -110,6 +110,16 @@ t log1p -0x1p-23   -0.001  50000
 t log1p   -0.001     -1.0  50000
 t log1p     -1.0      inf   5000
 
+L=2.0
+t log1pf    -10.0     10.0  10000
+t log1pf      0.0  0x1p-23  50000
+t log1pf  0x1p-23    0.001  50000
+t log1pf    0.001      1.0  50000
+t log1pf      0.0 -0x1p-23  50000
+t log1pf -0x1p-23   -0.001  50000
+t log1pf   -0.001     -1.0  50000
+t log1pf     -1.0      inf   5000
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/log1pf.tst b/pl/math/test/testcases/directed/log1pf.tst
new file mode 100644
index 0000000..a543887
--- /dev/null
+++ b/pl/math/test/testcases/directed/log1pf.tst
@@ -0,0 +1,130 @@
+; log1pf.tst
+;
+; Copyright (c) 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log1pf op1=7fc00001 result=7fc00001 errno=0
+func=log1pf op1=ffc00001 result=7fc00001 errno=0
+func=log1pf op1=7f800001 result=7fc00001 errno=0 status=i
+func=log1pf op1=ff800001 result=7fc00001 errno=0 status=i
+func=log1pf op1=ff810000 result=7fc00001 errno=0 status=i
+func=log1pf op1=7f800000 result=7f800000 errno=0
+
+; Cases 6, 9 , 10, 11, 12 fail with certain versions of GLIBC and not others.
+; The main reason seems to be the handling of errno and exceptions.
+
+func=log1pf op1=00000000 result=00000000 errno=0
+func=log1pf op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=log1pf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=log1pf op1=80000001 result=80000001 errno=0 maybestatus=ux
+
+func=log1pf op1=3f1e91ee result=3ef6d127.fdb errno=0
+func=log1pf op1=3f201046 result=3ef8a881.fba errno=0
+func=log1pf op1=3f21b916 result=3efab23b.f9f errno=0
+func=log1pf op1=3f21bde6 result=3efab821.fee errno=0
+func=log1pf op1=3f22a5ee result=3efbd435.ff2 errno=0
+func=log1pf op1=3f231b56 result=3efc63b7.e26 errno=0
+func=log1pf op1=3f23ce96 result=3efd3e83.fc8 errno=0
+func=log1pf op1=3eee18c6 result=3ec38576.02e errno=0
+func=log1pf op1=3eee2f41 result=3ec394ce.057 errno=0
+func=log1pf op1=3eee770d result=3ec3c5cc.00c errno=0
+func=log1pf op1=3eee7fed result=3ec3cbda.065 errno=0
+func=log1pf op1=3eee8fb2 result=3ec3d69c.008 errno=0
+func=log1pf op1=3eeeb8eb result=3ec3f2ba.061 errno=0
+func=log1pf op1=3eeeccfd result=3ec4006a.01d errno=0
+func=log1pf op1=3eeef5f0 result=3ec41c56.020 errno=0
+func=log1pf op1=3eeeff12 result=3ec42290.00c errno=0
+func=log1pf op1=3eef05cf result=3ec42728.052 errno=0
+func=log1pf op1=3eef13d3 result=3ec430b6.00e errno=0
+func=log1pf op1=3eef2e70 result=3ec442da.04a errno=0
+func=log1pf op1=3eef3fbf result=3ec44ea6.055 errno=0
+func=log1pf op1=3eef3feb result=3ec44ec4.021 errno=0
+func=log1pf op1=3eef4399 result=3ec45146.011 errno=0
+func=log1pf op1=3eef452e result=3ec4525a.049 errno=0
+func=log1pf op1=3eef4ea9 result=3ec458d0.020 errno=0
+func=log1pf op1=3eef7365 result=3ec471d8.05e errno=0
+func=log1pf op1=3eefa38f result=3ec492a8.003 errno=0
+func=log1pf op1=3eefb1f1 result=3ec49c74.015 errno=0
+func=log1pf op1=3eefb334 result=3ec49d50.023 errno=0
+func=log1pf op1=3eefb3c1 result=3ec49db0.0bf errno=0
+func=log1pf op1=3eefb591 result=3ec49eec.15d errno=0
+func=log1pf op1=3eefd736 result=3ec4b5d6.02d errno=0
+func=log1pf op1=3eefd797 result=3ec4b618.114 errno=0
+func=log1pf op1=3eefee5d result=3ec4c59a.071 errno=0
+func=log1pf op1=3eeffff4 result=3ec4d194.0a7 errno=0
+func=log1pf op1=3ef00cd1 result=3ec4da56.025 errno=0
+func=log1pf op1=3ef0163a result=3ec4e0be.07a errno=0
+func=log1pf op1=3ef01e89 result=3ec4e666.007 errno=0
+func=log1pf op1=3ef02004 result=3ec4e768.00a errno=0
+func=log1pf op1=3ef02c40 result=3ec4efbc.017 errno=0
+func=log1pf op1=3ef05b50 result=3ec50fc4.031 errno=0
+func=log1pf op1=3ef05bb1 result=3ec51006.05f errno=0
+func=log1pf op1=3ef0651b result=3ec5166e.0d9 errno=0
+func=log1pf op1=3ef06609 result=3ec51710.02a errno=0
+func=log1pf op1=3ef0666a result=3ec51752.049 errno=0
+func=log1pf op1=3ef0791e result=3ec5240c.0a8 errno=0
+func=log1pf op1=3ef07d46 result=3ec526e0.00e errno=0
+func=log1pf op1=3ef091fd result=3ec534f8.03c errno=0
+func=log1pf op1=3ef09602 result=3ec537b4.128 errno=0
+func=log1pf op1=3ef09848 result=3ec53940.044 errno=0
+func=log1pf op1=3ef0a04f result=3ec53eb6.07d errno=0
+func=log1pf op1=3ef0ab6a result=3ec54644.062 errno=0
+func=log1pf op1=3ef0ae49 result=3ec54838.002 errno=0
+func=log1pf op1=3ef0c1b8 result=3ec55570.000 errno=0
+func=log1pf op1=3ef0ca06 result=3ec55b16.00d errno=0
+func=log1pf op1=3ef0cc29 result=3ec55c8a.095 errno=0
+func=log1pf op1=3ef0d228 result=3ec5609e.04f errno=0
+func=log1pf op1=3ef0d8c0 result=3ec5651a.05e errno=0
+func=log1pf op1=3ef0dc0c result=3ec56758.029 errno=0
+func=log1pf op1=3ef0e0e8 result=3ec56aa6.02e errno=0
+func=log1pf op1=3ef0e502 result=3ec56d70.102 errno=0
+func=log1pf op1=3ef0e754 result=3ec56f04.017 errno=0
+func=log1pf op1=3ef0efe9 result=3ec574da.01c errno=0
+func=log1pf op1=3ef0f309 result=3ec576fa.016 errno=0
+func=log1pf op1=3ef0f499 result=3ec5780a.005 errno=0
+func=log1pf op1=3ef0f6c2 result=3ec57982.083 errno=0
+func=log1pf op1=3ef0f852 result=3ec57a92.05d errno=0
+func=log1pf op1=3ef0f9e2 result=3ec57ba2.02e errno=0
+func=log1pf op1=3ef119ee result=3ec5916c.024 errno=0
+func=log1pf op1=3ef11edf result=3ec594c8.03d errno=0
+func=log1pf op1=3ef128c4 result=3ec59b82.001 errno=0
+func=log1pf op1=3ef12ac1 result=3ec59cdc.04b errno=0
+func=log1pf op1=3ef12fea result=3ec5a05e.045 errno=0
+func=log1pf op1=3ef131e7 result=3ec5a1b8.05a errno=0
+func=log1pf op1=3ef134e1 result=3ec5a3be.00e errno=0
+func=log1pf op1=3ef1397a result=3ec5a6de.127 errno=0
+func=log1pf op1=3ef13ade result=3ec5a7d0.0f6 errno=0
+func=log1pf op1=3ef13c0d result=3ec5a89e.054 errno=0
+func=log1pf op1=3ef13d71 result=3ec5a990.016 errno=0
+func=log1pf op1=3ef14074 result=3ec5ab9c.12c errno=0
+func=log1pf op1=3ef146a0 result=3ec5afce.035 errno=0
+func=log1pf op1=3ef14a39 result=3ec5b240.024 errno=0
+func=log1pf op1=3ef14d39 result=3ec5b44a.00c errno=0
+func=log1pf op1=3ef152a3 result=3ec5b7f8.04d errno=0
+func=log1pf op1=3ef170a1 result=3ec5cc5a.021 errno=0
+func=log1pf op1=3ef17855 result=3ec5d196.0dc errno=0
+func=log1pf op1=3ef17ece result=3ec5d5fc.010 errno=0
+func=log1pf op1=3ef1810c result=3ec5d782.08e errno=0
+func=log1pf op1=3ef18da9 result=3ec5e014.0ae errno=0
+func=log1pf op1=3ef19054 result=3ec5e1e4.1a2 errno=0
+func=log1pf op1=3ef190ea result=3ec5e24a.048 errno=0
+func=log1pf op1=3ef1a739 result=3ec5f172.0d8 errno=0
+func=log1pf op1=3ef1a83c result=3ec5f222.018 errno=0
+func=log1pf op1=3ef1bbcc result=3ec5ff6c.09d errno=0
+func=log1pf op1=3ef1bd3c result=3ec60066.03a errno=0
+func=log1pf op1=3ef1d6ee result=3ec611da.056 errno=0
+func=log1pf op1=3ef1de36 result=3ec616cc.01b errno=0
+func=log1pf op1=3ef1e623 result=3ec61c2e.008 errno=0
+func=log1pf op1=3ef1e9b1 result=3ec61e98.029 errno=0
+func=log1pf op1=3ef1ee19 result=3ec62196.0d8 errno=0
+func=log1pf op1=3ef1f13a result=3ec623b6.039 errno=0
+func=log1pf op1=3ef1f1a7 result=3ec62400.091 errno=0
+func=log1pf op1=3ef1f214 result=3ec6244a.0e8 errno=0
+func=log1pf op1=3ef206e1 result=3ec6326a.09b errno=0
+func=log1pf op1=3ef21245 result=3ec63a26.012 errno=0
+func=log1pf op1=3ef217fd result=3ec63e08.048 errno=0
+func=log1pf op1=3ef2186a result=3ec63e52.063 errno=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 739ecfd..0825acd 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -9,6 +9,7 @@ F2 (atan2)
 F1 (erfc)
 F1 (erf)
 F1 (log10)
+F1 (log1p)
 D1 (asinh)
 D2 (atan2)
 D1 (erfc)
diff --git a/pl/math/tools/log1pf.sollya b/pl/math/tools/log1pf.sollya
new file mode 100644
index 0000000..32b307b
--- /dev/null
+++ b/pl/math/tools/log1pf.sollya
@@ -0,0 +1,21 @@
+// polynomial for approximating log(1+x) in single precision
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 10;
+
+a = -0.25;
+b = 0.5;
+
+f = proc(y) {
+  return log(1+y);
+};
+
+poly = fpminimax(f(x), deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do round(coeff(poly,i), SG, RN);
-- 
cgit v1.2.3


From 241ac3945ce507f6d08f66fcf11596d785ba933f Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 14 Jul 2022 11:42:03 +0100
Subject: pl/math: Add vector/Neon log1pf

The new routine is a Neon port of the scalar algorithm, using the same
coefficients. The worst-case error is about 2.1 ulp when using Estrin,
at the same value as the scalar algorithm.
---
 pl/math/include/mathlib.h      |   4 ++
 pl/math/math_config.h          |   1 +
 pl/math/s_log1pf_2u1.c         |   6 ++
 pl/math/test/mathbench_funcs.h |   5 ++
 pl/math/test/runulp.sh         |  16 +++++
 pl/math/test/ulp_funcs.h       |   4 ++
 pl/math/test/ulp_wrappers.h    |   3 +
 pl/math/v_log1pf_2u1.c         | 136 +++++++++++++++++++++++++++++++++++++++++
 pl/math/v_math.h               |  20 ++++++
 pl/math/vn_log1pf_2u1.c        |  12 ++++
 10 files changed, 207 insertions(+)
 create mode 100644 pl/math/s_log1pf_2u1.c
 create mode 100644 pl/math/v_log1pf_2u1.c
 create mode 100644 pl/math/vn_log1pf_2u1.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 7775ec3..e688a32 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -26,6 +26,7 @@ float __s_atan2f (float, float);
 float __s_erfcf (float);
 float __s_erff (float);
 float __s_log10f (float);
+float __s_log1pf (float);
 
 double __s_atan (double);
 double __s_atan2 (double, double);
@@ -55,6 +56,7 @@ __f32x4_t __v_erfcf (__f32x4_t);
 __f64x2_t __v_erfc (__f64x2_t);
 __f32x4_t __v_log10f (__f32x4_t);
 __f64x2_t __v_log10 (__f64x2_t);
+__f32x4_t __v_log1pf (__f32x4_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
@@ -70,6 +72,7 @@ __vpcs __f32x4_t __vn_erfcf (__f32x4_t);
 __vpcs __f64x2_t __vn_erfc (__f64x2_t);
 __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
+__vpcs __f32x4_t __vn_log1pf (__f32x4_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
@@ -82,6 +85,7 @@ __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
 
 #endif
 #endif
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 623ceff..f058753 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -460,6 +460,7 @@ extern const struct log1p_data
 } __log1p_data HIDDEN;
 
 #define LOG1PF_2U5
+#define V_LOG1PF_2U5
 #define LOG1PF_NCOEFFS 9
 extern const struct log1pf_data
 {
diff --git a/pl/math/s_log1pf_2u1.c b/pl/math/s_log1pf_2u1.c
new file mode 100644
index 0000000..fe01b05
--- /dev/null
+++ b/pl/math/s_log1pf_2u1.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log1pf_2u1.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 9eff7e4..aa4a2eb 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -32,6 +32,7 @@ F (__s_erfcf, -6.0, 28.0)
 D (__s_erfc, -6.0, 28.0)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
+F (__s_log1pf, -0.9, 10.0)
 #if __aarch64__
 VF (__v_atanf, -10.0, 10.0)
 VD (__v_atan, -10.0, 10.0)
@@ -43,6 +44,7 @@ VF (__v_erfcf, -6.0, 28.0)
 VD (__v_erfc, -6.0, 28.0)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
+VF (__v_log1pf, -0.9, 10.0)
 #ifdef __vpcs
 VNF (__vn_atanf, -10.0, 10.0)
 VNF (_ZGVnN4v_atanf, -10.0, 10.0)
@@ -73,6 +75,9 @@ VNF (_ZGVnN4v_log10f, 0.01, 11.1)
 
 VND (__vn_log10, 0.01, 11.1)
 VND (_ZGVnN2v_log10, 0.01, 11.1)
+
+VNF (__vn_log1pf, -0.9, 10.0)
+VNF (_ZGVnN4v_log1pf, -0.9, 10.0)
 #endif
 #endif
 #endif
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index ebe5b92..9234f23 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -212,6 +212,17 @@ range_atanf='
    1e6       1e32  40000
 '
 
+range_log1pf='
+    -10.0     10.0  10000
+      0.0  0x1p-23  30000
+  0x1p-23    0.001  50000
+    0.001      1.0  50000
+      0.0 -0x1p-23  30000
+ -0x1p-23   -0.001  30000
+   -0.001     -1.0  50000
+     -1.0      inf   1000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -223,6 +234,7 @@ L_atan2=2.9
 L_atan=3.0
 L_atan2f=3.0
 L_atanf=3.0
+L_log1pf=2.0
 
 while read G F R
 do
@@ -282,6 +294,10 @@ log10f __s_log10f      $runs
 log10f __v_log10f      $runv
 log10f __vn_log10f     $runvn
 log10f _ZGVnN4v_log10f $runvn
+log1pf __s_log1pf      $runs
+log1pf __v_log1pf      $runv
+log1pf __vn_log1pf     $runvn
+log1pf _ZGVnN4v_log1pf $runvn
 EOF
 
 [ 0 -eq $FAIL ] || {
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 0825acd..bd7026b 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -26,6 +26,7 @@ F (__s_erfcf, __s_erfcf, erfc, mpfr_erfc, 1, 1, f1, 0)
 F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0)
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
+F (__s_log1pf, __s_log1pf, log1p, mpfr_log1p, 1, 1, f1, 0)
 #if __aarch64__
 F (__v_atanf, v_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1)
@@ -37,6 +38,7 @@ F (__v_erfcf, v_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (__v_log1pf, v_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
 #ifdef __vpcs
 F (__vn_atanf, vn_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1)
@@ -48,6 +50,7 @@ F (__vn_erfcf, vn_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (__vn_log1pf, vn_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
 F (_ZGVnN4v_atanf, Z_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (_ZGVnN4vv_atan2f, Z_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
@@ -58,6 +61,7 @@ F (_ZGVnN4v_erfcf, Z_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
 F (_ZGVnN2v_erfc, Z_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (_ZGVnN4v_log10f, Z_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (_ZGVnN2v_log10, Z_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (_ZGVnN4v_log1pf, Z_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
 #endif
 #endif
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index fa4ba4c..d8baad7 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -19,6 +19,7 @@ static float v_atan2f(float x, float y) { return __v_atan2f(argf(x), argf(y))[0]
 static float v_erff(float x) { return __v_erff(argf(x))[0]; }
 static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
+static float v_log1pf(float x) { return __v_log1pf(argf(x))[0]; }
 static double v_atan(double x) { return __v_atan(argd(x))[0]; }
 static double v_atan2(double x, double y) { return __v_atan2(argd(x), argd(y))[0]; }
 static double v_erf(double x) { return __v_erf(argd(x))[0]; }
@@ -30,6 +31,7 @@ static float vn_atan2f(float x, float y) { return __vn_atan2f(argf(x), argf(y))[
 static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
 static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
 static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
+static float vn_log1pf(float x) { return __vn_log1pf(argf(x))[0]; }
 static double vn_atan(double x) { return __vn_atan(argd(x))[0]; }
 static double vn_atan2(double x, double y) { return __vn_atan2(argd(x), argd(y))[0]; }
 static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
@@ -41,6 +43,7 @@ static float Z_atan2f(float x, float y) { return _ZGVnN4vv_atan2f(argf(x), argf(
 static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
 static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
 static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
+static float Z_log1pf(float x) { return _ZGVnN4v_log1pf(argf(x))[0]; }
 static double Z_atan(double x) { return _ZGVnN2v_atan(argd(x))[0]; }
 static double Z_atan2(double x, double y) { return _ZGVnN2vv_atan2(argd(x), argd(y))[0]; }
 static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; }
diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c
new file mode 100644
index 0000000..9e81ff4
--- /dev/null
+++ b/pl/math/v_log1pf_2u1.c
@@ -0,0 +1,136 @@
+/*
+ * Single-precision vector log(1+x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define TinyBound 0x340 /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
+#define MinusOne 0xbf800000
+#define Ln2 (0x1.62e43p-1f)
+#define Four 0x40800000
+#define ThreeQuarters v_u32 (0x3f400000)
+
+#define C(i) v_f32 (__log1pf_data.coeffs[i])
+
+static inline v_f32_t
+eval_poly (v_f32_t m)
+{
+#ifdef V_LOG1PF_1U3
+
+  /* Approximate log(1+m) on [-0.25, 0.5] using Horner scheme.  */
+  v_f32_t p = v_fma_f32 (C (8), m, C (7));
+  p = v_fma_f32 (p, m, C (6));
+  p = v_fma_f32 (p, m, C (5));
+  p = v_fma_f32 (p, m, C (4));
+  p = v_fma_f32 (p, m, C (3));
+  p = v_fma_f32 (p, m, C (2));
+  p = v_fma_f32 (p, m, C (1));
+  p = v_fma_f32 (p, m, C (0));
+  return v_fma_f32 (m, m * p, m);
+
+#elif defined(V_LOG1PF_2U5)
+
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
+  v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
+  v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
+  v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+
+  v_f32_t m2 = m * m;
+  v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
+  v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
+  v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+
+  v_f32_t m4 = m2 * m2;
+  v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
+
+  v_f32_t m8 = m4 * m4;
+  return v_fma_f32 (m8, p_79, p_06);
+
+#else
+#error No precision specified for v_log1pf
+#endif
+}
+
+static inline float
+handle_special (float x)
+{
+  uint32_t ix = asuint (x);
+  if (ix == 0xff800000 || ix > 0xbf800000)
+    {
+      /* x == -Inf => log1pf(x) = NaN.
+	 x <  -1.0 => log1pf(x) = NaN.  */
+      return NAN;
+    }
+  if (ix == 0xbf800000)
+    {
+      /* x == -1.0 => log1pf(x) = -Inf.  */
+      return -INFINITY;
+    }
+  uint32_t ia = ix & AbsMask;
+  if (ia >= 0x7f800000)
+    {
+      /* x == +/-NaN => log1pf(x) = NaN, needs to be propagated.  */
+      return asfloat (ia);
+    }
+  /* |x| < TinyBound => log1p(x)  =  x.  */
+  return x;
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Accuracy is
+   the same as for the scalar algorithm, i.e. worst-case error when using Estrin
+   is roughly 2.02 ULP:
+   log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3.  */
+VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t ia12 = (ix >> 20) & v_u32 (0x7f8);
+  v_u32_t special_cases
+    = v_cond_u32 (ia12 - v_u32 (TinyBound) >= (0x7f8 - TinyBound))
+      | v_cond_u32 (ix >= MinusOne);
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+			   is in [-0.25, 0.5]):
+     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+     We approximate log1p(m) with a polynomial, then scale by
+     k*log(2). Instead of doing this directly, we use an intermediate
+     scale factor s = 4*k*log(2) to ensure the scale is representable
+     as a normalised fp32 number.  */
+
+  v_f32_t m = x + v_f32 (1.0f);
+
+  /* Choose k to scale x to the range [-1/4, 1/2].  */
+  v_s32_t k = (v_as_s32_f32 (m) - ThreeQuarters) & v_u32 (0xff800000);
+
+  /* Scale x by exponent manipulation.  */
+  v_f32_t m_scale = v_as_f32_u32 (ix - v_as_u32_s32 (k));
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number, and scale m down accordingly.  */
+  v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
+  m_scale = m_scale + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+
+  /* Evaluate polynomial on the reduced interval.  */
+  v_f32_t p = eval_poly (m_scale);
+
+  /* The scale factor to be applied back at the end - by multiplying float(k)
+     by 2^-23 we get the unbiased exponent of k.  */
+  v_f32_t scale_back = v_to_f32_s32 (k) * v_f32 (0x1p-23f);
+
+  /* Apply the scaling back.  */
+  v_f32_t y = v_fma_f32 (scale_back, v_f32 (Ln2), p);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    {
+      return v_call_f32 (handle_special, x, y, special_cases);
+    }
+  return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index ddc5dab..4de3609 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -231,6 +231,16 @@ v_as_u32_f32 (v_f32_t x)
   union { v_f32_t f; v_u32_t u; } r = {x};
   return r.u;
 }
+static inline v_s32_t
+v_as_s32_f32 (v_f32_t x)
+{
+  union
+  {
+    v_f32_t f;
+    v_s32_t u;
+  } r = {x};
+  return r.u;
+}
 static inline v_f32_t
 v_as_f32_u32 (v_u32_t x)
 {
@@ -561,6 +571,16 @@ v_as_u32_f32 (v_f32_t x)
   union { v_f32_t f; v_u32_t u; } r = {x};
   return r.u;
 }
+static inline v_s32_t
+v_as_s32_f32 (v_f32_t x)
+{
+  union
+  {
+    v_f32_t f;
+    v_s32_t u;
+  } r = {x};
+  return r.u;
+}
 static inline v_f32_t
 v_as_f32_u32 (v_u32_t x)
 {
diff --git a/pl/math/vn_log1pf_2u1.c b/pl/math/vn_log1pf_2u1.c
new file mode 100644
index 0000000..429d167
--- /dev/null
+++ b/pl/math/vn_log1pf_2u1.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log1pf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log1pf, _ZGVnN4v_log1pf)
+#include "v_log1pf_2u1.c"
+#endif
-- 
cgit v1.2.3


From 83a43cd7afcbdca7e192704121acfad6fa24cb67 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 15 Jul 2022 08:56:20 +0100
Subject: pl/math: Add vector/Neon asinhf

The new routine uses vector log1pf, and is accurate to 2.7 ulp.
---
 pl/math/include/mathlib.h      |  4 ++++
 pl/math/s_asinhf_2u7.c         |  6 +++++
 pl/math/test/mathbench_funcs.h |  5 +++++
 pl/math/test/runulp.sh         | 16 +++++++++++++
 pl/math/test/ulp_funcs.h       |  4 ++++
 pl/math/test/ulp_wrappers.h    |  3 +++
 pl/math/v_asinhf_2u7.c         | 51 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/v_math.h               | 10 +++++++++
 pl/math/vn_asinhf_2u7.c        | 12 ++++++++++
 9 files changed, 111 insertions(+)
 create mode 100644 pl/math/s_asinhf_2u7.c
 create mode 100644 pl/math/v_asinhf_2u7.c
 create mode 100644 pl/math/vn_asinhf_2u7.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e688a32..c6620c0 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -21,6 +21,7 @@ double atan2 (double, double);
 double log10 (double);
 double log1p (double);
 
+float __s_asinhf (float);
 float __s_atanf (float);
 float __s_atan2f (float, float);
 float __s_erfcf (float);
@@ -46,6 +47,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #endif
 
 /* Vector functions following the base PCS.  */
+__f32x4_t __v_asinhf (__f32x4_t);
 __f32x4_t __v_atanf (__f32x4_t);
 __f64x2_t __v_atan (__f64x2_t);
 __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
@@ -62,6 +64,7 @@ __f32x4_t __v_log1pf (__f32x4_t);
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_asinhf (__f32x4_t);
 __vpcs __f32x4_t __vn_atanf (__f32x4_t);
 __vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
@@ -75,6 +78,7 @@ __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 __vpcs __f32x4_t __vn_log1pf (__f32x4_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
diff --git a/pl/math/s_asinhf_2u7.c b/pl/math/s_asinhf_2u7.c
new file mode 100644
index 0000000..bce86a7
--- /dev/null
+++ b/pl/math/s_asinhf_2u7.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_asinhf_2u7.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index aa4a2eb..f681489 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -22,6 +22,7 @@ D (log10, 0.01, 11.1)
 D (log1p, -0.9, 10.0)
 
 #if WANT_VMATH
+F (__s_asinhf, -10.0, 10.0)
 F (__s_atanf, -10.0, 10.0)
 D (__s_atan, -10.0, 10.0)
 {"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
@@ -34,6 +35,7 @@ F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 F (__s_log1pf, -0.9, 10.0)
 #if __aarch64__
+VF (__v_asinhf, -10.0, 10.0)
 VF (__v_atanf, -10.0, 10.0)
 VD (__v_atan, -10.0, 10.0)
 {"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
@@ -46,6 +48,9 @@ VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 VF (__v_log1pf, -0.9, 10.0)
 #ifdef __vpcs
+VNF (__vn_asinhf, -10.0, 10.0)
+VNF (_ZGVnN4v_asinhf, -10.0, 10.0)
+
 VNF (__vn_atanf, -10.0, 10.0)
 VNF (_ZGVnN4v_atanf, -10.0, 10.0)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 9234f23..900d7d2 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -223,6 +223,17 @@ range_log1pf='
      -1.0      inf   1000
 '
 
+range_asinhf='
+        0  0x1p-12  40000
+  0x1p-12      1.0  40000
+      1.0   0x1p11  40000
+   0x1p11      inf  40000
+        0 -0x1p-12  20000
+ -0x1p-12     -1.0  20000
+     -1.0  -0x1p11  20000
+  -0x1p11     -inf  20000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -235,6 +246,7 @@ L_atan=3.0
 L_atan2f=3.0
 L_atanf=3.0
 L_log1pf=2.0
+L_asinhf=2.2
 
 while read G F R
 do
@@ -298,6 +310,10 @@ log1pf __s_log1pf      $runs
 log1pf __v_log1pf      $runv
 log1pf __vn_log1pf     $runvn
 log1pf _ZGVnN4v_log1pf $runvn
+asinhf __s_asinhf      $runs
+asinhf __v_asinhf      $runv
+asinhf __vn_asinhf     $runvn
+asinhf _ZGVnN4v_asinhf $runvn
 EOF
 
 [ 0 -eq $FAIL ] || {
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index bd7026b..3a4e6b9 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -16,6 +16,7 @@ D1 (erfc)
 D1 (log10)
 D1 (log1p)
 #if WANT_VMATH
+F (__s_asinhf, __s_asinhf, asinh, mpfr_asinh, 1, 1, f1, 0)
 F (__s_atanf, __s_atanf, atan, mpfr_atan, 1, 1, f1, 0)
 F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0)
 F (__s_atan2f, __s_atan2f, atan2, mpfr_atan2, 2, 1, f2, 0)
@@ -28,6 +29,7 @@ F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 F (__s_log1pf, __s_log1pf, log1p, mpfr_log1p, 1, 1, f1, 0)
 #if __aarch64__
+F (__v_asinhf, v_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
 F (__v_atanf, v_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (__v_atan2f, v_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
@@ -40,6 +42,7 @@ F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (__v_log1pf, v_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
 #ifdef __vpcs
+F (__vn_asinhf, vn_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
 F (__vn_atanf, vn_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (__vn_atan2f, vn_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
@@ -51,6 +54,7 @@ F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (__vn_log1pf, vn_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
+F (_ZGVnN4v_asinhf, Z_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
 F (_ZGVnN4v_atanf, Z_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (_ZGVnN4vv_atan2f, Z_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index d8baad7..06f94e5 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -14,6 +14,7 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,
 
 /* Wrappers for vector functions.  */
 #if __aarch64__ && WANT_VMATH
+static float v_asinhf(float x) { return __v_asinhf(argf(x))[0]; }
 static float v_atanf(float x) { return __v_atanf(argf(x))[0]; }
 static float v_atan2f(float x, float y) { return __v_atan2f(argf(x), argf(y))[0]; }
 static float v_erff(float x) { return __v_erff(argf(x))[0]; }
@@ -26,6 +27,7 @@ static double v_erf(double x) { return __v_erf(argd(x))[0]; }
 static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
 static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 #ifdef __vpcs
+static float vn_asinhf(float x) { return __vn_asinhf(argf(x))[0]; }
 static float vn_atanf(float x) { return __vn_atanf(argf(x))[0]; }
 static float vn_atan2f(float x, float y) { return __vn_atan2f(argf(x), argf(y))[0]; }
 static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
@@ -38,6 +40,7 @@ static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
 static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
 static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
 
+static float Z_asinhf(float x) { return _ZGVnN4v_asinhf(argf(x))[0]; }
 static float Z_atanf(float x) { return _ZGVnN4v_atanf(argf(x))[0]; }
 static float Z_atan2f(float x, float y) { return _ZGVnN4vv_atan2f(argf(x), argf(y))[0]; }
 static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
new file mode 100644
index 0000000..39f7989
--- /dev/null
+++ b/pl/math/v_asinhf_2u7.c
@@ -0,0 +1,51 @@
+/*
+ * Single-precision vector asinh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "include/mathlib.h"
+
+#if V_SUPPORTED
+
+#define SignMask v_u32 (0x80000000)
+#define One v_f32 (1.0f)
+#define Ln2 v_f32 (0x1.62e43p-1f)
+#define SpecialBound v_u32 (0x5f800000) /* asuint(0x1p64).  */
+
+static inline v_f32_t
+handle_special (v_f32_t ax)
+{
+  return V_NAME (log1pf) (ax) + Ln2;
+}
+
+/* Single-precision implementation of vector asinh(x), using vector log1p.
+   Worst-case error is 2.66 ULP, at roughly +/-0.25:
+   __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3.  */
+VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x)
+{
+  v_f32_t ax = v_abs_f32 (x);
+  v_u32_t special = v_cond_u32 (v_as_u32_f32 (ax) >= SpecialBound);
+  v_u32_t sign = v_as_u32_f32 (x) & SignMask;
+
+  /* asinh(x) = log(x + sqrt(x * x + 1)).
+     For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
+  v_f32_t d = One + v_sqrt_f32 (ax * ax + One);
+  v_f32_t y = V_NAME (log1pf) (ax + ax * ax / d);
+
+  if (unlikely (v_any_u32 (special)))
+    {
+      /* If |x| is too large, we cannot square it at low cost without overflow.
+	 At very large x, asinh(x) ~= log(2x) and log(x) ~= log1p(x), so we
+	 calculate asinh(x) as log1p(x) + log(2).  */
+      v_f32_t y_large = V_NAME (log1pf) (ax) + Ln2;
+      return v_as_f32_u32 (sign
+			   | v_as_u32_f32 (v_sel_f32 (special, y_large, y)));
+    }
+
+  return v_as_f32_u32 (sign | v_as_u32_f32 (y));
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index 4de3609..ccdfd75 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -213,6 +213,11 @@ v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
 {
   return p ? x : y;
 }
+static inline v_f32_t
+v_sqrt_f32 (v_f32_t x)
+{
+  return __builtin_sqrtf (x);
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
@@ -553,6 +558,11 @@ v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
 {
   return vbslq_u32 (p, x, y);
 }
+static inline v_f32_t
+v_sqrt_f32 (v_f32_t x)
+{
+  return vsqrtq_f32 (x);
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
diff --git a/pl/math/vn_asinhf_2u7.c b/pl/math/vn_asinhf_2u7.c
new file mode 100644
index 0000000..c42e37e
--- /dev/null
+++ b/pl/math/vn_asinhf_2u7.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_asinhf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_asinhf, _ZGVnN4v_asinhf)
+#include "v_asinhf_2u7.c"
+#endif
-- 
cgit v1.2.3


From 4d56ab71ea3a7a69eb3f1b1dabf3583e6cd84259 Mon Sep 17 00:00:00 2001
From: Andrew Walbran <qwandor@google.com>
Date: Mon, 18 Jul 2022 13:14:10 +0000
Subject: Add library for core memory routines.

The Rust compiler may emit calls to memcmp, memcpy, memmove and memset.
Usually these come from libc, but in bare-metal binaries they must be
provided somewhere. For now we only use bare-metal rust on aarch64, so
the Arm optimized-routines implementation seems like the best choice.

Bug: 223166344
Test: atest vmbase_example.integration_test
Change-Id: Id2439208160411dcde12be76fbaa22c30c24b81d
---
 Android.bp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/Android.bp b/Android.bp
index 8fe656e..ea238fe 100755
--- a/Android.bp
+++ b/Android.bp
@@ -139,6 +139,33 @@ cc_library_static {
     },
 }
 
+// Memory intrinsics for bare-metal Rust binaries.
+cc_library_static {
+    name: "libarm-optimized-routines-mem",
+    nocrt: true,
+    system_shared_libs: [],
+    stl: "none",
+    sanitize: {
+        hwaddress: false,
+    },
+    arch: {
+        arm64: {
+            srcs: [
+                "string/aarch64/memcmp.S",
+                "string/aarch64/memcpy.S",
+                "string/aarch64/memset.S",
+            ],
+            asflags: [
+                "-D__memcmp_aarch64=memcmp",
+                "-D__memcpy_aarch64=memcpy",
+                "-D__memmove_aarch64=memmove",
+                "-D__memset_aarch64=memset",
+            ],
+        },
+    },
+    visibility: ["//packages/modules/Virtualization:__subpackages__"],
+}
+
 // adb shell "/data/nativetest64/mathtest/mathtest /data/nativetest64/mathtest/test/testcases/directed/*"
 // adb shell "/data/nativetest/mathtest/mathtest /data/nativetest/mathtest/test/testcases/directed/*"
 cc_test {
-- 
cgit v1.2.3


From a790e502efe76dbad55a33655f6e3c9066e11325 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Thu, 21 Jul 2022 13:50:32 +0100
Subject: Update config.mk example to define WANT_SVE_MATH.

This is required when running a `make check`, in order to avoid
running ulp tests on SVE routines when SVE is disabled.
Keeping the definition of cflags for SVE in the config file to
allow user control over `-march`.
---
 config.mk.dist | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/config.mk.dist b/config.mk.dist
index b7fc243..b29a9b0 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -62,8 +62,12 @@ math-cflags += -ffp-contract=fast -fno-math-errno
 # Disable vector math code
 #math-cflags += -DWANT_VMATH=0
 
-# Enable SVE vector code
-#math-cflags += -march=armv8.2-a+sve -DWANT_SVE_MATH=1
+# Disable/enable SVE vector math code and tests
+WANT_SVE_MATH = 0
+ifeq ($(WANT_SVE_MATH), 1)
+  math-cflags += -march=armv8.2-a+sve
+endif
+math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
 
 # Disable fenv checks
 #math-ulpflags = -q -f
-- 
cgit v1.2.3


From a40716ffedba4fa94c1a0c3e88857740d86a00b0 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 21 Jul 2022 13:55:41 +0100
Subject: pl/math: Add Vector/SVE cosf.

An implementation based on SVE trigonometric instructions.
It relies on the same range reduction as Vector/Neon
cosf, with a slight modification of the shift.
The maximum measured error is 2.06ULPs.
---
 pl/math/Dir.mk                 |   2 +-
 pl/math/include/mathlib.h      |   8 +++
 pl/math/sv_cosf_2u1.c          |  75 +++++++++++++++++++
 pl/math/sv_math.h              | 160 +++++++++++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h |   5 ++
 pl/math/test/runulp.sh         |  19 +++++
 pl/math/test/ulp_funcs.h       |   4 ++
 pl/math/test/ulp_wrappers.h    |  19 ++++-
 8 files changed, 288 insertions(+), 4 deletions(-)
 create mode 100644 pl/math/sv_cosf_2u1.c
 create mode 100644 pl/math/sv_math.h

diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index 4a96dc6..7909ea0 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -128,7 +128,7 @@ check-pl/math-rtest: $(math-host-tools) $(math-tools)
 	cat $(math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
 
 check-pl/math-ulp: $(math-tools)
-	ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR)
+	WANT_SVE_MATH=$(WANT_SVE_MATH) ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR)
 
 check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
 
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index c6620c0..5ed266a 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -92,6 +92,14 @@ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
 
 #endif
+
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
+/* SVE ABI names.  */
+svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
+#endif
+
 #endif
 
 #endif
diff --git a/pl/math/sv_cosf_2u1.c b/pl/math/sv_cosf_2u1.c
new file mode 100644
index 0000000..70057ea
--- /dev/null
+++ b/pl/math/sv_cosf_2u1.c
@@ -0,0 +1,75 @@
+/*
+ * Single-precision SVE cos(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
+#define RangeVal (sv_f32 (0x1p20f))
+#define InvPio2 (sv_f32 (0x1.45f306p-1f))
+/* Original shift used in Neon cosf,
+   plus a contribution to set the bit #0 of q
+   as expected by trigonometric instructions.  */
+#define Shift (sv_f32 (0x1.800002p+23f))
+#define AbsMask (0x7fffffff)
+
+static NOINLINE sv_f32_t
+__sv_cosf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (cosf, x, y, cmp);
+}
+
+/* A fast SVE implementation of cosf based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum measured error: 2.06 ULPs.
+   __sv_cosf(0x1.dea2f2p+19) got 0x1.fffe7ap-6
+			    want 0x1.fffe76p-6.  */
+sv_f32_t
+__sv_cosf_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_f32_t n, r, r2, y;
+  svbool_t cmp;
+
+  r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask));
+  cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal));
+
+  /* n = rint(|x|/(pi/2)).  */
+  sv_f32_t q = sv_fma_f32_x (pg, InvPio2, r, Shift);
+  n = svsub_f32_x (pg, q, Shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = sv_fma_f32_x (pg, NegPio2_1, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_3, n, r);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  sv_f32_t f = svtssel_f32 (r, sv_as_u32_f32 (q));
+
+  /* cos(r) poly approx.  */
+  r2 = svtsmul_f32 (r, sv_as_u32_f32 (q));
+  y = sv_f32 (0.0f);
+  y = svtmad_f32 (y, r2, 4);
+  y = svtmad_f32 (y, r2, 3);
+  y = svtmad_f32 (y, r2, 2);
+  y = svtmad_f32 (y, r2, 1);
+  y = svtmad_f32 (y, r2, 0);
+
+  /* Apply factor.  */
+  y = svmul_f32_x (pg, f, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_cosf_specialcase (x, y, cmp);
+  return y;
+}
+
+strong_alias (__sv_cosf_x, _ZGVsMxv_cosf)
+
+#endif
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
new file mode 100644
index 0000000..14919be
--- /dev/null
+++ b/pl/math/sv_math.h
@@ -0,0 +1,160 @@
+/*
+ * Wrapper functions for SVE ACLE.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef SV_MATH_H
+#define SV_MATH_H
+
+#ifndef WANT_VMATH
+/* Enable the build of vector math code.  */
+#define WANT_VMATH 1
+#endif
+#if WANT_VMATH
+
+#if WANT_SVE_MATH
+#define SV_SUPPORTED 1
+
+#include <arm_sve.h>
+#include <stdbool.h>
+
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+typedef svfloat64_t sv_f64_t;
+typedef svuint64_t sv_u64_t;
+typedef svint64_t sv_s64_t;
+
+typedef svfloat32_t sv_f32_t;
+typedef svuint32_t sv_u32_t;
+typedef svint32_t sv_s32_t;
+
+/* Double precision.  */
+static inline sv_s64_t
+sv_s64 (s64_t x)
+{
+  return svdup_n_s64 (x);
+}
+
+static inline sv_u64_t
+sv_u64 (u64_t x)
+{
+  return svdup_n_u64 (x);
+}
+
+static inline sv_f64_t
+sv_f64 (f64_t x)
+{
+  return svdup_n_f64 (x);
+}
+
+static inline sv_f64_t
+sv_fma_f64_x (svbool_t pg, sv_f64_t x, sv_f64_t y, sv_f64_t z)
+{
+  return svmla_f64_x (pg, z, x, y);
+}
+
+/* res = z + x * y with x scalar. */
+static inline sv_f64_t
+sv_fma_n_f64_x (svbool_t pg, f64_t x, sv_f64_t y, sv_f64_t z)
+{
+  return svmla_n_f64_x (pg, z, y, x);
+}
+
+static inline sv_u64_t
+sv_as_u64_f64 (sv_f64_t x)
+{
+  return svreinterpret_u64_f64 (x);
+}
+
+static inline sv_f64_t
+sv_as_f64_u64 (sv_u64_t x)
+{
+  return svreinterpret_f64_u64 (x);
+}
+
+static inline sv_f64_t
+sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f64_t elem = svclastb_n_f64 (p, 0, x);
+      elem = (*f) (elem);
+      sv_f64_t y2 = svdup_n_f64 (elem);
+      y = svsel_f64 (p, y2, y);
+      p = svpnext_b64 (cmp, p);
+    }
+  return y;
+}
+
+/* Single precision.  */
+static inline sv_s32_t
+sv_s32 (s32_t x)
+{
+  return svdup_n_s32 (x);
+}
+
+static inline sv_u32_t
+sv_u32 (u32_t x)
+{
+  return svdup_n_u32 (x);
+}
+
+static inline sv_f32_t
+sv_f32 (f32_t x)
+{
+  return svdup_n_f32 (x);
+}
+
+static inline sv_f32_t
+sv_fma_f32_x (svbool_t pg, sv_f32_t x, sv_f32_t y, sv_f32_t z)
+{
+  return svmla_f32_x (pg, z, x, y);
+}
+
+/* res = z + x * y with x scalar.  */
+static inline sv_f32_t
+sv_fma_n_f32_x (svbool_t pg, f32_t x, sv_f32_t y, sv_f32_t z)
+{
+  return svmla_n_f32_x (pg, z, y, x);
+}
+
+static inline sv_u32_t
+sv_as_u32_f32 (sv_f32_t x)
+{
+  return svreinterpret_u32_f32 (x);
+}
+
+static inline sv_f32_t
+sv_as_f32_u32 (sv_u32_t x)
+{
+  return svreinterpret_f32_u32 (x);
+}
+
+static inline sv_f32_t
+sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f32_t elem = svclastb_n_f32 (p, 0, x);
+      elem = (*f) (elem);
+      sv_f32_t y2 = svdup_n_f32 (elem);
+      y = svsel_f32 (p, y2, y);
+      p = svpnext_b32 (cmp, p);
+    }
+  return y;
+}
+
+#endif
+#endif
+#endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f681489..5b6a806 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -8,6 +8,7 @@
 F (asinhf, -10.0, 10.0)
 F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
+F (cosf, -3.1, 3.1)
 F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
@@ -85,5 +86,9 @@ VNF (__vn_log1pf, -0.9, 10.0)
 VNF (_ZGVnN4v_log1pf, -0.9, 10.0)
 #endif
 #endif
+#if WANT_SVE_MATH
+SVF (__sv_cosf_x, -3.1, 3.1)
+SVF (_ZGVsMxv_cosf, -3.1, 3.1)
+#endif
 #endif
   // clang-format on
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 900d7d2..8af5d1c 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -15,6 +15,9 @@ rmodes='n'
 flags="${ULPFLAGS:--q}"
 emu="$@"
 
+# Enable SVE testing
+WANT_SVE_MATH=${WANT_SVE_MATH:-0}
+
 FAIL=0
 PASS=0
 
@@ -132,6 +135,10 @@ runv=
 check __v_log10f 1 && runv=1
 runvn=
 check __vn_log10f 1 && runvn=1
+runsv=
+if [ $WANT_SVE_MATH -eq 1 ]; then
+check __sv_cosf 0 && runsv=1
+fi
 
 range_erfc='
    0       0xffff0000   10000
@@ -234,6 +241,11 @@ range_asinhf='
   -0x1p11     -inf  20000
 '
 
+range_sve_cosf='
+ 0    0xffff0000    10000
+ 0x1p-4    0x1p4    500000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -248,6 +260,8 @@ L_atanf=3.0
 L_log1pf=2.0
 L_asinhf=2.2
 
+L_sve_cosf=1.6
+
 while read G F R
 do
 	[ "$R" = 1 ] || continue
@@ -314,6 +328,11 @@ asinhf __s_asinhf      $runs
 asinhf __v_asinhf      $runv
 asinhf __vn_asinhf     $runvn
 asinhf _ZGVnN4v_asinhf $runvn
+
+if [ $WANT_SVE_MATH -eq 1 ]; then
+sve_cosf __sv_cosf     $runsv
+sve_cosf _ZGVsMxv_cosf $runsv
+fi
 EOF
 
 [ 0 -eq $FAIL ] || {
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 3a4e6b9..d92b3b5 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -68,4 +68,8 @@ F (_ZGVnN2v_log10, Z_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (_ZGVnN4v_log1pf, Z_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
 #endif
 #endif
+#if WANT_SVE_MATH
+SVF1 (cos)
+ZSVF1 (cos)
+#endif
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 06f94e5..df25bf1 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -6,10 +6,15 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-
 #if USE_MPFR
-static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
-static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
+  mpfr_cos(y, x, r);
+  return mpfr_sin(y, x, r);
+}
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
+  mpfr_sin(y, x, r);
+  return mpfr_cos(y, x, r);
+}
 #endif
 
 /* Wrappers for vector functions.  */
@@ -53,5 +58,13 @@ static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; }
 static double Z_erfc(double x) { return _ZGVnN2v_erfc(argd(x))[0]; }
 static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
 #endif
+#if WANT_SVE_MATH
+static float sv_cosf(float x) {
+  return svretf(__sv_cosf_x(svargf(x), svptrue_b32()));
+}
+static float Z_sv_cosf(float x) {
+  return svretf(_ZGVsMxv_cosf(svargf(x), svptrue_b32()));
+}
+#endif
 #endif
 // clang-format on
-- 
cgit v1.2.3


From 0b4165a41d49927b57daa53350d9d33b25475fac Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 21 Jul 2022 13:56:01 +0100
Subject: pl/math: Add Vector/SVE cos.

An implementation based on SVE trigonometric instructions.
It relies on the same range reduction as Vector/Neon
cos, with a slight modification of the shift.
The maximum measured error is 2.11ULPs around x = 205.522.
---
 pl/math/include/mathlib.h      |  2 ++
 pl/math/sv_cos_2u5.c           | 77 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h |  4 +++
 pl/math/test/runulp.sh         |  9 +++++
 pl/math/test/ulp_funcs.h       |  2 ++
 pl/math/test/ulp_wrappers.h    |  6 ++++
 6 files changed, 100 insertions(+)
 create mode 100644 pl/math/sv_cos_2u5.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 5ed266a..6f3b537 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -96,8 +96,10 @@ __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
 #if WANT_SVE_MATH
 #include <arm_sve.h>
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
 /* SVE ABI names.  */
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
 #endif
 
 #endif
diff --git a/pl/math/sv_cos_2u5.c b/pl/math/sv_cos_2u5.c
new file mode 100644
index 0000000..483c73f
--- /dev/null
+++ b/pl/math/sv_cos_2u5.c
@@ -0,0 +1,77 @@
+/*
+ * Double-precision SVE cos(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1))
+#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0))
+#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26))
+#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54))
+/* Original shift used in Neon cos,
+   plus a contribution to set the bit #0 of q
+   as expected by trigonometric instructions.  */
+#define Shift (sv_f64 (0x1.8000000000001p52))
+#define RangeVal (sv_f64 (0x1p23))
+#define AbsMask (0x7fffffffffffffff)
+
+static NOINLINE sv_f64_t
+__sv_cos_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (cos, x, y, cmp);
+}
+
+/* A fast SVE implementation of cos based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum measured error: 2.108 ULPs.
+   __sv_cos(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3
+				 want -0x1.fddd4c65c7f05p-3.  */
+sv_f64_t
+__sv_cos_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_f64_t n, r, r2, y;
+  svbool_t cmp;
+
+  r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask));
+  cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal));
+
+  /* n = rint(|x|/(pi/2)).  */
+  sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift);
+  n = svsub_f64_x (pg, q, Shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = sv_fma_f64_x (pg, NegPio2_1, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_3, n, r);
+
+  /* cos(r) poly approx.  */
+  r2 = svtsmul_f64 (r, sv_as_u64_f64 (q));
+  y = sv_f64 (0.0);
+  y = svtmad_f64 (y, r2, 7);
+  y = svtmad_f64 (y, r2, 6);
+  y = svtmad_f64 (y, r2, 5);
+  y = svtmad_f64 (y, r2, 4);
+  y = svtmad_f64 (y, r2, 3);
+  y = svtmad_f64 (y, r2, 2);
+  y = svtmad_f64 (y, r2, 1);
+  y = svtmad_f64 (y, r2, 0);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q));
+  /* Apply factor.  */
+  y = svmul_f64_x (pg, f, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_cos_specialcase (x, y, cmp);
+  return y;
+}
+
+strong_alias (__sv_cos_x, _ZGVsMxv_cos)
+
+#endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 5b6a806..85ec906 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -17,6 +17,7 @@ F (log1pf, -0.9, 10.0)
 D (asinh, -10.0, 10.0)
 D (atan, -10.0, 10.0)
 {"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
+D (cos, -3.1, 3.1)
 D (erf, -6,6)
 D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
@@ -89,6 +90,9 @@ VNF (_ZGVnN4v_log1pf, -0.9, 10.0)
 #if WANT_SVE_MATH
 SVF (__sv_cosf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_cosf, -3.1, 3.1)
+
+SVD (__sv_cos_x, -3.1, 3.1)
+SVD (_ZGVsMxv_cos, -3.1, 3.1)
 #endif
 #endif
   // clang-format on
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 8af5d1c..d5ae82b 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -138,6 +138,7 @@ check __vn_log10f 1 && runvn=1
 runsv=
 if [ $WANT_SVE_MATH -eq 1 ]; then
 check __sv_cosf 0 && runsv=1
+check __sv_cos  0 && runsv=1
 fi
 
 range_erfc='
@@ -246,6 +247,11 @@ range_sve_cosf='
  0x1p-4    0x1p4    500000
 '
 
+range_sve_cos='
+ 0    0xffff0000    10000
+ 0x1p-4    0x1p4    500000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -261,6 +267,7 @@ L_log1pf=2.0
 L_asinhf=2.2
 
 L_sve_cosf=1.6
+L_sve_cos=2.0
 
 while read G F R
 do
@@ -332,6 +339,8 @@ asinhf _ZGVnN4v_asinhf $runvn
 if [ $WANT_SVE_MATH -eq 1 ]; then
 sve_cosf __sv_cosf     $runsv
 sve_cosf _ZGVsMxv_cosf $runsv
+sve_cos  __sv_cos      $runsv
+sve_cos  _ZGVsMxv_cos  $runsv
 fi
 EOF
 
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index d92b3b5..d317352 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -71,5 +71,7 @@ F (_ZGVnN4v_log1pf, Z_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
 #if WANT_SVE_MATH
 SVF1 (cos)
 ZSVF1 (cos)
+SVD1 (cos)
+ZSVD1 (cos)
 #endif
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index df25bf1..14a32b7 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -65,6 +65,12 @@ static float sv_cosf(float x) {
 static float Z_sv_cosf(float x) {
   return svretf(_ZGVsMxv_cosf(svargf(x), svptrue_b32()));
 }
+static double sv_cos(double x) {
+  return svretd(__sv_cos_x(svargd(x), svptrue_b64()));
+}
+static double Z_sv_cos(double x) {
+  return svretd(_ZGVsMxv_cos(svargd(x), svptrue_b64()));
+}
 #endif
 #endif
 // clang-format on
-- 
cgit v1.2.3


From c499115c17546ea8cb8cb7727264b6eedc7eb4d9 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 3 Aug 2022 11:47:41 +0100
Subject: string: arm: Implement conditional leaf PAC signing

Adjust critetion for M-profile PACBTI signing of leaf function to be
contingent on +leaf option being passed to -mbranch-protect compilation
option.
---
 string/arm/memchr.S         | 12 ++++++------
 string/arm/strcmp.S         |  8 ++++----
 string/arm/strlen-armv6t2.S |  8 ++++----
 string/pacbti.h             | 14 ++++++++++----
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index bc1608f..4e82ba3 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -75,7 +75,7 @@ __memchr_arm:
 	push	{r4,r5,r6,r7}
 	.save   {r4-r7}
 	.cfi_adjust_cfa_offset 16
-#ifdef __ARM_FEATURE_PAC_DEFAULT
+#if HAVE_PAC_LEAF
 	.cfi_offset 4, -20
 	.cfi_offset 5, -16
 	.cfi_offset 6, -12
@@ -85,7 +85,7 @@ __memchr_arm:
 	.cfi_offset 5, -12
 	.cfi_offset 6, -8
 	.cfi_offset 7, -4
-#endif /*  __ARM_FEATURE_PAC_DEFAULT */
+#endif /*  HAVE_PAC_LEAF */
 	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
 	orr	r1, r1, r1, lsl #16
 	bic	r4, r2, #7	@ Number of double words to work with
@@ -153,20 +153,20 @@ __memchr_arm:
 
 61:
 	subs	r0,r0,#1
-#if __ARM_FEATURE_PAC_DEFAULT
+#if HAVE_PAC_LEAF
 	pop	{r4,r5,r6,r7,ip}
 	.cfi_restore 143
 #else
 	pop	{r4,r5,r6,r7}
-#endif /* __ARM_FEATURE_PAC_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
 	.cfi_restore 7
 	.cfi_restore 6
 	.cfi_restore 5
 	.cfi_restore 4
 	.cfi_def_cfa_offset 0
-#if __ARM_FEATURE_PAC_DEFAULT
+#if HAVE_PAC_LEAF
 	aut ip, lr, sp
-#endif /* __ARM_FEATURE_PAC_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
 	bx lr
 	.cfi_endproc
 	.fnend
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index db96cc0..d891d33 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -154,23 +154,23 @@ __strcmp_arm:
 	strd	r4, r5, [sp, #-16]!
 	.save	{r4, r5}
 	.cfi_adjust_cfa_offset 16
-#ifdef __ARM_FEATURE_PAC_DEFAULT
+#if HAVE_PAC_LEAF
 	.cfi_offset 4, -20
 	.cfi_offset 5, -16
 #else
 	.cfi_offset 4, -16
 	.cfi_offset 5, -12
-#endif /* __ARM_FEATURE_PAC_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
 	orr	tmp1, src1, src2
 	strd	r6, r7, [sp, #8]
 	.save	{r6, r7}
-#ifdef __ARM_FEATURE_PAC_DEFAULT
+#if HAVE_PAC_LEAF
 	.cfi_offset 6, -12
 	.cfi_offset 7, -8
 #else
 	.cfi_offset 6, -8
 	.cfi_offset 7, -4
-#endif /* __ARM_FEATURE_PAC_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
 	mvn	const_m1, #0
 	lsl	r2, tmp1, #29
 	cbz	r2, L(loop_aligned8)
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index a981600..e30ddd7 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -44,7 +44,7 @@
 ENTRY (__strlen_armv6t2)
 	/* common pacbti_prologue macro from pacbti.h not used.
 	   handwritten prologue saves one push instruction. */
-#if __ARM_FEATURE_PAC_DEFAULT
+#if HAVE_PAC_LEAF
 #if __ARM_FEATURE_BTI_DEFAULT
 	pacbti ip, lr, sp
 #else
@@ -65,7 +65,7 @@ ENTRY (__strlen_armv6t2)
 	.cfi_def_cfa_offset 8
 	.cfi_offset 4, -8
 	.cfi_offset 5, -4
-#endif /* __ARM_FEATURE_PAC_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
 	pld	[srcin, #0]
 	bic	src, srcin, #7
 	mvn	const_m1, #0
@@ -125,7 +125,7 @@ L(null_found):
 #endif
 	clz	data1a, data1a
 	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
-#if __ARM_FEATURE_PAC_DEFAULT
+#if HAVE_PAC_LEAF
 	pop	{r4, r5, ip}
 	.cfi_restore 4
 	.cfi_restore 5
@@ -137,7 +137,7 @@ L(null_found):
 	.cfi_restore 4
 	.cfi_restore 5
 	.cfi_def_cfa_offset 0
-#endif /* __ARM_FEATURE_PAC_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
 	bx lr
 
 L(misaligned8):
diff --git a/string/pacbti.h b/string/pacbti.h
index 4b6e7df..50d276a 100644
--- a/string/pacbti.h
+++ b/string/pacbti.h
@@ -5,10 +5,16 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+/* Checki whether leaf function PAC signing has been requested
+   in the -mbranch-protect compile-time option */
+#define LEAF_PROTECT_BIT 2
+#define HAVE_PAC_LEAF \
+	__ARM_FEATURE_PAC_DEFAULT & (1 << LEAF_PROTECT_BIT)
+
 /* Macro to handle function entry depending on branch-protection
    schemes */
 	.macro pacbti_prologue
-#if __ARM_FEATURE_PAC_DEFAULT
+#if HAVE_PAC_LEAF
 #if __ARM_FEATURE_BTI_DEFAULT
 	pacbti ip, lr, sp
 #else
@@ -20,17 +26,17 @@
 	.cfi_offset 143, -4
 #elif __ARM_FEATURE_BTI_DEFAULT
 	bti
-#endif /* __ARM_FEATURE_PAC_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
 	.endm
 
 /* Macro to handle different branch exchange cases depending on
    branch-protection schemes */
 	.macro pacbti_epilogue
-#if __ARM_FEATURE_PAC_DEFAULT
+#if HAVE_PAC_LEAF
 	ldr ip, [sp], #4
 	.cfi_restore 143
 	.cfi_def_cfa_offset 0
 	aut ip, lr, sp
-#endif /* __ARM_FEATURE_PAC_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
 	bx lr
 	.endm
-- 
cgit v1.2.3


From 3f5c5bc532c2d3781e8b080dfd1ec712a3978e8e Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 3 Aug 2022 11:48:38 +0100
Subject: string: arm: Update feature test macro use in .arch selection

Move away from use of the non-portable __ARM_ARCH_8M_MAIN__ feature
test macro in favour of __ARM_ARCH >= 8 in selecting for target
architecture selection.
---
 string/arm/memchr.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 4e82ba3..77fe569 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -23,7 +23,7 @@
 @    Removed unneeded cbz from align loop
 
 	.syntax unified
-#if __ARM_ARCH_8M_MAIN__
+#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'M'
     /* keep config inherited from -march= */
 #else
 	.arch armv7-a
-- 
cgit v1.2.3


From 250db3398da6774290a75e099a7beaa4c940f079 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 3 Aug 2022 11:49:19 +0100
Subject: string: arm: Augument unwind information for PAC instructions

Add the `.cfi_register 143, 12' directive immediately after pac
instruction is emitted.

Ensures unwind info consumers know immediately that if they need
the PAC for the function, they can find it in ip register.
---
 string/arm/strlen-armv6t2.S | 1 +
 string/pacbti.h             | 1 +
 2 files changed, 2 insertions(+)

diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index e30ddd7..2601d03 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -50,6 +50,7 @@ ENTRY (__strlen_armv6t2)
 #else
 	pac ip, lr, sp
 #endif /* __ARM_FEATURE_BTI_DEFAULT */
+	.cfi_register 143, 12
 	push    {r4, r5, ip}
 	.save   {r4, r5, ra_auth_code}
 	.cfi_def_cfa_offset 12
diff --git a/string/pacbti.h b/string/pacbti.h
index 50d276a..9162b27 100644
--- a/string/pacbti.h
+++ b/string/pacbti.h
@@ -20,6 +20,7 @@
 #else
 	pac ip, lr, sp
 #endif /* __ARM_FEATURE_BTI_DEFAULT */
+	.cfi_register 143, 12
 	str ip, [sp, #-4]!
 	.save {ra_auth_code}
 	.cfi_def_cfa_offset 4
-- 
cgit v1.2.3


From 5d326aad40b2f804e672321d3412e67a14814190 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <joe.ramsay@arm.com>
Date: Mon, 15 Aug 2022 10:48:58 +0100
Subject: pl/math: Add vector/Neon log2f

The new routine is based on the scalar algorithm in the main math
directory, but with all arithmetic done in single precision. invc
is represented with a high and a low part. The routine is accurate
to 2.6 ULPs.
---
 pl/math/include/mathlib.h                 |   5 ++
 pl/math/math_config.h                     |  12 ++++
 pl/math/s_log2f_2u6.c                     |   6 ++
 pl/math/test/mathbench_funcs.h            |   6 ++
 pl/math/test/runulp.sh                    |  14 ++++
 pl/math/test/testcases/directed/log2f.tst |  27 ++++++++
 pl/math/test/ulp_funcs.h                  |   4 ++
 pl/math/test/ulp_wrappers.h               |   3 +
 pl/math/v_log2f_2u6.c                     | 108 ++++++++++++++++++++++++++++++
 pl/math/v_log2f_data.c                    |  37 ++++++++++
 pl/math/vn_log2f_2u6.c                    |  12 ++++
 11 files changed, 234 insertions(+)
 create mode 100644 pl/math/s_log2f_2u6.c
 create mode 100644 pl/math/test/testcases/directed/log2f.tst
 create mode 100644 pl/math/v_log2f_2u6.c
 create mode 100644 pl/math/v_log2f_data.c
 create mode 100644 pl/math/vn_log2f_2u6.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 6f3b537..2ef8198 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -15,6 +15,7 @@ float erfcf (float);
 float erff (float);
 float log10f (float);
 float log1pf (float);
+float log2f (float);
 
 double asinh (double);
 double atan2 (double, double);
@@ -28,6 +29,7 @@ float __s_erfcf (float);
 float __s_erff (float);
 float __s_log10f (float);
 float __s_log1pf (float);
+float __s_log2f (float);
 
 double __s_atan (double);
 double __s_atan2 (double, double);
@@ -59,6 +61,7 @@ __f64x2_t __v_erfc (__f64x2_t);
 __f32x4_t __v_log10f (__f32x4_t);
 __f64x2_t __v_log10 (__f64x2_t);
 __f32x4_t __v_log1pf (__f32x4_t);
+__f32x4_t __v_log2f (__f32x4_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
@@ -76,6 +79,7 @@ __vpcs __f64x2_t __vn_erfc (__f64x2_t);
 __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 __vpcs __f32x4_t __vn_log1pf (__f32x4_t);
+__vpcs __f32x4_t __vn_log2f (__f32x4_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
@@ -90,6 +94,7 @@ __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
 
 #endif
 
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index f058753..a2719e0 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -466,4 +466,16 @@ extern const struct log1pf_data
 {
   float coeffs[LOG1PF_NCOEFFS];
 } __log1pf_data HIDDEN;
+
+#define V_LOG2F_TABLE_BITS 4
+#define V_LOG2F_POLY_ORDER 4
+extern const struct v_log2f_data
+{
+  struct
+  {
+    /* Pad with dummy for quad-aligned memory access.  */
+    float invc_hi, invc_lo, logc, dummy;
+  } tab[1 << V_LOG2F_TABLE_BITS];
+  float poly[V_LOG2F_POLY_ORDER];
+} __v_log2f_data HIDDEN;
 #endif
diff --git a/pl/math/s_log2f_2u6.c b/pl/math/s_log2f_2u6.c
new file mode 100644
index 0000000..8e5569d
--- /dev/null
+++ b/pl/math/s_log2f_2u6.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log2f_2u6.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 85ec906..338f050 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -13,6 +13,7 @@ F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
 F (log1pf, -0.9, 10.0)
+F (log2f, 0.01, 11.1)
 
 D (asinh, -10.0, 10.0)
 D (atan, -10.0, 10.0)
@@ -36,6 +37,7 @@ D (__s_erfc, -6.0, 28.0)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 F (__s_log1pf, -0.9, 10.0)
+F (__s_log2f, 0.01, 11.1)
 #if __aarch64__
 VF (__v_asinhf, -10.0, 10.0)
 VF (__v_atanf, -10.0, 10.0)
@@ -49,6 +51,7 @@ VD (__v_erfc, -6.0, 28.0)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 VF (__v_log1pf, -0.9, 10.0)
+VF (__v_log2f, 0.01, 11.1)
 #ifdef __vpcs
 VNF (__vn_asinhf, -10.0, 10.0)
 VNF (_ZGVnN4v_asinhf, -10.0, 10.0)
@@ -85,6 +88,9 @@ VND (_ZGVnN2v_log10, 0.01, 11.1)
 
 VNF (__vn_log1pf, -0.9, 10.0)
 VNF (_ZGVnN4v_log1pf, -0.9, 10.0)
+
+VNF (__vn_log2f, 0.01, 11.1)
+VNF (_ZGVnN4v_log2f, 0.01, 11.1)
 #endif
 #endif
 #if WANT_SVE_MATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index d5ae82b..2ef5d5b 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -242,6 +242,15 @@ range_asinhf='
   -0x1p11     -inf  20000
 '
 
+range_log2f='
+     -0.0  -0x1p126  100
+ 0x1p-149  0x1p-126  4000
+ 0x1p-126   0x1p-23  50000
+  0x1p-23       1.0  50000
+      1.0       100  50000
+      100       inf  50000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -265,6 +274,7 @@ L_atan2f=3.0
 L_atanf=3.0
 L_log1pf=2.0
 L_asinhf=2.2
+L_log2f=2.6
 
 L_sve_cosf=1.6
 L_sve_cos=2.0
@@ -335,6 +345,10 @@ asinhf __s_asinhf      $runs
 asinhf __v_asinhf      $runv
 asinhf __vn_asinhf     $runvn
 asinhf _ZGVnN4v_asinhf $runvn
+log2f  __s_log2f       $runs
+log2f  __v_log2f       $runv
+log2f  __vn_log2f      $runvn
+log2f  _ZGVnN4v_log2f  $runvn
 
 if [ $WANT_SVE_MATH -eq 1 ]; then
 sve_cosf __sv_cosf     $runsv
diff --git a/pl/math/test/testcases/directed/log2f.tst b/pl/math/test/testcases/directed/log2f.tst
new file mode 100644
index 0000000..9e99c53
--- /dev/null
+++ b/pl/math/test/testcases/directed/log2f.tst
@@ -0,0 +1,27 @@
+; log2f.tst - Directed test cases for log2f
+;
+; Copyright (c) 2017-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log2f op1=7fc00001 result=7fc00001 errno=0
+func=log2f op1=ffc00001 result=7fc00001 errno=0
+func=log2f op1=7f800001 result=7fc00001 errno=0 status=i
+func=log2f op1=ff800001 result=7fc00001 errno=0 status=i
+func=log2f op1=ff810000 result=7fc00001 errno=0 status=i
+func=log2f op1=7f800000 result=7f800000 errno=0
+func=log2f op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=log2f op1=3f800000 result=00000000 errno=0
+func=log2f op1=00000000 result=ff800000 errno=ERANGE status=z
+func=log2f op1=80000000 result=ff800000 errno=ERANGE status=z
+func=log2f op1=80000001 result=7fc00001 errno=EDOM status=i
+
+func=log2f op1=3f7d70a4 result=bc6d8f8b.7d4 error=0
+func=log2f op1=3f604189 result=be4394c8.395 error=0
+func=log2f op1=3f278034 result=bf1caa73.88e error=0
+func=log2f op1=3edd3c36 result=bf9af3b9.619 error=0
+func=log2f op1=3e61259a result=c00bdb95.650 error=0
+func=log2f op1=3f8147ae result=3c6b3267.d6a error=0
+func=log2f op1=3f8fbe77 result=3e2b5fe2.a1c error=0
+func=log2f op1=3fac3eea result=3edb4d5e.1fc error=0
+func=log2f op1=3fd6e632 result=3f3f5d3a.827 error=0
+func=log2f op1=40070838 result=3f89e055.a0a error=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index d317352..f9f0233 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -28,6 +28,7 @@ F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0)
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 F (__s_log1pf, __s_log1pf, log1p, mpfr_log1p, 1, 1, f1, 0)
+F (__s_log2f, __s_log2f, log2, mpfr_log2, 1, 1, f1, 0)
 #if __aarch64__
 F (__v_asinhf, v_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
 F (__v_atanf, v_atanf, atan, mpfr_atan, 1, 1, f1, 1)
@@ -41,6 +42,7 @@ F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (__v_log1pf, v_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
+F (__v_log2f, v_log2f, log2, mpfr_log2, 1, 1, f1, 1)
 #ifdef __vpcs
 F (__vn_asinhf, vn_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
 F (__vn_atanf, vn_atanf, atan, mpfr_atan, 1, 1, f1, 1)
@@ -54,6 +56,7 @@ F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (__vn_log1pf, vn_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
+F (__vn_log2f, vn_log2f, log2, mpfr_log2, 1, 1, f1, 1)
 F (_ZGVnN4v_asinhf, Z_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
 F (_ZGVnN4v_atanf, Z_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
@@ -66,6 +69,7 @@ F (_ZGVnN2v_erfc, Z_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (_ZGVnN4v_log10f, Z_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (_ZGVnN2v_log10, Z_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (_ZGVnN4v_log1pf, Z_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
+F (_ZGVnN4v_log2f, Z_log2f, log2, mpfr_log2, 1, 1, f1, 1)
 #endif
 #endif
 #if WANT_SVE_MATH
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 14a32b7..274e119 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -26,6 +26,7 @@ static float v_erff(float x) { return __v_erff(argf(x))[0]; }
 static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
 static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
 static float v_log1pf(float x) { return __v_log1pf(argf(x))[0]; }
+static float v_log2f(float x) { return __v_log2f(argf(x))[0]; }
 static double v_atan(double x) { return __v_atan(argd(x))[0]; }
 static double v_atan2(double x, double y) { return __v_atan2(argd(x), argd(y))[0]; }
 static double v_erf(double x) { return __v_erf(argd(x))[0]; }
@@ -39,6 +40,7 @@ static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
 static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
 static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
 static float vn_log1pf(float x) { return __vn_log1pf(argf(x))[0]; }
+static float vn_log2f(float x) { return __vn_log2f(argf(x))[0]; }
 static double vn_atan(double x) { return __vn_atan(argd(x))[0]; }
 static double vn_atan2(double x, double y) { return __vn_atan2(argd(x), argd(y))[0]; }
 static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
@@ -52,6 +54,7 @@ static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
 static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
 static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
 static float Z_log1pf(float x) { return _ZGVnN4v_log1pf(argf(x))[0]; }
+static float Z_log2f(float x) { return _ZGVnN4v_log2f(argf(x))[0]; }
 static double Z_atan(double x) { return _ZGVnN2v_atan(argd(x))[0]; }
 static double Z_atan2(double x, double y) { return _ZGVnN2vv_atan2(argd(x), argd(y))[0]; }
 static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; }
diff --git a/pl/math/v_log2f_2u6.c b/pl/math/v_log2f_2u6.c
new file mode 100644
index 0000000..ce46206
--- /dev/null
+++ b/pl/math/v_log2f_2u6.c
@@ -0,0 +1,108 @@
+/*
+ * Single-precision vector log2 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+#define N (1 << V_LOG2F_TABLE_BITS)
+#define T __v_log2f_data.tab
+#define A __v_log2f_data.poly
+#define OFF 0x3f330000
+
+static float
+handle_special (float x)
+{
+  if (x < 0)
+    /* log2f(-anything) = NaN.  */
+    return NAN;
+  if (x == 0)
+    /* log2f(0) = Inf.  */
+    return __math_divzerof (1);
+  /* log2f(Inf)  =  Inf
+     log2f(Nan)  =  Nan
+     log2f(-NaN) = -NaN.  */
+  return x;
+}
+
+static float
+normalise (float x)
+{
+  return asfloat (asuint (x * 0x1p23f) - (23 << 23));
+}
+
+#ifdef SCALAR
+
+#define DEFINE_LOOKUP_FUNC(p)                                                  \
+  static inline float lookup_##p (uint32_t i) { return T[i].p; }
+
+#else
+
+#define DEFINE_LOOKUP_FUNC(p)                                                  \
+  static inline v_f32_t lookup_##p (v_u32_t i)                                 \
+  {                                                                            \
+    return (v_f32_t){T[i[0]].p, T[i[1]].p, T[i[2]].p, T[i[3]].p};              \
+  }
+
+#endif
+
+DEFINE_LOOKUP_FUNC (invc_lo)
+DEFINE_LOOKUP_FUNC (invc_hi)
+DEFINE_LOOKUP_FUNC (logc)
+
+/* Single-precision vector log2 routine. Implements the same algorithms as
+   scalar log2f, but using only single-precision arithmetic, with invc
+   represented as a two-limb float. Accurate to 2.6 ulp. The maximum error is
+   near sqrt(2):
+  __v_log2f(0x1.6a0484p+0) got 0x1.ffea02p-2
+			  want 0x1.ffea08p-2.  */
+VPCS_ATTR v_f32_t V_NAME (log2f) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+
+  /* x is +-Inf, +-NaN, 0 or -ve.  */
+  v_u32_t special = v_cond_u32 (ix >= 0x7f800000) | v_cond_u32 (ix == 0);
+  /* |x| < 2^126 (i.e. x is subnormal).  */
+  v_u32_t subnorm = v_cond_u32 (v_calt_f32 (x, v_f32 (0x1p-126f)));
+
+  if (unlikely (v_any_u32 (subnorm)))
+    /* Normalize any subnormals.  */
+    ix = v_as_u32_f32 (v_call_f32 (normalise, x, x, subnorm));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  v_u32_t tmp = ix - OFF;
+  v_u32_t i = (tmp >> (23 - V_LOG2F_TABLE_BITS)) % N;
+  v_u32_t top = tmp & 0xff800000;
+  v_u32_t iz = ix - top;
+  v_f32_t k = v_to_f32_s32 (v_as_s32_u32 (tmp) >> 23); /* Arithmetic shift.  */
+  v_f32_t z = v_as_f32_u32 (iz);
+
+  v_f32_t invc_lo = lookup_invc_lo (i);
+  v_f32_t invc_hi = lookup_invc_hi (i);
+  v_f32_t logc = lookup_logc (i);
+
+  /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k.  */
+  v_f32_t r = v_fma_f32 (z, invc_hi, v_f32 (-1));
+  r = v_fma_f32 (z, invc_lo, r);
+  v_f32_t y0 = logc + k;
+
+  /* Pipelined polynomial evaluation to approximate log1p(r)/ln2.  */
+  v_f32_t r2 = r * r;
+  v_f32_t y = v_fma_f32 (v_f32 (A[1]), r, v_f32 (A[2]));
+  y = v_fma_f32 (v_f32 (A[0]), r2, y);
+  v_f32_t p = v_fma_f32 (v_f32 (A[3]), r, y0);
+  y = v_fma_f32 (y, r2, p);
+
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (handle_special, x, y, special);
+
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_log2f_data.c b/pl/math/v_log2f_data.c
new file mode 100644
index 0000000..e6c1f71
--- /dev/null
+++ b/pl/math/v_log2f_data.c
@@ -0,0 +1,37 @@
+/*
+ * Coefficients and table entries for vector log2f
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct v_log2f_data __v_log2f_data = {
+/* All values here are derived from the values in math/log2f_data.c.
+   For all i:
+     tab[i].invc_hi = (float) log2f_data.invc
+     tab[i].invc_lo = log2f_data.invc - (double) tab[i].invc_hi
+     tab[i].logc    = (float) log2f_data.logc
+     poly[i]        = (float) log2f_data.poly[i].  */
+  .tab = {
+  { 0x1.661ec8p+0,  -0x1.81c31p-26,  -0x1.efec66p-2, 0},
+  { 0x1.571ed4p+0,   0x1.55f108p-25, -0x1.b0b684p-2, 0},
+  { 0x1.4953ap+0,   -0x1.e1fdeap-25, -0x1.7418bp-2, 0},
+  { 0x1.3c995cp+0,  -0x1.e8ff9p-25,  -0x1.39de92p-2, 0},
+  { 0x1.30d19p+0,    0x1.910c94p-25, -0x1.01d9cp-2, 0},
+  { 0x1.25e228p+0,  -0x1.3d1c58p-26, -0x1.97c1d2p-3, 0},
+  { 0x1.1bb4a4p+0,   0x1.434688p-25, -0x1.2f9e3ap-3, 0},
+  { 0x1.12359p+0,   -0x1.eea348p-25, -0x1.960cbcp-4, 0},
+  { 0x1.0953f4p+0,   0x1.9900a8p-28, -0x1.a6f9dcp-5, 0},
+  { 0x1p+0,          0x0p+0,          0x0p+0, 0},
+  { 0x1.e608dp-1,   -0x1.32dc2ap-28,  0x1.338caap-4, 0},
+  { 0x1.ca4b32p-1,  -0x1.fb2acp-30,   0x1.476a96p-3, 0},
+  { 0x1.b20366p-1,  -0x1.12a064p-26,  0x1.e840b4p-3, 0},
+  { 0x1.9c2d16p-1,   0x1.d0d516p-28,  0x1.40646p-2, 0},
+  { 0x1.886e6p-1,    0x1.bc20f6p-28,  0x1.88e9c2p-2, 0},
+  { 0x1.767ddp-1,   -0x1.5596f4p-26,  0x1.ce0a44p-2, 0},
+  },
+  .poly = { -0x1.712b70p-2, 0x1.ecabf4p-2,
+	    -0x1.71547ap-1, 0x1.715476p+0 }
+};
diff --git a/pl/math/vn_log2f_2u6.c b/pl/math/vn_log2f_2u6.c
new file mode 100644
index 0000000..dc5ab03
--- /dev/null
+++ b/pl/math/vn_log2f_2u6.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log2f.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log2f, _ZGVnN4v_log2f)
+#include "v_log2f_2u6.c"
+#endif
-- 
cgit v1.2.3


From 46ee91e02f58caf2a7db402bdcca2ede6bb7285c Mon Sep 17 00:00:00 2001
From: Joe Ramsay <joe.ramsay@arm.com>
Date: Mon, 15 Aug 2022 10:55:38 +0100
Subject: pl/math: Add vector/Neon log2

New routine uses the same algorithm as vector log10, with scaled
coefficients. Accurate to 2.5 ulp.
---
 pl/math/include/mathlib.h                |   5 +
 pl/math/math_config.h                    |  12 +++
 pl/math/s_log2_2u5.c                     |   6 ++
 pl/math/test/mathbench_funcs.h           |   6 ++
 pl/math/test/runulp.sh                   |  15 +++
 pl/math/test/testcases/directed/log2.tst |  21 +++++
 pl/math/test/ulp_funcs.h                 |   5 +
 pl/math/test/ulp_wrappers.h              |   3 +
 pl/math/v_log2_2u5.c                     |  89 ++++++++++++++++++
 pl/math/v_log2_data.c                    | 155 +++++++++++++++++++++++++++++++
 pl/math/vn_log2_2u5.c                    |  12 +++
 11 files changed, 329 insertions(+)
 create mode 100644 pl/math/s_log2_2u5.c
 create mode 100644 pl/math/test/testcases/directed/log2.tst
 create mode 100644 pl/math/v_log2_2u5.c
 create mode 100644 pl/math/v_log2_data.c
 create mode 100644 pl/math/vn_log2_2u5.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 2ef8198..d06b2ff 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -21,6 +21,7 @@ double asinh (double);
 double atan2 (double, double);
 double log10 (double);
 double log1p (double);
+double log2 (double);
 
 float __s_asinhf (float);
 float __s_atanf (float);
@@ -36,6 +37,7 @@ double __s_atan2 (double, double);
 double __s_erf (double);
 double __s_erfc (double);
 double __s_log10 (double);
+double __s_log2 (double);
 
 #if __aarch64__
 #if __GNUC__ >= 5
@@ -62,6 +64,7 @@ __f32x4_t __v_log10f (__f32x4_t);
 __f64x2_t __v_log10 (__f64x2_t);
 __f32x4_t __v_log1pf (__f32x4_t);
 __f32x4_t __v_log2f (__f32x4_t);
+__f64x2_t __v_log2 (__f64x2_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
@@ -80,6 +83,7 @@ __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 __vpcs __f32x4_t __vn_log1pf (__f32x4_t);
 __vpcs __f32x4_t __vn_log2f (__f32x4_t);
+__vpcs __f64x2_t __vn_log2 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
@@ -95,6 +99,7 @@ __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 
 #endif
 
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index a2719e0..e22a5b4 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -478,4 +478,16 @@ extern const struct v_log2f_data
   } tab[1 << V_LOG2F_TABLE_BITS];
   float poly[V_LOG2F_POLY_ORDER];
 } __v_log2f_data HIDDEN;
+
+#define V_LOG2_TABLE_BITS 7
+#define V_LOG2_POLY_ORDER 7
+extern const struct v_log2_data
+{
+  double poly[V_LOG2_POLY_ORDER - 1];
+  struct
+  {
+    double invc, log2c;
+  } tab[1 << V_LOG2_TABLE_BITS];
+} __v_log2_data HIDDEN;
+
 #endif
diff --git a/pl/math/s_log2_2u5.c b/pl/math/s_log2_2u5.c
new file mode 100644
index 0000000..f5e8e4d
--- /dev/null
+++ b/pl/math/s_log2_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log2_2u5.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 338f050..b33f369 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -23,6 +23,7 @@ D (erf, -6,6)
 D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
 D (log1p, -0.9, 10.0)
+D (log2, 0.01, 11.1)
 
 #if WANT_VMATH
 F (__s_asinhf, -10.0, 10.0)
@@ -38,6 +39,7 @@ F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 F (__s_log1pf, -0.9, 10.0)
 F (__s_log2f, 0.01, 11.1)
+D (__s_log2, 0.01, 11.1)
 #if __aarch64__
 VF (__v_asinhf, -10.0, 10.0)
 VF (__v_atanf, -10.0, 10.0)
@@ -52,6 +54,7 @@ VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 VF (__v_log1pf, -0.9, 10.0)
 VF (__v_log2f, 0.01, 11.1)
+VD (__v_log2, 0.01, 11.1)
 #ifdef __vpcs
 VNF (__vn_asinhf, -10.0, 10.0)
 VNF (_ZGVnN4v_asinhf, -10.0, 10.0)
@@ -91,6 +94,9 @@ VNF (_ZGVnN4v_log1pf, -0.9, 10.0)
 
 VNF (__vn_log2f, 0.01, 11.1)
 VNF (_ZGVnN4v_log2f, 0.01, 11.1)
+
+VND (__vn_log2, 0.01, 11.1)
+VND (_ZGVnN2v_log2, 0.01, 11.1)
 #endif
 #endif
 #if WANT_SVE_MATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 2ef5d5b..fef96c1 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -251,6 +251,15 @@ range_log2f='
       100       inf  50000
 '
 
+range_log2='
+     -0.0  -0x1p126  100
+ 0x1p-149  0x1p-126  4000
+ 0x1p-126   0x1p-23  50000
+  0x1p-23       1.0  50000
+      1.0       100  50000
+      100       inf  50000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -275,6 +284,8 @@ L_atanf=3.0
 L_log1pf=2.0
 L_asinhf=2.2
 L_log2f=2.6
+# TODO tighten log2 bound
+L_log2=3
 
 L_sve_cosf=1.6
 L_sve_cos=2.0
@@ -316,6 +327,10 @@ log10  __s_log10       $runs
 log10  __v_log10       $runv
 log10  __vn_log10      $runvn
 log10  _ZGVnN2v_log10  $runvn
+log2   __s_log2        $runs
+log2   __v_log2        $runv
+log2   __vn_log2       $runvn
+log2   _ZGVnN2v_log2   $runvn
 
 atanf  __s_atanf       $runs
 atanf  __v_atanf       $runv
diff --git a/pl/math/test/testcases/directed/log2.tst b/pl/math/test/testcases/directed/log2.tst
new file mode 100644
index 0000000..c84ff65
--- /dev/null
+++ b/pl/math/test/testcases/directed/log2.tst
@@ -0,0 +1,21 @@
+; Directed test cases for log2
+;
+; Copyright (c) 2018-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=log2 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log2 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log2 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=log2 op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=log2 op1=7fefffff.ffffffff result=408fffff.ffffffff.ffa errno=0
+func=log2 op1=ffefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
+func=log2 op1=3ff00000.00000000 result=00000000.00000000 errno=0
+func=log2 op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=log2 op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=log2 op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=log2 op1=00000000.00000001 result=c090c800.00000000 errno=0
+func=log2 op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=log2 op1=40000000.00000000 result=3ff00000.00000000 errno=0
+func=log2 op1=3fe00000.00000000 result=bff00000.00000000 errno=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index f9f0233..7325cfd 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -15,6 +15,7 @@ D2 (atan2)
 D1 (erfc)
 D1 (log10)
 D1 (log1p)
+D1 (log2)
 #if WANT_VMATH
 F (__s_asinhf, __s_asinhf, asinh, mpfr_asinh, 1, 1, f1, 0)
 F (__s_atanf, __s_atanf, atan, mpfr_atan, 1, 1, f1, 0)
@@ -29,6 +30,7 @@ F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 F (__s_log1pf, __s_log1pf, log1p, mpfr_log1p, 1, 1, f1, 0)
 F (__s_log2f, __s_log2f, log2, mpfr_log2, 1, 1, f1, 0)
+F (__s_log2, __s_log2, log2l, mpfr_log2, 1, 0, d1, 0)
 #if __aarch64__
 F (__v_asinhf, v_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
 F (__v_atanf, v_atanf, atan, mpfr_atan, 1, 1, f1, 1)
@@ -43,6 +45,7 @@ F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (__v_log1pf, v_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
 F (__v_log2f, v_log2f, log2, mpfr_log2, 1, 1, f1, 1)
+F (__v_log2, v_log2, log2l, mpfr_log2, 1, 0, d1, 1)
 #ifdef __vpcs
 F (__vn_asinhf, vn_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
 F (__vn_atanf, vn_atanf, atan, mpfr_atan, 1, 1, f1, 1)
@@ -57,6 +60,7 @@ F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (__vn_log1pf, vn_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
 F (__vn_log2f, vn_log2f, log2, mpfr_log2, 1, 1, f1, 1)
+F (__vn_log2, vn_log2, log2l, mpfr_log2, 1, 0, d1, 1)
 F (_ZGVnN4v_asinhf, Z_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
 F (_ZGVnN4v_atanf, Z_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
@@ -70,6 +74,7 @@ F (_ZGVnN4v_log10f, Z_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (_ZGVnN2v_log10, Z_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 F (_ZGVnN4v_log1pf, Z_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
 F (_ZGVnN4v_log2f, Z_log2f, log2, mpfr_log2, 1, 1, f1, 1)
+F (_ZGVnN2v_log2, Z_log2, log2l, mpfr_log2, 1, 0, d1, 1)
 #endif
 #endif
 #if WANT_SVE_MATH
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 274e119..98edb99 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -32,6 +32,7 @@ static double v_atan2(double x, double y) { return __v_atan2(argd(x), argd(y))[0
 static double v_erf(double x) { return __v_erf(argd(x))[0]; }
 static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
 static double v_log10(double x) { return __v_log10(argd(x))[0]; }
+static double v_log2(double x) { return __v_log2(argd(x))[0]; }
 #ifdef __vpcs
 static float vn_asinhf(float x) { return __vn_asinhf(argf(x))[0]; }
 static float vn_atanf(float x) { return __vn_atanf(argf(x))[0]; }
@@ -46,6 +47,7 @@ static double vn_atan2(double x, double y) { return __vn_atan2(argd(x), argd(y))
 static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
 static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
 static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
+static double vn_log2(double x) { return __vn_log2(argd(x))[0]; }
 
 static float Z_asinhf(float x) { return _ZGVnN4v_asinhf(argf(x))[0]; }
 static float Z_atanf(float x) { return _ZGVnN4v_atanf(argf(x))[0]; }
@@ -60,6 +62,7 @@ static double Z_atan2(double x, double y) { return _ZGVnN2vv_atan2(argd(x), argd
 static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; }
 static double Z_erfc(double x) { return _ZGVnN2v_erfc(argd(x))[0]; }
 static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
+static double Z_log2(double x) { return _ZGVnN2v_log2(argd(x))[0]; }
 #endif
 #if WANT_SVE_MATH
 static float sv_cosf(float x) {
diff --git a/pl/math/v_log2_2u5.c b/pl/math/v_log2_2u5.c
new file mode 100644
index 0000000..5b1014c
--- /dev/null
+++ b/pl/math/v_log2_2u5.c
@@ -0,0 +1,89 @@
+/*
+ * Double-precision vector log2 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "include/mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define N (1 << V_LOG2_TABLE_BITS)
+#define OFF v_u64 (0x3fe6900900000000)
+#define P(i) v_f64 (__v_log2_data.poly[i])
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t log2c;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = __v_log2_data.tab[i].invc;
+  e.log2c = __v_log2_data.tab[i].log2c;
+#else
+  e.invc[0] = __v_log2_data.tab[i[0]].invc;
+  e.log2c[0] = __v_log2_data.tab[i[0]].log2c;
+  e.invc[1] = __v_log2_data.tab[i[1]].invc;
+  e.log2c[1] = __v_log2_data.tab[i[1]].log2c;
+#endif
+  return e;
+}
+
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (log2, x, y, cmp);
+}
+
+/* Double-precision vector log2 routine. Implements the same algorithm as vector
+   log10, with coefficients and table entries scaled in extended precision.
+   The maximum observed error is 2.26 ULP, at roughly 0.84:
+   __v_log2(0x1.aee6cb4e12a19p-1) got -0x1.fd8348301747fp-3
+				 want -0x1.fd8348301747dp-3.  */
+VPCS_ATTR
+v_f64_t V_NAME (log2) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t top = ix >> 48;
+  v_u64_t special
+    = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  v_u64_t tmp = ix - OFF;
+  v_u64_t i = (tmp >> (52 - V_LOG2_TABLE_BITS)) % N;
+  v_s64_t k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift.  */
+  v_u64_t iz = ix - (tmp & v_u64 (0xfffULL << 52));
+  v_f64_t z = v_as_f64_u64 (iz);
+  struct entry e = lookup (i);
+
+  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
+
+  v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  v_f64_t kd = v_to_f64_s64 (k);
+  v_f64_t w = v_fma_f64 (r, InvLn2, e.log2c);
+
+  v_f64_t r2 = r * r;
+  v_f64_t p_45 = v_fma_f64 (P (5), r, P (4));
+  v_f64_t p_23 = v_fma_f64 (P (3), r, P (2));
+  v_f64_t p_01 = v_fma_f64 (P (1), r, P (0));
+  v_f64_t y = v_fma_f64 (r2, p_45, p_23);
+  y = v_fma_f64 (r2, y, p_01);
+  y = v_fma_f64 (r2, y, kd + w);
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_log2_data.c b/pl/math/v_log2_data.c
new file mode 100644
index 0000000..f926d7f
--- /dev/null
+++ b/pl/math/v_log2_data.c
@@ -0,0 +1,155 @@
+/*
+ * Coefficients and table entries for vector log2
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << V_LOG2_TABLE_BITS)
+
+// clang-format off
+
+const struct v_log2_data __v_log2_data = {
+
+/* Derived from the coefficients in log_data.c for N == 128 && LOG_POLY_ORDER == 7.
+   Each coefficient was scaled by log2(e) in extended precision and rounded back to
+   double.  */
+.poly = { -0x1.71547652b83p-1,   0x1.ec709dc3a03fep-2, -0x1.71547651bb77bp-2,
+	  0x1.2776c50e7a6a3p-2, -0x1.ec73b0462606bp-3,  0x1.a619832ca8615p-3 },
+
+/* Derived from the table in v_log10_data.c. invc is unchanged. log2(c) was
+   calculated by scaling log10(c) by log2(10) in extended precision and rounding
+   back.  */
+.tab = {
+{ 0x1.6a133d0dec120p+0,  -0x1.00130d57f5fadp-1 },
+{ 0x1.6815f2f3e42edp+0,  -0x1.f802661bd725ep-2 },
+{ 0x1.661e39be1ac9ep+0,  -0x1.efea1c6f73a5bp-2 },
+{ 0x1.642bfa30ac371p+0,  -0x1.e7dd1dcd06f05p-2 },
+{ 0x1.623f1d916f323p+0,  -0x1.dfdb4ae024809p-2 },
+{ 0x1.60578da220f65p+0,  -0x1.d7e484d101958p-2 },
+{ 0x1.5e75349dea571p+0,  -0x1.cff8ad452f6ep-2 },
+{ 0x1.5c97fd387a75ap+0,  -0x1.c817a666c997fp-2 },
+{ 0x1.5abfd2981f200p+0,  -0x1.c04152d640419p-2 },
+{ 0x1.58eca051dc99cp+0,  -0x1.b87595a3f64b2p-2 },
+{ 0x1.571e526d9df12p+0,  -0x1.b0b4526c44d07p-2 },
+{ 0x1.5554d555b3fcbp+0,  -0x1.a8fd6d1a90f5ep-2 },
+{ 0x1.539015e2a20cdp+0,  -0x1.a150ca2559fc6p-2 },
+{ 0x1.51d0014ee0164p+0,  -0x1.99ae4e62cca29p-2 },
+{ 0x1.50148538cd9eep+0,  -0x1.9215df1a1e842p-2 },
+{ 0x1.4e5d8f9f698a1p+0,  -0x1.8a8761fe1f0d9p-2 },
+{ 0x1.4cab0edca66bep+0,  -0x1.8302bd1cc9a54p-2 },
+{ 0x1.4afcf1a9db874p+0,  -0x1.7b87d6fb437f6p-2 },
+{ 0x1.495327136e16fp+0,  -0x1.741696673a86dp-2 },
+{ 0x1.47ad9e84af28fp+0,  -0x1.6caee2b3c6fe4p-2 },
+{ 0x1.460c47b39ae15p+0,  -0x1.6550a3666c27ap-2 },
+{ 0x1.446f12b278001p+0,  -0x1.5dfbc08de02a4p-2 },
+{ 0x1.42d5efdd720ecp+0,  -0x1.56b022766c84ap-2 },
+{ 0x1.4140cfe001a0fp+0,  -0x1.4f6db1c955536p-2 },
+{ 0x1.3fafa3b421f69p+0,  -0x1.4834579063054p-2 },
+{ 0x1.3e225c9c8ece5p+0,  -0x1.4103fd2249a76p-2 },
+{ 0x1.3c98ec29a211ap+0,  -0x1.39dc8c3fe6dabp-2 },
+{ 0x1.3b13442a413fep+0,  -0x1.32bdeed4b5c8fp-2 },
+{ 0x1.399156baa3c54p+0,  -0x1.2ba80f41e20ddp-2 },
+{ 0x1.38131639b4cdbp+0,  -0x1.249ad8332f4a7p-2 },
+{ 0x1.36987540fbf53p+0,  -0x1.1d96347e7f3ebp-2 },
+{ 0x1.352166b648f61p+0,  -0x1.169a0f7d6604ap-2 },
+{ 0x1.33adddb3eb575p+0,  -0x1.0fa654a221909p-2 },
+{ 0x1.323dcd99fc1d3p+0,  -0x1.08baefcf8251ap-2 },
+{ 0x1.30d129fefc7d2p+0,  -0x1.01d7cd14deecdp-2 },
+{ 0x1.2f67e6b72fe7dp+0,  -0x1.f5f9b1ad55495p-3 },
+{ 0x1.2e01f7cf8b187p+0,  -0x1.e853ff76a77afp-3 },
+{ 0x1.2c9f518ddc86ep+0,  -0x1.dabe5d624cba1p-3 },
+{ 0x1.2b3fe86e5f413p+0,  -0x1.cd38a5cef4822p-3 },
+{ 0x1.29e3b1211b25cp+0,  -0x1.bfc2b38d315f9p-3 },
+{ 0x1.288aa08b373cfp+0,  -0x1.b25c61f5edd0fp-3 },
+{ 0x1.2734abcaa8467p+0,  -0x1.a5058d18e9cacp-3 },
+{ 0x1.25e1c82459b81p+0,  -0x1.97be1113e47a3p-3 },
+{ 0x1.2491eb1ad59c5p+0,  -0x1.8a85cafdf5e27p-3 },
+{ 0x1.23450a54048b5p+0,  -0x1.7d5c97e8fc45bp-3 },
+{ 0x1.21fb1bb09e578p+0,  -0x1.704255d6486e4p-3 },
+{ 0x1.20b415346d8f7p+0,  -0x1.6336e2cedd7bfp-3 },
+{ 0x1.1f6fed179a1acp+0,  -0x1.563a1d9b0cc6ap-3 },
+{ 0x1.1e2e99b93c7b3p+0,  -0x1.494be541aaa6fp-3 },
+{ 0x1.1cf011a7a882ap+0,  -0x1.3c6c1964dd0f2p-3 },
+{ 0x1.1bb44b97dba5ap+0,  -0x1.2f9a99f19a243p-3 },
+{ 0x1.1a7b3e66cdd4fp+0,  -0x1.22d747344446p-3 },
+{ 0x1.1944e11dc56cdp+0,  -0x1.1622020d4f7f5p-3 },
+{ 0x1.18112aebb1a6ep+0,  -0x1.097aabb3553f3p-3 },
+{ 0x1.16e013231b7e9p+0,  -0x1.f9c24b48014c5p-4 },
+{ 0x1.15b1913f156cfp+0,  -0x1.e0aaa3bdc858ap-4 },
+{ 0x1.14859cdedde13p+0,  -0x1.c7ae257c952d6p-4 },
+{ 0x1.135c2dc68cfa4p+0,  -0x1.aecc960a03e58p-4 },
+{ 0x1.12353bdb01684p+0,  -0x1.9605bb724d541p-4 },
+{ 0x1.1110bf25b85b4p+0,  -0x1.7d595ca7147cep-4 },
+{ 0x1.0feeafd2f8577p+0,  -0x1.64c74165002d9p-4 },
+{ 0x1.0ecf062c51c3bp+0,  -0x1.4c4f31c86d344p-4 },
+{ 0x1.0db1baa076c8bp+0,  -0x1.33f0f70388258p-4 },
+{ 0x1.0c96c5bb3048ep+0,  -0x1.1bac5abb3037dp-4 },
+{ 0x1.0b7e20263e070p+0,  -0x1.0381272495f21p-4 },
+{ 0x1.0a67c2acd0ce3p+0,  -0x1.d6de4eba2de2ap-5 },
+{ 0x1.0953a6391e982p+0,  -0x1.a6ec4e8156898p-5 },
+{ 0x1.0841c3caea380p+0,  -0x1.772be542e3e1bp-5 },
+{ 0x1.07321489b13eap+0,  -0x1.479cadcde852dp-5 },
+{ 0x1.062491aee9904p+0,  -0x1.183e4265faa5p-5 },
+{ 0x1.05193497a7cc5p+0,  -0x1.d2207fdaa1b85p-6 },
+{ 0x1.040ff6b5f5e9fp+0,  -0x1.742486cb4a6a2p-6 },
+{ 0x1.0308d19aa6127p+0,  -0x1.1687d77cfc299p-6 },
+{ 0x1.0203beedb0c67p+0,  -0x1.7293623a6b5dep-7 },
+{ 0x1.010037d38bcc2p+0,  -0x1.70ec80ec8f25dp-8 },
+{ 1.0,   0.0 },
+{ 0x1.fc06d493cca10p-1,  0x1.704c1ca6b6bc9p-7 },
+{ 0x1.f81e6ac3b918fp-1,  0x1.6eac8ba664beap-6 },
+{ 0x1.f44546ef18996p-1,  0x1.11e67d040772dp-5 },
+{ 0x1.f07b10382c84bp-1,  0x1.6bc665e2105dep-5 },
+{ 0x1.ecbf7070e59d4p-1,  0x1.c4f8a9772bf1dp-5 },
+{ 0x1.e91213f715939p-1,  0x1.0ebff10fbb951p-4 },
+{ 0x1.e572a9a75f7b7p-1,  0x1.3aaf4d7805d11p-4 },
+{ 0x1.e1e0e2c530207p-1,  0x1.664ba81a4d717p-4 },
+{ 0x1.de5c72d8a8be3p-1,  0x1.9196387da6de4p-4 },
+{ 0x1.dae50fa5658ccp-1,  0x1.bc902f2b7796p-4 },
+{ 0x1.d77a71145a2dap-1,  0x1.e73ab5f584f28p-4 },
+{ 0x1.d41c51166623ep-1,  0x1.08cb78510d232p-3 },
+{ 0x1.d0ca6ba0bb29fp-1,  0x1.1dd2fe2f0dcb5p-3 },
+{ 0x1.cd847e8e59681p-1,  0x1.32b4784400df4p-3 },
+{ 0x1.ca4a499693e00p-1,  0x1.47706f3d49942p-3 },
+{ 0x1.c71b8e399e821p-1,  0x1.5c0768ee4a4dcp-3 },
+{ 0x1.c3f80faf19077p-1,  0x1.7079e86fc7c6dp-3 },
+{ 0x1.c0df92dc2b0ecp-1,  0x1.84c86e1183467p-3 },
+{ 0x1.bdd1de3cbb542p-1,  0x1.98f377a34b499p-3 },
+{ 0x1.baceb9e1007a3p-1,  0x1.acfb803bc924bp-3 },
+{ 0x1.b7d5ef543e55ep-1,  0x1.c0e10098b025fp-3 },
+{ 0x1.b4e749977d953p-1,  0x1.d4a46efe103efp-3 },
+{ 0x1.b20295155478ep-1,  0x1.e8463f45b8d0bp-3 },
+{ 0x1.af279f8e82be2p-1,  0x1.fbc6e3228997fp-3 },
+{ 0x1.ac5638197fdf3p-1,  0x1.079364f2e5aa8p-2 },
+{ 0x1.a98e2f102e087p-1,  0x1.1133306010a63p-2 },
+{ 0x1.a6cf5606d05c1p-1,  0x1.1ac309631bd17p-2 },
+{ 0x1.a4197fc04d746p-1,  0x1.24432485370c1p-2 },
+{ 0x1.a16c80293dc01p-1,  0x1.2db3b5449132fp-2 },
+{ 0x1.9ec82c4dc5bc9p-1,  0x1.3714ee1d7a32p-2 },
+{ 0x1.9c2c5a491f534p-1,  0x1.406700ab52c94p-2 },
+{ 0x1.9998e1480b618p-1,  0x1.49aa1d87522b2p-2 },
+{ 0x1.970d9977c6c2dp-1,  0x1.52de746d7ecb2p-2 },
+{ 0x1.948a5c023d212p-1,  0x1.5c0434336b343p-2 },
+{ 0x1.920f0303d6809p-1,  0x1.651b8ad6c90d1p-2 },
+{ 0x1.8f9b698a98b45p-1,  0x1.6e24a56ab5831p-2 },
+{ 0x1.8d2f6b81726f6p-1,  0x1.771fb04ec29b1p-2 },
+{ 0x1.8acae5bb55badp-1,  0x1.800cd6f19c25ep-2 },
+{ 0x1.886db5d9275b8p-1,  0x1.88ec441df11dfp-2 },
+{ 0x1.8617ba567c13cp-1,  0x1.91be21b7c93f5p-2 },
+{ 0x1.83c8d27487800p-1,  0x1.9a8298f8c7454p-2 },
+{ 0x1.8180de3c5dbe7p-1,  0x1.a339d255c04ddp-2 },
+{ 0x1.7f3fbe71cdb71p-1,  0x1.abe3f59f43db7p-2 },
+{ 0x1.7d055498071c1p-1,  0x1.b48129deca9efp-2 },
+{ 0x1.7ad182e54f65ap-1,  0x1.bd119575364c1p-2 },
+{ 0x1.78a42c3c90125p-1,  0x1.c5955e23ebcbcp-2 },
+{ 0x1.767d342f76944p-1,  0x1.ce0ca8f4e1557p-2 },
+{ 0x1.745c7ef26b00ap-1,  0x1.d6779a5a75774p-2 },
+{ 0x1.7241f15769d0fp-1,  0x1.ded6563550d27p-2 },
+{ 0x1.702d70d396e41p-1,  0x1.e728ffafd840ep-2 },
+{ 0x1.6e1ee3700cd11p-1,  0x1.ef6fb96c8d739p-2 },
+{ 0x1.6c162fc9cbe02p-1,  0x1.f7aaa57907219p-2 }}
+};
+// clang-format on
diff --git a/pl/math/vn_log2_2u5.c b/pl/math/vn_log2_2u5.c
new file mode 100644
index 0000000..dba524e
--- /dev/null
+++ b/pl/math/vn_log2_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log2.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log2, _ZGVnN2v_log2)
+#include "v_log2_2u5.c"
+#endif
-- 
cgit v1.2.3


From 3d1a87e2fe152dc52d4a624425f5b2349a4088b0 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 15 Aug 2022 11:19:25 +0100
Subject: pl/math: Audit Neon special-case handlers

Prevent inlining in most cases - change to use AOR style (NOINLINE).
---
 pl/math/v_asinhf_2u7.c | 6 ------
 pl/math/v_atan2_3u.c   | 2 +-
 pl/math/v_atan2f_3u.c  | 2 +-
 pl/math/v_erf_2u.c     | 2 +-
 pl/math/v_erfc_3u7.c   | 2 +-
 pl/math/v_erfcf_1u.c   | 8 +++++++-
 pl/math/v_erff_1u5.c   | 2 +-
 pl/math/v_log10_2u5.c  | 2 +-
 pl/math/v_log10f_3u5.c | 2 +-
 9 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
index 39f7989..675b8a8 100644
--- a/pl/math/v_asinhf_2u7.c
+++ b/pl/math/v_asinhf_2u7.c
@@ -14,12 +14,6 @@
 #define Ln2 v_f32 (0x1.62e43p-1f)
 #define SpecialBound v_u32 (0x5f800000) /* asuint(0x1p64).  */
 
-static inline v_f32_t
-handle_special (v_f32_t ax)
-{
-  return V_NAME (log1pf) (ax) + Ln2;
-}
-
 /* Single-precision implementation of vector asinh(x), using vector log1p.
    Worst-case error is 2.66 ULP, at roughly +/-0.25:
    __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3.  */
diff --git a/pl/math/v_atan2_3u.c b/pl/math/v_atan2_3u.c
index 184b220..d69d221 100644
--- a/pl/math/v_atan2_3u.c
+++ b/pl/math/v_atan2_3u.c
@@ -15,7 +15,7 @@
 
 /* Special cases i.e. 0, infinity, NaN (fall back to scalar calls).  */
 VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
+NOINLINE static v_f64_t
 specialcase (v_f64_t y, v_f64_t x, v_f64_t ret, v_u64_t cmp)
 {
   return v_call2_f64 (atan2, y, x, ret, cmp);
diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
index 4212351..dc0fbca 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/pl/math/v_atan2f_3u.c
@@ -16,7 +16,7 @@
 
 /* Special cases i.e. 0, infinity and nan (fall back to scalar calls).  */
 VPCS_ATTR
-__attribute__ ((noinline)) static v_f32_t
+NOINLINE static v_f32_t
 specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp)
 {
   return v_call2_f32 (atan2f, y, x, ret, cmp);
diff --git a/pl/math/v_erf_2u.c b/pl/math/v_erf_2u.c
index 7a08a2c..5a7403f 100644
--- a/pl/math/v_erf_2u.c
+++ b/pl/math/v_erf_2u.c
@@ -16,7 +16,7 @@
 
 /* Special cases (fall back to scalar calls).  */
 VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
+NOINLINE static v_f64_t
 specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
 {
   return v_call_f64 (erf, x, y, cmp);
diff --git a/pl/math/v_erfc_3u7.c b/pl/math/v_erfc_3u7.c
index d3e80ef..4caa9f1 100644
--- a/pl/math/v_erfc_3u7.c
+++ b/pl/math/v_erfc_3u7.c
@@ -22,7 +22,7 @@ v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
 
 /* Special cases (fall back to scalar calls).  */
 VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
+NOINLINE static v_f64_t
 specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
 {
   return v_call_f64 (erfc, x, y, cmp);
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c
index 057ef5c..fc9571d 100644
--- a/pl/math/v_erfcf_1u.c
+++ b/pl/math/v_erfcf_1u.c
@@ -14,6 +14,12 @@
 
 VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
 
+static VPCS_ATTR NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (erfcf, x, y, special);
+}
+
 static inline uint32_t
 interval_index (uint32_t ia12)
 {
@@ -182,7 +188,7 @@ v_f32_t V_NAME (erfcf) (v_f32_t x)
 
   if (unlikely (v_any_u32 (special_cases)))
     {
-      y = v_call_f32 (erfcf, x, y, special_cases);
+      return specialcase (x, y, special_cases);
     }
 #endif
 
diff --git a/pl/math/v_erff_1u5.c b/pl/math/v_erff_1u5.c
index 7c910bd..4407cd1 100644
--- a/pl/math/v_erff_1u5.c
+++ b/pl/math/v_erff_1u5.c
@@ -16,7 +16,7 @@ VPCS_ATTR v_f32_t V_NAME (expf) (v_f32_t);
 
 /* Special cases (fall back to scalar calls).  */
 VPCS_ATTR
-__attribute__ ((noinline)) static v_f32_t
+NOINLINE static v_f32_t
 specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
 {
   return v_call_f32 (erff, x, y, cmp);
diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c
index 64a2b50..6991d4f 100644
--- a/pl/math/v_log10_2u5.c
+++ b/pl/math/v_log10_2u5.c
@@ -53,7 +53,7 @@ lookup (v_u64_t i)
 }
 
 VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
+inline static v_f64_t
 specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
 {
   return v_call_f64 (log10, x, y, cmp);
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
index e105956..c956d0c 100644
--- a/pl/math/v_log10f_3u5.c
+++ b/pl/math/v_log10f_3u5.c
@@ -32,7 +32,7 @@ static const float Poly[] = {
 #define Off v_u32 (0x3f2aaaab) /* 0.666667.  */
 
 VPCS_ATTR
-__attribute__ ((noinline)) static v_f32_t
+NOINLINE static v_f32_t
 specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
 {
   /* Fall back to scalar code.  */
-- 
cgit v1.2.3


From 6393cf41930b5f4209ded61a83b020709509b182 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <joe.ramsay@arm.com>
Date: Wed, 17 Aug 2022 08:56:36 +0100
Subject: pl/math: Add Vector/SVE sinf

An implementation based on Taylor series expansion of sin.
The maximum measured error is 1.89ULPs.
---
 pl/math/include/mathlib.h      |  2 ++
 pl/math/math_config.h          |  6 ++++
 pl/math/sv_sinf_1u9.c          | 77 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/sv_sinf_poly_data.c    | 19 +++++++++++
 pl/math/test/mathbench_funcs.h |  3 ++
 pl/math/test/runulp.sh         | 10 ++++++
 pl/math/test/ulp_funcs.h       |  2 ++
 pl/math/test/ulp_wrappers.h    |  7 ++++
 8 files changed, 126 insertions(+)
 create mode 100644 pl/math/sv_sinf_1u9.c
 create mode 100644 pl/math/sv_sinf_poly_data.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index d06b2ff..afaccaa 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -107,9 +107,11 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 #include <arm_sve.h>
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
 /* SVE ABI names.  */
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
 #endif
 
 #endif
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index e22a5b4..1a4fc21 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -490,4 +490,10 @@ extern const struct v_log2_data
   } tab[1 << V_LOG2_TABLE_BITS];
 } __v_log2_data HIDDEN;
 
+#define V_SINF_NCOEFFS 4
+extern const struct sv_sinf_data
+{
+  float coeffs[V_SINF_NCOEFFS];
+} __sv_sinf_data HIDDEN;
+
 #endif
diff --git a/pl/math/sv_sinf_1u9.c b/pl/math/sv_sinf_1u9.c
new file mode 100644
index 0000000..f7913ca
--- /dev/null
+++ b/pl/math/sv_sinf_1u9.c
@@ -0,0 +1,77 @@
+/*
+ * Single-precision SVE sin(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define A3 (sv_f32 (__sv_sinf_data.coeffs[3]))
+#define A5 (sv_f32 (__sv_sinf_data.coeffs[2]))
+#define A7 (sv_f32 (__sv_sinf_data.coeffs[1]))
+#define A9 (sv_f32 (__sv_sinf_data.coeffs[0]))
+
+#define NegPi1 (sv_f32 (-0x1.921fb6p+1f))
+#define NegPi2 (sv_f32 (0x1.777a5cp-24f))
+#define NegPi3 (sv_f32 (0x1.ee59dap-49f))
+#define RangeVal (sv_f32 (0x1p20f))
+#define InvPi (sv_f32 (0x1.45f306p-2f))
+#define Shift (sv_f32 (0x1.8p+23f))
+#define AbsMask (0x7fffffff)
+
+static NOINLINE sv_f32_t
+__sv_sinf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (sinf, x, y, cmp);
+}
+
+/* A fast SVE implementation of sinf.
+   Maximum error: 1.89 ULPs.
+   This maximum error is achieved at multiple values in [-2^18, 2^18]
+   but one example is:
+   __sv_sinf(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1.  */
+sv_f32_t
+__sv_sinf_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_f32_t n, r, r2, y;
+  sv_u32_t sign, odd;
+  svbool_t cmp;
+
+  r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask));
+  sign = svand_n_u32_x (pg, sv_as_u32_f32 (x), ~AbsMask);
+  cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal));
+
+  /* n = rint(|x|/pi).  */
+  n = sv_fma_f32_x (pg, InvPi, r, Shift);
+  odd = svlsl_n_u32_x (pg, sv_as_u32_f32 (n), 31);
+  n = svsub_f32_x (pg, n, Shift);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = sv_fma_f32_x (pg, NegPi1, n, r);
+  r = sv_fma_f32_x (pg, NegPi2, n, r);
+  r = sv_fma_f32_x (pg, NegPi3, n, r);
+
+  /* sin(r) approx using a degree 9 polynomial from the Taylor series
+     expansion. Note that only the odd terms of this are non-zero.  */
+  r2 = svmul_f32_x (pg, r, r);
+  y = sv_fma_f32_x (pg, A9, r2, A7);
+  y = sv_fma_f32_x (pg, y, r2, A5);
+  y = sv_fma_f32_x (pg, y, r2, A3);
+  y = sv_fma_f32_x (pg, svmul_f32_x (pg, y, r2), r, r);
+
+  /* sign = y^sign^odd.  */
+  y = sv_as_f32_u32 (
+    sveor_u32_x (pg, sv_as_u32_f32 (y), sveor_u32_x (pg, sign, odd)));
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_sinf_specialcase (x, y, cmp);
+  return y;
+}
+
+strong_alias (__sv_sinf_x, _ZGVsMxv_sinf)
+
+#endif
diff --git a/pl/math/sv_sinf_poly_data.c b/pl/math/sv_sinf_poly_data.c
new file mode 100644
index 0000000..109ed58
--- /dev/null
+++ b/pl/math/sv_sinf_poly_data.c
@@ -0,0 +1,19 @@
+/*
+ * Data used in single-precision sin(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients for approximating sin(x) in single
+   precision. These are the non-zero coefficients from the
+   degree 9 Taylor series expansion of sin.  */
+
+const struct sv_sinf_data __sv_sinf_data = {.coeffs = {
+					      0x1.5b2e76p-19f,
+					      -0x1.9f42eap-13f,
+					      0x1.110df4p-7f,
+					      -0x1.555548p-3f,
+					    }};
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index b33f369..3baeaed 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -14,6 +14,7 @@ F (erff, -4.0, 4.0)
 F (log10f, 0.01, 11.1)
 F (log1pf, -0.9, 10.0)
 F (log2f, 0.01, 11.1)
+F (sinf, -3.1, 3.1)
 
 D (asinh, -10.0, 10.0)
 D (atan, -10.0, 10.0)
@@ -102,6 +103,8 @@ VND (_ZGVnN2v_log2, 0.01, 11.1)
 #if WANT_SVE_MATH
 SVF (__sv_cosf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_cosf, -3.1, 3.1)
+SVF (__sv_sinf_x, -3.1, 3.1)
+SVF (_ZGVsMxv_sinf, -3.1, 3.1)
 
 SVD (__sv_cos_x, -3.1, 3.1)
 SVD (_ZGVsMxv_cos, -3.1, 3.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index fef96c1..732a61b 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -139,6 +139,7 @@ runsv=
 if [ $WANT_SVE_MATH -eq 1 ]; then
 check __sv_cosf 0 && runsv=1
 check __sv_cos  0 && runsv=1
+check __sv_sinf 0 && runsv=1
 fi
 
 range_erfc='
@@ -270,6 +271,11 @@ range_sve_cos='
  0x1p-4    0x1p4    500000
 '
 
+range_sve_sinf='
+ 0    0xffff0000    10000
+ 0x1p-4    0x1p4    500000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -289,6 +295,7 @@ L_log2=3
 
 L_sve_cosf=1.6
 L_sve_cos=2.0
+L_sve_sinf=1.9
 
 while read G F R
 do
@@ -368,6 +375,9 @@ log2f  _ZGVnN4v_log2f  $runvn
 if [ $WANT_SVE_MATH -eq 1 ]; then
 sve_cosf __sv_cosf     $runsv
 sve_cosf _ZGVsMxv_cosf $runsv
+sve_sinf __sv_sinf     $runsv
+sve_sinf _ZGVsMxv_sinf $runsv
+
 sve_cos  __sv_cos      $runsv
 sve_cos  _ZGVsMxv_cos  $runsv
 fi
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 7325cfd..b81609d 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -82,5 +82,7 @@ SVF1 (cos)
 ZSVF1 (cos)
 SVD1 (cos)
 ZSVD1 (cos)
+SVF1 (sin)
+ZSVF1 (sin)
 #endif
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 98edb99..d3e08a7 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -71,6 +71,13 @@ static float sv_cosf(float x) {
 static float Z_sv_cosf(float x) {
   return svretf(_ZGVsMxv_cosf(svargf(x), svptrue_b32()));
 }
+static float sv_sinf(float x) {
+  return svretf(__sv_sinf_x(svargf(x), svptrue_b32()));
+}
+static float Z_sv_sinf(float x) {
+  return svretf(_ZGVsMxv_sinf(svargf(x), svptrue_b32()));
+}
+
 static double sv_cos(double x) {
   return svretd(__sv_cos_x(svargd(x), svptrue_b64()));
 }
-- 
cgit v1.2.3


From a10dbf0355cae0a07761f2f96586fa47446ada95 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <joe.ramsay@arm.com>
Date: Wed, 17 Aug 2022 08:57:10 +0100
Subject: pl/math: Add Vector/SVE sin

An implementation based on SVE trigonometric instructions.
It relies on a similar range reduction as Vector/Neon
sin, but to [-pi/4, pi/4] instead of [-pi/2, pi/2].
The estimated maximum error is 1.95ULPs.
---
 pl/math/include/mathlib.h      |  2 ++
 pl/math/sv_sin_2u.c            | 82 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h |  3 ++
 pl/math/test/runulp.sh         |  9 +++++
 pl/math/test/ulp_funcs.h       |  2 ++
 pl/math/test/ulp_wrappers.h    |  6 ++++
 6 files changed, 104 insertions(+)
 create mode 100644 pl/math/sv_sin_2u.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index afaccaa..0c5051c 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -108,10 +108,12 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
 /* SVE ABI names.  */
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
 #endif
 
 #endif
diff --git a/pl/math/sv_sin_2u.c b/pl/math/sv_sin_2u.c
new file mode 100644
index 0000000..9c5b747
--- /dev/null
+++ b/pl/math/sv_sin_2u.c
@@ -0,0 +1,82 @@
+/*
+ * Double-precision SVE sin(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define InvPi (sv_f64 (0x1.45f306dc9c883p-2))
+#define HalfPi (sv_f64 (0x1.921fb54442d18p+0))
+#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1))
+#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0))
+#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26))
+#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54))
+#define Shift (sv_f64 (0x1.8p52))
+#define RangeVal (sv_f64 (0x1p23))
+#define AbsMask (0x7fffffffffffffff)
+
+static NOINLINE sv_f64_t
+__sv_sin_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (sin, x, y, cmp);
+}
+
+/* A fast SVE implementation of sin based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum measured error: 1.95 ULPs
+   __sv_sin(0x1.0abe696a98052p+19) got -0x1.ff302079d96a4p-3
+				  want -0x1.ff302079d96a2p-3.  */
+sv_f64_t
+__sv_sin_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_f64_t n, r, r2, y;
+  sv_u64_t sign;
+  svbool_t cmp;
+
+  r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask));
+  sign = svand_n_u64_x (pg, sv_as_u64_f64 (x), ~AbsMask);
+  cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal));
+
+  /* n = rint(|x|/(pi/2)).  */
+  sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift);
+  n = svsub_f64_x (pg, q, Shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = sv_fma_f64_x (pg, NegPio2_1, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_3, n, r);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q));
+
+  /* sin(r) poly approx.  */
+  r2 = svtsmul_f64 (r, sv_as_u64_f64 (q));
+  y = sv_f64 (0.0);
+  y = svtmad_f64 (y, r2, 7);
+  y = svtmad_f64 (y, r2, 6);
+  y = svtmad_f64 (y, r2, 5);
+  y = svtmad_f64 (y, r2, 4);
+  y = svtmad_f64 (y, r2, 3);
+  y = svtmad_f64 (y, r2, 2);
+  y = svtmad_f64 (y, r2, 1);
+  y = svtmad_f64 (y, r2, 0);
+
+  /* Apply factor.  */
+  y = svmul_f64_x (pg, f, y);
+
+  /* sign = y^sign.  */
+  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_sin_specialcase (x, y, cmp);
+  return y;
+}
+
+strong_alias (__sv_sin_x, _ZGVsMxv_sin)
+
+#endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 3baeaed..689f4c9 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -25,6 +25,7 @@ D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
 D (log1p, -0.9, 10.0)
 D (log2, 0.01, 11.1)
+D (sin, -3.1, 3.1)
 
 #if WANT_VMATH
 F (__s_asinhf, -10.0, 10.0)
@@ -108,6 +109,8 @@ SVF (_ZGVsMxv_sinf, -3.1, 3.1)
 
 SVD (__sv_cos_x, -3.1, 3.1)
 SVD (_ZGVsMxv_cos, -3.1, 3.1)
+SVD (__sv_sin_x, -3.1, 3.1)
+SVD (_ZGVsMxv_sin, -3.1, 3.1)
 #endif
 #endif
   // clang-format on
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 732a61b..c077c4f 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -140,6 +140,7 @@ if [ $WANT_SVE_MATH -eq 1 ]; then
 check __sv_cosf 0 && runsv=1
 check __sv_cos  0 && runsv=1
 check __sv_sinf 0 && runsv=1
+check __sv_sin 0 && runsv=1
 fi
 
 range_erfc='
@@ -276,6 +277,11 @@ range_sve_sinf='
  0x1p-4    0x1p4    500000
 '
 
+range_sve_sin='
+ 0    0xffff0000    10000
+ 0x1p-4    0x1p4    500000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -296,6 +302,7 @@ L_log2=3
 L_sve_cosf=1.6
 L_sve_cos=2.0
 L_sve_sinf=1.9
+L_sve_sin=2.0
 
 while read G F R
 do
@@ -380,6 +387,8 @@ sve_sinf _ZGVsMxv_sinf $runsv
 
 sve_cos  __sv_cos      $runsv
 sve_cos  _ZGVsMxv_cos  $runsv
+sve_sin  __sv_sin      $runsv
+sve_sin  _ZGVsMxv_sin  $runsv
 fi
 EOF
 
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index b81609d..5d67863 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -84,5 +84,7 @@ SVD1 (cos)
 ZSVD1 (cos)
 SVF1 (sin)
 ZSVF1 (sin)
+SVD1 (sin)
+ZSVD1 (sin)
 #endif
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index d3e08a7..d3c8bd6 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -84,6 +84,12 @@ static double sv_cos(double x) {
 static double Z_sv_cos(double x) {
   return svretd(_ZGVsMxv_cos(svargd(x), svptrue_b64()));
 }
+static double sv_sin(double x) {
+  return svretd(__sv_sin_x(svargd(x), svptrue_b64()));
+}
+static double Z_sv_sin(double x) {
+  return svretd(_ZGVsMxv_sin(svargd(x), svptrue_b64()));
+}
 #endif
 #endif
 // clang-format on
-- 
cgit v1.2.3


From 5a746bfe0f8fee6a266355c25b767bbd275e452f Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 17 Aug 2022 16:23:43 +0100
Subject: pl/math: Add vector/SVE atanf

New routine uses polynomial on a reduced interval, and is accurate to
2.9 ulp.
---
 pl/math/include/mathlib.h      |  2 ++
 pl/math/sv_atanf_2u9.c         | 49 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/sv_atanf_common.h      | 47 ++++++++++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h |  3 +++
 pl/math/test/runulp.sh         | 19 ++++++++++++----
 pl/math/test/ulp_funcs.h       |  2 ++
 pl/math/test/ulp_wrappers.h    |  6 ++++++
 7 files changed, 124 insertions(+), 4 deletions(-)
 create mode 100644 pl/math/sv_atanf_2u9.c
 create mode 100644 pl/math/sv_atanf_common.h

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 0c5051c..b8fea7f 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -105,11 +105,13 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 
 #if WANT_SVE_MATH
 #include <arm_sve.h>
+svfloat32_t __sv_atanf_x (svfloat32_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
 /* SVE ABI names.  */
+svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
diff --git a/pl/math/sv_atanf_2u9.c b/pl/math/sv_atanf_2u9.c
new file mode 100644
index 0000000..d195ca5
--- /dev/null
+++ b/pl/math/sv_atanf_2u9.c
@@ -0,0 +1,49 @@
+/*
+ * Single-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#include "sv_atanf_common.h"
+
+#define PiOver2 sv_f32 (0x1.921fb6p+0f)
+#define AbsMask (0x7fffffff)
+
+/* Fast implementation of SVE atanf based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=-1/x and shift = pi/2.
+   Largest observed error is 2.9 ULP, close to +/-1.0:
+   __sv_atanf(0x1.0468f6p+0) got -0x1.967f06p-1
+			    want -0x1.967fp-1.  */
+sv_f32_t
+__sv_atanf_x (sv_f32_t x, const svbool_t pg)
+{
+  /* No need to trigger special case. Small cases, infs and nans
+     are supported by our approximation technique.  */
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask);
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  svbool_t red = svacgt_n_f32 (pg, x, 1.0f);
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  sv_f32_t z = svsel_f32 (red, svdiv_f32_x (pg, sv_f32 (-1.0f), x), x);
+  /* Use absolute value only when needed (odd powers of z).  */
+  sv_f32_t az = svabs_f32_x (pg, z);
+  az = svneg_f32_m (az, red, az);
+
+  sv_f32_t y = __sv_atanf_common (pg, red, z, az, PiOver2);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  return sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign));
+}
+
+strong_alias (__sv_atanf_x, _ZGVsMxv_atanf)
+
+#endif
diff --git a/pl/math/sv_atanf_common.h b/pl/math/sv_atanf_common.h
new file mode 100644
index 0000000..869a257
--- /dev/null
+++ b/pl/math/sv_atanf_common.h
@@ -0,0 +1,47 @@
+/*
+ * Single-precision polynomial evaluation function for SVE atan(x) and
+ * atan2(y,x).
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_SV_ATANF_COMMON_H
+#define PL_MATH_SV_ATANF_COMMON_H
+
+#include "math_config.h"
+#include "sv_math.h"
+
+#define P(i) sv_f32 (__atanf_poly_data.poly[i])
+
+/* Polynomial used in fast SVE atanf(x) and atan2f(y,x) implementations
+   The order 7 polynomial P approximates (f(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline sv_f32_t
+__sv_atanf_common (svbool_t pg, svbool_t red, sv_f32_t z, sv_f32_t az,
+		   sv_f32_t shift)
+{
+  /* Use full Estrin scheme for P(z^2) with deg(P)=7.  */
+
+  /* First compute square powers of z.  */
+  sv_f32_t z2 = svmul_f32_x (pg, z, z);
+  sv_f32_t z4 = svmul_f32_x (pg, z2, z2);
+  sv_f32_t z8 = svmul_f32_x (pg, z4, z4);
+
+  /* Then assemble polynomial.  */
+  sv_f32_t p_4_7 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (7), P (6))),
+				 (sv_fma_f32_x (pg, z2, P (5), P (4))));
+  sv_f32_t p_0_3 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (3), P (2))),
+				 (sv_fma_f32_x (pg, z2, P (1), P (0))));
+  sv_f32_t y = sv_fma_f32_x (pg, z8, p_4_7, p_0_3);
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  sv_f32_t z3 = svmul_f32_x (pg, z2, az);
+  y = sv_fma_f32_x (pg, y, z3, az);
+
+  /* Apply shift as indicated by 'red' predicate.  */
+  y = svadd_f32_m (red, y, shift);
+
+  return y;
+}
+
+#endif // PL_MATH_SV_ATANF_COMMON_H
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 689f4c9..13b6d5f 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -102,6 +102,9 @@ VND (_ZGVnN2v_log2, 0.01, 11.1)
 #endif
 #endif
 #if WANT_SVE_MATH
+SVF (__sv_atanf_x, -3.1, 3.1)
+SVF (_ZGVsMxv_atanf, -3.1, 3.1)
+
 SVF (__sv_cosf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_cosf, -3.1, 3.1)
 SVF (__sv_sinf_x, -3.1, 3.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index c077c4f..f34769e 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -282,6 +282,14 @@ range_sve_sin='
  0x1p-4    0x1p4    500000
 '
 
+range_sve_atanf='
+ -10.0       10.0  50000
+  -1.0        1.0  40000
+   0.0        1.0  40000
+   1.0      100.0  40000
+   1e6       1e32  40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -303,6 +311,7 @@ L_sve_cosf=1.6
 L_sve_cos=2.0
 L_sve_sinf=1.9
 L_sve_sin=2.0
+L_sve_atanf=2.9
 
 while read G F R
 do
@@ -380,10 +389,12 @@ log2f  __vn_log2f      $runvn
 log2f  _ZGVnN4v_log2f  $runvn
 
 if [ $WANT_SVE_MATH -eq 1 ]; then
-sve_cosf __sv_cosf     $runsv
-sve_cosf _ZGVsMxv_cosf $runsv
-sve_sinf __sv_sinf     $runsv
-sve_sinf _ZGVsMxv_sinf $runsv
+sve_cosf     __sv_cosf         $runsv
+sve_cosf     _ZGVsMxv_cosf     $runsv
+sve_sinf     __sv_sinf         $runsv
+sve_sinf     _ZGVsMxv_sinf     $runsv
+sve_atanf    __sv_atanf        $runsv
+sve_atanf    _ZGVsMxv_atanf    $runsv
 
 sve_cos  __sv_cos      $runsv
 sve_cos  _ZGVsMxv_cos  $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 5d67863..aa99657 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -78,6 +78,8 @@ F (_ZGVnN2v_log2, Z_log2, log2l, mpfr_log2, 1, 0, d1, 1)
 #endif
 #endif
 #if WANT_SVE_MATH
+SVF1 (atan)
+ZSVF1 (atan)
 SVF1 (cos)
 ZSVF1 (cos)
 SVD1 (cos)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index d3c8bd6..e962573 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -65,6 +65,12 @@ static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
 static double Z_log2(double x) { return _ZGVnN2v_log2(argd(x))[0]; }
 #endif
 #if WANT_SVE_MATH
+static float sv_atanf(float x) {
+  return svretf(__sv_atanf_x(svargf(x), svptrue_b32()));
+}
+static float Z_sv_atanf(float x) {
+  return svretf(_ZGVsMxv_atanf(svargf(x), svptrue_b32()));
+}
 static float sv_cosf(float x) {
   return svretf(__sv_cosf_x(svargf(x), svptrue_b32()));
 }
-- 
cgit v1.2.3


From 15f280615aa2db05f47c6ce11b8dd9bef502b2e7 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <joe.ramsay@arm.com>
Date: Wed, 17 Aug 2022 16:25:20 +0100
Subject: pl/math: Add vector/SVE atan

New routine uses polynomial on a reduced interval, and is accurate to
2.5 ulp.
---
 pl/math/include/mathlib.h      |  2 ++
 pl/math/sv_atan_2u5.c          | 52 +++++++++++++++++++++++++++++++++++
 pl/math/sv_atan_common.h       | 61 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h |  2 ++
 pl/math/test/runulp.sh         | 19 ++++++++++---
 pl/math/test/ulp_funcs.h       |  2 ++
 pl/math/test/ulp_wrappers.h    |  6 +++++
 7 files changed, 140 insertions(+), 4 deletions(-)
 create mode 100644 pl/math/sv_atan_2u5.c
 create mode 100644 pl/math/sv_atan_common.h

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index b8fea7f..9a58fd6 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -106,12 +106,14 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 #if WANT_SVE_MATH
 #include <arm_sve.h>
 svfloat32_t __sv_atanf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
 /* SVE ABI names.  */
 svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
diff --git a/pl/math/sv_atan_2u5.c b/pl/math/sv_atan_2u5.c
new file mode 100644
index 0000000..aa741f7
--- /dev/null
+++ b/pl/math/sv_atan_2u5.c
@@ -0,0 +1,52 @@
+/*
+ * Double-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#include "sv_atan_common.h"
+
+/* Useful constants.  */
+#define PiOver2 sv_f64 (0x1.921fb54442d18p+0)
+#define AbsMask (0x7fffffffffffffff)
+
+/* Fast implementation of SVE atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
+   is 2.2 ulps:
+   __sv_atan(0x1.00050804cdc8cp+0) got 0x1.9224bd3c68773p-1
+				  want 0x1.9224bd3c68775p-1.  */
+sv_f64_t
+__sv_atan_x (sv_f64_t x, const svbool_t pg)
+{
+  /* No need to trigger special case. Small cases, infs and nans
+     are supported by our approximation technique.  */
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  svbool_t red = svacgt_n_f64 (pg, x, 1.0);
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  sv_f64_t z = svsel_f64 (red, svdiv_f64_x (pg, sv_f64 (-1.0), x), x);
+  /* Use absolute value only when needed (odd powers of z).  */
+  sv_f64_t az = svabs_f64_x (pg, z);
+  az = svneg_f64_m (az, red, az);
+
+  sv_f64_t y = __sv_atan_common (pg, red, z, az, PiOver2);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+
+  return y;
+}
+
+strong_alias (__sv_atan_x, _ZGVsMxv_atan)
+
+#endif
diff --git a/pl/math/sv_atan_common.h b/pl/math/sv_atan_common.h
new file mode 100644
index 0000000..53cdbc7
--- /dev/null
+++ b/pl/math/sv_atan_common.h
@@ -0,0 +1,61 @@
+/*
+ * Double-precision polynomial evaluation function for SVE atan(x) and
+ * atan2(y,x).
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "sv_math.h"
+
+#define P(i) sv_f64 (__atan_poly_data.poly[i])
+
+/* Polynomial used in fast SVE atan(x) and atan2(y,x) implementations
+   The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline sv_f64_t
+__sv_atan_common (svbool_t pg, svbool_t red, sv_f64_t z, sv_f64_t az,
+		  sv_f64_t shift)
+{
+  /* Use full Estrin scheme for P(z^2) with deg(P)=19.  */
+  sv_f64_t z2 = svmul_f64_x (pg, z, z);
+
+  /* Level 1.  */
+  sv_f64_t P_1_0 = sv_fma_f64_x (pg, P (1), z2, P (0));
+  sv_f64_t P_3_2 = sv_fma_f64_x (pg, P (3), z2, P (2));
+  sv_f64_t P_5_4 = sv_fma_f64_x (pg, P (5), z2, P (4));
+  sv_f64_t P_7_6 = sv_fma_f64_x (pg, P (7), z2, P (6));
+  sv_f64_t P_9_8 = sv_fma_f64_x (pg, P (9), z2, P (8));
+  sv_f64_t P_11_10 = sv_fma_f64_x (pg, P (11), z2, P (10));
+  sv_f64_t P_13_12 = sv_fma_f64_x (pg, P (13), z2, P (12));
+  sv_f64_t P_15_14 = sv_fma_f64_x (pg, P (15), z2, P (14));
+  sv_f64_t P_17_16 = sv_fma_f64_x (pg, P (17), z2, P (16));
+  sv_f64_t P_19_18 = sv_fma_f64_x (pg, P (19), z2, P (18));
+
+  /* Level 2.  */
+  sv_f64_t x2 = svmul_f64_x (pg, z2, z2);
+  sv_f64_t P_3_0 = sv_fma_f64_x (pg, P_3_2, x2, P_1_0);
+  sv_f64_t P_7_4 = sv_fma_f64_x (pg, P_7_6, x2, P_5_4);
+  sv_f64_t P_11_8 = sv_fma_f64_x (pg, P_11_10, x2, P_9_8);
+  sv_f64_t P_15_12 = sv_fma_f64_x (pg, P_15_14, x2, P_13_12);
+  sv_f64_t P_19_16 = sv_fma_f64_x (pg, P_19_18, x2, P_17_16);
+
+  /* Level 3.  */
+  sv_f64_t x4 = svmul_f64_x (pg, x2, x2);
+  sv_f64_t P_7_0 = sv_fma_f64_x (pg, P_7_4, x4, P_3_0);
+  sv_f64_t P_15_8 = sv_fma_f64_x (pg, P_15_12, x4, P_11_8);
+
+  /* Level 4.  */
+  sv_f64_t x8 = svmul_f64_x (pg, x4, x4);
+  sv_f64_t y = sv_fma_f64_x (pg, P_19_16, x8, P_15_8);
+  y = sv_fma_f64_x (pg, y, x8, P_7_0);
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  sv_f64_t z3 = svmul_f64_x (pg, z2, az);
+  y = sv_fma_f64_x (pg, y, z3, az);
+
+  /* Apply shift as indicated by `red` predicate.  */
+  y = svadd_f64_m (red, y, shift);
+
+  return y;
+}
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 13b6d5f..eb3cded 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -104,6 +104,8 @@ VND (_ZGVnN2v_log2, 0.01, 11.1)
 #if WANT_SVE_MATH
 SVF (__sv_atanf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_atanf, -3.1, 3.1)
+SVD (__sv_atan_x, -3.1, 3.1)
+SVD (_ZGVsMxv_atan, -3.1, 3.1)
 
 SVF (__sv_cosf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_cosf, -3.1, 3.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index f34769e..4ceac24 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -290,6 +290,14 @@ range_sve_atanf='
    1e6       1e32  40000
 '
 
+range_sve_atan='
+ -10.0       10.0  50000
+  -1.0        1.0  40000
+   0.0        1.0  40000
+   1.0      100.0  40000
+   1e6       1e32  40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -312,6 +320,7 @@ L_sve_cos=2.0
 L_sve_sinf=1.9
 L_sve_sin=2.0
 L_sve_atanf=2.9
+L_sve_atan=2.5
 
 while read G F R
 do
@@ -396,10 +405,12 @@ sve_sinf     _ZGVsMxv_sinf     $runsv
 sve_atanf    __sv_atanf        $runsv
 sve_atanf    _ZGVsMxv_atanf    $runsv
 
-sve_cos  __sv_cos      $runsv
-sve_cos  _ZGVsMxv_cos  $runsv
-sve_sin  __sv_sin      $runsv
-sve_sin  _ZGVsMxv_sin  $runsv
+sve_cos    __sv_cos        $runsv
+sve_cos    _ZGVsMxv_cos    $runsv
+sve_sin    __sv_sin        $runsv
+sve_sin    _ZGVsMxv_sin    $runsv
+sve_atan   __sv_atan       $runsv
+sve_atan   _ZGVsMxv_atan   $runsv
 fi
 EOF
 
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index aa99657..697d1cb 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -80,6 +80,8 @@ F (_ZGVnN2v_log2, Z_log2, log2l, mpfr_log2, 1, 0, d1, 1)
 #if WANT_SVE_MATH
 SVF1 (atan)
 ZSVF1 (atan)
+SVD1 (atan)
+ZSVD1 (atan)
 SVF1 (cos)
 ZSVF1 (cos)
 SVD1 (cos)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index e962573..4e02feb 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -84,6 +84,12 @@ static float Z_sv_sinf(float x) {
   return svretf(_ZGVsMxv_sinf(svargf(x), svptrue_b32()));
 }
 
+static double sv_atan(double x) {
+  return svretd(__sv_atan_x(svargd(x), svptrue_b64()));
+}
+static double Z_sv_atan(double x) {
+  return svretd(_ZGVsMxv_atan(svargd(x), svptrue_b64()));
+}
 static double sv_cos(double x) {
   return svretd(__sv_cos_x(svargd(x), svptrue_b64()));
 }
-- 
cgit v1.2.3


From 8e0882f0bc93e7df04c44d77abf14af502159f52 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 17 Aug 2022 16:25:49 +0100
Subject: pl/math: Add vector/SVE atan2f

New routine is accurate to 3 ulps.
---
 pl/math/include/mathlib.h         |  2 +
 pl/math/sv_atan2f_3u.c            | 83 +++++++++++++++++++++++++++++++++++++++
 pl/math/sv_math.h                 | 17 ++++++++
 pl/math/test/mathbench_funcs.h    |  3 ++
 pl/math/test/mathbench_wrappers.h | 16 ++++++++
 pl/math/test/runulp.sh            | 11 ++++++
 pl/math/test/ulp_funcs.h          |  2 +
 pl/math/test/ulp_wrappers.h       |  6 +++
 8 files changed, 140 insertions(+)
 create mode 100644 pl/math/sv_atan2f_3u.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 9a58fd6..665d6fe 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -105,6 +105,7 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 
 #if WANT_SVE_MATH
 #include <arm_sve.h>
+svfloat32_t __sv_atan2f_x (svfloat32_t, svfloat32_t, svbool_t);
 svfloat32_t __sv_atanf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
@@ -112,6 +113,7 @@ svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
 /* SVE ABI names.  */
+svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t);
 svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
diff --git a/pl/math/sv_atan2f_3u.c b/pl/math/sv_atan2f_3u.c
new file mode 100644
index 0000000..5f93c49
--- /dev/null
+++ b/pl/math/sv_atan2f_3u.c
@@ -0,0 +1,83 @@
+/*
+ * Single-precision vector atan2f(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#include "sv_atanf_common.h"
+
+/* Useful constants.  */
+#define PiOver2 sv_f32 (0x1.921fb6p+0f)
+#define SignMask sv_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
+static inline sv_f32_t
+specialcase (sv_f32_t y, sv_f32_t x, sv_f32_t ret, const svbool_t cmp)
+{
+  return sv_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns a predicate indicating true if the input is the bit representation of
+   0, infinity or nan.  */
+static inline svbool_t
+zeroinfnan (sv_u32_t i, const svbool_t pg)
+{
+  return svcmpge_u32 (pg, svsub_n_u32_x (pg, svlsl_n_u32_x (pg, i, 1), 1),
+		      sv_u32 (2 * 0x7f800000lu - 1));
+}
+
+/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * P(z^2)
+   with reduction to [0,1] using z=1/x and shift = pi/2.
+   Maximum observed error is 2.95 ULP:
+   __sv_atan2f(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+					   want 0x1.967f00p-1.  */
+sv_f32_t
+__sv_atan2f_x (sv_f32_t y, sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  sv_u32_t iy = sv_as_u32_f32 (y);
+
+  svbool_t cmp_x = zeroinfnan (ix, pg);
+  svbool_t cmp_y = zeroinfnan (iy, pg);
+  svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y);
+
+  sv_u32_t sign_x = svand_u32_x (pg, ix, SignMask);
+  sv_u32_t sign_y = svand_u32_x (pg, iy, SignMask);
+  sv_u32_t sign_xy = sveor_u32_x (pg, sign_x, sign_y);
+
+  sv_f32_t ax = svabs_f32_x (pg, x);
+  sv_f32_t ay = svabs_f32_x (pg, y);
+
+  svbool_t pred_xlt0 = svcmplt_f32 (pg, x, sv_f32 (0.0));
+  svbool_t pred_aygtax = svcmpgt_f32 (pg, ay, ax);
+
+  /* Set up z for call to atan.  */
+  sv_f32_t n = svsel_f32 (pred_aygtax, svneg_f32_x (pg, ax), ay);
+  sv_f32_t d = svsel_f32 (pred_aygtax, ay, ax);
+  sv_f32_t z = svdiv_f32_x (pg, n, d);
+
+  /* Work out the correct shift.  */
+  sv_f32_t shift = svsel_f32 (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0));
+  shift = svsel_f32 (pred_aygtax, svadd_n_f32_x (pg, shift, 1.0), shift);
+  shift = svmul_f32_x (pg, shift, PiOver2);
+
+  sv_f32_t ret = __sv_atanf_common (pg, pg, z, z, shift);
+
+  /* Account for the sign of x and y.  */
+  ret = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (ret), sign_xy));
+
+  if (unlikely (svptest_any (pg, cmp_xy)))
+    {
+      return specialcase (y, x, ret, cmp_xy);
+    }
+
+  return ret;
+}
+
+strong_alias (__sv_atan2f_x, _ZGVsMxvv_atan2f)
+
+#endif
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
index 14919be..4a7a3fb 100644
--- a/pl/math/sv_math.h
+++ b/pl/math/sv_math.h
@@ -155,6 +155,23 @@ sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp)
   return y;
 }
 
+static inline sv_f32_t
+sv_call2_f32 (f32_t (*f) (f32_t, f32_t), sv_f32_t x1, sv_f32_t x2, sv_f32_t y,
+	      svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f32_t elem1 = svclastb_n_f32 (p, 0, x1);
+      f32_t elem2 = svclastb_n_f32 (p, 0, x2);
+      f32_t ret = (*f) (elem1, elem2);
+      sv_f32_t y2 = svdup_n_f32 (ret);
+      y = svsel_f32 (p, y2, y);
+      p = svpnext_b32 (cmp, p);
+    }
+  return y;
+}
+
 #endif
 #endif
 #endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index eb3cded..eb8dc4d 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -107,6 +107,9 @@ SVF (_ZGVsMxv_atanf, -3.1, 3.1)
 SVD (__sv_atan_x, -3.1, 3.1)
 SVD (_ZGVsMxv_atan, -3.1, 3.1)
 
+{"__sv_atan2f_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_atan2f_wrap}},
+{"_ZGVsMxvv_atan2f", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}},
+
 SVF (__sv_cosf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_cosf, -3.1, 3.1)
 SVF (__sv_sinf_x, -3.1, 3.1)
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index 37edeae..af5fa2a 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -73,3 +73,19 @@ _Z_atan2f_wrap (v_float x)
 #endif // __vpcs
 #endif // __arch64__
 #endif // WANT_VMATH
+
+#if WANT_SVE_MATH
+
+static sv_float
+__sv_atan2f_wrap (sv_float x, sv_bool pg)
+{
+  return __sv_atan2f_x (x, svdup_n_f32 (5.0f), pg);
+}
+
+static sv_float
+_Z_sv_atan2f_wrap (sv_float x, sv_bool pg)
+{
+  return _ZGVsMxvv_atan2f (x, svdup_n_f32 (5.0f), pg);
+}
+
+#endif // WANT_SVE_MATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 4ceac24..7473005 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -298,6 +298,14 @@ range_sve_atan='
    1e6       1e32  40000
 '
 
+range_sve_atan2f='
+ -10.0       10.0  50000
+  -1.0        1.0  40000
+   0.0        1.0  40000
+   1.0      100.0  40000
+   1e6       1e32  40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -321,6 +329,7 @@ L_sve_sinf=1.9
 L_sve_sin=2.0
 L_sve_atanf=2.9
 L_sve_atan=2.5
+L_sve_atan2f=3.0
 
 while read G F R
 do
@@ -402,6 +411,8 @@ sve_cosf     __sv_cosf         $runsv
 sve_cosf     _ZGVsMxv_cosf     $runsv
 sve_sinf     __sv_sinf         $runsv
 sve_sinf     _ZGVsMxv_sinf     $runsv
+sve_atan2f   __sv_atan2f       $runsv
+sve_atan2f   _ZGVsMxvv_atan2f  $runsv
 sve_atanf    __sv_atanf        $runsv
 sve_atanf    _ZGVsMxv_atanf    $runsv
 
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 697d1cb..3db230d 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -78,6 +78,8 @@ F (_ZGVnN2v_log2, Z_log2, log2l, mpfr_log2, 1, 0, d1, 1)
 #endif
 #endif
 #if WANT_SVE_MATH
+SVF2 (atan2)
+ZSVF2 (atan2)
 SVF1 (atan)
 ZSVF1 (atan)
 SVD1 (atan)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 4e02feb..0d72e51 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -65,6 +65,12 @@ static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
 static double Z_log2(double x) { return _ZGVnN2v_log2(argd(x))[0]; }
 #endif
 #if WANT_SVE_MATH
+static float sv_atan2f(float x, float y) {
+  return svretf(__sv_atan2f_x(svargf(x), svargf(y), svptrue_b32()));
+}
+static float Z_sv_atan2f(float x, float y) {
+  return svretf(_ZGVsMxvv_atan2f(svargf(x), svargf(y), svptrue_b32()));
+}
 static float sv_atanf(float x) {
   return svretf(__sv_atanf_x(svargf(x), svptrue_b32()));
 }
-- 
cgit v1.2.3


From 1d89678cd83c42953467c1405bf596362a7c96c0 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 17 Aug 2022 16:25:55 +0100
Subject: pl/math: Add vector/SVE atan2

New routine is accurate to 2 ulps.
---
 pl/math/include/mathlib.h         |  2 +
 pl/math/sv_atan2_2u.c             | 82 +++++++++++++++++++++++++++++++++++++++
 pl/math/sv_math.h                 | 17 ++++++++
 pl/math/test/mathbench_funcs.h    |  2 +
 pl/math/test/mathbench_wrappers.h | 12 ++++++
 pl/math/test/runulp.sh            | 11 ++++++
 pl/math/test/ulp_funcs.h          |  2 +
 pl/math/test/ulp_wrappers.h       |  6 +++
 8 files changed, 134 insertions(+)
 create mode 100644 pl/math/sv_atan2_2u.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 665d6fe..266d2a1 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -108,6 +108,7 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 svfloat32_t __sv_atan2f_x (svfloat32_t, svfloat32_t, svbool_t);
 svfloat32_t __sv_atanf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
+svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
@@ -116,6 +117,7 @@ svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t);
 svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
diff --git a/pl/math/sv_atan2_2u.c b/pl/math/sv_atan2_2u.c
new file mode 100644
index 0000000..82f7588
--- /dev/null
+++ b/pl/math/sv_atan2_2u.c
@@ -0,0 +1,82 @@
+/*
+ * Double-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#include "sv_atan_common.h"
+
+/* Useful constants.  */
+#define PiOver2 sv_f64 (0x1.921fb54442d18p+0)
+#define SignMask sv_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
+__attribute__ ((noinline)) static sv_f64_t
+specialcase (sv_f64_t y, sv_f64_t x, sv_f64_t ret, const svbool_t cmp)
+{
+  return sv_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns a predicate indicating true if the input is the bit representation of
+   0, infinity or nan.  */
+static inline svbool_t
+zeroinfnan (sv_u64_t i, const svbool_t pg)
+{
+  return svcmpge_u64 (pg, svsub_n_u64_x (pg, svlsl_n_u64_x (pg, i, 1), 1),
+		      sv_u64 (2 * asuint64 (INFINITY) - 1));
+}
+
+/* Fast implementation of SVE atan2. Errors are greatest when y and
+   x are reasonably close together. Maximum observed error is 2.0 ulps:
+   sv_atan2(0x1.8d9621df2f329p+2, 0x1.884cf49437972p+2)
+   got 0x1.958cd0e8c618bp-1 want 0x1.958cd0e8c618dp-1.  */
+sv_f64_t
+__sv_atan2_x (sv_f64_t y, sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t iy = sv_as_u64_f64 (y);
+
+  svbool_t cmp_x = zeroinfnan (ix, pg);
+  svbool_t cmp_y = zeroinfnan (iy, pg);
+  svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y);
+
+  sv_u64_t sign_x = svand_u64_x (pg, ix, SignMask);
+  sv_u64_t sign_y = svand_u64_x (pg, iy, SignMask);
+  sv_u64_t sign_xy = sveor_u64_x (pg, sign_x, sign_y);
+
+  sv_f64_t ax = svabs_f64_x (pg, x);
+  sv_f64_t ay = svabs_f64_x (pg, y);
+
+  svbool_t pred_xlt0 = svcmplt_f64 (pg, x, sv_f64 (0.0));
+  svbool_t pred_aygtax = svcmpgt_f64 (pg, ay, ax);
+
+  /* Set up z for call to atan.  */
+  sv_f64_t n = svsel_f64 (pred_aygtax, svneg_f64_x (pg, ax), ay);
+  sv_f64_t d = svsel_f64 (pred_aygtax, ay, ax);
+  sv_f64_t z = svdiv_f64_x (pg, n, d);
+
+  /* Work out the correct shift.  */
+  sv_f64_t shift = svsel_f64 (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
+  shift = svsel_f64 (pred_aygtax, svadd_n_f64_x (pg, shift, 1.0), shift);
+  shift = svmul_f64_x (pg, shift, PiOver2);
+
+  sv_f64_t ret = __sv_atan_common (pg, pg, z, z, shift);
+
+  /* Account for the sign of x and y.  */
+  ret = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (ret), sign_xy));
+
+  if (unlikely (svptest_any (pg, cmp_xy)))
+    {
+      return specialcase (y, x, ret, cmp_xy);
+    }
+
+  return ret;
+}
+
+strong_alias (__sv_atan2_x, _ZGVsMxvv_atan2)
+
+#endif
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
index 4a7a3fb..c027a41 100644
--- a/pl/math/sv_math.h
+++ b/pl/math/sv_math.h
@@ -96,6 +96,23 @@ sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp)
   return y;
 }
 
+static inline sv_f64_t
+sv_call2_f64 (f64_t (*f) (f64_t, f64_t), sv_f64_t x1, sv_f64_t x2, sv_f64_t y,
+	      svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f64_t elem1 = svclastb_n_f64 (p, 0, x1);
+      f64_t elem2 = svclastb_n_f64 (p, 0, x2);
+      f64_t ret = (*f) (elem1, elem2);
+      sv_f64_t y2 = svdup_n_f64 (ret);
+      y = svsel_f64 (p, y2, y);
+      p = svpnext_b64 (cmp, p);
+    }
+  return y;
+}
+
 /* Single precision.  */
 static inline sv_s32_t
 sv_s32 (s32_t x)
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index eb8dc4d..fb48097 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -109,6 +109,8 @@ SVD (_ZGVsMxv_atan, -3.1, 3.1)
 
 {"__sv_atan2f_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_atan2f_wrap}},
 {"_ZGVsMxvv_atan2f", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}},
+{"__sv_atan2", 'd', 'n', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
+{"_ZGVsM2vv_atan2", 'd', 'n', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
 
 SVF (__sv_cosf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_cosf, -3.1, 3.1)
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index af5fa2a..fa1b99a 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -88,4 +88,16 @@ _Z_sv_atan2f_wrap (sv_float x, sv_bool pg)
   return _ZGVsMxvv_atan2f (x, svdup_n_f32 (5.0f), pg);
 }
 
+static sv_double
+__sv_atan2_wrap (sv_double x, sv_bool pg)
+{
+  return __sv_atan2_x (x, svdup_n_f64 (5.0), pg);
+}
+
+static sv_double
+_Z_sv_atan2_wrap (sv_double x, sv_bool pg)
+{
+  return _ZGVsMxvv_atan2 (x, svdup_n_f64 (5.0), pg);
+}
+
 #endif // WANT_SVE_MATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 7473005..137b324 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -306,6 +306,14 @@ range_sve_atan2f='
    1e6       1e32  40000
 '
 
+range_sve_atan2='
+ -10.0       10.0  50000
+  -1.0        1.0  40000
+   0.0        1.0  40000
+   1.0      100.0  40000
+   1e6       1e32  40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -330,6 +338,7 @@ L_sve_sin=2.0
 L_sve_atanf=2.9
 L_sve_atan=2.5
 L_sve_atan2f=3.0
+L_sve_atan2=2.0
 
 while read G F R
 do
@@ -422,6 +431,8 @@ sve_sin    __sv_sin        $runsv
 sve_sin    _ZGVsMxv_sin    $runsv
 sve_atan   __sv_atan       $runsv
 sve_atan   _ZGVsMxv_atan   $runsv
+sve_atan2  __sv_atan2      $runsv
+sve_atan2  _ZGVsMxvv_atan2 $runsv
 fi
 EOF
 
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 3db230d..a53091b 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -80,6 +80,8 @@ F (_ZGVnN2v_log2, Z_log2, log2l, mpfr_log2, 1, 0, d1, 1)
 #if WANT_SVE_MATH
 SVF2 (atan2)
 ZSVF2 (atan2)
+SVD2 (atan2)
+ZSVD2 (atan2)
 SVF1 (atan)
 ZSVF1 (atan)
 SVD1 (atan)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 0d72e51..6a31f4a 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -90,6 +90,12 @@ static float Z_sv_sinf(float x) {
   return svretf(_ZGVsMxv_sinf(svargf(x), svptrue_b32()));
 }
 
+static double sv_atan2(double x, double y) {
+  return svretd(__sv_atan2_x(svargd(x), svargd(y), svptrue_b64()));
+}
+static double Z_sv_atan2(double x, double y) {
+  return svretd(_ZGVsMxvv_atan2(svargd(x), svargd(y), svptrue_b64()));
+}
 static double sv_atan(double x) {
   return svretd(__sv_atan_x(svargd(x), svptrue_b64()));
 }
-- 
cgit v1.2.3


From ffb6461143e29d34687aca489d9fd8d297dc9920 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Mon, 22 Aug 2022 12:39:57 +0100
Subject: string: arm: Prevent leaf function unwinding

As leaf functions cannot throw exceptions, with EHABI only supporting
synchronous exceptions, add support for emitting a `.cantunwind'
directive prior to `.fnend' in ARM_FNEND preprocessor macro.

This ensures no personality routine or exception table data is
generated.  Existing `.save' directives used in leaf functions are also
removed.

Built w/ arm-none-linux-gnueabihf, ran make check-string w/ qemu-arm-static.
---
 string/arm/memchr.S         | 2 +-
 string/arm/strcmp.S         | 7 +++++--
 string/arm/strlen-armv6t2.S | 7 +++++--
 string/asmdefs.h            | 6 ++++++
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 77fe569..ddc808b 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -73,7 +73,6 @@ __memchr_arm:
 10:
 	@ At this point, we are aligned, we know we have at least 8 bytes to work with
 	push	{r4,r5,r6,r7}
-	.save   {r4-r7}
 	.cfi_adjust_cfa_offset 16
 #if HAVE_PAC_LEAF
 	.cfi_offset 4, -20
@@ -169,6 +168,7 @@ __memchr_arm:
 #endif /* HAVE_PAC_LEAF */
 	bx lr
 	.cfi_endproc
+	.cantunwind
 	.fnend
 
 	.size	__memchr_arm, . - __memchr_arm
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index d891d33..2eb560f 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -27,6 +27,11 @@
 
 #define STRCMP_NO_PRECHECK	0
 
+/* Ensure the .cantunwind directive is prepended to .fnend.
+   Leaf functions cannot throw exceptions - EHABI only supports
+   synchronous exceptions.  */
+#define IS_LEAF
+
 	/* This version uses Thumb-2 code.  */
 	.thumb
 	.syntax unified
@@ -152,7 +157,6 @@ __strcmp_arm:
 	bne	L(fastpath_exit)
 #endif
 	strd	r4, r5, [sp, #-16]!
-	.save	{r4, r5}
 	.cfi_adjust_cfa_offset 16
 #if HAVE_PAC_LEAF
 	.cfi_offset 4, -20
@@ -163,7 +167,6 @@ __strcmp_arm:
 #endif /* HAVE_PAC_LEAF */
 	orr	tmp1, src1, src2
 	strd	r6, r7, [sp, #8]
-	.save	{r6, r7}
 #if HAVE_PAC_LEAF
 	.cfi_offset 6, -12
 	.cfi_offset 7, -8
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 2601d03..49ba928 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -24,6 +24,11 @@
 #define S2HI		lsl
 #endif
 
+/* Ensure the .cantunwind directive is prepended to .fnend.
+   Leaf functions cannot throw exceptions - EHABI only supports
+   synchronous exceptions.  */
+#define IS_LEAF
+
 	/* This code requires Thumb.  */
 	.thumb
 	.syntax unified
@@ -52,7 +57,6 @@ ENTRY (__strlen_armv6t2)
 #endif /* __ARM_FEATURE_BTI_DEFAULT */
 	.cfi_register 143, 12
 	push    {r4, r5, ip}
-	.save   {r4, r5, ra_auth_code}
 	.cfi_def_cfa_offset 12
 	.cfi_offset 143, -4
 	.cfi_offset 5, -8
@@ -62,7 +66,6 @@ ENTRY (__strlen_armv6t2)
 	bti
 #endif /* __ARM_FEATURE_BTI_DEFAULT */
 	push    {r4, r5}
-	.save   {r4, r5}
 	.cfi_def_cfa_offset 8
 	.cfi_offset 4, -8
 	.cfi_offset 5, -4
diff --git a/string/asmdefs.h b/string/asmdefs.h
index d122b26..eb43836 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -10,7 +10,13 @@
 
 #if defined (__arm__)
 #define ARM_FNSTART .fnstart
+#if defined (IS_LEAF)
+#define ARM_FNEND \
+  .cantunwind	  \
+  .fnend
+#else
 #define ARM_FNEND .fnend
+# endif
 #else
 #define ARM_FNSTART
 #define ARM_FNEND
-- 
cgit v1.2.3


From 5c72615c203f9a2a39c04b23640ebaac26294bcb Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Mon, 22 Aug 2022 12:44:49 +0100
Subject: string: arm: Augment M-profile PACBTI-enablement macros

Modify previously defined PACBTI macros to allow for
more flexible push/pop expressions at function prologue/epilogues,
allowing further simplification of code predicated on the use of
M-profile PACBTI hardware features.

This patch also allows for the specification of whether generated pac
keys are pushed onto the stack for leaf functions where this may not
be necessary.

It defines the following preprocessor macros:
    * HAVE_PAC_LEAF: Indicates whether pac-signing has been requested for
    leaf functions.
    * PAC_LEAF_PUSH_IP: Whether leaf functions should push the pac code
    to the stack irrespective of whether the ip register is clobbered in
    the function or not.
    * PAC_CFI_ADJ: Given values for the above two parameters, this
    holds the calculated offset applied to default CFI address/offset
    values as a consequence of potentially pushing the pac-code to the
    stack.

It also defines the following assembler macros:
    * prologue: In addition to pushing any callee-saved registers onto
    the stack, it generates any requested pacbti instructions.
    Pushed registers are specified via the optional `first', `last' and
    `savepac' macro argument parameters.
    when a single register number is provided, it pushes that
    register.  When two register numbers are provided, they specify a
    rage to save.  If savepac is non-zero, the ip register is also
    saved.
    For example:

        prologue savepac=1 -> push {sp}
        prologue 1 -> push {r1}
        prologue 1 savepac=1 -> push {r1, ip}
        prologue 1 4 -> push {r1-r4}
        prologue 1 4 savepac=1 -> push {r1-r4, ip}

    * epilogue: pops registes off the stack and emmits pac key signing
    instruction if requested. The optional `first', `last' and
    `savepac' function as per the prologue macro, generating a pop
    instead of push instruction.
    * cfisavelist - prologue macro helper function, generating
    necessary .cfi_offset directives associated with push instruction.
    Therefore, the net effect of calling `prologue 1 2 savepac=1' is
    to generate the following:

        push {r1-r2, ip}
       .cfi_adjust_cfa_offset 12
       .cfi_offset 143, -12
       .cfi_offset 2, -8
       .cfi_offset 1, -4

    * cfirestorelist - epilogue macro helper function, emitting
    .cfi_restore instructions prior to resetting the cfa offset.  As
    such, calling `epilogue 1 2 savepac=1' will produce:

        pop {r1-r2, ip}
       .cfi_restore 143
       .cfi_restore 2
       .cfi_restore 1
       .cfi_def_cfa_offset 0
---
 string/arm/memchr.S         |  37 +++----------
 string/arm/strcmp.S         |  25 ++++-----
 string/arm/strlen-armv6t2.S |  39 +------------
 string/pacbti.h             | 131 +++++++++++++++++++++++++++++++++++++-------
 4 files changed, 131 insertions(+), 101 deletions(-)

diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index ddc808b..83a96ca 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -52,7 +52,7 @@ __memchr_arm:
 	@ r1 = character to look for
 	@ r2 = length
 	@ returns r0 = pointer to character or NULL if not found
-	pacbti_prologue
+	prologue
 	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
 
 	cmp	r2,#16		@ If it's short don't bother with anything clever
@@ -74,17 +74,10 @@ __memchr_arm:
 	@ At this point, we are aligned, we know we have at least 8 bytes to work with
 	push	{r4,r5,r6,r7}
 	.cfi_adjust_cfa_offset 16
-#if HAVE_PAC_LEAF
-	.cfi_offset 4, -20
-	.cfi_offset 5, -16
-	.cfi_offset 6, -12
-	.cfi_offset 7, -8
-#else
-	.cfi_offset 4, -16
-	.cfi_offset 5, -12
-	.cfi_offset 6, -8
-	.cfi_offset 7, -4
-#endif /*  HAVE_PAC_LEAF */
+	.cfi_offset 4, -(16+PAC_CFI_ADJ)
+	.cfi_offset 5, -(12+PAC_CFI_ADJ)
+	.cfi_offset 6, -(8+PAC_CFI_ADJ)
+	.cfi_offset 7, -(4+PAC_CFI_ADJ)
 	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
 	orr	r1, r1, r1, lsl #16
 	bic	r4, r2, #7	@ Number of double words to work with
@@ -124,11 +117,11 @@ __memchr_arm:
 
 40:
 	movs	r0,#0		@ not found
-	pacbti_epilogue
+	epilogue
 
 50:
 	subs	r0,r0,#1	@ found
-	pacbti_epilogue
+	epilogue
 
 60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
 	@ r0 points to the start of the double word after the one that was tested
@@ -152,21 +145,7 @@ __memchr_arm:
 
 61:
 	subs	r0,r0,#1
-#if HAVE_PAC_LEAF
-	pop	{r4,r5,r6,r7,ip}
-	.cfi_restore 143
-#else
-	pop	{r4,r5,r6,r7}
-#endif /* HAVE_PAC_LEAF */
-	.cfi_restore 7
-	.cfi_restore 6
-	.cfi_restore 5
-	.cfi_restore 4
-	.cfi_def_cfa_offset 0
-#if HAVE_PAC_LEAF
-	aut ip, lr, sp
-#endif /* HAVE_PAC_LEAF */
-	bx lr
+	epilogue 4 7
 	.cfi_endproc
 	.cantunwind
 	.fnend
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index 2eb560f..a408f3f 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -106,7 +106,7 @@
 	.cfi_restore 5
 	.cfi_adjust_cfa_offset -16
 	sub	result, result, r1, lsr #24
-	pacbti_epilogue
+	epilogue savepac=HAVE_PAC_LEAF
 #else
 	/* To use the big-endian trick we'd have to reverse all three words.
 	   that's slower than this approach.  */
@@ -129,7 +129,7 @@
 	.cfi_adjust_cfa_offset -16
 	sub	result, result, r1
 
-	pacbti_epilogue
+	epilogue savepac=HAVE_PAC_LEAF
 #endif
 	.endm
 
@@ -140,14 +140,14 @@ L(strcmp_start_addr):
 #if STRCMP_NO_PRECHECK == 0
 L(fastpath_exit):
 	sub	r0, r2, r3
-	pacbti_epilogue
+	epilogue savepac=HAVE_PAC_LEAF
 	nop
 #endif
 	.global __strcmp_arm
 	.type __strcmp_arm,%function
 	.align 0
 __strcmp_arm:
-	pacbti_prologue
+	prologue savepac=HAVE_PAC_LEAF
 #if STRCMP_NO_PRECHECK == 0
 	ldrb	r2, [src1]
 	ldrb	r3, [src2]
@@ -158,13 +158,8 @@ __strcmp_arm:
 #endif
 	strd	r4, r5, [sp, #-16]!
 	.cfi_adjust_cfa_offset 16
-#if HAVE_PAC_LEAF
-	.cfi_offset 4, -20
-	.cfi_offset 5, -16
-#else
-	.cfi_offset 4, -16
-	.cfi_offset 5, -12
-#endif /* HAVE_PAC_LEAF */
+	.cfi_offset 5, -(12+PAC_CFI_ADJ)
+	.cfi_offset 4, -(16+PAC_CFI_ADJ)
 	orr	tmp1, src1, src2
 	strd	r6, r7, [sp, #8]
 #if HAVE_PAC_LEAF
@@ -344,7 +339,7 @@ L(misaligned_exit):
 	.cfi_restore 4
 	.cfi_adjust_cfa_offset -16
 
-	pacbti_epilogue
+	epilogue savepac=HAVE_PAC_LEAF
 
 #if STRCMP_NO_PRECHECK == 0
 L(aligned_m1):
@@ -396,7 +391,7 @@ L(overlap3):
 	.cfi_restore 7
 	.cfi_adjust_cfa_offset -16
 	neg	result, result
-	pacbti_epilogue
+	epilogue savepac=HAVE_PAC_LEAF
 6:
 	.cfi_restore_state
 	S2LO	data1, data1, #24
@@ -472,7 +467,7 @@ L(strcmp_done_equal):
 	.cfi_restore 6
 	.cfi_restore 7
 	.cfi_adjust_cfa_offset -16
-	pacbti_epilogue
+	epilogue savepac=HAVE_PAC_LEAF
 
 L(strcmp_tail):
 	.cfi_restore_state
@@ -496,7 +491,7 @@ L(strcmp_tail):
 	.cfi_restore 7
 	.cfi_adjust_cfa_offset -16
 	sub	result, result, data2, lsr #24
-	pacbti_epilogue
+	epilogue savepac=HAVE_PAC_LEAF
 
 END (__strcmp_arm)
 
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 49ba928..6e0352d 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -47,29 +47,7 @@
 #define tmp2		r5
 
 ENTRY (__strlen_armv6t2)
-	/* common pacbti_prologue macro from pacbti.h not used.
-	   handwritten prologue saves one push instruction. */
-#if HAVE_PAC_LEAF
-#if __ARM_FEATURE_BTI_DEFAULT
-	pacbti ip, lr, sp
-#else
-	pac ip, lr, sp
-#endif /* __ARM_FEATURE_BTI_DEFAULT */
-	.cfi_register 143, 12
-	push    {r4, r5, ip}
-	.cfi_def_cfa_offset 12
-	.cfi_offset 143, -4
-	.cfi_offset 5, -8
-	.cfi_offset 4, -12
-#else
-#if __ARM_FEATURE_BTI_DEFAULT
-	bti
-#endif /* __ARM_FEATURE_BTI_DEFAULT */
-	push    {r4, r5}
-	.cfi_def_cfa_offset 8
-	.cfi_offset 4, -8
-	.cfi_offset 5, -4
-#endif /* HAVE_PAC_LEAF */
+	prologue 4 5 savepac=HAVE_PAC_LEAF
 	pld	[srcin, #0]
 	bic	src, srcin, #7
 	mvn	const_m1, #0
@@ -129,20 +107,7 @@ L(null_found):
 #endif
 	clz	data1a, data1a
 	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
-#if HAVE_PAC_LEAF
-	pop	{r4, r5, ip}
-	.cfi_restore 4
-	.cfi_restore 5
-	.cfi_restore 143
-	.cfi_def_cfa_offset 0
-	aut ip, lr, sp
-#else
-	ldrd	r4, r5, [sp], #8
-	.cfi_restore 4
-	.cfi_restore 5
-	.cfi_def_cfa_offset 0
-#endif /* HAVE_PAC_LEAF */
-	bx lr
+	epilogue 4 5 savepac=HAVE_PAC_LEAF
 
 L(misaligned8):
 	ldrd	data1a, data1b, [src]
diff --git a/string/pacbti.h b/string/pacbti.h
index 9162b27..0745233 100644
--- a/string/pacbti.h
+++ b/string/pacbti.h
@@ -5,39 +5,130 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-/* Checki whether leaf function PAC signing has been requested
-   in the -mbranch-protect compile-time option */
+/* Check whether leaf function PAC signing has been requested in the
+   -mbranch-protect compile-time option.  */
 #define LEAF_PROTECT_BIT 2
-#define HAVE_PAC_LEAF \
+
+#ifdef __ARM_FEATURE_PAC_DEFAULT
+# define HAVE_PAC_LEAF \
 	__ARM_FEATURE_PAC_DEFAULT & (1 << LEAF_PROTECT_BIT)
+#else
+# define HAVE_PAC_LEAF 0
+#endif
+
+/* Provide default parameters for PAC-code handling in leaf-functions.  */
+#ifndef PAC_LEAF_PUSH_IP
+# define PAC_LEAF_PUSH_IP 1
+#endif
+
+#if HAVE_PAC_LEAF
+# if PAC_LEAF_PUSH_IP
+#   define PAC_CFI_ADJ 4
+# else
+#  define PAC_CFI_ADJ 0
+# endif /* PAC_LEAF_PUSH_IP*/
+#else
+# undef PAC_LEAF_PUSH_IP
+# define PAC_LEAF_PUSH_IP 0
+# define PAC_CFI_ADJ 0
+#endif /* HAVE_PAC_LEAF */
+
+/* Emit .cfi_restore directives for a consecutive sequence of registers.  */
+	.macro cfirestorelist first, last
+	.cfi_restore \last
+	.if \last-\first
+	cfirestorelist \first, \last-1
+	.endif
+	.endm
 
-/* Macro to handle function entry depending on branch-protection
-   schemes */
-	.macro pacbti_prologue
+/* Emit .cfi_offset directives for a consecutive sequence of registers.  */
+	.macro cfisavelist first, last, index=1
+	.cfi_offset \last, -4*(\index) - PAC_CFI_ADJ
+	.if \last-\first
+	cfisavelist \first, \last-1, \index+1
+	.endif
+	.endm
+
+/* Create a prologue entry sequence handling PAC/BTI, if required and emitting
+   CFI directives for generated PAC code and any pushed registers.  */
+	.macro prologue first=-1, last=-1, savepac=PAC_LEAF_PUSH_IP
 #if HAVE_PAC_LEAF
 #if __ARM_FEATURE_BTI_DEFAULT
-	pacbti ip, lr, sp
+	pacbti	ip, lr, sp
 #else
-	pac ip, lr, sp
+	pac	ip, lr, sp
 #endif /* __ARM_FEATURE_BTI_DEFAULT */
 	.cfi_register 143, 12
-	str ip, [sp, #-4]!
-	.save {ra_auth_code}
-	.cfi_def_cfa_offset 4
-	.cfi_offset 143, -4
-#elif __ARM_FEATURE_BTI_DEFAULT
+#else
+#if __ARM_FEATURE_BTI_DEFAULT
 	bti
+#endif /* __ARM_FEATURE_BTI_DEFAULT */
 #endif /* HAVE_PAC_LEAF */
+	.if \first != -1
+	.if \last != -1
+	.if \savepac
+	push {r\first-r\last, ip}
+	.cfi_adjust_cfa_offset ((\last-\first)+1)*4 + PAC_CFI_ADJ
+	.cfi_offset 143, -PAC_CFI_ADJ
+	cfisavelist \first, \last
+	.else
+	push {r\first-r\last}
+	.cfi_adjust_cfa_offset ((\last-\first)+1)*4
+	cfisavelist \first, \last
+	.endif
+	.else
+	.if \savepac
+	push {r\first, ip}
+	.cfi_adjust_cfa_offset 4 + PAC_CFI_ADJ
+	.cfi_offset 143, -PAC_CFI_ADJ
+	cfisavelist \first, \first
+	.else // !\savepac
+	push {r\first}
+	.cfi_adjust_cfa_offset PAC_CFI_ADJ
+	cfisavelist \first, \first
+	.endif
+	.endif
+	.else // \first == -1
+	.if \savepac
+	push {ip}
+	.cfi_adjust_cfa_offset PAC_CFI_ADJ
+	.cfi_offset 143, -PAC_CFI_ADJ
+	.endif
+	.endif
 	.endm
 
-/* Macro to handle different branch exchange cases depending on
-   branch-protection schemes */
-	.macro pacbti_epilogue
-#if HAVE_PAC_LEAF
-	ldr ip, [sp], #4
+/* Create an epilogue exit sequence handling PAC/BTI, if required and emitting
+  CFI directives for all restored registers.  */
+	.macro epilogue first=-1, last=-1, savepac=PAC_LEAF_PUSH_IP
+	.if \first != -1
+	.if \last != -1
+	.if \savepac
+	pop {r\first-r\last, ip}
+	.cfi_restore 143
+	cfirestorelist \first, \last
+	.else
+	pop {r\first-r\last}
+	cfirestorelist \first, \last
+	.endif
+	.else
+	.if \savepac
+	pop {r\first, ip}
 	.cfi_restore 143
+	cfirestorelist \first, \first
+	.else
+	pop {r\first}
+	cfirestorelist \first, \first
+	.endif
+	.endif
+	.else
+	.if \savepac
+	pop {ip}
+	.cfi_restore 143
+	.endif
+	.endif
 	.cfi_def_cfa_offset 0
-	aut ip, lr, sp
+#if HAVE_PAC_LEAF
+	aut	ip, lr, sp
 #endif /* HAVE_PAC_LEAF */
-	bx lr
+	bx	lr
 	.endm
-- 
cgit v1.2.3


From d3166fa73ad1ef10c5c4a5e5e2403cc9d753bda4 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Tue, 23 Aug 2022 10:07:27 +0100
Subject: string: arm: Fix CFI auto-alignment issues.

The use of the PAC_CFI_ADJ macro for calculating the effect of pushing
the IP register onto the stack assumes that pushing the register is
always optional and is always supressed when PAC_LEAF_PUSH_IP is set
to 0. This leads to CFI alignment issues for functions where the IP
register is clobbered and thus where IP is always pushed to the stack
in the function prologue.

This patch introduces a new macro PAC_CFI_ADJ_DEFAULT whose value is
never zeroed when PAC signing is requested, irrespective of the
PAC_LEAF_PUSH_IP settings.

Example:
* HAVE_PAC_LEAF == 1 && PAC_LEAF_PUSH_IP == 1:
    PAC_CFI_ADJ = 4
    PAC_CFI_ADJ_DEFAULT = 4

* HAVE_PAC_LEAF == 1 && PAC_LEAF_PUSH_IP == 0:
    PAC_CFI_ADJ = 0
    PAC_CFI_ADJ_DEFAULT = 4

Built w/ arm-none-linux-gnueabihf, ran make check-string w/ qemu-arm-static.
---
 string/arm/strcmp.S |  4 ++--
 string/pacbti.h     | 39 ++++++++++++++++++++++++++-------------
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index a408f3f..eafb9f6 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -158,8 +158,8 @@ __strcmp_arm:
 #endif
 	strd	r4, r5, [sp, #-16]!
 	.cfi_adjust_cfa_offset 16
-	.cfi_offset 5, -(12+PAC_CFI_ADJ)
-	.cfi_offset 4, -(16+PAC_CFI_ADJ)
+	.cfi_offset 5, -(12+PAC_CFI_ADJ_DEFAULT)
+	.cfi_offset 4, -(16+PAC_CFI_ADJ_DEFAULT)
 	orr	tmp1, src1, src2
 	strd	r6, r7, [sp, #8]
 #if HAVE_PAC_LEAF
diff --git a/string/pacbti.h b/string/pacbti.h
index 0745233..4e8a600 100644
--- a/string/pacbti.h
+++ b/string/pacbti.h
@@ -21,9 +21,21 @@
 # define PAC_LEAF_PUSH_IP 1
 #endif
 
+/* Two distinct PAC_CFI adjustment values are needed at any given time.
+   If PAC-signing is requested for leaf functions but pushing the pac
+   code to the stack is not, PAC_CFI_ADJ defaults to 0, as most
+   functions will not overwrite the register holding pac (ip). This is not
+   appropriate for functions that clobber the ip register, where pushing
+   to the stack is non-optional.  Wherever a generated pac code must be
+   unconditionally pushed to the stack, a CFI adjustment of
+   PAC_CFI_ADJ_DEFAULT is used instead.  */
+#if HAVE_PAC_LEAF
+#  define PAC_CFI_ADJ_DEFAULT 4
+#endif
+
 #if HAVE_PAC_LEAF
 # if PAC_LEAF_PUSH_IP
-#   define PAC_CFI_ADJ 4
+#  define PAC_CFI_ADJ 4
 # else
 #  define PAC_CFI_ADJ 0
 # endif /* PAC_LEAF_PUSH_IP*/
@@ -31,6 +43,7 @@
 # undef PAC_LEAF_PUSH_IP
 # define PAC_LEAF_PUSH_IP 0
 # define PAC_CFI_ADJ 0
+# define PAC_CFI_ADJ_DEFAULT PAC_CFI_ADJ
 #endif /* HAVE_PAC_LEAF */
 
 /* Emit .cfi_restore directives for a consecutive sequence of registers.  */
@@ -43,7 +56,7 @@
 
 /* Emit .cfi_offset directives for a consecutive sequence of registers.  */
 	.macro cfisavelist first, last, index=1
-	.cfi_offset \last, -4*(\index) - PAC_CFI_ADJ
+	.cfi_offset \last, -4 * (\index)
 	.if \last-\first
 	cfisavelist \first, \last-1, \index+1
 	.endif
@@ -68,31 +81,31 @@
 	.if \last != -1
 	.if \savepac
 	push {r\first-r\last, ip}
-	.cfi_adjust_cfa_offset ((\last-\first)+1)*4 + PAC_CFI_ADJ
-	.cfi_offset 143, -PAC_CFI_ADJ
-	cfisavelist \first, \last
+	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
+	.cfi_offset 143, -4
+	cfisavelist \first, \last, 2
 	.else
 	push {r\first-r\last}
 	.cfi_adjust_cfa_offset ((\last-\first)+1)*4
-	cfisavelist \first, \last
+	cfisavelist \first, \last, 1
 	.endif
 	.else
 	.if \savepac
 	push {r\first, ip}
-	.cfi_adjust_cfa_offset 4 + PAC_CFI_ADJ
-	.cfi_offset 143, -PAC_CFI_ADJ
-	cfisavelist \first, \first
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 143, -4
+	cfisavelist \first, \first, 2
 	.else // !\savepac
 	push {r\first}
-	.cfi_adjust_cfa_offset PAC_CFI_ADJ
-	cfisavelist \first, \first
+	.cfi_adjust_cfa_offset 4
+	cfisavelist \first, \first, 1
 	.endif
 	.endif
 	.else // \first == -1
 	.if \savepac
 	push {ip}
-	.cfi_adjust_cfa_offset PAC_CFI_ADJ
-	.cfi_offset 143, -PAC_CFI_ADJ
+	.cfi_adjust_cfa_offset 4
+	.cfi_offset 143, -4
 	.endif
 	.endif
 	.endm
-- 
cgit v1.2.3


From 5de06730073d93487fdda678db08e9e2cafe93bb Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 22 Aug 2022 13:20:33 +0100
Subject: string: Optimize strnlen

Optimize strnlen using the shrn instruction and improve the main loop.
Small strings are 10% faster, large strings are 40% faster.
---
 string/aarch64/strnlen.S | 54 ++++++++++++++++++++----------------------------
 1 file changed, 22 insertions(+), 32 deletions(-)

diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index e09dd1b..eecfad3 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -20,39 +20,30 @@
 #define src		x2
 #define synd		x3
 #define	shift		x4
-#define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (__strnlen_aarch64)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
-	ld1	{vdata.16b}, [src], 16
-	dup	vrepmask.8h, wtmp
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -64,37 +55,40 @@ L(finish):
 	csel	result, cntin, result, ls
 	ret
 
+L(nomatch):
+	mov	result, cntin
+	ret
+
 L(start_loop):
 	sub	tmp, src, srcin
+	add	tmp, tmp, 17
 	subs	cntrem, cntin, tmp
-	b.ls	L(nomatch)
+	b.lo	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
-
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
 	.p2align 5
 L(loop32):
-	ldr	qdata, [src], 16
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 L(loop32_2):
-	ldr	qdata, [src], 16
+	ldr	qdata, [src, 16]
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, 0
-	b.ls	L(end)
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
-
+L(end_2):
+	add	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
-	sub	src, src, 16
-	mov	synd, vend.d[0]
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	sub	result, src, srcin
+	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
@@ -104,9 +98,5 @@ L(end):
 	csel	result, cntin, result, ls
 	ret
 
-L(nomatch):
-	mov	result, cntin
-	ret
-
 END (__strnlen_aarch64)
 
-- 
cgit v1.2.3


From f890e426fd2607c78b39c04a795c0e486541a108 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 22 Aug 2022 13:20:47 +0100
Subject: string: Optimize strlen-mte

Optimize strlen by unrolling the main loop. Large strings are 64% faster.
---
 string/aarch64/strlen-mte.S | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 0d33ebb..fdb07ae 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -28,13 +28,9 @@
 #define dend		d2
 
 /* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (__strlen_aarch64_mte)
 	PTR_ARG (0)
@@ -54,18 +50,25 @@ ENTRY (__strlen_aarch64_mte)
 
 	.p2align 5
 L(loop):
-	ldr	data, [src, 16]!
+	ldr	data, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop_end)
+	ldr	data, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop)
-
+	sub	src, src, 16
+L(loop_end):
 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
+	add	result, result, 16
 	clz	tmp, synd
 	add	result, result, tmp, lsr 2
 	ret
-- 
cgit v1.2.3


From 802438542e4dff98109027594be5603f13708ab2 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 22 Aug 2022 13:21:03 +0100
Subject: string: Improve strlen

Use shrn for the mask, merge tst+bne into cbnz, tweak code alignment.
The random strlen test improves by 2%.
---
 string/aarch64/strlen.S | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 98145f9..103fac1 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -36,6 +36,7 @@
 #define tmp	x2
 #define tmpw	w2
 #define synd	x3
+#define syndw	w3
 #define shift	x4
 
 /* For the first 32 bytes, NUL detection works on the principle that
@@ -110,7 +111,6 @@ ENTRY (__strlen_aarch64)
 	add	len, len, tmp1, lsr 3
 	ret
 
-	.p2align 3
 	/* Look for a NUL byte at offset 16..31 in the string.  */
 L(bytes16_31):
 	ldp	data1, data2, [srcin, 16]
@@ -138,6 +138,7 @@ L(bytes16_31):
 	add	len, len, tmp1, lsr 3
 	ret
 
+	nop
 L(loop_entry):
 	bic	src, srcin, 31
 
@@ -153,18 +154,12 @@ L(loop):
 	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
 	cmeq	maskv.16b, datav1.16b, 0
 	sub	len, src, srcin
-	tst	synd, 0xffffffff
-	b.ne	1f
+	cbnz	syndw, 1f
 	cmeq	maskv.16b, datav2.16b, 0
 	add	len, len, 16
 1:
 	/* Generate a bitmask and compute correct byte offset.  */
-#ifdef __AARCH64EB__
-	bic	maskv.8h, 0xf0
-#else
-	bic	maskv.8h, 0x0f, lsl 8
-#endif
-	umaxp	maskv.16b, maskv.16b, maskv.16b
+	shrn	maskv.8b, maskv.8h, 4
 	fmov	synd, maskd
 #ifndef __AARCH64EB__
 	rbit	synd, synd
@@ -173,8 +168,6 @@ L(loop):
 	add	len, len, tmp, lsr 2
 	ret
 
-        .p2align 4
-
 L(page_cross):
 	bic	src, srcin, 31
 	mov	tmpw, 0x0c03
-- 
cgit v1.2.3


From 87e99142bf8273215f58e4770fd7247e4b6e401b Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 22 Aug 2022 13:21:30 +0100
Subject: string: Improve strchrnul-mte

Unroll the main loop, which gives a small gain.
---
 string/aarch64/strchrnul-mte.S | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 9be5cbc..b1ac4db 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -57,14 +57,22 @@ ENTRY (__strchrnul_aarch64_mte)
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
-
+	sub	src, src, 16
+L(end):
 	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	add	src, src, 16
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
-- 
cgit v1.2.3


From 864fc0254172a47c9ea36e12cd27511108902813 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 22 Aug 2022 13:21:41 +0100
Subject: string: Improve strchr-mte

Simplify calculation of the mask using shrn. Unroll the main loop.
Small strings are 20% faster.
---
 string/aarch64/strchr-mte.S | 52 +++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 4ed6cce..8840f0d 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -19,8 +19,7 @@
 
 #define src		x2
 #define tmp1		x1
-#define wtmp2		w3
-#define tmp3		x3
+#define tmp2		x3
 
 #define vrepchr		v0
 #define vdata		v1
@@ -28,39 +27,30 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
-#define vend		v6
-#define dend		d6
+#define vend		v5
+#define dend		d5
 
 /* Core algorithm.
 
    For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
-   requested character, bits 2-3 are set if the byte is NUL (or matched), and
-   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
-   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
-   in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   per byte. Bits 0-1 are set if the relevant byte matched the requested
+   character, bits 2-3 are set if the byte is NUL or matched. Count trailing
+   zeroes gives the position of the matching byte if it is a multiple of 4.
+   If it is not a multiple of 4, there was no match.  */
 
 ENTRY (__strchr_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	wtmp2, 0x3003
-	dup	vrepmask.8h, wtmp2
+	movi	vrepmask.16b, 0x33
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp2, 0xf00f
-	dup	vrepmask2.8h, wtmp2
-
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	lsl	tmp3, srcin, 2
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
-
+	lsl	tmp2, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
-	lsr	tmp1, tmp1, tmp3
+	lsr	tmp1, tmp1, tmp2
 	cbz	tmp1, L(loop)
 
 	rbit	tmp1, tmp1
@@ -74,28 +64,34 @@ ENTRY (__strchr_aarch64_mte)
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
+	sub	src, src, 16
+L(end):
 
 #ifdef __AARCH64EB__
 	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	rbit	tmp1, tmp1
 #endif
+	add	src, src, 16
 	clz	tmp1, tmp1
-	/* Tmp1 is an even multiple of 2 if the target character was
-	   found first. Otherwise we've found the end of string.  */
+	/* Tmp1 is a multiple of 4 if the target character was found.  */
 	tst	tmp1, 2
 	add	result, src, tmp1, lsr 2
 	csel	result, result, xzr, eq
-- 
cgit v1.2.3


From a1547d148400deffeaab1cd484638ec03a519682 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 22 Aug 2022 13:21:53 +0100
Subject: string: Optimize memrchr

Optimize the main loop - large strings are 43% faster.
---
 string/aarch64/memrchr.S | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index 47fbce2..4726618 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -51,7 +51,7 @@ ENTRY (__memrchr_aarch64)
 	dup	vrepchr.16b, chrin
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	shrn	vend.8b, vhas_chr.8h, 4            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -62,31 +62,34 @@ ENTRY (__memrchr_aarch64)
 	csel	result, result, xzr, hi
 	ret
 
+	nop
 L(start_loop):
-	sub	tmp, end, src
-	subs	cntrem, cntin, tmp
+	subs	cntrem, src, srcin
 	b.ls	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
+	sub	cntrem, cntrem, 1
+	tbz	cntrem, 4, L(loop32_2)
+	add	src, src, 16
 
-	.p2align 4
+	.p2align 5
 L(loop32):
-	ldr	qdata, [src, -16]!
+	ldr	qdata, [src, -32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, -16]!
+	ldr	qdata, [src, -16]
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	b.ls	L(end)
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
+L(end_2):
+	sub	src, src, 16
 L(end):
 	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
-- 
cgit v1.2.3


From 43c24ad1c17de4b9d084f61ab8361d3736f2e527 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 22 Aug 2022 13:22:05 +0100
Subject: string: Optimize memchr-mte

Optimize the main loop - large strings are 40% faster.
---
 string/aarch64/memchr-mte.S | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index 0f434cf..d4673b3 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -48,49 +48,51 @@ ENTRY (__memchr_aarch64_mte)
 	dup	vrepchr.16b, chrin
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	shrn	vend.8b, vhas_chr.8h, 4            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
 
 	rbit	synd, synd
 	clz	synd, synd
-	add	result, srcin, synd, lsr 2
 	cmp	cntin, synd, lsr 2
+	add	result, srcin, synd, lsr 2
 	csel	result, result, xzr, hi
 	ret
 
+	.p2align 3
 L(start_loop):
 	sub	tmp, src, srcin
-	add	tmp, tmp, 16
+	add	tmp, tmp, 17
 	subs	cntrem, cntin, tmp
-	b.ls	L(nomatch)
+	b.lo	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
-
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
 	.p2align 4
 L(loop32):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, 16]!
-	subs	cntrem, cntrem, 32
+	ldr	qdata, [src, 16]
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	b.ls	L(end)
+	subs	cntrem, cntrem, 32
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
+L(end_2):
+	add	src, src, 16
 L(end):
 	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	sub	cntrem, src, srcin
 	fmov	synd, dend
-	add	tmp, srcin, cntin
-	sub	cntrem, tmp, src
+	sub	cntrem, cntin, cntrem
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
-- 
cgit v1.2.3


From 65e0232e36b784b09a72caf0df6845eec9e95057 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 2 Sep 2022 10:41:55 +0100
Subject: pl/math: Add vector/SVE log10

New routine shares coefficients and table entries with Neon variant,
as is accurate to 2.5 ULP. Coeffs and magic numbers have been
rearranged to as to be shared between Neon and SVE routines, and allow
better memory access.
---
 pl/math/include/mathlib.h      |   2 +
 pl/math/math_config.h          |  12 ++
 pl/math/sv_log10_2u5.c         |  79 ++++++++++++
 pl/math/sv_math.h              |  26 ++++
 pl/math/test/mathbench_funcs.h |   3 +
 pl/math/test/runulp.sh         |  12 ++
 pl/math/test/ulp_funcs.h       |   2 +
 pl/math/test/ulp_wrappers.h    |   6 +
 pl/math/v_log10.h              |  19 ---
 pl/math/v_log10_2u5.c          |  41 ++-----
 pl/math/v_log10_data.c         | 270 +++++++++++++++++++++--------------------
 11 files changed, 295 insertions(+), 177 deletions(-)
 create mode 100644 pl/math/sv_log10_2u5.c
 delete mode 100644 pl/math/v_log10.h

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 266d2a1..0f004c0 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -111,6 +111,7 @@ svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
 svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
+svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
 /* SVE ABI names.  */
@@ -120,6 +121,7 @@ svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
 svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
 #endif
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 1a4fc21..d70a38c 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -496,4 +496,16 @@ extern const struct sv_sinf_data
   float coeffs[V_SINF_NCOEFFS];
 } __sv_sinf_data HIDDEN;
 
+#define V_LOG10_TABLE_BITS 7
+#define V_LOG10_POLY_ORDER 6
+extern const struct v_log10_data
+{
+  struct
+  {
+    double invc, log10c;
+  } tab[1 << V_LOG10_TABLE_BITS];
+  double poly[V_LOG10_POLY_ORDER - 1];
+  double invln10, log10_2;
+} __v_log10_data HIDDEN;
+
 #endif
diff --git a/pl/math/sv_log10_2u5.c b/pl/math/sv_log10_2u5.c
new file mode 100644
index 0000000..92dbfa4
--- /dev/null
+++ b/pl/math/sv_log10_2u5.c
@@ -0,0 +1,79 @@
+/*
+ * Double-precision SVE log10(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "sv_math.h"
+
+#if SV_SUPPORTED
+
+#define OFF 0x3fe6900900000000
+#define N (1 << V_LOG10_TABLE_BITS)
+
+#define A(i) __v_log10_data.poly[i]
+
+static inline sv_f64_t
+specialcase (sv_f64_t x, sv_f64_t y, svbool_t special)
+{
+  return sv_call_f64 (log10, x, y, special);
+}
+
+/* SVE log10 algorithm. Maximum measured error is 2.46 ulps.
+   __sv_log10(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6
+				   want 0x1.fffbdf6eaa667p-6.  */
+sv_f64_t
+__sv_log10_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
+
+  svbool_t is_special_case
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x07ff0 - 0x0010);
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
+  sv_u64_t i
+    = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG10_TABLE_BITS), N);
+  sv_f64_t k
+    = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52));
+  sv_f64_t z = sv_as_f64_u64 (
+    svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52)));
+
+  /* log(x) = k*log(2) + log(c) + log(z/c).  */
+
+  sv_u64_t idx = svmul_n_u64_x (pg, i, 2);
+  sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].invc, idx);
+  sv_f64_t logc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].log10c, idx);
+
+  /* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1):
+     r = z/c - 1 (we look up precomputed 1/c)
+     log(z/c) ~= P(r).  */
+  sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
+
+  /* hi = log(c) + k*log(2).  */
+  sv_f64_t w = sv_fma_n_f64_x (pg, __v_log10_data.invln10, r, logc);
+  sv_f64_t hi = sv_fma_n_f64_x (pg, __v_log10_data.log10_2, k, w);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  sv_f64_t r2 = svmul_f64_x (pg, r, r);
+  sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2)));
+  sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0)));
+  y = sv_fma_n_f64_x (pg, A (4), r2, y);
+  y = sv_fma_f64_x (pg, y, r2, p);
+  y = sv_fma_f64_x (pg, y, r2, hi);
+
+  if (unlikely (svptest_any (pg, is_special_case)))
+    {
+      return specialcase (x, y, is_special_case);
+    }
+  return y;
+}
+
+strong_alias (__sv_log10_x, _ZGVsMxv_log10)
+
+#endif
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
index c027a41..4164faa 100644
--- a/pl/math/sv_math.h
+++ b/pl/math/sv_math.h
@@ -69,6 +69,12 @@ sv_fma_n_f64_x (svbool_t pg, f64_t x, sv_f64_t y, sv_f64_t z)
   return svmla_n_f64_x (pg, z, y, x);
 }
 
+static inline sv_s64_t
+sv_as_s64_u64 (sv_u64_t x)
+{
+  return svreinterpret_s64_u64 (x);
+}
+
 static inline sv_u64_t
 sv_as_u64_f64 (sv_f64_t x)
 {
@@ -81,6 +87,12 @@ sv_as_f64_u64 (sv_u64_t x)
   return svreinterpret_f64_u64 (x);
 }
 
+static inline sv_f64_t
+sv_to_f64_s64_x (svbool_t pg, sv_s64_t s)
+{
+  return svcvt_f64_x (pg, s);
+}
+
 static inline sv_f64_t
 sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp)
 {
@@ -113,6 +125,20 @@ sv_call2_f64 (f64_t (*f) (f64_t, f64_t), sv_f64_t x1, sv_f64_t x2, sv_f64_t y,
   return y;
 }
 
+/* Load array of double into svfloat64_t.  */
+static inline sv_f64_t
+sv_lookup_f64_x (svbool_t pg, const f64_t *tab, sv_u64_t idx)
+{
+  return svld1_gather_u64index_f64 (pg, tab, idx);
+}
+
+static inline sv_u64_t
+sv_mod_n_u64_x (svbool_t pg, sv_u64_t x, u64_t y)
+{
+  sv_u64_t q = svdiv_n_u64_x (pg, x, y);
+  return svmls_n_u64_x (pg, x, q, y);
+}
+
 /* Single precision.  */
 static inline sv_s32_t
 sv_s32 (s32_t x)
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index fb48097..f89ee08 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -117,6 +117,9 @@ SVF (_ZGVsMxv_cosf, -3.1, 3.1)
 SVF (__sv_sinf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_sinf, -3.1, 3.1)
 
+SVD (__sv_log10_x, 0.01, 11.1)
+SVD (_ZGVsMxv_log10, 0.01, 11.1)
+
 SVD (__sv_cos_x, -3.1, 3.1)
 SVD (_ZGVsMxv_cos, -3.1, 3.1)
 SVD (__sv_sin_x, -3.1, 3.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 137b324..993b7a3 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -314,6 +314,15 @@ range_sve_atan2='
    1e6       1e32  40000
 '
 
+range_sve_log10='
+     -0.0  -0x1p126  100
+ 0x1p-149  0x1p-126  4000
+ 0x1p-126   0x1p-23  50000
+  0x1p-23       1.0  50000
+      1.0       100  50000
+      100       inf  50000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -339,6 +348,7 @@ L_sve_atanf=2.9
 L_sve_atan=2.5
 L_sve_atan2f=3.0
 L_sve_atan2=2.0
+L_sve_log10=2.5
 
 while read G F R
 do
@@ -433,6 +443,8 @@ sve_atan   __sv_atan       $runsv
 sve_atan   _ZGVsMxv_atan   $runsv
 sve_atan2  __sv_atan2      $runsv
 sve_atan2  _ZGVsMxvv_atan2 $runsv
+sve_log10  __sv_log10      $runsv
+sve_log10  _ZGVsMxv_log10  $runsv
 fi
 EOF
 
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index a53091b..695909f 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -90,6 +90,8 @@ SVF1 (cos)
 ZSVF1 (cos)
 SVD1 (cos)
 ZSVD1 (cos)
+SVD1 (log10)
+ZSVD1 (log10)
 SVF1 (sin)
 ZSVF1 (sin)
 SVD1 (sin)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 6a31f4a..886dfde 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -108,6 +108,12 @@ static double sv_cos(double x) {
 static double Z_sv_cos(double x) {
   return svretd(_ZGVsMxv_cos(svargd(x), svptrue_b64()));
 }
+static double sv_log10(double x) {
+  return svretd(__sv_log10_x(svargd(x), svptrue_b64()));
+}
+static double Z_sv_log10(double x) {
+  return svretd(_ZGVsMxv_log10(svargd(x), svptrue_b64()));
+}
 static double sv_sin(double x) {
   return svretd(__sv_sin_x(svargd(x), svptrue_b64()));
 }
diff --git a/pl/math/v_log10.h b/pl/math/v_log10.h
deleted file mode 100644
index 8564911..0000000
--- a/pl/math/v_log10.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Declarations for double-precision log10(x) vector function.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#if WANT_VMATH
-
-#define V_LOG10_TABLE_BITS 7
-
-extern const struct v_log10_data
-{
-  f64_t invc;
-  f64_t log10c;
-} __v_log10_data[1 << V_LOG10_TABLE_BITS] HIDDEN;
-
-#endif
diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c
index 6991d4f..c34167f 100644
--- a/pl/math/v_log10_2u5.c
+++ b/pl/math/v_log10_2u5.c
@@ -5,27 +5,12 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "v_log10.h"
 #include "include/mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
-/* Constants used to switch from base e to base 10.  */
-#define ivln10 v_f64 (0x1.bcb7b1526e50ep-2)
-#define log10_2 v_f64 (0x1.34413509f79ffp-2)
-
-static const f64_t Poly[] = {
-  /* computed from log coeffs divided by log(10) in extended precision then
-     rounded to double precision.  */
-  -0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4,
-  0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4,
-};
-
-#define A0 v_f64 (Poly[0])
-#define A1 v_f64 (Poly[1])
-#define A2 v_f64 (Poly[2])
-#define A3 v_f64 (Poly[3])
-#define A4 v_f64 (Poly[4])
+#define A(i) v_f64 (__v_log10_data.poly[i])
+#define T(s, i) __v_log10_data.tab[i].s
 #define Ln2 v_f64 (0x1.62e42fefa39efp-1)
 #define N (1 << V_LOG10_TABLE_BITS)
 #define OFF v_u64 (0x3fe6900900000000)
@@ -41,13 +26,13 @@ lookup (v_u64_t i)
 {
   struct entry e;
 #ifdef SCALAR
-  e.invc = __v_log10_data[i].invc;
-  e.log10c = __v_log10_data[i].log10c;
+  e.invc = T (invc, i);
+  e.log10c = T (log10c, i);
 #else
-  e.invc[0] = __v_log10_data[i[0]].invc;
-  e.log10c[0] = __v_log10_data[i[0]].log10c;
-  e.invc[1] = __v_log10_data[i[1]].invc;
-  e.log10c[1] = __v_log10_data[i[1]].log10c;
+  e.invc[0] = T (invc, i[0]);
+  e.log10c[0] = T (log10c, i[0]);
+  e.invc[1] = T (invc, i[1]);
+  e.log10c[1] = T (log10c, i[1]);
 #endif
   return e;
 }
@@ -94,16 +79,16 @@ v_f64_t V_NAME (log10) (v_f64_t x)
   /* hi = r / log(10) + log10(c) + k*log10(2).
      Constants in `v_log10_data.c` are computed (in extended precision) as
      e.log10c := e.logc * ivln10.  */
-  v_f64_t w = v_fma_f64 (r, ivln10, e.log10c);
+  v_f64_t w = v_fma_f64 (r, v_f64 (__v_log10_data.invln10), e.log10c);
 
   /* y = log10(1+r) + n * log10(2).  */
-  hi = v_fma_f64 (kd, log10_2, w);
+  hi = v_fma_f64 (kd, v_f64 (__v_log10_data.log10_2), w);
 
   /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
   r2 = r * r;
-  y = v_fma_f64 (A3, r, A2);
-  p = v_fma_f64 (A1, r, A0);
-  y = v_fma_f64 (A4, r2, y);
+  y = v_fma_f64 (A (3), r, A (2));
+  p = v_fma_f64 (A (1), r, A (0));
+  y = v_fma_f64 (A (4), r2, y);
   y = v_fma_f64 (y, r2, p);
   y = v_fma_f64 (y, r2, hi);
 
diff --git a/pl/math/v_log10_data.c b/pl/math/v_log10_data.c
index 7fdb519..d1db9a5 100644
--- a/pl/math/v_log10_data.c
+++ b/pl/math/v_log10_data.c
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "v_log10.h"
+#include "math_config.h"
 
 #define N (1 << V_LOG10_TABLE_BITS)
 
@@ -25,133 +25,143 @@ floating point invc candidates around 1/center and selecting one for which
 the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
 that contains 1 and the previous one got tweaked to avoid cancellation.
 NB: invc should be optimized to minimize error in (double)log10(c) instead.  */
-const struct v_log10_data __v_log10_data[N] = {
-  {0x1.6a133d0dec120p+0, -0x1.345825f221684p-3},
-  {0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3},
-  {0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3},
-  {0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3},
-  {0x1.623f1d916f323p+0, -0x1.20e7081762193p-3},
-  {0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3},
-  {0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3},
-  {0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3},
-  {0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3},
-  {0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3},
-  {0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3},
-  {0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4},
-  {0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4},
-  {0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4},
-  {0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4},
-  {0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4},
-  {0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4},
-  {0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4},
-  {0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4},
-  {0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4},
-  {0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4},
-  {0x1.446f12b278001p+0, -0x1.a56c091954f87p-4},
-  {0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4},
-  {0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4},
-  {0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4},
-  {0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4},
-  {0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4},
-  {0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4},
-  {0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4},
-  {0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4},
-  {0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4},
-  {0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4},
-  {0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4},
-  {0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4},
-  {0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4},
-  {0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4},
-  {0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4},
-  {0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4},
-  {0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4},
-  {0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4},
-  {0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4},
-  {0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5},
-  {0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5},
-  {0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5},
-  {0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5},
-  {0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5},
-  {0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5},
-  {0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5},
-  {0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5},
-  {0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5},
-  {0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5},
-  {0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5},
-  {0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5},
-  {0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5},
-  {0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5},
-  {0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5},
-  {0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5},
-  {0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5},
-  {0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6},
-  {0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6},
-  {0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6},
-  {0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6},
-  {0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6},
-  {0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6},
-  {0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6},
-  {0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6},
-  {0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7},
-  {0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7},
-  {0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7},
-  {0x1.062491aee9904p+0, -0x1.517249c15a75cp-7},
-  {0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7},
-  {0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8},
-  {0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8},
-  {0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9},
-  {0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10},
-  {1.0, 0.0},
-  {0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9},
-  {0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8},
-  {0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7},
-  {0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7},
-  {0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6},
-  {0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6},
-  {0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6},
-  {0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6},
-  {0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6},
-  {0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5},
-  {0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5},
-  {0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5},
-  {0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5},
-  {0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5},
-  {0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5},
-  {0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5},
-  {0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5},
-  {0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5},
-  {0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5},
-  {0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4},
-  {0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4},
-  {0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4},
-  {0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4},
-  {0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4},
-  {0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4},
-  {0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4},
-  {0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4},
-  {0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4},
-  {0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4},
-  {0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4},
-  {0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4},
-  {0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4},
-  {0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4},
-  {0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4},
-  {0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4},
-  {0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4},
-  {0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4},
-  {0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4},
-  {0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4},
-  {0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4},
-  {0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4},
-  {0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4},
-  {0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3},
-  {0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3},
-  {0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3},
-  {0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3},
-  {0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3},
-  {0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3},
-  {0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3},
-  {0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3},
-  {0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3},
-  {0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3},
+const struct v_log10_data __v_log10_data
+  = {.tab = {{0x1.6a133d0dec120p+0, -0x1.345825f221684p-3},
+	     {0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3},
+	     {0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3},
+	     {0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3},
+	     {0x1.623f1d916f323p+0, -0x1.20e7081762193p-3},
+	     {0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3},
+	     {0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3},
+	     {0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3},
+	     {0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3},
+	     {0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3},
+	     {0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3},
+	     {0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4},
+	     {0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4},
+	     {0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4},
+	     {0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4},
+	     {0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4},
+	     {0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4},
+	     {0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4},
+	     {0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4},
+	     {0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4},
+	     {0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4},
+	     {0x1.446f12b278001p+0, -0x1.a56c091954f87p-4},
+	     {0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4},
+	     {0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4},
+	     {0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4},
+	     {0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4},
+	     {0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4},
+	     {0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4},
+	     {0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4},
+	     {0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4},
+	     {0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4},
+	     {0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4},
+	     {0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4},
+	     {0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4},
+	     {0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4},
+	     {0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4},
+	     {0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4},
+	     {0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4},
+	     {0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4},
+	     {0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4},
+	     {0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4},
+	     {0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5},
+	     {0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5},
+	     {0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5},
+	     {0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5},
+	     {0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5},
+	     {0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5},
+	     {0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5},
+	     {0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5},
+	     {0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5},
+	     {0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5},
+	     {0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5},
+	     {0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5},
+	     {0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5},
+	     {0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5},
+	     {0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5},
+	     {0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5},
+	     {0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5},
+	     {0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6},
+	     {0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6},
+	     {0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6},
+	     {0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6},
+	     {0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6},
+	     {0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6},
+	     {0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6},
+	     {0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6},
+	     {0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7},
+	     {0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7},
+	     {0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7},
+	     {0x1.062491aee9904p+0, -0x1.517249c15a75cp-7},
+	     {0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7},
+	     {0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8},
+	     {0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8},
+	     {0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9},
+	     {0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10},
+	     {1.0, 0.0},
+	     {0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9},
+	     {0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8},
+	     {0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7},
+	     {0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7},
+	     {0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6},
+	     {0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6},
+	     {0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6},
+	     {0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6},
+	     {0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6},
+	     {0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5},
+	     {0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5},
+	     {0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5},
+	     {0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5},
+	     {0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5},
+	     {0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5},
+	     {0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5},
+	     {0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5},
+	     {0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5},
+	     {0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5},
+	     {0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4},
+	     {0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4},
+	     {0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4},
+	     {0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4},
+	     {0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4},
+	     {0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4},
+	     {0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4},
+	     {0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4},
+	     {0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4},
+	     {0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4},
+	     {0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4},
+	     {0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4},
+	     {0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4},
+	     {0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4},
+	     {0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4},
+	     {0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4},
+	     {0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4},
+	     {0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4},
+	     {0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4},
+	     {0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4},
+	     {0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4},
+	     {0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4},
+	     {0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4},
+	     {0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3},
+	     {0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3},
+	     {0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3},
+	     {0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3},
+	     {0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3},
+	     {0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3},
+	     {0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3},
+	     {0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3},
+	     {0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3},
+	     {0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3}},
+
+     /* Computed from log coeffs div by log(10) then rounded to double
+	precision.  */
+     .poly
+     = {-0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4,
+	0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4},
+
+     .invln10 = 0x1.bcb7b1526e50ep-2,
+     .log10_2 = 0x1.34413509f79ffp-2
+
 };
-- 
cgit v1.2.3


From 6e0ada9b7edd52693fe8a684863796b7a309d6fe Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 2 Sep 2022 10:42:05 +0100
Subject: pl/math: Add vector/SVE log10f

New routine shares polynomial with Neon variant and is accurate to 3.5 ULP.
---
 pl/math/include/mathlib.h      |  2 ++
 pl/math/math_config.h          |  3 ++
 pl/math/sv_log10f_3u5.c        | 78 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/sv_math.h              | 12 +++++++
 pl/math/test/mathbench_funcs.h |  2 ++
 pl/math/test/runulp.sh         | 12 +++++++
 pl/math/test/ulp_funcs.h       |  2 ++
 pl/math/test/ulp_wrappers.h    |  6 ++++
 pl/math/v_log10f_3u5.c         | 27 ++++-----------
 pl/math/v_log10f_data.c        | 13 +++++++
 10 files changed, 137 insertions(+), 20 deletions(-)
 create mode 100644 pl/math/sv_log10f_3u5.c
 create mode 100644 pl/math/v_log10f_data.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 0f004c0..4f3eba3 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -111,6 +111,7 @@ svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
 svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
@@ -121,6 +122,7 @@ svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
 svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index d70a38c..10bbd9b 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -508,4 +508,7 @@ extern const struct v_log10_data
   double invln10, log10_2;
 } __v_log10_data HIDDEN;
 
+#define V_LOG10F_POLY_ORDER 9
+extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN;
+
 #endif
diff --git a/pl/math/sv_log10f_3u5.c b/pl/math/sv_log10f_3u5.c
new file mode 100644
index 0000000..fe8ecfd
--- /dev/null
+++ b/pl/math/sv_log10f_3u5.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision SVE log10 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+
+#if SV_SUPPORTED
+
+#define SpecialCaseMin 0x00800000
+#define SpecialCaseMax 0x7f800000
+#define Offset 0x3f2aaaab /* 0.666667.  */
+#define Mask 0x007fffff
+#define Ln2 0x1.62e43p-1f /* 0x3f317218.  */
+#define InvLn10 0x1.bcb7b2p-2f
+
+#define P(i) __v_log10f_poly[i]
+
+static NOINLINE sv_f32_t
+special_case (sv_f32_t x, sv_f32_t y, svbool_t special)
+{
+  return sv_call_f32 (log10f, x, y, special);
+}
+
+/* Optimised implementation of SVE log10f using the same algorithm and
+   polynomial as v_log10f. Maximum error is 3.31ulps:
+   __sv_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+			     want 0x1.ffe2f4p-4.  */
+sv_f32_t
+__sv_log10f_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  svbool_t special_cases
+    = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, ix, SpecialCaseMin),
+		     SpecialCaseMax - SpecialCaseMin);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  ix = svsub_n_u32_x (pg, ix, Offset);
+  sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (ix),
+						   23)); /* signextend.  */
+  ix = svand_n_u32_x (pg, ix, Mask);
+  ix = svadd_n_u32_x (pg, ix, Offset);
+  sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (ix), 1.0f);
+
+  /* y = log10(1+r) + n*log10(2)
+     log10(1+r) ~ r * InvLn(10) + P(r)
+     where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
+     log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3)
+
+     P(r) = r2 * (Q01 + r2 * (Q23 + r2 * (Q45 + r2 * Q67)))
+     and Qij  = Pi + r * Pj.  */
+  sv_f32_t q12 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0)));
+  sv_f32_t q34 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2)));
+  sv_f32_t q56 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4)));
+  sv_f32_t q78 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6)));
+
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+  sv_f32_t y = sv_fma_f32_x (pg, q78, r2, q56);
+  y = sv_fma_f32_x (pg, y, r2, q34);
+  y = sv_fma_f32_x (pg, y, r2, q12);
+
+  /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster but less
+     accurate.  */
+  sv_f32_t p = sv_fma_n_f32_x (pg, Ln2, n, r);
+  y = sv_fma_f32_x (pg, y, r2, svmul_n_f32_x (pg, p, InvLn10));
+
+  if (unlikely (svptest_any (pg, special_cases)))
+    {
+      return special_case (x, y, special_cases);
+    }
+  return y;
+}
+
+strong_alias (__sv_log10f_x, _ZGVsMxv_log10f)
+
+#endif
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
index 4164faa..3b318f1 100644
--- a/pl/math/sv_math.h
+++ b/pl/math/sv_math.h
@@ -183,6 +183,18 @@ sv_as_f32_u32 (sv_u32_t x)
   return svreinterpret_f32_u32 (x);
 }
 
+static inline sv_s32_t
+sv_as_s32_u32 (sv_u32_t x)
+{
+  return svreinterpret_s32_u32 (x);
+}
+
+static inline sv_f32_t
+sv_to_f32_s32_x (svbool_t pg, sv_s32_t s)
+{
+  return svcvt_f32_x (pg, s);
+}
+
 static inline sv_f32_t
 sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp)
 {
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f89ee08..076340e 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -117,6 +117,8 @@ SVF (_ZGVsMxv_cosf, -3.1, 3.1)
 SVF (__sv_sinf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_sinf, -3.1, 3.1)
 
+SVF (__sv_log10f_x, 0.01, 11.1)
+SVF (_ZGVsMxv_log10f, 0.01, 11.1)
 SVD (__sv_log10_x, 0.01, 11.1)
 SVD (_ZGVsMxv_log10, 0.01, 11.1)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 993b7a3..3475adb 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -323,6 +323,15 @@ range_sve_log10='
       100       inf  50000
 '
 
+range_sve_log10f='
+     -0.0  -0x1p126  100
+ 0x1p-149  0x1p-126  4000
+ 0x1p-126   0x1p-23  50000
+  0x1p-23       1.0  50000
+      1.0       100  50000
+      100       inf  50000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -349,6 +358,7 @@ L_sve_atan=2.5
 L_sve_atan2f=3.0
 L_sve_atan2=2.0
 L_sve_log10=2.5
+L_sve_log10f=3.5
 
 while read G F R
 do
@@ -434,6 +444,8 @@ sve_atan2f   __sv_atan2f       $runsv
 sve_atan2f   _ZGVsMxvv_atan2f  $runsv
 sve_atanf    __sv_atanf        $runsv
 sve_atanf    _ZGVsMxv_atanf    $runsv
+sve_log10f   __sv_log10f       $runsv
+sve_log10f   _ZGVsMxv_log10f   $runsv
 
 sve_cos    __sv_cos        $runsv
 sve_cos    _ZGVsMxv_cos    $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 695909f..4b12a1a 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -90,6 +90,8 @@ SVF1 (cos)
 ZSVF1 (cos)
 SVD1 (cos)
 ZSVD1 (cos)
+SVF1 (log10)
+ZSVF1 (log10)
 SVD1 (log10)
 ZSVD1 (log10)
 SVF1 (sin)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 886dfde..4f11aec 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -83,6 +83,12 @@ static float sv_cosf(float x) {
 static float Z_sv_cosf(float x) {
   return svretf(_ZGVsMxv_cosf(svargf(x), svptrue_b32()));
 }
+static float sv_log10f(float x) {
+  return svretf(__sv_log10f_x(svargf(x), svptrue_b32()));
+}
+static float Z_sv_log10f(float x) {
+  return svretf(_ZGVsMxv_log10f(svargf(x), svptrue_b32()));
+}
 static float sv_sinf(float x) {
   return svretf(__sv_sinf_x(svargf(x), svptrue_b32()));
 }
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
index c956d0c..4dede3d 100644
--- a/pl/math/v_log10f_3u5.c
+++ b/pl/math/v_log10f_3u5.c
@@ -9,22 +9,9 @@
 #include "v_math.h"
 #if V_SUPPORTED
 
-static const float Poly[] = {
-  /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
-     [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
-  -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f,
-  -0x1.246f8p-4f,  0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f};
-#define P8 v_f32 (Poly[7])
-#define P7 v_f32 (Poly[6])
-#define P6 v_f32 (Poly[5])
-#define P5 v_f32 (Poly[4])
-#define P4 v_f32 (Poly[3])
-#define P3 v_f32 (Poly[2])
-#define P2 v_f32 (Poly[1])
-#define P1 v_f32 (Poly[0])
+#define P(i) v_f32 (__v_log10f_poly[i])
 
 #define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218.  */
-#define Log10_2 v_f32 (0x1.344136p-2f)
 #define InvLn10 v_f32 (0x1.bcb7b2p-2f)
 #define Min v_u32 (0x00800000)
 #define Max v_u32 (0x7f800000)
@@ -64,12 +51,12 @@ v_f32_t V_NAME (log10f) (v_f32_t x)
 
   /* y = log10(1+r) + n*log10(2).  */
   r2 = r * r;
-  /* (n*ln2 + r)*InvLn10 + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 +
-     r2*(P7+r*P8))).  */
-  o = v_fma_f32 (P8, r, P7);
-  p = v_fma_f32 (P6, r, P5);
-  q = v_fma_f32 (P4, r, P3);
-  y = v_fma_f32 (P2, r, P1);
+  /* (n*ln2 + r)*InvLn10 + r2*(P0 + r*P1 + r2*(P2 + r*P3 + r2*(P4 + r*P5 +
+     r2*(P6+r*P7))).  */
+  o = v_fma_f32 (P (7), r, P (6));
+  p = v_fma_f32 (P (5), r, P (4));
+  q = v_fma_f32 (P (3), r, P (2));
+  y = v_fma_f32 (P (1), r, P (0));
   p = v_fma_f32 (o, r2, p);
   q = v_fma_f32 (p, r2, q);
   y = v_fma_f32 (q, r2, y);
diff --git a/pl/math/v_log10f_data.c b/pl/math/v_log10f_data.c
new file mode 100644
index 0000000..c95f38b
--- /dev/null
+++ b/pl/math/v_log10f_data.c
@@ -0,0 +1,13 @@
+/*
+ * Coefficients for single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+const float __v_log10f_poly[] = {
+  /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+     [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
+  -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f,
+  -0x1.246f8p-4f,  0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f};
-- 
cgit v1.2.3


From 8c44c9bc2b4295ebe20b3b86b76eb0f4af996ef8 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 2 Sep 2022 11:52:49 +0100
Subject: Add more ulp helper macros

All ulp_funcs entries use the same level of macro expansion. Also add
some missing routines in mathlib.h and remove funcs which we do not
supply from ulp_funcs.
---
 math/test/ulp.c           | 21 ++++++++++
 pl/math/include/mathlib.h |  3 +-
 pl/math/test/ulp_funcs.h  | 99 ++++++++++++++++++++---------------------------
 3 files changed, 64 insertions(+), 59 deletions(-)

diff --git a/math/test/ulp.c b/math/test/ulp.c
index 6fdc395..24185a2 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -340,6 +340,27 @@ static const struct fun fun[] = {
 #define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
 #define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
 #define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
+/* Neon routines.  */
+#define VF1(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define VF2(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define VD1(x) F (__v_##x, v_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define VD2(x) F (__v_##x, v_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define VNF1(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define VNF2(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define VND1(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define VND2(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZVF1(x) F (_ZGVnN4v_##x##f, Z_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZVF2(x) F (_ZGVnN4vv_##x##f, Z_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZVD1(x) F (_ZGVnN2v_##x, Z_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZVD2(x) F (_ZGVnN2vv_##x, Z_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZVNF1(x) VNF1 (x) ZVF1 (x)
+#define ZVNF2(x) VNF2 (x) ZVF2 (x)
+#define ZVND1(x) VND1 (x) ZVD1 (x)
+#define ZVND2(x) VND2 (x) ZVD2 (x)
+#define SF1(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define SF2(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define SD1(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define SD2(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 2, 0, d2, 0)
 /* SVE routines.  */
 #define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
 #define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 4f3eba3..0d6077d 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -15,13 +15,12 @@ float erfcf (float);
 float erff (float);
 float log10f (float);
 float log1pf (float);
-float log2f (float);
 
 double asinh (double);
 double atan2 (double, double);
+double erfc (double);
 double log10 (double);
 double log1p (double);
-double log2 (double);
 
 float __s_asinhf (float);
 float __s_atanf (float);
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 4b12a1a..78fddac 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -15,66 +15,51 @@ D2 (atan2)
 D1 (erfc)
 D1 (log10)
 D1 (log1p)
-D1 (log2)
 #if WANT_VMATH
-F (__s_asinhf, __s_asinhf, asinh, mpfr_asinh, 1, 1, f1, 0)
-F (__s_atanf, __s_atanf, atan, mpfr_atan, 1, 1, f1, 0)
-F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0)
-F (__s_atan2f, __s_atan2f, atan2, mpfr_atan2, 2, 1, f2, 0)
-F (__s_atan2, __s_atan2, atan2l, mpfr_atan2, 2, 0, d2, 0)
-F (__s_erff, __s_erff, erf, mpfr_erf, 1, 1, f1, 0)
-F (__s_erf, __s_erf, erfl, mpfr_erf, 1, 0, d1, 0)
-F (__s_erfcf, __s_erfcf, erfc, mpfr_erfc, 1, 1, f1, 0)
-F (__s_erfc, __s_erfc, erfcl, mpfr_erfc, 1, 0, d1, 0)
-F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
-F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
-F (__s_log1pf, __s_log1pf, log1p, mpfr_log1p, 1, 1, f1, 0)
-F (__s_log2f, __s_log2f, log2, mpfr_log2, 1, 1, f1, 0)
-F (__s_log2, __s_log2, log2l, mpfr_log2, 1, 0, d1, 0)
+SF1 (asinh)
+SF1 (atan)
+SD1 (atan)
+SF2 (atan2)
+SD2 (atan2)
+SF1 (erf)
+SD1 (erf)
+SF1 (erfc)
+SD1 (erfc)
+SF1 (log10)
+SD1 (log10)
+SF1 (log1p)
+SF1 (log2)
+SD1 (log2)
 #if __aarch64__
-F (__v_asinhf, v_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
-F (__v_atanf, v_atanf, atan, mpfr_atan, 1, 1, f1, 1)
-F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1)
-F (__v_atan2f, v_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
-F (__v_atan2, v_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
-F (__v_erff, v_erff, erf, mpfr_erf, 1, 1, f1, 1)
-F (__v_erf, v_erf, erfl, mpfr_erf, 1, 0, d1, 1)
-F (__v_erfcf, v_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
-F (__v_erfc, v_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
-F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
-F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
-F (__v_log1pf, v_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
-F (__v_log2f, v_log2f, log2, mpfr_log2, 1, 1, f1, 1)
-F (__v_log2, v_log2, log2l, mpfr_log2, 1, 0, d1, 1)
+VF1 (asinh)
+VF1 (atan)
+VD1 (atan)
+VF2 (atan2)
+VD2 (atan2)
+VF1 (erf)
+VD1 (erf)
+VF1 (erfc)
+VD1 (erfc)
+VF1 (log10)
+VD1 (log10)
+VF1 (log1p)
+VF1 (log2)
+VD1 (log2)
 #ifdef __vpcs
-F (__vn_asinhf, vn_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
-F (__vn_atanf, vn_atanf, atan, mpfr_atan, 1, 1, f1, 1)
-F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1)
-F (__vn_atan2f, vn_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
-F (__vn_atan2, vn_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
-F (__vn_erff, vn_erff, erf, mpfr_erf, 1, 1, f1, 1)
-F (__vn_erf, vn_erf, erfl, mpfr_erf, 1, 0, d1, 1)
-F (__vn_erfcf, vn_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
-F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
-F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
-F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
-F (__vn_log1pf, vn_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
-F (__vn_log2f, vn_log2f, log2, mpfr_log2, 1, 1, f1, 1)
-F (__vn_log2, vn_log2, log2l, mpfr_log2, 1, 0, d1, 1)
-F (_ZGVnN4v_asinhf, Z_asinhf, asinh, mpfr_asinh, 1, 1, f1, 1)
-F (_ZGVnN4v_atanf, Z_atanf, atan, mpfr_atan, 1, 1, f1, 1)
-F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
-F (_ZGVnN4vv_atan2f, Z_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
-F (_ZGVnN2vv_atan2, Z_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
-F (_ZGVnN4v_erff, Z_erff, erf, mpfr_erf, 1, 1, f1, 1)
-F (_ZGVnN2v_erf, Z_erf, erfl, mpfr_erf, 1, 0, d1, 1)
-F (_ZGVnN4v_erfcf, Z_erfcf, erfc, mpfr_erfc, 1, 1, f1, 1)
-F (_ZGVnN2v_erfc, Z_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
-F (_ZGVnN4v_log10f, Z_log10f, log10, mpfr_log10, 1, 1, f1, 1)
-F (_ZGVnN2v_log10, Z_log10, log10l, mpfr_log10, 1, 0, d1, 1)
-F (_ZGVnN4v_log1pf, Z_log1pf, log1p, mpfr_log1p, 1, 1, f1, 1)
-F (_ZGVnN4v_log2f, Z_log2f, log2, mpfr_log2, 1, 1, f1, 1)
-F (_ZGVnN2v_log2, Z_log2, log2l, mpfr_log2, 1, 0, d1, 1)
+ZVNF1 (asinh)
+ZVNF1 (atan)
+ZVND1 (atan)
+ZVNF2 (atan2)
+ZVND2 (atan2)
+ZVNF1 (erf)
+ZVND1 (erf)
+ZVNF1 (erfc)
+ZVND1 (erfc)
+ZVNF1 (log10)
+ZVND1 (log10)
+ZVNF1 (log1p)
+ZVNF1 (log2)
+ZVND1 (log2)
 #endif
 #endif
 #if WANT_SVE_MATH
-- 
cgit v1.2.3


From 375526b26e3535091ae4bdba0a5703fa0d4a4bb7 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 6 Sep 2022 15:00:13 +0100
Subject: pl/math: Add helpers for ULP wrappers

Makes adding new routines slightly easier.
---
 pl/math/test/ulp_wrappers.h | 176 ++++++++++++++++++--------------------------
 1 file changed, 73 insertions(+), 103 deletions(-)

diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 4f11aec..d275271 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -17,115 +17,85 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
 }
 #endif
 
+#define VF1_WRAP(func) static float v_##func##f(float x) { return __v_##func##f(argf(x))[0]; }
+#define VF2_WRAP(func) static float v_##func##f(float x, float y) { return __v_##func##f(argf(x), argf(y))[0]; }
+#define VD1_WRAP(func) static double v_##func(double x) { return __v_##func(argd(x))[0]; }
+#define VD2_WRAP(func) static double v_##func(double x, double y) { return __v_##func(argd(x), argd(y))[0]; }
+
+#define VNF1_WRAP(func) static float vn_##func##f(float x) { return __vn_##func##f(argf(x))[0]; }
+#define VNF2_WRAP(func) static float vn_##func##f(float x, float y) { return __vn_##func##f(argf(x), argf(y))[0]; }
+#define VND1_WRAP(func) static double vn_##func(double x) { return __vn_##func(argd(x))[0]; }
+#define VND2_WRAP(func) static double vn_##func(double x, double y) { return __vn_##func(argd(x), argd(y))[0]; }
+
+#define ZVF1_WRAP(func) static float Z_##func##f(float x) { return _ZGVnN4v_##func##f(argf(x))[0]; }
+#define ZVF2_WRAP(func) static float Z_##func##f(float x, float y) { return _ZGVnN4vv_##func##f(argf(x), argf(y))[0]; }
+#define ZVD1_WRAP(func) static double Z_##func(double x) { return _ZGVnN2v_##func(argd(x))[0]; }
+#define ZVD2_WRAP(func) static double Z_##func(double x, double y) { return _ZGVnN2vv_##func(argd(x), argd(y))[0]; }
+
+#define ZVNF1_WRAP(func) VNF1_WRAP(func) ZVF1_WRAP(func)
+#define ZVNF2_WRAP(func) VNF2_WRAP(func) ZVF2_WRAP(func)
+#define ZVND1_WRAP(func) VND1_WRAP(func) ZVD1_WRAP(func)
+#define ZVND2_WRAP(func) VND2_WRAP(func) ZVD2_WRAP(func)
+
+#define SVF1_WRAP(func) static float sv_##func##f(float x) { return svretf(__sv_##func##f_x(svargf(x), svptrue_b32())); }
+#define SVF2_WRAP(func) static float sv_##func##f(float x, float y) { return svretf(__sv_##func##f_x(svargf(x), svargf(y), svptrue_b32())); }
+#define SVD1_WRAP(func) static double sv_##func(double x) { return svretd(__sv_##func##_x(svargd(x), svptrue_b64())); }
+#define SVD2_WRAP(func) static double sv_##func(double x, double y) { return svretd(__sv_##func##_x(svargd(x), svargd(y), svptrue_b64())); }
+
+#define ZSVF1_WRAP(func) static float Z_sv_##func##f(float x) { return svretf(_ZGVsMxv_##func##f(svargf(x), svptrue_b32())); }
+#define ZSVF2_WRAP(func) static float Z_sv_##func##f(float x, float y) { return svretf(_ZGVsMxvv_##func##f(svargf(x), svargf(y), svptrue_b32())); }
+#define ZSVD1_WRAP(func) static double Z_sv_##func(double x) { return svretd(_ZGVsMxv_##func(svargd(x), svptrue_b64())); }
+#define ZSVD2_WRAP(func) static double Z_sv_##func(double x, double y) { return svretd(_ZGVsMxvv_##func(svargd(x), svargd(y), svptrue_b64())); }
+
+#define ZSVNF1_WRAP(func) SVF1_WRAP(func) ZSVF1_WRAP(func)
+#define ZSVNF2_WRAP(func) SVF2_WRAP(func) ZSVF2_WRAP(func)
+#define ZSVND1_WRAP(func) SVD1_WRAP(func) ZSVD1_WRAP(func)
+#define ZSVND2_WRAP(func) SVD2_WRAP(func) ZSVD2_WRAP(func)
+
 /* Wrappers for vector functions.  */
 #if __aarch64__ && WANT_VMATH
-static float v_asinhf(float x) { return __v_asinhf(argf(x))[0]; }
-static float v_atanf(float x) { return __v_atanf(argf(x))[0]; }
-static float v_atan2f(float x, float y) { return __v_atan2f(argf(x), argf(y))[0]; }
-static float v_erff(float x) { return __v_erff(argf(x))[0]; }
-static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
-static float v_log10f(float x) { return __v_log10f(argf(x))[0]; }
-static float v_log1pf(float x) { return __v_log1pf(argf(x))[0]; }
-static float v_log2f(float x) { return __v_log2f(argf(x))[0]; }
-static double v_atan(double x) { return __v_atan(argd(x))[0]; }
-static double v_atan2(double x, double y) { return __v_atan2(argd(x), argd(y))[0]; }
-static double v_erf(double x) { return __v_erf(argd(x))[0]; }
-static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
-static double v_log10(double x) { return __v_log10(argd(x))[0]; }
-static double v_log2(double x) { return __v_log2(argd(x))[0]; }
+VF1_WRAP(asinh)
+VF1_WRAP(atan)
+VF2_WRAP(atan2)
+VF1_WRAP(erf)
+VF1_WRAP(erfc)
+VF1_WRAP(log10)
+VF1_WRAP(log1p)
+VF1_WRAP(log2)
+VD1_WRAP(atan)
+VD2_WRAP(atan2)
+VD1_WRAP(erf)
+VD1_WRAP(erfc)
+VD1_WRAP(log10)
+VD1_WRAP(log2)
 #ifdef __vpcs
-static float vn_asinhf(float x) { return __vn_asinhf(argf(x))[0]; }
-static float vn_atanf(float x) { return __vn_atanf(argf(x))[0]; }
-static float vn_atan2f(float x, float y) { return __vn_atan2f(argf(x), argf(y))[0]; }
-static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
-static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
-static float vn_log10f(float x) { return __vn_log10f(argf(x))[0]; }
-static float vn_log1pf(float x) { return __vn_log1pf(argf(x))[0]; }
-static float vn_log2f(float x) { return __vn_log2f(argf(x))[0]; }
-static double vn_atan(double x) { return __vn_atan(argd(x))[0]; }
-static double vn_atan2(double x, double y) { return __vn_atan2(argd(x), argd(y))[0]; }
-static double vn_erf(double x) { return __vn_erf(argd(x))[0]; }
-static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
-static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
-static double vn_log2(double x) { return __vn_log2(argd(x))[0]; }
-
-static float Z_asinhf(float x) { return _ZGVnN4v_asinhf(argf(x))[0]; }
-static float Z_atanf(float x) { return _ZGVnN4v_atanf(argf(x))[0]; }
-static float Z_atan2f(float x, float y) { return _ZGVnN4vv_atan2f(argf(x), argf(y))[0]; }
-static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
-static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
-static float Z_log10f(float x) { return _ZGVnN4v_log10f(argf(x))[0]; }
-static float Z_log1pf(float x) { return _ZGVnN4v_log1pf(argf(x))[0]; }
-static float Z_log2f(float x) { return _ZGVnN4v_log2f(argf(x))[0]; }
-static double Z_atan(double x) { return _ZGVnN2v_atan(argd(x))[0]; }
-static double Z_atan2(double x, double y) { return _ZGVnN2vv_atan2(argd(x), argd(y))[0]; }
-static double Z_erf(double x) { return _ZGVnN2v_erf(argd(x))[0]; }
-static double Z_erfc(double x) { return _ZGVnN2v_erfc(argd(x))[0]; }
-static double Z_log10(double x) { return _ZGVnN2v_log10(argd(x))[0]; }
-static double Z_log2(double x) { return _ZGVnN2v_log2(argd(x))[0]; }
+ZVNF1_WRAP(asinh)
+ZVNF1_WRAP(atan)
+ZVNF2_WRAP(atan2)
+ZVNF1_WRAP(erf)
+ZVNF1_WRAP(erfc)
+ZVNF1_WRAP(log10)
+ZVNF1_WRAP(log1p)
+ZVNF1_WRAP(log2)
+ZVND1_WRAP(atan)
+ZVND2_WRAP(atan2)
+ZVND1_WRAP(erf)
+ZVND1_WRAP(erfc)
+ZVND1_WRAP(log10)
+ZVND1_WRAP(log2)
 #endif
 #if WANT_SVE_MATH
-static float sv_atan2f(float x, float y) {
-  return svretf(__sv_atan2f_x(svargf(x), svargf(y), svptrue_b32()));
-}
-static float Z_sv_atan2f(float x, float y) {
-  return svretf(_ZGVsMxvv_atan2f(svargf(x), svargf(y), svptrue_b32()));
-}
-static float sv_atanf(float x) {
-  return svretf(__sv_atanf_x(svargf(x), svptrue_b32()));
-}
-static float Z_sv_atanf(float x) {
-  return svretf(_ZGVsMxv_atanf(svargf(x), svptrue_b32()));
-}
-static float sv_cosf(float x) {
-  return svretf(__sv_cosf_x(svargf(x), svptrue_b32()));
-}
-static float Z_sv_cosf(float x) {
-  return svretf(_ZGVsMxv_cosf(svargf(x), svptrue_b32()));
-}
-static float sv_log10f(float x) {
-  return svretf(__sv_log10f_x(svargf(x), svptrue_b32()));
-}
-static float Z_sv_log10f(float x) {
-  return svretf(_ZGVsMxv_log10f(svargf(x), svptrue_b32()));
-}
-static float sv_sinf(float x) {
-  return svretf(__sv_sinf_x(svargf(x), svptrue_b32()));
-}
-static float Z_sv_sinf(float x) {
-  return svretf(_ZGVsMxv_sinf(svargf(x), svptrue_b32()));
-}
+ZSVNF2_WRAP(atan2)
+ZSVNF1_WRAP(atan)
+ZSVNF1_WRAP(cos)
+ZSVNF1_WRAP(log10)
+ZSVNF1_WRAP(sin)
 
-static double sv_atan2(double x, double y) {
-  return svretd(__sv_atan2_x(svargd(x), svargd(y), svptrue_b64()));
-}
-static double Z_sv_atan2(double x, double y) {
-  return svretd(_ZGVsMxvv_atan2(svargd(x), svargd(y), svptrue_b64()));
-}
-static double sv_atan(double x) {
-  return svretd(__sv_atan_x(svargd(x), svptrue_b64()));
-}
-static double Z_sv_atan(double x) {
-  return svretd(_ZGVsMxv_atan(svargd(x), svptrue_b64()));
-}
-static double sv_cos(double x) {
-  return svretd(__sv_cos_x(svargd(x), svptrue_b64()));
-}
-static double Z_sv_cos(double x) {
-  return svretd(_ZGVsMxv_cos(svargd(x), svptrue_b64()));
-}
-static double sv_log10(double x) {
-  return svretd(__sv_log10_x(svargd(x), svptrue_b64()));
-}
-static double Z_sv_log10(double x) {
-  return svretd(_ZGVsMxv_log10(svargd(x), svptrue_b64()));
-}
-static double sv_sin(double x) {
-  return svretd(__sv_sin_x(svargd(x), svptrue_b64()));
-}
-static double Z_sv_sin(double x) {
-  return svretd(_ZGVsMxv_sin(svargd(x), svptrue_b64()));
-}
+ZSVND2_WRAP(atan2)
+ZSVND1_WRAP(atan)
+ZSVND1_WRAP(cos)
+ZSVND1_WRAP(log10)
+ZSVND1_WRAP(sin)
 #endif
 #endif
 // clang-format on
-- 
cgit v1.2.3


From 251a040bd28ee232a63480b4036346b81995047e Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 6 Sep 2022 15:00:22 +0100
Subject: pl/math: Add vector/SVE logf

New routine is SVE port of the Neon algorithm in math/, and is
accurate to 3.4 ULP.
---
 pl/math/include/mathlib.h      |  2 ++
 pl/math/math_config.h          |  3 ++
 pl/math/sv_logf_3u4.c          | 66 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/sv_logf_data.c         | 12 ++++++++
 pl/math/test/mathbench_funcs.h |  3 ++
 pl/math/test/runulp.sh         | 12 ++++++++
 pl/math/test/ulp_funcs.h       |  2 ++
 pl/math/test/ulp_wrappers.h    |  1 +
 8 files changed, 101 insertions(+)
 create mode 100644 pl/math/sv_logf_3u4.c
 create mode 100644 pl/math/sv_logf_data.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 0d6077d..1765c9d 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -110,6 +110,7 @@ svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
 svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_logf_x (svfloat32_t, svbool_t);
 svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
@@ -121,6 +122,7 @@ svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
 svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
 svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 10bbd9b..5e22349 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -511,4 +511,7 @@ extern const struct v_log10_data
 #define V_LOG10F_POLY_ORDER 9
 extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN;
 
+#define SV_LOGF_POLY_ORDER 8
+extern const float __sv_logf_poly[SV_LOGF_POLY_ORDER - 1] HIDDEN;
+
 #endif
diff --git a/pl/math/sv_logf_3u4.c b/pl/math/sv_logf_3u4.c
new file mode 100644
index 0000000..125f806
--- /dev/null
+++ b/pl/math/sv_logf_3u4.c
@@ -0,0 +1,66 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define P(i) __sv_logf_poly[i]
+
+#define Ln2 (0x1.62e43p-1f) /* 0x3f317218 */
+#define Min (0x00800000)
+#define Max (0x7f800000)
+#define Mask (0x007fffff)
+#define Off (0x3f2aaaab) /* 0.666667 */
+
+float
+optr_aor_log_f32 (float);
+
+static NOINLINE sv_f32_t
+__sv_logf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (optr_aor_log_f32, x, y, cmp);
+}
+
+/* Optimised implementation of SVE logf, using the same algorithm and polynomial
+   as the Neon routine in math/. Maximum error is 3.34 ULPs:
+   __sv_logf(0x1.557298p+0) got 0x1.26edecp-2
+			   want 0x1.26ede6p-2.  */
+sv_f32_t
+__sv_logf_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t u = sv_as_u32_f32 (x);
+  svbool_t cmp
+    = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min));
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u = svsub_n_u32_x (pg, u, Off);
+  sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u),
+						   23)); /* Sign-extend.  */
+  u = svand_n_u32_x (pg, u, Mask);
+  u = svadd_n_u32_x (pg, u, Off);
+  sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f);
+
+  /* y = log(1+r) + n*ln2.  */
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+  /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))).  */
+  sv_f32_t p = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (2)));
+  sv_f32_t q = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (4)));
+  sv_f32_t y = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (6)));
+  p = sv_fma_n_f32_x (pg, P (0), r2, p);
+  q = sv_fma_f32_x (pg, p, r2, q);
+  y = sv_fma_f32_x (pg, q, r2, y);
+  p = sv_fma_n_f32_x (pg, Ln2, n, r);
+  y = sv_fma_f32_x (pg, y, r2, p);
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_logf_specialcase (x, y, cmp);
+  return y;
+}
+
+strong_alias (__sv_logf_x, _ZGVsMxv_logf)
+
+#endif // SV_SUPPORTED
diff --git a/pl/math/sv_logf_data.c b/pl/math/sv_logf_data.c
new file mode 100644
index 0000000..0082ee3
--- /dev/null
+++ b/pl/math/sv_logf_data.c
@@ -0,0 +1,12 @@
+/*
+ * Coefficients for single-precision SVE log function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+const float __sv_logf_poly[] = {
+  /* Copied from coeffs for the Neon routine in math/.  */
+  -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
+  -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
+};
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 076340e..1ecd1d4 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -117,6 +117,9 @@ SVF (_ZGVsMxv_cosf, -3.1, 3.1)
 SVF (__sv_sinf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_sinf, -3.1, 3.1)
 
+SVF (__sv_logf_x, 0.01, 11.1)
+SVF (_ZGVsMxv_logf, 0.01, 11.1)
+
 SVF (__sv_log10f_x, 0.01, 11.1)
 SVF (_ZGVsMxv_log10f, 0.01, 11.1)
 SVD (__sv_log10_x, 0.01, 11.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 3475adb..656d7a7 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -332,6 +332,15 @@ range_sve_log10f='
       100       inf  50000
 '
 
+range_sve_logf='
+     -0.0  -0x1p126  100
+ 0x1p-149  0x1p-126  4000
+ 0x1p-126   0x1p-23  50000
+  0x1p-23       1.0  50000
+      1.0       100  50000
+      100       inf  50000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -359,6 +368,7 @@ L_sve_atan2f=3.0
 L_sve_atan2=2.0
 L_sve_log10=2.5
 L_sve_log10f=3.5
+L_sve_logf=3.5
 
 while read G F R
 do
@@ -446,6 +456,8 @@ sve_atanf    __sv_atanf        $runsv
 sve_atanf    _ZGVsMxv_atanf    $runsv
 sve_log10f   __sv_log10f       $runsv
 sve_log10f   _ZGVsMxv_log10f   $runsv
+sve_logf     __sv_logf         $runsv
+sve_logf     _ZGVsMxv_logf     $runsv
 
 sve_cos    __sv_cos        $runsv
 sve_cos    _ZGVsMxv_cos    $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 78fddac..01d7bd1 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -75,6 +75,8 @@ SVF1 (cos)
 ZSVF1 (cos)
 SVD1 (cos)
 ZSVD1 (cos)
+SVF1 (log)
+ZSVF1 (log)
 SVF1 (log10)
 ZSVF1 (log10)
 SVD1 (log10)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index d275271..07fa077 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -88,6 +88,7 @@ ZVND1_WRAP(log2)
 ZSVNF2_WRAP(atan2)
 ZSVNF1_WRAP(atan)
 ZSVNF1_WRAP(cos)
+ZSVNF1_WRAP(log)
 ZSVNF1_WRAP(log10)
 ZSVNF1_WRAP(sin)
 
-- 
cgit v1.2.3


From f37547083381c221bb454d7c70f435be8bd2d3bc Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 6 Sep 2022 15:00:36 +0100
Subject: pl/math: Add vector/SVE log

New routine is an SVE port of the Neon algorithm, and is accurate to
2.5 ulp.
---
 pl/math/include/mathlib.h      |   2 +
 pl/math/math_config.h          |   9 +++
 pl/math/sv_log_2u5.c           |  74 +++++++++++++++++++++
 pl/math/sv_log_data.c          | 146 +++++++++++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h |   2 +
 pl/math/test/runulp.sh         |  12 ++++
 pl/math/test/ulp_funcs.h       |   2 +
 pl/math/test/ulp_wrappers.h    |   1 +
 8 files changed, 248 insertions(+)
 create mode 100644 pl/math/sv_log_2u5.c
 create mode 100644 pl/math/sv_log_data.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 1765c9d..e2d36c6 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -111,6 +111,7 @@ svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_logf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
@@ -123,6 +124,7 @@ svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 5e22349..9f902a7 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -514,4 +514,13 @@ extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN;
 #define SV_LOGF_POLY_ORDER 8
 extern const float __sv_logf_poly[SV_LOGF_POLY_ORDER - 1] HIDDEN;
 
+#define SV_LOG_POLY_ORDER 6
+#define SV_LOG_TABLE_BITS 7
+extern const struct sv_log_data
+{
+  double invc[1 << SV_LOG_TABLE_BITS];
+  double logc[1 << SV_LOG_TABLE_BITS];
+  double poly[SV_LOG_POLY_ORDER - 1];
+} __sv_log_data HIDDEN;
+
 #endif
diff --git a/pl/math/sv_log_2u5.c b/pl/math/sv_log_2u5.c
new file mode 100644
index 0000000..c10299c
--- /dev/null
+++ b/pl/math/sv_log_2u5.c
@@ -0,0 +1,74 @@
+/*
+ * Double-precision SVE log(x) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define A(i) __sv_log_data.poly[i]
+#define Ln2 (0x1.62e42fefa39efp-1)
+#define N (1 << SV_LOG_TABLE_BITS)
+#define OFF (0x3fe6900900000000)
+
+double
+optr_aor_log_f64 (double);
+
+static NOINLINE sv_f64_t
+__sv_log_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (optr_aor_log_f64, x, y, cmp);
+}
+
+/* SVE port of Neon log algorithm from math/.
+   Maximum measured error is 2.17 ulp:
+   __sv_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+				 want 0x1.ffffff1cca045p-2.  */
+sv_f64_t
+__sv_log_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
+  svbool_t cmp = svcmpge_u64 (pg, svsub_n_u64_x (pg, top, 0x0010),
+			      sv_u64 (0x7ff0 - 0x0010));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
+  /* Equivalent to (tmp >> (52 - SV_LOG_TABLE_BITS)) % N, since N is a power
+     of 2.  */
+  sv_u64_t i
+    = svand_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, (52 - SV_LOG_TABLE_BITS)),
+		     N - 1);
+  sv_s64_t k
+    = svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52); /* Arithmetic shift.  */
+  sv_u64_t iz = svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52));
+  sv_f64_t z = sv_as_f64_u64 (iz);
+  /* Lookup in 2 global lists (length N).  */
+  sv_f64_t invc = sv_lookup_f64_x (pg, __sv_log_data.invc, i);
+  sv_f64_t logc = sv_lookup_f64_x (pg, __sv_log_data.logc, i);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
+  sv_f64_t kd = sv_to_f64_s64_x (pg, k);
+  /* hi = r + log(c) + k*Ln2.  */
+  sv_f64_t hi = sv_fma_n_f64_x (pg, Ln2, kd, svadd_f64_x (pg, logc, r));
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  sv_f64_t r2 = svmul_f64_x (pg, r, r);
+  sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2)));
+  sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0)));
+  y = sv_fma_n_f64_x (pg, A (4), r2, y);
+  y = sv_fma_f64_x (pg, y, r2, p);
+  y = sv_fma_f64_x (pg, y, r2, hi);
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_log_specialcase (x, y, cmp);
+  return y;
+}
+
+strong_alias (__sv_log_x, _ZGVsMxv_log);
+
+#endif // SV_SUPPORTED
diff --git a/pl/math/sv_log_data.c b/pl/math/sv_log_data.c
new file mode 100644
index 0000000..a544a69
--- /dev/null
+++ b/pl/math/sv_log_data.c
@@ -0,0 +1,146 @@
+/*
+ * Coefficients for double-precision SVE log(x) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct sv_log_data __sv_log_data = {
+  /* All coefficients and table entries are copied from the Neon routine in
+     math/. See math/v_log_data.c for an explanation of the algorithm.  */
+
+  .invc = {0x1.6a133d0dec120p+0, 0x1.6815f2f3e42edp+0,
+	   0x1.661e39be1ac9ep+0, 0x1.642bfa30ac371p+0,
+	   0x1.623f1d916f323p+0, 0x1.60578da220f65p+0,
+	   0x1.5e75349dea571p+0, 0x1.5c97fd387a75ap+0,
+	   0x1.5abfd2981f200p+0, 0x1.58eca051dc99cp+0,
+	   0x1.571e526d9df12p+0, 0x1.5554d555b3fcbp+0,
+	   0x1.539015e2a20cdp+0, 0x1.51d0014ee0164p+0,
+	   0x1.50148538cd9eep+0, 0x1.4e5d8f9f698a1p+0,
+	   0x1.4cab0edca66bep+0, 0x1.4afcf1a9db874p+0,
+	   0x1.495327136e16fp+0, 0x1.47ad9e84af28fp+0,
+	   0x1.460c47b39ae15p+0, 0x1.446f12b278001p+0,
+	   0x1.42d5efdd720ecp+0, 0x1.4140cfe001a0fp+0,
+	   0x1.3fafa3b421f69p+0, 0x1.3e225c9c8ece5p+0,
+	   0x1.3c98ec29a211ap+0, 0x1.3b13442a413fep+0,
+	   0x1.399156baa3c54p+0, 0x1.38131639b4cdbp+0,
+	   0x1.36987540fbf53p+0, 0x1.352166b648f61p+0,
+	   0x1.33adddb3eb575p+0, 0x1.323dcd99fc1d3p+0,
+	   0x1.30d129fefc7d2p+0, 0x1.2f67e6b72fe7dp+0,
+	   0x1.2e01f7cf8b187p+0, 0x1.2c9f518ddc86ep+0,
+	   0x1.2b3fe86e5f413p+0, 0x1.29e3b1211b25cp+0,
+	   0x1.288aa08b373cfp+0, 0x1.2734abcaa8467p+0,
+	   0x1.25e1c82459b81p+0, 0x1.2491eb1ad59c5p+0,
+	   0x1.23450a54048b5p+0, 0x1.21fb1bb09e578p+0,
+	   0x1.20b415346d8f7p+0, 0x1.1f6fed179a1acp+0,
+	   0x1.1e2e99b93c7b3p+0, 0x1.1cf011a7a882ap+0,
+	   0x1.1bb44b97dba5ap+0, 0x1.1a7b3e66cdd4fp+0,
+	   0x1.1944e11dc56cdp+0, 0x1.18112aebb1a6ep+0,
+	   0x1.16e013231b7e9p+0, 0x1.15b1913f156cfp+0,
+	   0x1.14859cdedde13p+0, 0x1.135c2dc68cfa4p+0,
+	   0x1.12353bdb01684p+0, 0x1.1110bf25b85b4p+0,
+	   0x1.0feeafd2f8577p+0, 0x1.0ecf062c51c3bp+0,
+	   0x1.0db1baa076c8bp+0, 0x1.0c96c5bb3048ep+0,
+	   0x1.0b7e20263e070p+0, 0x1.0a67c2acd0ce3p+0,
+	   0x1.0953a6391e982p+0, 0x1.0841c3caea380p+0,
+	   0x1.07321489b13eap+0, 0x1.062491aee9904p+0,
+	   0x1.05193497a7cc5p+0, 0x1.040ff6b5f5e9fp+0,
+	   0x1.0308d19aa6127p+0, 0x1.0203beedb0c67p+0,
+	   0x1.010037d38bcc2p+0, 1.0,
+	   0x1.fc06d493cca10p-1, 0x1.f81e6ac3b918fp-1,
+	   0x1.f44546ef18996p-1, 0x1.f07b10382c84bp-1,
+	   0x1.ecbf7070e59d4p-1, 0x1.e91213f715939p-1,
+	   0x1.e572a9a75f7b7p-1, 0x1.e1e0e2c530207p-1,
+	   0x1.de5c72d8a8be3p-1, 0x1.dae50fa5658ccp-1,
+	   0x1.d77a71145a2dap-1, 0x1.d41c51166623ep-1,
+	   0x1.d0ca6ba0bb29fp-1, 0x1.cd847e8e59681p-1,
+	   0x1.ca4a499693e00p-1, 0x1.c71b8e399e821p-1,
+	   0x1.c3f80faf19077p-1, 0x1.c0df92dc2b0ecp-1,
+	   0x1.bdd1de3cbb542p-1, 0x1.baceb9e1007a3p-1,
+	   0x1.b7d5ef543e55ep-1, 0x1.b4e749977d953p-1,
+	   0x1.b20295155478ep-1, 0x1.af279f8e82be2p-1,
+	   0x1.ac5638197fdf3p-1, 0x1.a98e2f102e087p-1,
+	   0x1.a6cf5606d05c1p-1, 0x1.a4197fc04d746p-1,
+	   0x1.a16c80293dc01p-1, 0x1.9ec82c4dc5bc9p-1,
+	   0x1.9c2c5a491f534p-1, 0x1.9998e1480b618p-1,
+	   0x1.970d9977c6c2dp-1, 0x1.948a5c023d212p-1,
+	   0x1.920f0303d6809p-1, 0x1.8f9b698a98b45p-1,
+	   0x1.8d2f6b81726f6p-1, 0x1.8acae5bb55badp-1,
+	   0x1.886db5d9275b8p-1, 0x1.8617ba567c13cp-1,
+	   0x1.83c8d27487800p-1, 0x1.8180de3c5dbe7p-1,
+	   0x1.7f3fbe71cdb71p-1, 0x1.7d055498071c1p-1,
+	   0x1.7ad182e54f65ap-1, 0x1.78a42c3c90125p-1,
+	   0x1.767d342f76944p-1, 0x1.745c7ef26b00ap-1,
+	   0x1.7241f15769d0fp-1, 0x1.702d70d396e41p-1,
+	   0x1.6e1ee3700cd11p-1, 0x1.6c162fc9cbe02p-1},
+
+  .logc = {-0x1.62fe995eb963ap-2, -0x1.5d5a48dad6b67p-2,
+	   -0x1.57bde257d2769p-2, -0x1.52294fbf2af55p-2,
+	   -0x1.4c9c7b598aa38p-2, -0x1.47174fc5ff560p-2,
+	   -0x1.4199b7fa7b5cap-2, -0x1.3c239f48cfb99p-2,
+	   -0x1.36b4f154d2aebp-2, -0x1.314d9a0ff32fbp-2,
+	   -0x1.2bed85cca3cffp-2, -0x1.2694a11421af9p-2,
+	   -0x1.2142d8d014fb2p-2, -0x1.1bf81a2c77776p-2,
+	   -0x1.16b452a39c6a4p-2, -0x1.11776ffa6c67ep-2,
+	   -0x1.0c416035020e0p-2, -0x1.071211aa10fdap-2,
+	   -0x1.01e972e293b1bp-2, -0x1.f98ee587fd434p-3,
+	   -0x1.ef5800ad716fbp-3, -0x1.e52e160484698p-3,
+	   -0x1.db1104b19352ep-3, -0x1.d100ac59e0bd6p-3,
+	   -0x1.c6fced287c3bdp-3, -0x1.bd05a7b317c29p-3,
+	   -0x1.b31abd229164fp-3, -0x1.a93c0edadb0a3p-3,
+	   -0x1.9f697ee30d7ddp-3, -0x1.95a2efa9aa40ap-3,
+	   -0x1.8be843d796044p-3, -0x1.82395ecc477edp-3,
+	   -0x1.7896240966422p-3, -0x1.6efe77aca8c55p-3,
+	   -0x1.65723e117ec5cp-3, -0x1.5bf15c0955706p-3,
+	   -0x1.527bb6c111da1p-3, -0x1.491133c939f8fp-3,
+	   -0x1.3fb1b90c7fc58p-3, -0x1.365d2cc485f8dp-3,
+	   -0x1.2d13758970de7p-3, -0x1.23d47a721fd47p-3,
+	   -0x1.1aa0229f25ec2p-3, -0x1.117655ddebc3bp-3,
+	   -0x1.0856fbf83ab6bp-3, -0x1.fe83fabbaa106p-4,
+	   -0x1.ec6e8507a56cdp-4, -0x1.da6d68c7cc2eap-4,
+	   -0x1.c88078462be0cp-4, -0x1.b6a786a423565p-4,
+	   -0x1.a4e2676ac7f85p-4, -0x1.9330eea777e76p-4,
+	   -0x1.8192f134d5ad9p-4, -0x1.70084464f0538p-4,
+	   -0x1.5e90bdec5cb1fp-4, -0x1.4d2c3433c5536p-4,
+	   -0x1.3bda7e219879ap-4, -0x1.2a9b732d27194p-4,
+	   -0x1.196eeb2b10807p-4, -0x1.0854be8ef8a7ep-4,
+	   -0x1.ee998cb277432p-5, -0x1.ccadb79919fb9p-5,
+	   -0x1.aae5b1d8618b0p-5, -0x1.89413015d7442p-5,
+	   -0x1.67bfe7bf158dep-5, -0x1.46618f83941bep-5,
+	   -0x1.2525df1b0618ap-5, -0x1.040c8e2f77c6ap-5,
+	   -0x1.c62aad39f738ap-6, -0x1.847fe3bdead9cp-6,
+	   -0x1.43183683400acp-6, -0x1.01f31c4e1d544p-6,
+	   -0x1.82201d1e6b69ap-7, -0x1.00dd0f3e1bfd6p-7,
+	   -0x1.ff6fe1feb4e53p-9, 0.0,
+	   0x1.fe91885ec8e20p-8,  0x1.fc516f716296dp-7,
+	   0x1.7bb4dd70a015bp-6,  0x1.f84c99b34b674p-6,
+	   0x1.39f9ce4fb2d71p-5,  0x1.7756c0fd22e78p-5,
+	   0x1.b43ee82db8f3ap-5,  0x1.f0b3fced60034p-5,
+	   0x1.165bd78d4878ep-4,  0x1.3425d2715ebe6p-4,
+	   0x1.51b8bd91b7915p-4,  0x1.6f15632c76a47p-4,
+	   0x1.8c3c88ecbe503p-4,  0x1.a92ef077625dap-4,
+	   0x1.c5ed5745fa006p-4,  0x1.e27876de1c993p-4,
+	   0x1.fed104fce4cdcp-4,  0x1.0d7bd9c17d78bp-3,
+	   0x1.1b76986cef97bp-3,  0x1.295913d24f750p-3,
+	   0x1.37239fa295d17p-3,  0x1.44d68dd78714bp-3,
+	   0x1.52722ebe5d780p-3,  0x1.5ff6d12671f98p-3,
+	   0x1.6d64c2389484bp-3,  0x1.7abc4da40fddap-3,
+	   0x1.87fdbda1e8452p-3,  0x1.95295b06a5f37p-3,
+	   0x1.a23f6d34abbc5p-3,  0x1.af403a28e04f2p-3,
+	   0x1.bc2c06a85721ap-3,  0x1.c903161240163p-3,
+	   0x1.d5c5aa93287ebp-3,  0x1.e274051823fa9p-3,
+	   0x1.ef0e656300c16p-3,  0x1.fb9509f05aa2ap-3,
+	   0x1.04041821f37afp-2,  0x1.0a340a49b3029p-2,
+	   0x1.105a7918a126dp-2,  0x1.1677819812b84p-2,
+	   0x1.1c8b405b40c0ep-2,  0x1.2295d16cfa6b1p-2,
+	   0x1.28975066318a2p-2,  0x1.2e8fd855d86fcp-2,
+	   0x1.347f83d605e59p-2,  0x1.3a666d1244588p-2,
+	   0x1.4044adb6f8ec4p-2,  0x1.461a5f077558cp-2,
+	   0x1.4be799e20b9c8p-2,  0x1.51ac76a6b79dfp-2,
+	   0x1.57690d5744a45p-2,  0x1.5d1d758e45217p-2},
+
+  .poly = {-0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2,
+	   0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3},
+};
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 1ecd1d4..b625555 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -119,6 +119,8 @@ SVF (_ZGVsMxv_sinf, -3.1, 3.1)
 
 SVF (__sv_logf_x, 0.01, 11.1)
 SVF (_ZGVsMxv_logf, 0.01, 11.1)
+SVD (__sv_log_x, 0.01, 11.1)
+SVD (_ZGVsMxv_log, 0.01, 11.1)
 
 SVF (__sv_log10f_x, 0.01, 11.1)
 SVF (_ZGVsMxv_log10f, 0.01, 11.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 656d7a7..ebcdb36 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -341,6 +341,15 @@ range_sve_logf='
       100       inf  50000
 '
 
+range_sve_log='
+     -0.0  -0x1p126  100
+ 0x1p-149  0x1p-126  4000
+ 0x1p-126   0x1p-23  50000
+  0x1p-23       1.0  50000
+      1.0       100  50000
+      100       inf  50000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -369,6 +378,7 @@ L_sve_atan2=2.0
 L_sve_log10=2.5
 L_sve_log10f=3.5
 L_sve_logf=3.5
+L_sve_log=2.5
 
 while read G F R
 do
@@ -469,6 +479,8 @@ sve_atan2  __sv_atan2      $runsv
 sve_atan2  _ZGVsMxvv_atan2 $runsv
 sve_log10  __sv_log10      $runsv
 sve_log10  _ZGVsMxv_log10  $runsv
+sve_log    __sv_log        $runsv
+sve_log    _ZGVsMxv_log    $runsv
 fi
 EOF
 
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 01d7bd1..7ab804f 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -77,6 +77,8 @@ SVD1 (cos)
 ZSVD1 (cos)
 SVF1 (log)
 ZSVF1 (log)
+SVD1 (log)
+ZSVD1 (log)
 SVF1 (log10)
 ZSVF1 (log10)
 SVD1 (log10)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 07fa077..0ceef13 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -95,6 +95,7 @@ ZSVNF1_WRAP(sin)
 ZSVND2_WRAP(atan2)
 ZSVND1_WRAP(atan)
 ZSVND1_WRAP(cos)
+ZSVND1_WRAP(log)
 ZSVND1_WRAP(log10)
 ZSVND1_WRAP(sin)
 #endif
-- 
cgit v1.2.3


From 9bd7555a40657155a36f012ef08eb094d7549091 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 14 Sep 2022 16:27:12 +0100
Subject: Audit ULP limits in runulp.sh

We have not been consistent in using the pre- or post-rounding ULP
estimate here. We now use the pre-rounding value. There are also a
handful of inaccuracies which are probably due to running ULP against
different versions of GLIBC. A few routines have been updated because
of this - going forward we will compare against GLIBC 2.31. The bounds
in runulp have mostly been tightened.
---
 pl/math/erfc_4u5.c     |  7 ++--
 pl/math/s_atan_2u5.c   |  6 ++++
 pl/math/s_atan_3u.c    |  6 ----
 pl/math/s_log2_2u5.c   |  6 ----
 pl/math/s_log2_3u.c    |  6 ++++
 pl/math/test/runulp.sh | 67 ++++++++++++++++++-------------------
 pl/math/v_atan_2u5.c   | 50 ++++++++++++++++++++++++++++
 pl/math/v_atan_3u.c    | 51 -----------------------------
 pl/math/v_log2_2u5.c   | 89 --------------------------------------------------
 pl/math/v_log2_3u.c    | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_atan_2u5.c  | 12 +++++++
 pl/math/vn_atan_3u.c   | 12 -------
 pl/math/vn_log2_2u5.c  | 12 -------
 pl/math/vn_log2_3u.c   | 12 +++++++
 14 files changed, 212 insertions(+), 213 deletions(-)
 create mode 100644 pl/math/s_atan_2u5.c
 delete mode 100644 pl/math/s_atan_3u.c
 delete mode 100644 pl/math/s_log2_2u5.c
 create mode 100644 pl/math/s_log2_3u.c
 create mode 100644 pl/math/v_atan_2u5.c
 delete mode 100644 pl/math/v_atan_3u.c
 delete mode 100644 pl/math/v_log2_2u5.c
 create mode 100644 pl/math/v_log2_3u.c
 create mode 100644 pl/math/vn_atan_2u5.c
 delete mode 100644 pl/math/vn_atan_3u.c
 delete mode 100644 pl/math/vn_log2_2u5.c
 create mode 100644 pl/math/vn_log2_3u.c

diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
index 810da82..7e0e813 100644
--- a/pl/math/erfc_4u5.c
+++ b/pl/math/erfc_4u5.c
@@ -117,10 +117,9 @@ top32 (double x)
 /* Fast erfc implementation.
    The approximation uses polynomial approximation of
    exp(x^2) * erfc(x) with fixed orders on 20 intervals.
-   Maximum measured error is 4.37 ULPs in [1.2281, 1.2282].
-   erfc(0x1.3a64c308e7789p+0) got 0x1.519b08721640cp-4
-			     want 0x1.519b087216408p-4
-   -0.367612 ulp err 3.86761.  */
+   Maximum measured error is 4.05 ULPs:.
+   erfc(0x1.e8ee8c87064ap-2) got 0x1.ff81b0d2dc2e6p-2
+			    want 0x1.ff81b0d2dc2eap-2.  */
 double
 erfc (double x)
 {
diff --git a/pl/math/s_atan_2u5.c b/pl/math/s_atan_2u5.c
new file mode 100644
index 0000000..b6b746a
--- /dev/null
+++ b/pl/math/s_atan_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan_2u5.c"
diff --git a/pl/math/s_atan_3u.c b/pl/math/s_atan_3u.c
deleted file mode 100644
index 1cdc4ed..0000000
--- a/pl/math/s_atan_3u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2021-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_atan_3u.c"
diff --git a/pl/math/s_log2_2u5.c b/pl/math/s_log2_2u5.c
deleted file mode 100644
index f5e8e4d..0000000
--- a/pl/math/s_log2_2u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_log2_2u5.c"
diff --git a/pl/math/s_log2_3u.c b/pl/math/s_log2_3u.c
new file mode 100644
index 0000000..913c825
--- /dev/null
+++ b/pl/math/s_log2_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log2_3u.c"
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index ebcdb36..8a9a5ec 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -51,13 +51,13 @@ t log10f  0x1p-26   0x1p3   50000
 t log10f  0x1p-4    0x1p4   50000
 t log10f  0         inf     50000
 
-L=1.15
+L=1.11
 Ldir=
 t log10  0 0xffff000000000000 10000
 t log10  0x1p-4    0x1p4      40000
 t log10  0         inf        40000
 
-L=3.5
+L=3.55
 t erfc  0       0xffff0000   10000
 t erfc  0x1p-1022  0x1p-26   40000
 t erfc -0x1p-1022 -0x1p-26   40000
@@ -66,7 +66,7 @@ t erfc -0x1p-26   -0x1p3     40000
 t erfc  0          inf       40000
 Ldir=0.5
 
-L=1.45
+L=1.5
 t erfcf  0      0xffff0000 10000
 t erfcf  0x1p-127  0x1p-26 40000
 t erfcf -0x1p-127 -0x1p-26 40000
@@ -74,27 +74,27 @@ t erfcf  0x1p-26    0x1p5  40000
 t erfcf -0x1p-26   -0x1p3  40000
 t erfcf  0          inf    40000
 
-L=2.0
+L=1.5
 t atan2 -10.0       10.0  50000
 t atan2  -1.0        1.0  40000
 t atan2   0.0        1.0  40000
 t atan2   1.0      100.0  40000
 t atan2   1e6       1e32  40000
 
-L=3.0
+L=2.4
 t atan2f -10.0       10.0  50000
 t atan2f  -1.0        1.0  40000
 t atan2f   0.0        1.0  40000
 t atan2f   1.0      100.0  40000
 t atan2f   1e6       1e32  40000
 
-L=3.0
+L=2.9
 t asinhf        0  0x1p-12  5000
 t asinhf  0x1p-12      1.0  50000
 t asinhf      1.0   0x1p11  50000
 t asinhf   0x1p11  0x1p127  20000
 
-L=2.0
+L=1.51
 t asinh -0x1p-26 0x1p-26   50000
 t asinh  0x1p-26     1.0   40000
 t asinh -0x1p-26    -1.0   10000
@@ -103,7 +103,7 @@ t asinh     -1.0  -100.0   10000
 t asinh    100.0     inf   50000
 t asinh   -100.0    -inf   10000
 
-L=2.0
+L=1.18
 t log1p    -10.0     10.0  10000
 t log1p      0.0  0x1p-23  50000
 t log1p  0x1p-23    0.001  50000
@@ -113,7 +113,7 @@ t log1p -0x1p-23   -0.001  50000
 t log1p   -0.001     -1.0  50000
 t log1p     -1.0      inf   5000
 
-L=2.0
+L=1.52
 t log1pf    -10.0     10.0  10000
 t log1pf      0.0  0x1p-23  50000
 t log1pf  0x1p-23    0.001  50000
@@ -351,34 +351,35 @@ range_sve_log='
 '
 
 # error limits
-L_erfc=3.7
-L_erfcf=1.0
-L_log10=1.16
+L_erfc=3.11
+L_erfcf=0.26
+L_log10=1.97
 L_log10f=2.81
-L_erf=1.76
-L_erff=1.5
+L_erf=1.26
+L_erff=0.76
+# TODO tighten this once __v_atan2 is fixed
 L_atan2=2.9
-L_atan=3.0
-L_atan2f=3.0
-L_atanf=3.0
-L_log1pf=2.0
-L_asinhf=2.2
-L_log2f=2.6
-# TODO tighten log2 bound
-L_log2=3
-
-L_sve_cosf=1.6
-L_sve_cos=2.0
-L_sve_sinf=1.9
-L_sve_sin=2.0
+L_atan=2.15
+L_atan2f=2.46
+L_atanf=2.5
+L_log1pf=1.53
+L_asinhf=2.17
+L_log2f=2.10
+L_log2=2.09
+
+L_sve_cosf=1.57
+L_sve_cos=1.61
+L_sve_sinf=1.40
+L_sve_sin=1.46
 L_sve_atanf=2.9
-L_sve_atan=2.5
-L_sve_atan2f=3.0
+L_sve_atan=1.7
+L_sve_atan2f=2.45
+# TODO tighten this once __sv_atan2 is fixed
 L_sve_atan2=2.0
-L_sve_log10=2.5
-L_sve_log10f=3.5
-L_sve_logf=3.5
-L_sve_log=2.5
+L_sve_log10=1.97
+L_sve_log10f=2.82
+L_sve_logf=2.85
+L_sve_log=1.68
 
 while read G F R
 do
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
new file mode 100644
index 0000000..619bbb9
--- /dev/null
+++ b/pl/math/v_atan_2u5.c
@@ -0,0 +1,50 @@
+/*
+ * Double-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#include "atan_common.h"
+
+#define PiOver2 v_f64 (0x1.921fb54442d18p+0)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+
+/* Fast implementation of vector atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Maximum observed error is 2.14 ulps:
+   __v_atan(-0x1.02eac6432cb9ap+0) got -0x1.95063e76724c1p-1
+				  want -0x1.95063e76724c3p-1.  */
+VPCS_ATTR
+v_f64_t V_NAME (atan) (v_f64_t x)
+{
+  /* No need to trigger special case. Small cases, infs and nans
+     are supported by our approximation technique.  */
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t sign = ix & ~AbsMask;
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  v_u64_t red = v_cagt_f64 (x, v_f64 (1.0));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  v_f64_t z = v_sel_f64 (red, v_div_f64 (v_f64 (-1.0), x), x);
+  v_f64_t shift = v_sel_f64 (red, PiOver2, v_f64 (0.0));
+  /* Use absolute value only when needed (odd powers of z).  */
+  v_f64_t az = v_abs_f64 (z);
+  az = v_sel_f64 (red, -az, az);
+
+  /* Calculate the polynomial approximation.  */
+  v_f64_t y = eval_poly (z, az, shift);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
+
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_atan_3u.c b/pl/math/v_atan_3u.c
deleted file mode 100644
index bf11399..0000000
--- a/pl/math/v_atan_3u.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Double-precision vector atan(x) function.
- *
- * Copyright (c) 2021-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#if V_SUPPORTED
-
-#include "atan_common.h"
-
-#define PiOver2 v_f64 (0x1.921fb54442d18p+0)
-#define AbsMask v_u64 (0x7fffffffffffffff)
-
-/* Fast implementation of vector atan.
-   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
-   z=1/x and shift = pi/2. Maximum observed error is 3.0 ulps, in
-   [0x1.00e766b50e9f2p+0, 0x1.00e78cab70984p+0]:
-   v_atan(0x1.00e76c0e723e4p+0) got 0x1.9306b8d822418p-1
-			       want 0x1.9306b8d82241bp-1.  */
-VPCS_ATTR
-v_f64_t V_NAME (atan) (v_f64_t x)
-{
-  /* No need to trigger special case. Small cases, infs and nans
-     are supported by our approximation technique.  */
-  v_u64_t ix = v_as_u64_f64 (x);
-  v_u64_t sign = ix & ~AbsMask;
-
-  /* Argument reduction:
-     y := arctan(x) for x < 1
-     y := pi/2 + arctan(-1/x) for x > 1
-     Hence, use z=-1/a if x>=1, otherwise z=a.  */
-  v_u64_t red = v_cagt_f64 (x, v_f64 (1.0));
-  /* Avoid dependency in abs(x) in division (and comparison).  */
-  v_f64_t z = v_sel_f64 (red, v_div_f64 (v_f64 (-1.0), x), x);
-  v_f64_t shift = v_sel_f64 (red, PiOver2, v_f64 (0.0));
-  /* Use absolute value only when needed (odd powers of z).  */
-  v_f64_t az = v_abs_f64 (z);
-  az = v_sel_f64 (red, -az, az);
-
-  /* Calculate the polynomial approximation.  */
-  v_f64_t y = eval_poly (z, az, shift);
-
-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
-  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
-
-  return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/pl/math/v_log2_2u5.c b/pl/math/v_log2_2u5.c
deleted file mode 100644
index 5b1014c..0000000
--- a/pl/math/v_log2_2u5.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Double-precision vector log2 function.
- *
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "include/mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-#define InvLn2 v_f64 (0x1.71547652b82fep0)
-#define N (1 << V_LOG2_TABLE_BITS)
-#define OFF v_u64 (0x3fe6900900000000)
-#define P(i) v_f64 (__v_log2_data.poly[i])
-
-struct entry
-{
-  v_f64_t invc;
-  v_f64_t log2c;
-};
-
-static inline struct entry
-lookup (v_u64_t i)
-{
-  struct entry e;
-#ifdef SCALAR
-  e.invc = __v_log2_data.tab[i].invc;
-  e.log2c = __v_log2_data.tab[i].log2c;
-#else
-  e.invc[0] = __v_log2_data.tab[i[0]].invc;
-  e.log2c[0] = __v_log2_data.tab[i[0]].log2c;
-  e.invc[1] = __v_log2_data.tab[i[1]].invc;
-  e.log2c[1] = __v_log2_data.tab[i[1]].log2c;
-#endif
-  return e;
-}
-
-VPCS_ATTR
-NOINLINE static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
-  return v_call_f64 (log2, x, y, cmp);
-}
-
-/* Double-precision vector log2 routine. Implements the same algorithm as vector
-   log10, with coefficients and table entries scaled in extended precision.
-   The maximum observed error is 2.26 ULP, at roughly 0.84:
-   __v_log2(0x1.aee6cb4e12a19p-1) got -0x1.fd8348301747fp-3
-				 want -0x1.fd8348301747dp-3.  */
-VPCS_ATTR
-v_f64_t V_NAME (log2) (v_f64_t x)
-{
-  v_u64_t ix = v_as_u64_f64 (x);
-  v_u64_t top = ix >> 48;
-  v_u64_t special
-    = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
-
-  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
-     The range is split into N subintervals.
-     The ith subinterval contains z and c is near its center.  */
-  v_u64_t tmp = ix - OFF;
-  v_u64_t i = (tmp >> (52 - V_LOG2_TABLE_BITS)) % N;
-  v_s64_t k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift.  */
-  v_u64_t iz = ix - (tmp & v_u64 (0xfffULL << 52));
-  v_f64_t z = v_as_f64_u64 (iz);
-  struct entry e = lookup (i);
-
-  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
-
-  v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
-  v_f64_t kd = v_to_f64_s64 (k);
-  v_f64_t w = v_fma_f64 (r, InvLn2, e.log2c);
-
-  v_f64_t r2 = r * r;
-  v_f64_t p_45 = v_fma_f64 (P (5), r, P (4));
-  v_f64_t p_23 = v_fma_f64 (P (3), r, P (2));
-  v_f64_t p_01 = v_fma_f64 (P (1), r, P (0));
-  v_f64_t y = v_fma_f64 (r2, p_45, p_23);
-  y = v_fma_f64 (r2, y, p_01);
-  y = v_fma_f64 (r2, y, kd + w);
-
-  if (unlikely (v_any_u64 (special)))
-    return specialcase (x, y, special);
-  return y;
-}
-VPCS_ALIAS
-
-#endif
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
new file mode 100644
index 0000000..d34d0e8
--- /dev/null
+++ b/pl/math/v_log2_3u.c
@@ -0,0 +1,89 @@
+/*
+ * Double-precision vector log2 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "include/mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define N (1 << V_LOG2_TABLE_BITS)
+#define OFF v_u64 (0x3fe6900900000000)
+#define P(i) v_f64 (__v_log2_data.poly[i])
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t log2c;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = __v_log2_data.tab[i].invc;
+  e.log2c = __v_log2_data.tab[i].log2c;
+#else
+  e.invc[0] = __v_log2_data.tab[i[0]].invc;
+  e.log2c[0] = __v_log2_data.tab[i[0]].log2c;
+  e.invc[1] = __v_log2_data.tab[i[1]].invc;
+  e.log2c[1] = __v_log2_data.tab[i[1]].log2c;
+#endif
+  return e;
+}
+
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (log2, x, y, cmp);
+}
+
+/* Double-precision vector log2 routine. Implements the same algorithm as vector
+   log10, with coefficients and table entries scaled in extended precision.
+   The maximum observed error is 2.59 ULP:
+   __v_log2(0x1.0b5572f05bc9dp+0) got 0x1.fffc917a7a52dp-5
+				  want 0x1.fffc917a7a53p-5.  */
+VPCS_ATTR
+v_f64_t V_NAME (log2) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t top = ix >> 48;
+  v_u64_t special
+    = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  v_u64_t tmp = ix - OFF;
+  v_u64_t i = (tmp >> (52 - V_LOG2_TABLE_BITS)) % N;
+  v_s64_t k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift.  */
+  v_u64_t iz = ix - (tmp & v_u64 (0xfffULL << 52));
+  v_f64_t z = v_as_f64_u64 (iz);
+  struct entry e = lookup (i);
+
+  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
+
+  v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  v_f64_t kd = v_to_f64_s64 (k);
+  v_f64_t w = v_fma_f64 (r, InvLn2, e.log2c);
+
+  v_f64_t r2 = r * r;
+  v_f64_t p_45 = v_fma_f64 (P (5), r, P (4));
+  v_f64_t p_23 = v_fma_f64 (P (3), r, P (2));
+  v_f64_t p_01 = v_fma_f64 (P (1), r, P (0));
+  v_f64_t y = v_fma_f64 (r2, p_45, p_23);
+  y = v_fma_f64 (r2, y, p_01);
+  y = v_fma_f64 (r2, y, kd + w);
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_atan_2u5.c b/pl/math/vn_atan_2u5.c
new file mode 100644
index 0000000..22baab9
--- /dev/null
+++ b/pl/math/vn_atan_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atan, _ZGVnN2v_atan)
+#include "v_atan_2u5.c"
+#endif
diff --git a/pl/math/vn_atan_3u.c b/pl/math/vn_atan_3u.c
deleted file mode 100644
index 93bd7cf..0000000
--- a/pl/math/vn_atan_3u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_atan.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_atan, _ZGVnN2v_atan)
-#include "v_atan_3u.c"
-#endif
diff --git a/pl/math/vn_log2_2u5.c b/pl/math/vn_log2_2u5.c
deleted file mode 100644
index dba524e..0000000
--- a/pl/math/vn_log2_2u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_log2.
- *
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_log2, _ZGVnN2v_log2)
-#include "v_log2_2u5.c"
-#endif
diff --git a/pl/math/vn_log2_3u.c b/pl/math/vn_log2_3u.c
new file mode 100644
index 0000000..d74f9ca
--- /dev/null
+++ b/pl/math/vn_log2_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log2.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log2, _ZGVnN2v_log2)
+#include "v_log2_3u.c"
+#endif
-- 
cgit v1.2.3


From b56e103d1afaa625557f1f275fa2b6a0606f6456 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Wed, 21 Sep 2022 10:27:18 +0100
Subject: pl/math: Add scalar tanf

Fast implementation using reduction to [0, pi/4] using
approximation of cotan to allow for early reciprocal
computation.

The maximum error is 3.293ulps and is observed for a large
input equal to 0x1.c849eap+16, about 116809.91.
---
 pl/math/include/mathlib.h                |   1 +
 pl/math/math_config.h                    |   9 ++
 pl/math/tanf_3u3.c                       | 192 +++++++++++++++++++++++++++++++
 pl/math/tanf_data.c                      |  48 ++++++++
 pl/math/test/mathbench_funcs.h           |   1 +
 pl/math/test/runulp.sh                   |  16 +++
 pl/math/test/testcases/directed/tanf.tst |  25 ++++
 pl/math/test/testcases/random/float.tst  |   1 +
 pl/math/test/ulp_funcs.h                 |   1 +
 pl/math/tools/tanf.sollya                |  88 ++++++++++++++
 10 files changed, 382 insertions(+)
 create mode 100644 pl/math/tanf_3u3.c
 create mode 100644 pl/math/tanf_data.c
 create mode 100644 pl/math/test/testcases/directed/tanf.tst
 create mode 100644 pl/math/tools/tanf.sollya

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e2d36c6..52b02cf 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -15,6 +15,7 @@ float erfcf (float);
 float erff (float);
 float log10f (float);
 float log1pf (float);
+float tanf (float);
 
 double asinh (double);
 double atan2 (double, double);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 9f902a7..d693743 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -467,6 +467,15 @@ extern const struct log1pf_data
   float coeffs[LOG1PF_NCOEFFS];
 } __log1pf_data HIDDEN;
 
+#define TANF_P_POLY_NCOEFFS 7
+/* cotan approach needs order 3 on [0, pi/4] to reach <3.5ulps.  */
+#define TANF_Q_POLY_NCOEFFS 4
+extern const struct tanf_poly_data
+{
+  float poly_tan[TANF_P_POLY_NCOEFFS];
+  float poly_cotan[TANF_Q_POLY_NCOEFFS];
+} __tanf_poly_data HIDDEN;
+
 #define V_LOG2F_TABLE_BITS 4
 #define V_LOG2F_POLY_ORDER 4
 extern const struct v_log2f_data
diff --git a/pl/math/tanf_3u3.c b/pl/math/tanf_3u3.c
new file mode 100644
index 0000000..e6f899f
--- /dev/null
+++ b/pl/math/tanf_3u3.c
@@ -0,0 +1,192 @@
+/*
+ * Single-precision scalar tan(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+/* Useful constants.  */
+#define NegPio2_1 (-0x1.921fb6p+0f)
+#define NegPio2_2 (0x1.777a5cp-25f)
+#define NegPio2_3 (0x1.ee59dap-50f)
+/* Reduced from 0x1p20 to 0x1p17 to ensure 3.5ulps.  */
+#define RangeVal (0x1p17f)
+#define InvPio2 ((0x1.45f306p-1f))
+#define Shift (0x1.8p+23f)
+#define AbsMask (0x7fffffff)
+#define Pio4 (0x1.921fb6p-1)
+/* 2PI * 2^-64.  */
+#define Pio2p63 (0x1.921FB54442D18p-62)
+
+#define P __tanf_poly_data.poly_tan
+#define Q __tanf_poly_data.poly_cotan
+
+static inline float
+eval_P (float z)
+{
+  float z2 = z * z;
+  float y_10 = fmaf (z, P[1], P[0]);
+  float y_32 = fmaf (z, P[3], P[2]);
+  float y_54 = fmaf (z, P[5], P[4]);
+  float y_6_54 = fmaf (z2, P[6], y_54);
+  float y_32_10 = fmaf (z2, y_32, y_10);
+  float y = fmaf (z2, z2 * y_6_54, y_32_10);
+  return y;
+}
+
+static inline float
+eval_Q (float z)
+{
+  float z2 = z * z;
+  float y = fmaf (z2, fmaf (z, Q[3], Q[2]), fmaf (z, Q[1], Q[0]));
+  return y;
+}
+
+/* Reduction of the input argument x using Cody-Waite approach, such that x = r
+   + n * pi/2 with r lives in [-pi/4, pi/4] and n is a signed integer.  */
+static inline float
+reduce (float x, int32_t *in)
+{
+  /* n = rint(x/(pi/2)).  */
+  float r = x;
+  float q = fmaf (InvPio2, r, Shift);
+  float n = q - Shift;
+  /* There is no rounding here, n is representable by a signed integer.  */
+  *in = (int32_t) n;
+  /* r = x - n * (pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = fmaf (NegPio2_1, n, r);
+  r = fmaf (NegPio2_2, n, r);
+  r = fmaf (NegPio2_3, n, r);
+  return r;
+}
+
+/* Table with 4/PI to 192 bit precision.  To avoid unaligned accesses
+   only 8 new bits are added per entry, making the table 4 times larger.  */
+static const uint32_t __inv_pio4[24]
+  = {0x000000a2, 0x0000a2f9, 0x00a2f983, 0xa2f9836e, 0xf9836e4e, 0x836e4e44,
+     0x6e4e4415, 0x4e441529, 0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1,
+     0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0, 0x34ddc0db, 0xddc0db62,
+     0xc0db6295, 0xdb629599, 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041};
+
+/* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
+   XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
+   Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
+   Reduction uses a table of 4/PI with 192 bits of precision.  A 32x96->128 bit
+   multiply computes the exact 2.62-bit fixed-point modulo.  Since the result
+   can have at most 29 leading zeros after the binary point, the double
+   precision result is accurate to 33 bits.  */
+static inline double
+reduce_large (uint32_t xi, int *np)
+{
+  const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15];
+  int shift = (xi >> 23) & 7;
+  uint64_t n, res0, res1, res2;
+
+  xi = (xi & 0xffffff) | 0x800000;
+  xi <<= shift;
+
+  res0 = xi * arr[0];
+  res1 = (uint64_t) xi * arr[4];
+  res2 = (uint64_t) xi * arr[8];
+  res0 = (res2 >> 32) | (res0 << 32);
+  res0 += res1;
+
+  n = (res0 + (1ULL << 61)) >> 62;
+  res0 -= n << 62;
+  double x = (int64_t) res0;
+  *np = n;
+  return x * Pio2p63;
+}
+
+/* Top 12 bits of the float representation with the sign bit cleared.  */
+static inline uint32_t
+top12 (float x)
+{
+  return (asuint (x) >> 20);
+}
+
+/* Fast single-precision tan implementation.
+   Maximum ULP error: 3.293ulps.
+   tanf(0x1.c849eap+16) got -0x1.fe8d98p-1 want -0x1.fe8d9ep-1.  */
+float
+tanf (float x)
+{
+  /* Get top words.  */
+  uint32_t ix = asuint (x);
+  uint32_t ia = ix & AbsMask;
+  uint32_t ia12 = ia >> 20;
+
+  /* Dispatch between no reduction (small numbers), fast reduction and
+     slow large numbers reduction. The reduction step determines r float
+     (|r| < pi/4) and n signed integer such that x = r + n * pi/2.  */
+  int32_t n;
+  float r;
+  if (ia12 < top12 (Pio4))
+    {
+      /* Optimize small values.  */
+      if (unlikely (ia12 < top12 (0x1p-12f)))
+	{
+	  if (unlikely (ia12 < top12 (0x1p-126f)))
+	    /* Force underflow for tiny x.  */
+	    force_eval_float (x * x);
+	  return x;
+	}
+
+      /* tan (x) ~= x + x^3 * P(x^2).  */
+      float x2 = x * x;
+      float y = eval_P (x2);
+      return fmaf (x2, x * y, x);
+    }
+  /* Similar to other trigonometric routines, fast inaccurate reduction is
+     performed for values of x from pi/4 up to RangeVal. In order to keep errors
+     below 3.5ulps, we set the value of RangeVal to 2^17. This might differ for
+     other trigonometric routines. Above this value more advanced but slower
+     reduction techniques need to be implemented to reach a similar accuracy.
+  */
+  else if (ia12 < top12 (RangeVal))
+    {
+      /* Fast inaccurate reduction.  */
+      r = reduce (x, &n);
+    }
+  else if (ia12 < 0x7f8)
+    {
+      /* Slow accurate reduction.  */
+      uint32_t sign = ix & ~AbsMask;
+      double dar = reduce_large (ia, &n);
+      float ar = (float) dar;
+      r = asfloat (asuint (ar) ^ sign);
+    }
+  else
+    {
+      /* tan(Inf or NaN) is NaN.  */
+      return __math_invalidf (x);
+    }
+
+  /* If x lives in an interval where |tan(x)|
+     - is finite then use an approximation of tangent in the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use an approximation of cotangent in the form
+       cotan(z) ~ 1/z + z * Q(z^2), where the reciprocal can be computed early.
+       Using symmetries of tangent and the identity tan(r) = cotan(pi/2 - r),
+       we only need to change the sign of r to obtain tan(x) from cotan(r).
+     This 2-interval approach requires 2 different sets of coefficients P and
+     Q, where Q is a lower order polynomial than P.  */
+
+  /* Determine if x lives in an interval where |tan(x)| grows to infinity.  */
+  uint32_t alt = (uint32_t) n & 1;
+
+  /* Perform additional reduction if required.  */
+  float z = alt ? -r : r;
+
+  /* Prepare backward transformation.  */
+  float z2 = r * r;
+  float offset = alt ? 1.0f / z : z;
+  float scale = alt ? z : z * z2;
+
+  /* Evaluate polynomial approximation of tan or cotan.  */
+  float p = alt ? eval_Q (z2) : eval_P (z2);
+
+  /* A unified way of assembling the result on both interval types.  */
+  return fmaf (scale, p, offset);
+}
diff --git a/pl/math/tanf_data.c b/pl/math/tanf_data.c
new file mode 100644
index 0000000..386b911
--- /dev/null
+++ b/pl/math/tanf_data.c
@@ -0,0 +1,48 @@
+/*
+ * Data used in single-precision tan(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct tanf_poly_data __tanf_poly_data = {
+.poly_tan = {
+/* Coefficients generated using:
+   remez(f(x) = (tan(sqrt(x)) - sqrt(x)) / (x * sqrt(x)), deg, [a;b], 1, 1e-16, [|dtype ...|])
+   optimize each coefficient
+   optimize relative error
+   final prec : 23 bits
+   working prec : 128 bits
+   deg : 6
+   a : 0x1p-126
+   b : (pi) / 0x1p2
+   dirty rel error : 0x1.df324p-26
+   dirty abs error : 0x1.df3244p-26.  */
+0x1.555558p-2, /* 0.3333334.  */
+0x1.110e1cp-3, /* 0.1333277.  */
+0x1.bb0e7p-5, /* 5.408403e-2.  */
+0x1.5826c8p-6, /* 2.100534e-2.  */
+0x1.8426a6p-7, /* 1.1845428e-2.  */
+-0x1.7a5adcp-10, /* -1.4433095e-3.  */
+0x1.5574dap-8, /* 5.210212e-3.  */
+},
+.poly_cotan = {
+/* Coefficients generated using:
+   fpminimax(f(x) = (0x1p0 / tan(sqrt(x)) - 0x1p0 / sqrt(x)) / sqrt(x), deg, [|dtype ...|], [a;b])
+   optimize a single polynomial
+   optimize absolute error
+   final prec : 23 bits
+   working prec : 128 bits
+   deg : 3
+   a : 0x1p-126
+   b : (pi) / 0x1p2
+   dirty rel error : 0x1.81298cp-25
+   dirty abs error : 0x1.a8acf4p-25.  */
+-0x1.55555p-2, /* -0.33333325.  */
+-0x1.6c23e4p-6, /* -2.2225354e-2.  */
+-0x1.12dbap-9, /* -2.0969994e-3.  */
+-0x1.05a1c2p-12, /* -2.495116e-4.  */
+}
+};
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index b625555..e4b819b 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -15,6 +15,7 @@ F (log10f, 0.01, 11.1)
 F (log1pf, -0.9, 10.0)
 F (log2f, 0.01, 11.1)
 F (sinf, -3.1, 3.1)
+F (tanf, -3.1, 3.1)
 
 D (asinh, -10.0, 10.0)
 D (atan, -10.0, 10.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 8a9a5ec..21712bf 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -123,6 +123,22 @@ t log1pf -0x1p-23   -0.001  50000
 t log1pf   -0.001     -1.0  50000
 t log1pf     -1.0      inf   5000
 
+L=2.80
+Ldir=
+t tanf  0      0xffff0000 10000
+t tanf  0x1p-127  0x1p-14 50000
+t tanf -0x1p-127 -0x1p-14 50000
+t tanf  0x1p-14   0.7     50000
+t tanf -0x1p-14  -0.7     50000
+t tanf  0.7       1.5     50000
+t tanf -0.7      -1.5     50000
+t tanf  1.5       0x1p17  50000
+t tanf -1.5      -0x1p17  50000
+t tanf  0x1p17    0x1p54  50000
+t tanf -0x1p17   -0x1p54  50000
+t tanf  0x1p54    inf     50000
+t tanf -0x1p54   -inf     50000
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/tanf.tst b/pl/math/test/testcases/directed/tanf.tst
new file mode 100644
index 0000000..99aacc4
--- /dev/null
+++ b/pl/math/test/testcases/directed/tanf.tst
@@ -0,0 +1,25 @@
+; tanf.tst
+;
+; Copyright (c) 2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=tanf op1=7fc00001 result=7fc00001 errno=0
+func=tanf op1=ffc00001 result=7fc00001 errno=0
+func=tanf op1=7f800001 result=7fc00001 errno=0 status=i
+func=tanf op1=ff800001 result=7fc00001 errno=0 status=i
+func=tanf op1=7f800000 result=7fc00001 errno=EDOM status=i
+func=tanf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=tanf op1=00000000 result=00000000 errno=0
+func=tanf op1=80000000 result=80000000 errno=0
+; SDCOMP-26094: check tanf in the cases for which the range reducer
+; returns values furthest beyond its nominal upper bound of pi/4.
+func=tanf op1=46427f1b result=3f80396d.599 error=0
+func=tanf op1=4647e568 result=3f8039a6.c9f error=0
+func=tanf op1=46428bac result=3f803a03.148 error=0
+func=tanf op1=4647f1f9 result=3f803a3c.852 error=0
+func=tanf op1=4647fe8a result=3f803ad2.410 error=0
+func=tanf op1=45d8d7f1 result=bf800669.901 error=0
+func=tanf op1=45d371a4 result=bf800686.3cd error=0
+func=tanf op1=45ce0b57 result=bf8006a2.e9a error=0
+func=tanf op1=45d35882 result=bf80071b.bc4 error=0
+func=tanf op1=45cdf235 result=bf800738.693 error=0
diff --git a/pl/math/test/testcases/random/float.tst b/pl/math/test/testcases/random/float.tst
index 468896b..68afbfb 100644
--- a/pl/math/test/testcases/random/float.tst
+++ b/pl/math/test/testcases/random/float.tst
@@ -5,3 +5,4 @@
 
 test erff 10000
 test log10f 10000
+test tanf 10000
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 7ab804f..447d529 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -12,6 +12,7 @@ F1 (log10)
 F1 (log1p)
 D1 (asinh)
 D2 (atan2)
+F1 (tan)
 D1 (erfc)
 D1 (log10)
 D1 (log1p)
diff --git a/pl/math/tools/tanf.sollya b/pl/math/tools/tanf.sollya
new file mode 100644
index 0000000..e8ff1e2
--- /dev/null
+++ b/pl/math/tools/tanf.sollya
@@ -0,0 +1,88 @@
+// polynomial for approximating single precision tan(x)
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+dtype = single;
+
+mthd = 0; // approximate tan
+deg = 6; // poly degree
+
+// // Uncomment for cotan
+// mthd = 1; // approximate cotan
+// deg = 3; // poly degree
+
+// interval bounds
+a = 0x1.0p-126;
+b = pi / 4;
+
+print("Print some useful constants");
+display = hexadecimal!;
+if (dtype==double) then { prec = 53!; }
+else if (dtype==single) then { prec = 23!; };
+
+print("pi/4");
+pi/4;
+
+// Setup precisions (dislay and computation)
+display = decimal!;
+prec=128!;
+save_prec=prec;
+
+//
+// Select function to approximate with Sollya
+//
+if(mthd==0) then {
+  s = "x + x^3 * P(x^2)";
+  g = tan(x);
+  F = proc(P) { return x + x^3 * P(x^2); };
+  f = (g(sqrt(x))-sqrt(x))/(x*sqrt(x));
+  init_poly = 0;
+  deg_init_poly = -1; // a value such that we actually start by building constant coefficient
+  // Display info
+  print("Approximate g(x) =", g, "as F(x)=", s, ".");
+  // Remez applied to minimise relative error
+  approx_remez = proc(func, poly, d) {
+    return remez(1 - poly / func, deg - d, [a;b], x^d/func(x), 1e-10);
+  };
+  // Iteratively find optimal coeffs
+  poly = init_poly;
+  for i from deg_init_poly+1 to deg do {
+    p = roundcoefficients(approx_remez(f, poly, i), [|dtype ...|]);
+    poly = poly + x^i * coeff(p,0);
+  };
+}
+else if (mthd==1) then {
+  s = "1/x + x * P(x^2)";
+  g = 1 / tan(x);
+  F = proc(P) { return 1/x + x * P(x^2); };
+  f = (g(sqrt(x))-1/sqrt(x))/(sqrt(x));
+  init_poly = 0;
+  deg_init_poly = -1; // a value such that we actually start by building constant coefficient
+  // Display info
+  print("Approximate g(x) =", g, "as F(x)=", s, ".");
+  // Fpminimax used to minimise absolute error
+  approx_fpminimax = proc(func, poly, d) {
+    return fpminimax(func - poly / x^-(deg-d), 0, [|dtype|], [a;b], absolute, floating);
+  };
+  // Optimise all coefficients at once
+  poly = fpminimax(f, [|0,...,deg|], [|dtype ...|], [a;b], absolute, floating);
+};
+
+
+//
+// Display coefficients in Sollya
+//
+display = hexadecimal!;
+if (dtype==double) then { prec = 53!; }
+else if (dtype==single) then { prec = 23!; };
+print("_coeffs :_ hex");
+for i from 0 to deg do coeff(poly, i);
+
+// Compute errors
+display = hexadecimal!;
+d_rel_err = dirtyinfnorm(1-F(poly)/g(x), [a;b]);
+d_abs_err = dirtyinfnorm(g(x)-F(poly), [a;b]);
+print("dirty rel error:", d_rel_err);
+print("dirty abs error:", d_abs_err);
+print("in [",a,b,"]");
-- 
cgit v1.2.3


From 52558f2f6ac3a22b2e0382f10caa87f10fb0b23d Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Wed, 21 Sep 2022 10:29:29 +0100
Subject: pl/math: Add Vector/Neon tanf

The Neon implementation is accurate up to 3.2ulps.
It relies on range reduction to [0, pi/4] and late reciprocal
depending on quadrant, as opposed to the early reciprocal used
in the scalar routine.
---
 pl/math/include/mathlib.h      |  4 ++
 pl/math/s_tanf_3u2.c           |  6 +++
 pl/math/test/mathbench_funcs.h |  5 +++
 pl/math/test/runulp.sh         | 16 +++++++
 pl/math/test/ulp_funcs.h       |  3 ++
 pl/math/test/ulp_wrappers.h    |  2 +
 pl/math/v_math.h               | 12 +++++
 pl/math/v_tanf_3u2.c           | 99 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_tanf_3u2.c          | 12 +++++
 9 files changed, 159 insertions(+)
 create mode 100644 pl/math/s_tanf_3u2.c
 create mode 100644 pl/math/v_tanf_3u2.c
 create mode 100644 pl/math/vn_tanf_3u2.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 52b02cf..01ea958 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -31,6 +31,7 @@ float __s_erff (float);
 float __s_log10f (float);
 float __s_log1pf (float);
 float __s_log2f (float);
+float __s_tanf (float);
 
 double __s_atan (double);
 double __s_atan2 (double, double);
@@ -65,6 +66,7 @@ __f64x2_t __v_log10 (__f64x2_t);
 __f32x4_t __v_log1pf (__f32x4_t);
 __f32x4_t __v_log2f (__f32x4_t);
 __f64x2_t __v_log2 (__f64x2_t);
+__f32x4_t __v_tanf (__f32x4_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
@@ -84,6 +86,7 @@ __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 __vpcs __f32x4_t __vn_log1pf (__f32x4_t);
 __vpcs __f32x4_t __vn_log2f (__f32x4_t);
 __vpcs __f64x2_t __vn_log2 (__f64x2_t);
+__vpcs __f32x4_t __vn_tanf (__f32x4_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
@@ -100,6 +103,7 @@ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
 
 #endif
 
diff --git a/pl/math/s_tanf_3u2.c b/pl/math/s_tanf_3u2.c
new file mode 100644
index 0000000..a47a7c0
--- /dev/null
+++ b/pl/math/s_tanf_3u2.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanf_3u2.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index e4b819b..4a09e61 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -43,6 +43,7 @@ D (__s_log10, 0.01, 11.1)
 F (__s_log1pf, -0.9, 10.0)
 F (__s_log2f, 0.01, 11.1)
 D (__s_log2, 0.01, 11.1)
+F (__s_tanf, -3.1, 3.1)
 #if __aarch64__
 VF (__v_asinhf, -10.0, 10.0)
 VF (__v_atanf, -10.0, 10.0)
@@ -58,6 +59,7 @@ VF (__v_log10f, 0.01, 11.1)
 VF (__v_log1pf, -0.9, 10.0)
 VF (__v_log2f, 0.01, 11.1)
 VD (__v_log2, 0.01, 11.1)
+VF (__v_tanf, -3.1, 3.1)
 #ifdef __vpcs
 VNF (__vn_asinhf, -10.0, 10.0)
 VNF (_ZGVnN4v_asinhf, -10.0, 10.0)
@@ -100,6 +102,9 @@ VNF (_ZGVnN4v_log2f, 0.01, 11.1)
 
 VND (__vn_log2, 0.01, 11.1)
 VND (_ZGVnN2v_log2, 0.01, 11.1)
+
+VNF (__vn_tanf, -3.1, 3.1)
+VNF (_ZGVnN4v_tanf, -3.1, 3.1)
 #endif
 #endif
 #if WANT_SVE_MATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 21712bf..03895d1 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -278,6 +278,17 @@ range_log2='
       100       inf  50000
 '
 
+range_tanf='
+     -0.0  -0x1p126  100
+ 0x1p-149  0x1p-126  4000
+ 0x1p-126   0x1p-23  50000
+  0x1p-23       0.7  50000
+      0.7       1.5  50000
+      1.5       100  50000
+      100    0x1p17  50000
+   0x1p17       inf  50000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -382,6 +393,7 @@ L_log1pf=1.53
 L_asinhf=2.17
 L_log2f=2.10
 L_log2=2.09
+L_tanf=2.7
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -471,6 +483,10 @@ log2f  __s_log2f       $runs
 log2f  __v_log2f       $runv
 log2f  __vn_log2f      $runvn
 log2f  _ZGVnN4v_log2f  $runvn
+tanf  __s_tanf         $runs
+tanf  __v_tanf         $runv
+tanf  __vn_tanf        $runvn
+tanf  _ZGVnN4v_tanf    $runvn
 
 if [ $WANT_SVE_MATH -eq 1 ]; then
 sve_cosf     __sv_cosf         $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 447d529..6048b5c 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -31,6 +31,7 @@ SD1 (log10)
 SF1 (log1p)
 SF1 (log2)
 SD1 (log2)
+SF1 (tan)
 #if __aarch64__
 VF1 (asinh)
 VF1 (atan)
@@ -46,6 +47,7 @@ VD1 (log10)
 VF1 (log1p)
 VF1 (log2)
 VD1 (log2)
+VF1 (tan)
 #ifdef __vpcs
 ZVNF1 (asinh)
 ZVNF1 (atan)
@@ -61,6 +63,7 @@ ZVND1 (log10)
 ZVNF1 (log1p)
 ZVNF1 (log2)
 ZVND1 (log2)
+ZVNF1 (tan)
 #endif
 #endif
 #if WANT_SVE_MATH
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 0ceef13..2ab1af2 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -62,6 +62,7 @@ VF1_WRAP(erfc)
 VF1_WRAP(log10)
 VF1_WRAP(log1p)
 VF1_WRAP(log2)
+VF1_WRAP(tan)
 VD1_WRAP(atan)
 VD2_WRAP(atan2)
 VD1_WRAP(erf)
@@ -77,6 +78,7 @@ ZVNF1_WRAP(erfc)
 ZVNF1_WRAP(log10)
 ZVNF1_WRAP(log1p)
 ZVNF1_WRAP(log2)
+ZVNF1_WRAP(tan)
 ZVND1_WRAP(atan)
 ZVND2_WRAP(atan2)
 ZVND1_WRAP(erf)
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index ccdfd75..e98824f 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -178,6 +178,12 @@ v_cagt_f32 (v_f32_t x, v_f32_t y)
 {
   return fabsf (x) > fabsf (y);
 }
+/* to wrap |x| >= |y|.  */
+static inline v_u32_t
+v_cage_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) >= fabsf (y);
+}
 static inline v_u32_t
 v_calt_f32 (v_f32_t x, v_f32_t y)
 {
@@ -523,6 +529,12 @@ v_cagt_f32 (v_f32_t x, v_f32_t y)
 {
   return vcagtq_f32 (x, y);
 }
+/* to wrap |x| >= |y|.  */
+static inline v_u32_t
+v_cage_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcageq_f32 (x, y);
+}
 static inline v_u32_t
 v_calt_f32 (v_f32_t x, v_f32_t y)
 {
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
new file mode 100644
index 0000000..a6d9dd1
--- /dev/null
+++ b/pl/math/v_tanf_3u2.c
@@ -0,0 +1,99 @@
+/*
+ * Single-precision vector tan(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+/* Constants.  */
+#define NegPio2_1 (v_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (v_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (v_f32 (0x1.ee59dap-50f))
+#define InvPio2 (v_f32 (0x1.45f306p-1f))
+#define RangeVal (v_f32 (0x1p17f))
+#define Shift (v_f32 (0x1.8p+23f))
+
+#define poly(i) v_f32 (__tanf_poly_data.poly_tan[i])
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  return v_call_f32 (tanf, x, y, cmp);
+}
+
+/* Use a full Estrin scheme to evaluate polynomial.  */
+static inline v_f32_t
+eval_poly (v_f32_t z)
+{
+  v_f32_t z2 = z * z;
+  v_f32_t z4 = z2 * z2;
+  v_f32_t y_10 = v_fma_f32 (z, poly (1), poly (0));
+  v_f32_t y_32 = v_fma_f32 (z, poly (3), poly (2));
+  v_f32_t y_54 = v_fma_f32 (z, poly (5), poly (4));
+  v_f32_t y_6_54 = v_fma_f32 (z2, poly (6), y_54);
+  v_f32_t y_32_10 = v_fma_f32 (z2, y_32, y_10);
+  v_f32_t y = v_fma_f32 (z4, y_6_54, y_32_10);
+  return y;
+}
+
+/* Fast implementation of Neon tanf.
+   Maximum measured error: 3.121ulps.
+   vtanq_f32(0x1.ff3df8p+16) got -0x1.fbb7b8p-1
+			    want -0x1.fbb7b2p-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (tanf) (v_f32_t x)
+{
+  /* Determine whether input is too large to perform fast regression.  */
+  v_u32_t cmp = v_cage_f32 (x, RangeVal);
+
+  /* n = rint(x/(pi/2)).  */
+  v_f32_t q = v_fma_f32 (InvPio2, x, Shift);
+  v_f32_t n = q - Shift;
+  /* n is representable as a signed integer, simply convert it.  */
+  v_s32_t in = v_round_s32 (n);
+  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
+  v_s32_t alt = in & 1;
+  v_u32_t pred_alt = (alt != 0);
+
+  /* r = x - n * (pi/2)  (range reduction into -pi./4 .. pi/4).  */
+  v_f32_t r;
+  r = v_fma_f32 (NegPio2_1, n, x);
+  r = v_fma_f32 (NegPio2_2, n, r);
+  r = v_fma_f32 (NegPio2_3, n, r);
+
+  /* If x lives in an interval, where |tan(x)|
+     - is finite, then use a polynomial approximation of the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use symmetries of tangent and the identity
+       tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
+       the same polynomial approximation of tan as above.  */
+
+  /* Perform additional reduction if required.  */
+  v_f32_t z = v_sel_f32 (pred_alt, -r, r);
+
+  /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4].  */
+  v_f32_t z2 = r * r;
+  v_f32_t p = eval_poly (z2);
+  v_f32_t y = v_fma_f32 (z * z2, p, z);
+
+  /* Compute reciprocal and apply if required.  */
+  v_f32_t inv_y = v_div_f32 (v_f32 (1.0f), y);
+  y = v_sel_f32 (pred_alt, inv_y, y);
+
+  /* Fast reduction does not handle the x = -0.0 case well,
+     therefore it is fixed here.  */
+  y = v_sel_f32 (x == v_f32 (-0.0), x, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/vn_tanf_3u2.c b/pl/math/vn_tanf_3u2.c
new file mode 100644
index 0000000..a086cc9
--- /dev/null
+++ b/pl/math/vn_tanf_3u2.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanf.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_tanf, _ZGVnN4v_tanf)
+#include "v_tanf_3u2.c"
+#endif
-- 
cgit v1.2.3


From 1931794b71006f0c5137b89f237ec090a77197a0 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <joe.ramsay@arm.com>
Date: Wed, 21 Sep 2022 15:05:25 +0100
Subject: pl/math: Add vector/SVE expf

Routine is an SVE port of the Neon algorithm from math/, and is
accurate to 2 ULP.
---
 pl/math/include/mathlib.h      |   2 +
 pl/math/math_config.h          |   3 ++
 pl/math/sv_expf_2u.c           | 107 +++++++++++++++++++++++++++++++++++++++++
 pl/math/sv_expf_data.c         |  12 +++++
 pl/math/test/mathbench_funcs.h |   3 ++
 pl/math/test/runulp.sh         |  14 ++++++
 pl/math/test/ulp_funcs.h       |   2 +
 pl/math/test/ulp_wrappers.h    |   1 +
 8 files changed, 144 insertions(+)
 create mode 100644 pl/math/sv_expf_2u.c
 create mode 100644 pl/math/sv_expf_data.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 01ea958..4594aa1 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -115,6 +115,7 @@ svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
 svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_expf_x (svfloat32_t, svbool_t);
 svfloat32_t __sv_logf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
@@ -128,6 +129,7 @@ svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
 svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t);
 svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index d693743..22ba9a9 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -532,4 +532,7 @@ extern const struct sv_log_data
   double poly[SV_LOG_POLY_ORDER - 1];
 } __sv_log_data HIDDEN;
 
+#define SV_EXPF_POLY_ORDER 6
+extern const float __sv_expf_poly[SV_EXPF_POLY_ORDER - 1] HIDDEN;
+
 #endif
diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c
new file mode 100644
index 0000000..4b20fee
--- /dev/null
+++ b/pl/math/sv_expf_2u.c
@@ -0,0 +1,107 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define C(i) __sv_expf_poly[i]
+
+#define Shift (0x1.8p23f) /* 1.5 * 2^23.  */
+#define InvLn2 (0x1.715476p+0f)
+#define Ln2hi (0x1.62e4p-1f)
+#define Ln2lo (0x1.7f7d1cp-20f)
+#define Thres (126.0f)
+
+/* Update of both special and non-special cases, if any special case is
+   detected.  */
+static inline sv_f32_t
+specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e, sv_f32_t absn,
+	     svbool_t p_cmp1, sv_f32_t scale)
+{
+  /* s=2^(n/N) may overflow, break it up into s=s1*s2,
+     such that exp = s + s*y can be computed as s1*(s2+s2*y)
+     and s1*s1 overflows only if n>0.  */
+
+  /* If n<=0 then set b to 0x820...0, 0 otherwise.  */
+  svbool_t p_sign = svcmple_n_f32 (pg, n, 0.0f); /* n <= 0.  */
+  sv_u32_t b
+    = svdup_n_u32_z (p_sign, 0x82000000); /* Inactive lanes set to 0.  */
+
+  /* Set s1 to generate overflow depending on sign of exponent n.  */
+  sv_f32_t s1
+    = sv_as_f32_u32 (svadd_n_u32_x (pg, b, 0x7f000000)); /* b + 0x7f000000.  */
+  /* Offset s to avoid overflow in final result if n is below threshold.  */
+  sv_f32_t s2 = sv_as_f32_u32 (
+    svsub_u32_x (pg, e, b)); /* as_u32 (s) - 0x3010...0 + b.  */
+
+  /* |n| > 192 => 2^(n/N) overflows.  */
+  svbool_t p_cmp2 = svcmpgt_n_f32 (pg, absn, 192.0f);
+
+  sv_f32_t r2 = svmul_f32_x (pg, s1, s1);
+  sv_f32_t r1 = sv_fma_f32_x (pg, poly, s2, s2);
+  r1 = svmul_f32_x (pg, r1, s1);
+  sv_f32_t r0 = sv_fma_f32_x (pg, poly, scale, scale);
+
+  /* Apply condition 1 then 2.
+     Returns r2 if cond2 is true, otherwise
+     if cond1 is true then return r1, otherwise return r0.  */
+  sv_f32_t r = svsel_f32 (p_cmp1, r1, r0);
+
+  return svsel_f32 (p_cmp2, r2, r);
+}
+
+/* SVE port of single-precision vector exp routine from math/.
+   Worst-case error is 1.95 ulp:
+   __sv_expf(-0x1.4cb74ap+2) got 0x1.6a022cp-8
+			     want 0x1.6a023p-8.  */
+sv_f32_t
+__sv_expf_x (sv_f32_t x, const svbool_t pg)
+{
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+
+  /* n = round(x/(ln2/N)).  */
+  sv_f32_t z = sv_fma_n_f32_x (pg, InvLn2, x, sv_f32 (Shift));
+  sv_f32_t n = svsub_n_f32_x (pg, z, Shift);
+
+  /* r = x - n*ln2/N.  */
+  sv_f32_t r = sv_fma_n_f32_x (pg, -Ln2hi, n, x);
+  r = sv_fma_n_f32_x (pg, -Ln2lo, n, r);
+
+  /* u << 23.  */
+  sv_u32_t e = svlsl_n_u32_x (pg, sv_as_u32_f32 (z), 23);
+
+  /* s = 2^(n/N).  */
+  sv_f32_t scale = sv_as_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000));
+  sv_f32_t absn = svabs_f32_x (pg, n);
+
+  svbool_t is_special_case = svcmpgt_n_f32 (pg, absn, Thres);
+
+  /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+  sv_f32_t p = sv_fma_n_f32_x (pg, C (0), r, sv_f32 (C (1)));
+  sv_f32_t q = sv_fma_n_f32_x (pg, C (2), r, sv_f32 (C (3)));
+  q = sv_fma_f32_x (pg, p, r2, q);
+  p = svmul_n_f32_x (pg, r, C (4));
+  sv_f32_t poly = sv_fma_f32_x (pg, q, r2, p);
+
+  /* The special case uses s, y and n to produce the final result (normal cases
+     included). It performs an update of all lanes! Therefore:
+     - all previous computation need to be done on all lanes indicated by input
+     pg
+     - we cannot simply apply the special case to the special-case-activated
+     lanes. Besides it is likely that this would not increase performance (no
+     scatter/gather).  */
+  if (unlikely (svptest_any (pg, is_special_case)))
+    return specialcase (pg, poly, n, e, absn, is_special_case, scale);
+
+  return sv_fma_f32_x (pg, poly, scale, scale);
+}
+
+strong_alias (__sv_expf_x, _ZGVsMxv_expf)
+
+#endif // SV_SUPPORTED
diff --git a/pl/math/sv_expf_data.c b/pl/math/sv_expf_data.c
new file mode 100644
index 0000000..d0d85df
--- /dev/null
+++ b/pl/math/sv_expf_data.c
@@ -0,0 +1,12 @@
+/*
+ * Coefficients for single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Cofficients copied from the polynomial in math/v_expf.c.  */
+const float __sv_expf_poly[] = {0x1.0e4020p-7f, 0x1.573e2ep-5f, 0x1.555e66p-3f,
+				0x1.fffdb6p-2f, 0x1.ffffecp-1f};
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 4a09e61..ecb5647 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -118,6 +118,9 @@ SVD (_ZGVsMxv_atan, -3.1, 3.1)
 {"__sv_atan2", 'd', 'n', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
 {"_ZGVsM2vv_atan2", 'd', 'n', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
 
+SVF (__sv_expf_x, -9.9, 9.9)
+SVF (_ZGVsMxv_expf, -9.9, 9.9)
+
 SVF (__sv_cosf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_cosf, -3.1, 3.1)
 SVF (__sv_sinf_x, -3.1, 3.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 03895d1..9bfeef0 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -377,6 +377,17 @@ range_sve_log='
       100       inf  50000
 '
 
+range_sve_expf='
+  0        0x1p-23   40000
+  0x1p-23  1         50000
+  1        0x1p23    50000
+  0x1p23   inf       50000
+  -0       -0x1p-23  40000
+  -0x1p-23 -1        50000
+  -1       -0x1p23   50000
+  -0x1p23  -inf      50000
+'
+
 # error limits
 L_erfc=3.11
 L_erfcf=0.26
@@ -408,6 +419,7 @@ L_sve_log10=1.97
 L_sve_log10f=2.82
 L_sve_logf=2.85
 L_sve_log=1.68
+L_sve_expf=1.46
 
 while read G F R
 do
@@ -501,6 +513,8 @@ sve_log10f   __sv_log10f       $runsv
 sve_log10f   _ZGVsMxv_log10f   $runsv
 sve_logf     __sv_logf         $runsv
 sve_logf     _ZGVsMxv_logf     $runsv
+sve_expf     __sv_expf         $runsv
+sve_expf     _ZGVsMxv_expf     $runsv
 
 sve_cos    __sv_cos        $runsv
 sve_cos    _ZGVsMxv_cos    $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 6048b5c..b073cc2 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -79,6 +79,8 @@ SVF1 (cos)
 ZSVF1 (cos)
 SVD1 (cos)
 ZSVD1 (cos)
+SVF1 (exp)
+ZSVF1 (exp)
 SVF1 (log)
 ZSVF1 (log)
 SVD1 (log)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 2ab1af2..623f945 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -90,6 +90,7 @@ ZVND1_WRAP(log2)
 ZSVNF2_WRAP(atan2)
 ZSVNF1_WRAP(atan)
 ZSVNF1_WRAP(cos)
+ZSVNF1_WRAP(exp)
 ZSVNF1_WRAP(log)
 ZSVNF1_WRAP(log10)
 ZSVNF1_WRAP(sin)
-- 
cgit v1.2.3


From bf04306967d1ed5cd3904e84a8bdcf6d44c09a3c Mon Sep 17 00:00:00 2001
From: Joe Ramsay <joe.ramsay@arm.com>
Date: Tue, 6 Sep 2022 08:59:15 +0000
Subject: pl/math: Add more accurate SVE expf variant

New variant is based on SVE FEXPA instruction, and improves worst-case
error from 1.95 ULP to 1.04 ULP. It is disabled by default.
---
 pl/math/math_config.h |  3 ++
 pl/math/sv_expf_2u.c  | 80 +++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 22ba9a9..9a17159 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -532,6 +532,9 @@ extern const struct sv_log_data
   double poly[SV_LOG_POLY_ORDER - 1];
 } __sv_log_data HIDDEN;
 
+#ifndef SV_EXPF_USE_FEXPA
+#define SV_EXPF_USE_FEXPA 0
+#endif
 #define SV_EXPF_POLY_ORDER 6
 extern const float __sv_expf_poly[SV_EXPF_POLY_ORDER - 1] HIDDEN;
 
diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c
index 4b20fee..9ae9d60 100644
--- a/pl/math/sv_expf_2u.c
+++ b/pl/math/sv_expf_2u.c
@@ -10,16 +10,41 @@
 
 #define C(i) __sv_expf_poly[i]
 
-#define Shift (0x1.8p23f) /* 1.5 * 2^23.  */
 #define InvLn2 (0x1.715476p+0f)
 #define Ln2hi (0x1.62e4p-1f)
 #define Ln2lo (0x1.7f7d1cp-20f)
+
+#if SV_EXPF_USE_FEXPA
+
+#define Shift (0x1.903f8p17f) /* 1.5*2^17 + 127.  */
+#define Thres                                                                  \
+  (0x1.5d5e2ap+6f) /* Roughly 87.3. For x < -Thres, the result is subnormal    \
+		      and not handled correctly by FEXPA.  */
+
+static NOINLINE sv_f32_t
+special_case (sv_f32_t x, sv_f32_t y, svbool_t special)
+{
+  /* The special-case handler from the Neon routine does not handle subnormals
+     in a way that is compatible with FEXPA. For the FEXPA variant we just fall
+     back to scalar expf.  */
+  return sv_call_f32 (expf, x, y, special);
+}
+
+#else
+
+#define Shift (0x1.8p23f) /* 1.5 * 2^23.  */
 #define Thres (126.0f)
 
-/* Update of both special and non-special cases, if any special case is
-   detected.  */
+/* Special-case handler adapted from Neon variant. Uses s, y and n to produce
+   the final result (normal cases included). It performs an update of all lanes!
+   Therefore:
+   - all previous computation need to be done on all lanes indicated by input
+     pg
+   - we cannot simply apply the special case to the special-case-activated
+     lanes. Besides it is likely that this would not increase performance (no
+     scatter/gather).  */
 static inline sv_f32_t
-specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e, sv_f32_t absn,
+specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e,
 	     svbool_t p_cmp1, sv_f32_t scale)
 {
   /* s=2^(n/N) may overflow, break it up into s=s1*s2,
@@ -39,7 +64,7 @@ specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e, sv_f32_t absn,
     svsub_u32_x (pg, e, b)); /* as_u32 (s) - 0x3010...0 + b.  */
 
   /* |n| > 192 => 2^(n/N) overflows.  */
-  svbool_t p_cmp2 = svcmpgt_n_f32 (pg, absn, 192.0f);
+  svbool_t p_cmp2 = svacgt_n_f32 (pg, n, 192.0f);
 
   sv_f32_t r2 = svmul_f32_x (pg, s1, s1);
   sv_f32_t r1 = sv_fma_f32_x (pg, poly, s2, s2);
@@ -54,10 +79,20 @@ specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e, sv_f32_t absn,
   return svsel_f32 (p_cmp2, r2, r);
 }
 
-/* SVE port of single-precision vector exp routine from math/.
-   Worst-case error is 1.95 ulp:
+#endif
+
+/* Optimised single-precision SVE exp function. By default this is an SVE port
+   of the Neon algorithm from math/. Alternatively, enable a modification of
+   that algorithm that looks up scale using SVE FEXPA instruction with
+   SV_EXPF_USE_FEXPA.
+
+   Worst-case error of the default algorithm is 1.95 ulp:
    __sv_expf(-0x1.4cb74ap+2) got 0x1.6a022cp-8
-			     want 0x1.6a023p-8.  */
+			     want 0x1.6a023p-8.
+
+   Worst-case error when using FEXPA is 1.04 ulp:
+   __sv_expf(0x1.a8eda4p+1) got 0x1.ba74bcp+4
+			   want 0x1.ba74bap+4.  */
 sv_f32_t
 __sv_expf_x (sv_f32_t x, const svbool_t pg)
 {
@@ -72,14 +107,17 @@ __sv_expf_x (sv_f32_t x, const svbool_t pg)
   sv_f32_t r = sv_fma_n_f32_x (pg, -Ln2hi, n, x);
   r = sv_fma_n_f32_x (pg, -Ln2lo, n, r);
 
-  /* u << 23.  */
+/* scale = 2^(n/N).  */
+#if SV_EXPF_USE_FEXPA
+  /* NaNs also need special handling with FEXPA.  */
+  svbool_t is_special_case
+    = svorr_b_z (pg, svacgt_n_f32 (pg, x, Thres), svcmpne_f32 (pg, x, x));
+  sv_f32_t scale = svexpa_f32 (sv_as_u32_f32 (z));
+#else
   sv_u32_t e = svlsl_n_u32_x (pg, sv_as_u32_f32 (z), 23);
-
-  /* s = 2^(n/N).  */
+  svbool_t is_special_case = svacgt_n_f32 (pg, n, Thres);
   sv_f32_t scale = sv_as_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000));
-  sv_f32_t absn = svabs_f32_x (pg, n);
-
-  svbool_t is_special_case = svcmpgt_n_f32 (pg, absn, Thres);
+#endif
 
   /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
   sv_f32_t r2 = svmul_f32_x (pg, r, r);
@@ -89,15 +127,13 @@ __sv_expf_x (sv_f32_t x, const svbool_t pg)
   p = svmul_n_f32_x (pg, r, C (4));
   sv_f32_t poly = sv_fma_f32_x (pg, q, r2, p);
 
-  /* The special case uses s, y and n to produce the final result (normal cases
-     included). It performs an update of all lanes! Therefore:
-     - all previous computation need to be done on all lanes indicated by input
-     pg
-     - we cannot simply apply the special case to the special-case-activated
-     lanes. Besides it is likely that this would not increase performance (no
-     scatter/gather).  */
   if (unlikely (svptest_any (pg, is_special_case)))
-    return specialcase (pg, poly, n, e, absn, is_special_case, scale);
+#if SV_EXPF_USE_FEXPA
+    return special_case (x, sv_fma_f32_x (pg, poly, scale, scale),
+			 is_special_case);
+#else
+    return specialcase (pg, poly, n, e, is_special_case, scale);
+#endif
 
   return sv_fma_f32_x (pg, poly, scale, scale);
 }
-- 
cgit v1.2.3


From 2ed10b59cde1d025140c9dc338bd1b387b72e3a4 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 28 Sep 2022 13:32:39 +0100
Subject: pl/math: Add scalar acoshf

New routine uses AOR logf and log1pf. It is accurate to 2.8 ULP.
---
 pl/math/acoshf_2u8.c                       | 52 ++++++++++++++++++++++++++++++
 pl/math/include/mathlib.h                  |  1 +
 pl/math/test/mathbench_funcs.h             |  1 +
 pl/math/test/runulp.sh                     |  7 ++++
 pl/math/test/testcases/directed/acoshf.tst | 19 +++++++++++
 pl/math/test/ulp_funcs.h                   |  1 +
 6 files changed, 81 insertions(+)
 create mode 100644 pl/math/acoshf_2u8.c
 create mode 100644 pl/math/test/testcases/directed/acoshf.tst

diff --git a/pl/math/acoshf_2u8.c b/pl/math/acoshf_2u8.c
new file mode 100644
index 0000000..fb8d12d
--- /dev/null
+++ b/pl/math/acoshf_2u8.c
@@ -0,0 +1,52 @@
+/*
+ * Single-precision acosh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define Ln2 (0x1.62e4p-1f)
+#define MinusZero 0x80000000
+#define SquareLim 0x5f800000 /* asuint(0x1p64).  */
+#define Two 0x40000000
+
+/* Single-precision log from math/.  */
+float
+optr_aor_log_f32 (float);
+
+/* Single-precision log(1+x) from pl/math.  */
+float
+log1pf (float);
+
+/* acoshf approximation using a variety of approaches on different intervals:
+
+   x >= 2^64: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
+   close enough to x that we can calculate the result by ln(2x) == ln(x) +
+   ln(2). The greatest error in the region is 0.94 ULP:
+   acoshf(0x1.15f706p+92) got 0x1.022e14p+6 want 0x1.022e16p+6.
+
+   x > 2: Calculate the result directly using definition of asinh(x) = ln(x +
+   sqrt(x*x - 1)). Greatest error in this region is 1.30 ULP:
+   acoshf(0x1.249d8p+1) got 0x1.77e1aep+0 want 0x1.77e1bp+0.
+
+   0 <= x <= 2: Calculate the result using log1p. For x < 1, acosh(x) is
+   undefined. For 1 <= x <= 2, the greatest error is 2.78 ULP:
+   acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3 want 0x1.ef9ea2p-3.  */
+float
+acoshf (float x)
+{
+  uint32_t ix = asuint (x);
+
+  if (unlikely (ix >= MinusZero))
+    return __math_invalidf (x);
+
+  if (unlikely (ix >= SquareLim))
+    return optr_aor_log_f32 (x) + Ln2;
+
+  if (ix > Two)
+    return optr_aor_log_f32 (x + sqrtf (x * x - 1));
+
+  float xm1 = x - 1;
+  return log1pf (xm1 + sqrtf (2 * xm1 + xm1 * xm1));
+}
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 4594aa1..4c906b4 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -9,6 +9,7 @@
 #ifndef _MATHLIB_H
 #define _MATHLIB_H
 
+float acoshf (float);
 float asinhf (float);
 float atan2f (float, float);
 float erfcf (float);
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index ecb5647..134ae01 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -5,6 +5,7 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+F (acoshf, 1.0, 10.0)
 F (asinhf, -10.0, 10.0)
 F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 9bfeef0..857996e 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -139,6 +139,13 @@ t tanf -0x1p17   -0x1p54  50000
 t tanf  0x1p54    inf     50000
 t tanf -0x1p54   -inf     50000
 
+L=2.30
+t acoshf 0      1         100
+t acoshf 1      2       10000
+t acoshf 2      0x1p64 100000
+t acoshf 0x1p64 inf    100000
+t acoshf -0     -inf    10000
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/acoshf.tst b/pl/math/test/testcases/directed/acoshf.tst
new file mode 100644
index 0000000..ffa6208
--- /dev/null
+++ b/pl/math/test/testcases/directed/acoshf.tst
@@ -0,0 +1,19 @@
+; acoshf.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=acoshf op1=7fc00001 result=7fc00001 errno=0
+func=acoshf op1=ffc00001 result=7fc00001 errno=0
+func=acoshf op1=7f800001 result=7fc00001 errno=0 status=i
+func=acoshf op1=ff800001 result=7fc00001 errno=0 status=i
+func=acoshf op1=7f800000 result=7f800000 errno=0
+func=acoshf op1=3f800000 result=00000000 errno=0
+func=acoshf op1=3f7fffff result=7fc00001 errno=EDOM status=i
+func=acoshf op1=00000000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=80000000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=bf7fffff result=7fc00001 errno=EDOM status=i
+func=acoshf op1=bf800000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=bf800001 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=7f767efe result=42b2c19d.83e error=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index b073cc2..3aa9d3a 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -4,6 +4,7 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+F1 (acosh)
 F1 (asinh)
 F2 (atan2)
 F1 (erfc)
-- 
cgit v1.2.3


From 28fa116dbf12bd4cc36188d278e70980655030f9 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 30 Sep 2022 14:55:02 +0100
Subject: pl/math: Update worst-case error for Neon erfc

Also update formatting of some comments.
---
 pl/math/test/runulp.sh |  2 +-
 pl/math/v_erfc_3u7.c   | 16 +++++-----------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 857996e..32fa32f 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -396,7 +396,7 @@ range_sve_expf='
 '
 
 # error limits
-L_erfc=3.11
+L_erfc=3.14
 L_erfcf=0.26
 L_log10=1.97
 L_log10f=2.81
diff --git a/pl/math/v_erfc_3u7.c b/pl/math/v_erfc_3u7.c
index 4caa9f1..d65a9d7 100644
--- a/pl/math/v_erfc_3u7.c
+++ b/pl/math/v_erfc_3u7.c
@@ -87,12 +87,7 @@ v_eval_gauss (v_f64_t a)
   v_f64_t e2;
   v_f64_t a2 = a * a;
 
-  /* Dekker's algorithm.
-     tmp = a - Scale * a.
-     a_hi = high bits of a.
-	  = Scale * a - tmp.
-     a_lo = low bits of a.
-	  = a - a_hi.  */
+  /* TwoProduct (Dekker) applied to a * a.  */
   v_f64_t a_hi = -v_fma_f64 (Scale, a, -a);
   a_hi = v_fma_f64 (Scale, a, a_hi);
   v_f64_t a_lo = a - a_hi;
@@ -108,10 +103,9 @@ v_eval_gauss (v_f64_t a)
 }
 
 /* Optimized double precision vector complementary error function erfc.
-   Max ULP: 3.7ulps.
-   Max measured: 3.610 on [5.1183, 5.1184] (at 0x1.47923afd09313p+2).
-   __v_erfc(0x1.47923afd09313p+2) got 0x1.ff487ddd86457p-42 want
-   0x1.ff487ddd8645bp-42 -0.390493 ulp err -3.10951.  */
+   Maximum measured error is 3.63 ULP:
+   __v_erfc(0x1.479279a3bbc74p+2) got 0x1.ff341c664edc5p-42
+				 want 0x1.ff341c664edc9p-42.  */
 VPCS_ATTR
 v_f64_t V_NAME (erfc) (v_f64_t x)
 {
@@ -128,7 +122,7 @@ v_f64_t V_NAME (erfc) (v_f64_t x)
   struct entry dat;
 
   /* All entries of the vector are out of bounds, take a short path.
-     Use smallest possible number above 28 representable in 12 bits.   */
+     Use smallest possible number above 28 representable in 12 bits.  */
   v_u64_t out_of_bounds = v_cond_u64 (atop >= v_u64 (0x404));
 
   /* Use sign to produce either 0 if x > 0, 2 otherwise.  */
-- 
cgit v1.2.3


From 34be6cf382be21e7f8c099bbd6d8ac5ccbf8ffbc Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 3 Oct 2022 09:49:56 +0100
Subject: pl/math: Add vector/SVE erff

New routine is an SVE port of the Neon algorithm and is accurate to
1.3 ULP.
---
 pl/math/include/mathlib.h      |  2 +
 pl/math/sv_erff_1u3.c          | 91 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h |  3 ++
 pl/math/test/runulp.sh         | 14 +++++++
 pl/math/test/ulp_funcs.h       |  2 +
 pl/math/test/ulp_wrappers.h    |  1 +
 6 files changed, 113 insertions(+)
 create mode 100644 pl/math/sv_erff_1u3.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 4c906b4..6979b96 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -116,6 +116,7 @@ svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
 svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_erff_x (svfloat32_t, svbool_t);
 svfloat32_t __sv_expf_x (svfloat32_t, svbool_t);
 svfloat32_t __sv_logf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
@@ -130,6 +131,7 @@ svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
 svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t);
 svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t);
 svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
diff --git a/pl/math/sv_erff_1u3.c b/pl/math/sv_erff_1u3.c
new file mode 100644
index 0000000..f0af98e
--- /dev/null
+++ b/pl/math/sv_erff_1u3.c
@@ -0,0 +1,91 @@
+/*
+ * Single-precision vector erf(x) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define AbsMask (0x7fffffff)
+
+static NOINLINE sv_f32_t
+__sv_erff_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (erff, x, y, cmp);
+}
+
+sv_f32_t __sv_expf_x (svbool_t, sv_f32_t);
+
+/* Optimized single precision vector erf. Worst-case error is 1.25 ULP:
+   __sv_erff(0x1.dc59fap-1) got 0x1.9f9c88p-1
+			   want 0x1.9f9c8ap-1.  */
+sv_f32_t
+__sv_erff_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  sv_u32_t atop = svand_n_u32_x (pg, svlsr_n_u32_x (pg, ix, 16), 0x7fff);
+  /* Handle both inf/nan as well as small values (|x|<2^-28).  */
+  svbool_t cmp
+    = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, atop, 0x3180), 0x7ff0 - 0x3180);
+
+  sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask);
+  /* |x| < 0.921875.  */
+  svbool_t red = svaclt_n_f32 (pg, x, 0.921875f);
+  /* |x| > 4.0.  */
+  svbool_t bor = svacgt_n_f32 (pg, x, 4.0f);
+
+  /* Load polynomial coefficients.  */
+  sv_u32_t idx_lo = svsel (red, sv_u32 (0), sv_u32 (1));
+  sv_u32_t idx_hi = svadd_n_u32_x (pg, idx_lo, 2);
+
+  const float *base = (float *) __v_erff_data.coeffs;
+  sv_f32_t c_2_5 = svld1rq (svptrue_b32 (), base + 2);
+  sv_f32_t c_6_9 = svld1rq (svptrue_b32 (), base + 6);
+  sv_f32_t c_10_13 = svld1rq (svptrue_b32 (), base + 10);
+
+  /* Do not need to store elem 0 of __v_erff_data as it is not used.  */
+  sv_f32_t p1 = svtbl (c_2_5, idx_lo);
+  sv_f32_t p2 = svtbl (c_2_5, idx_hi);
+  sv_f32_t p3 = svtbl (c_6_9, idx_lo);
+  sv_f32_t p4 = svtbl (c_6_9, idx_hi);
+  sv_f32_t p5 = svtbl (c_10_13, idx_lo);
+  sv_f32_t p6 = svtbl (c_10_13, idx_hi);
+
+  sv_f32_t a = svabs_f32_x (pg, x);
+  /* Square with merging mul - z is x^2 for reduced, |x| otherwise.  */
+  sv_f32_t z = svmul_f32_m (red, a, a);
+
+  /* Evaluate polynomial on |x| or x^2.  */
+  sv_f32_t r = sv_fma_f32_x (pg, z, p6, p5);
+  r = sv_fma_f32_x (pg, z, r, p4);
+  r = sv_fma_f32_x (pg, z, r, p3);
+  r = sv_fma_f32_x (pg, z, r, p2);
+  r = sv_fma_f32_x (pg, z, r, p1);
+  /* Use merging svmad for last operation - apply first coefficient if not
+     reduced, otherwise r is propagated unchanged. This is because the reduced
+     polynomial has lower order than the non-reduced.  */
+  r = svmad_n_f32_m (svnot_b_z (pg, red), r, z, base[1]);
+  r = sv_fma_f32_x (pg, a, r, a);
+
+  /* y = |x| + |x| * P(x^2)               if |x| < 0.921875
+     y = 1 - exp (-(|x| + |x| * P(|x|)))  otherwise.  */
+  sv_f32_t y = __sv_expf_x (pg, svneg_f32_x (pg, r));
+  y = svsel_f32 (red, r, svsubr_n_f32_x (pg, y, 1.0));
+
+  /* Boring domain (absolute value is required to get the sign of erf(-nan)
+     right).  */
+  y = svsel_f32 (bor, sv_f32 (1.0f), svabs_f32_x (pg, y));
+
+  /* y = erf(x) if x>0, -erf(-x) otherwise.  */
+  y = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign));
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_erff_specialcase (x, y, cmp);
+  return y;
+}
+
+strong_alias (__sv_erff_x, _ZGVsMxv_erff)
+
+#endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 134ae01..dad6ad6 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -119,6 +119,9 @@ SVD (_ZGVsMxv_atan, -3.1, 3.1)
 {"__sv_atan2", 'd', 'n', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
 {"_ZGVsM2vv_atan2", 'd', 'n', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
 
+SVF (__sv_erff_x, -4.0, 4.0)
+SVF (_ZGVsMxv_erff, -4.0, 4.0)
+
 SVF (__sv_expf_x, -9.9, 9.9)
 SVF (_ZGVsMxv_expf, -9.9, 9.9)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 32fa32f..caa49d7 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -395,6 +395,17 @@ range_sve_expf='
   -0x1p23  -inf      50000
 '
 
+range_sve_erff='
+  0        0x1p-28   20000
+  0x1p-28  1         60000
+  1        0x1p28    60000
+  0x1p28   inf       20000
+  -0       -0x1p-28  20000
+  -0x1p-28 -1        60000
+  -1       -0x1p28   60000
+  -0x1p28  -inf      20000
+'
+
 # error limits
 L_erfc=3.14
 L_erfcf=0.26
@@ -427,6 +438,7 @@ L_sve_log10f=2.82
 L_sve_logf=2.85
 L_sve_log=1.68
 L_sve_expf=1.46
+L_sve_erff=0.76
 
 while read G F R
 do
@@ -522,6 +534,8 @@ sve_logf     __sv_logf         $runsv
 sve_logf     _ZGVsMxv_logf     $runsv
 sve_expf     __sv_expf         $runsv
 sve_expf     _ZGVsMxv_expf     $runsv
+sve_erff     __sv_erff         $runsv
+sve_erff     _ZGVsMxv_erff     $runsv
 
 sve_cos    __sv_cos        $runsv
 sve_cos    _ZGVsMxv_cos    $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 3aa9d3a..452950f 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -80,6 +80,8 @@ SVF1 (cos)
 ZSVF1 (cos)
 SVD1 (cos)
 ZSVD1 (cos)
+SVF1 (erf)
+ZSVF1 (erf)
 SVF1 (exp)
 ZSVF1 (exp)
 SVF1 (log)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 623f945..16647e4 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -90,6 +90,7 @@ ZVND1_WRAP(log2)
 ZSVNF2_WRAP(atan2)
 ZSVNF1_WRAP(atan)
 ZSVNF1_WRAP(cos)
+ZSVNF1_WRAP(erf)
 ZSVNF1_WRAP(exp)
 ZSVNF1_WRAP(log)
 ZSVNF1_WRAP(log10)
-- 
cgit v1.2.3


From 075fa2ecae86588cfdde41b008d5d0be70b9b863 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 3 Oct 2022 11:49:29 +0100
Subject: pl/math: Update worst-case error for scalar asinh

A larger error was found, threshold updated in runulp.
---
 pl/math/asinh_2u5.c    | 4 ++--
 pl/math/test/runulp.sh | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c
index 293626d..f22b342 100644
--- a/pl/math/asinh_2u5.c
+++ b/pl/math/asinh_2u5.c
@@ -60,8 +60,8 @@ eval_poly (double z)
    |x| < 2^511: Upper bound of this region is close to sqrt(DBL_MAX). Calculate
      the result directly using the definition asinh(x) = ln(x + sqrt(x*x + 1)).
      The largest observed error in this region is 2.03 ULPs:
-     asinh(0x1.00441cdce7fd5p+0) got 0x1.c3a3b32255bf9p-1
-				want 0x1.c3a3b32255bfbp-1.
+     asinh(-0x1.00094e0f39574p+0) got -0x1.c3508eb6a681ep-1
+				 want -0x1.c3508eb6a682p-1.
 
    |x| >= 2^511: We cannot square x without overflow at a low
      cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index caa49d7..f0ff787 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -94,7 +94,7 @@ t asinhf  0x1p-12      1.0  50000
 t asinhf      1.0   0x1p11  50000
 t asinhf   0x1p11  0x1p127  20000
 
-L=1.51
+L=1.54
 t asinh -0x1p-26 0x1p-26   50000
 t asinh  0x1p-26     1.0   40000
 t asinh -0x1p-26    -1.0   10000
-- 
cgit v1.2.3


From 3ef86d7f7b4a43a528dd5b0477950ac54ca1aff0 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 3 Oct 2022 11:49:41 +0100
Subject: pl/math: Add vector/SVE erf

New routine is based on the Neon routine and is accurate to 2.5 ULP.
---
 pl/math/include/mathlib.h      |  2 +
 pl/math/sv_erf_2u5.c           | 92 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h |  2 +
 pl/math/test/runulp.sh         | 14 +++++++
 pl/math/test/ulp_funcs.h       |  2 +
 pl/math/test/ulp_wrappers.h    |  1 +
 6 files changed, 113 insertions(+)
 create mode 100644 pl/math/sv_erf_2u5.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 6979b96..b45721b 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -117,6 +117,7 @@ svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_erff_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_erf_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_expf_x (svfloat32_t, svbool_t);
 svfloat32_t __sv_logf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
@@ -132,6 +133,7 @@ svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t);
 svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
diff --git a/pl/math/sv_erf_2u5.c b/pl/math/sv_erf_2u5.c
new file mode 100644
index 0000000..a68b9e3
--- /dev/null
+++ b/pl/math/sv_erf_2u5.c
@@ -0,0 +1,92 @@
+/*
+ * Double-precision SVE erf(x) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define Scale (8.0f)
+#define AbsMask (0x7fffffffffffffff)
+
+static NOINLINE sv_f64_t
+__sv_erf_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (erf, x, y, cmp);
+}
+
+/* Optimized double precision SVE error function erf. Maximum
+   observed error is 2.46 ULP:
+   __sv_erf(0x1.5644782ddd668p+2) got 0x1.ffffffffffeap-1
+				 want 0x1.ffffffffffe9ep-1.  */
+sv_f64_t
+__sv_erf_x (sv_f64_t x, const svbool_t pg)
+{
+  /* Use top 16 bits to test for special cases and small values.  */
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t atop = svand_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 48), 0x7fff);
+
+  /* Handle both inf/nan as well as small values (|x|<2^-28).  */
+  svbool_t cmp
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30);
+
+  /* Get sign and absolute value.  */
+  sv_f64_t a = svabs_f64_x (pg, x);
+  sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
+
+  /* i = trunc(Scale*x).  */
+  sv_u64_t i = svcvt_u64_f64_x (pg, svmul_n_f64_x (pg, a, Scale));
+  /* Saturate index of intervals.  */
+  i = svmin_u64_x (pg, i, sv_u64 (V_ERF_NINTS));
+
+  /* Load polynomial coefficients.  */
+  sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i);
+  sv_f64_t P_1 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[1], i);
+  sv_f64_t P_2 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[2], i);
+  sv_f64_t P_3 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[3], i);
+  sv_f64_t P_4 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[4], i);
+  sv_f64_t P_5 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[5], i);
+  sv_f64_t P_6 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[6], i);
+  sv_f64_t P_7 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[7], i);
+  sv_f64_t P_8 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[8], i);
+  sv_f64_t P_9 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[9], i);
+
+  /* Get shift and scale.  */
+  sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i);
+
+  /* Transform polynomial variable.  */
+  sv_f64_t z = sv_fma_n_f64_x (pg, Scale, a, shift);
+
+  /* Evaluate polynomial P(z) using level-2 Estrin.  */
+  sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0);
+  sv_f64_t r2 = sv_fma_f64_x (pg, z, P_3, P_2);
+  sv_f64_t r3 = sv_fma_f64_x (pg, z, P_5, P_4);
+  sv_f64_t r4 = sv_fma_f64_x (pg, z, P_7, P_6);
+  sv_f64_t r5 = sv_fma_f64_x (pg, z, P_9, P_8);
+
+  sv_f64_t z2 = svmul_f64_x (pg, z, z);
+  sv_f64_t z4 = svmul_f64_x (pg, z2, z2);
+
+  sv_f64_t q2 = sv_fma_f64_x (pg, r4, z2, r3);
+  sv_f64_t q1 = sv_fma_f64_x (pg, r2, z2, r1);
+
+  sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2);
+  y = sv_fma_f64_x (pg, z4, y, q1);
+
+  /* Saturate y. This works because using the last interval on the boring domain
+     produces y > 1.  */
+  y = svmin_n_f64_x (pg, y, 1.0);
+
+  /* y = erf(x) if x > 0, -erf(-x) otherwise.  */
+  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_erf_specialcase (x, y, cmp);
+  return y;
+}
+
+strong_alias (__sv_erf_x, _ZGVsMxv_erf)
+
+#endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index dad6ad6..6b68c4a 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -121,6 +121,8 @@ SVD (_ZGVsMxv_atan, -3.1, 3.1)
 
 SVF (__sv_erff_x, -4.0, 4.0)
 SVF (_ZGVsMxv_erff, -4.0, 4.0)
+SVD (__sv_erf_x, -4.0, 4.0)
+SVD (_ZGVsMxv_erf, -4.0, 4.0)
 
 SVF (__sv_expf_x, -9.9, 9.9)
 SVF (_ZGVsMxv_expf, -9.9, 9.9)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index f0ff787..26349fd 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -406,6 +406,17 @@ range_sve_erff='
   -0x1p28  -inf      20000
 '
 
+range_sve_erf='
+  0        0x1p-28   20000
+  0x1p-28  1         60000
+  1        0x1p28    60000
+  0x1p28   inf       20000
+  -0       -0x1p-28  20000
+  -0x1p-28 -1        60000
+  -1       -0x1p28   60000
+  -0x1p28  -inf      20000
+'
+
 # error limits
 L_erfc=3.14
 L_erfcf=0.26
@@ -439,6 +450,7 @@ L_sve_logf=2.85
 L_sve_log=1.68
 L_sve_expf=1.46
 L_sve_erff=0.76
+L_sve_erf=1.97
 
 while read G F R
 do
@@ -549,6 +561,8 @@ sve_log10  __sv_log10      $runsv
 sve_log10  _ZGVsMxv_log10  $runsv
 sve_log    __sv_log        $runsv
 sve_log    _ZGVsMxv_log    $runsv
+sve_erf    __sv_erf        $runsv
+sve_erf    _ZGVsMxv_erf    $runsv
 fi
 EOF
 
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 452950f..37ddc6d 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -82,6 +82,8 @@ SVD1 (cos)
 ZSVD1 (cos)
 SVF1 (erf)
 ZSVF1 (erf)
+SVD1 (erf)
+ZSVD1 (erf)
 SVF1 (exp)
 ZSVF1 (exp)
 SVF1 (log)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 16647e4..4ff8e8a 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -99,6 +99,7 @@ ZSVNF1_WRAP(sin)
 ZSVND2_WRAP(atan2)
 ZSVND1_WRAP(atan)
 ZSVND1_WRAP(cos)
+ZSVND1_WRAP(erf)
 ZSVND1_WRAP(log)
 ZSVND1_WRAP(log10)
 ZSVND1_WRAP(sin)
-- 
cgit v1.2.3


From 1de88de352067e5f8af82a8df00ed76aae7c7e54 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Wed, 28 Sep 2022 08:49:47 +0000
Subject: pl/math: Add Vector/SVE tanf

New SVE routine is an SVE port of the Neon algorithm and is accurate to
3.2 ULP.
---
 pl/math/include/mathlib.h      |   2 +
 pl/math/sv_math.h              |   6 +++
 pl/math/sv_tanf_3u2.c          | 101 +++++++++++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h |   3 ++
 pl/math/test/runulp.sh         |  14 ++++++
 pl/math/test/ulp_funcs.h       |   2 +
 pl/math/test/ulp_wrappers.h    |   1 +
 7 files changed, 129 insertions(+)
 create mode 100644 pl/math/sv_tanf_3u2.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index b45721b..b53847f 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -125,6 +125,7 @@ svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_tanf_x (svfloat32_t, svbool_t);
 /* SVE ABI names.  */
 svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t);
 svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t);
@@ -141,6 +142,7 @@ svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t);
 #endif
 
 #endif
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
index 3b318f1..b574f10 100644
--- a/pl/math/sv_math.h
+++ b/pl/math/sv_math.h
@@ -195,6 +195,12 @@ sv_to_f32_s32_x (svbool_t pg, sv_s32_t s)
   return svcvt_f32_x (pg, s);
 }
 
+static inline sv_s32_t
+sv_to_s32_f32_x (svbool_t pg, sv_f32_t x)
+{
+  return svcvt_s32_f32_x (pg, x);
+}
+
 static inline sv_f32_t
 sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp)
 {
diff --git a/pl/math/sv_tanf_3u2.c b/pl/math/sv_tanf_3u2.c
new file mode 100644
index 0000000..e1d3757
--- /dev/null
+++ b/pl/math/sv_tanf_3u2.c
@@ -0,0 +1,101 @@
+/*
+ * Single-precision vector tan(x) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+/* Constants.  */
+#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
+#define InvPio2 (sv_f32 (0x1.45f306p-1f))
+#define RangeVal (sv_f32 (0x1p17f))
+#define Shift (sv_f32 (0x1.8p+23f))
+
+#define poly(i) sv_f32 (__tanf_poly_data.poly_tan[i])
+
+/* Use full Estrin's scheme to evaluate polynomial.  */
+static inline sv_f32_t
+eval_poly (svbool_t pg, sv_f32_t z)
+{
+  sv_f32_t z2 = svmul_f32_x (pg, z, z);
+  sv_f32_t z4 = svmul_f32_x (pg, z2, z2);
+  sv_f32_t y_10 = sv_fma_f32_x (pg, z, poly (1), poly (0));
+  sv_f32_t y_32 = sv_fma_f32_x (pg, z, poly (3), poly (2));
+  sv_f32_t y_54 = sv_fma_f32_x (pg, z, poly (5), poly (4));
+  sv_f32_t y_6_54 = sv_fma_f32_x (pg, z2, poly (6), y_54);
+  sv_f32_t y_32_10 = sv_fma_f32_x (pg, z2, y_32, y_10);
+  sv_f32_t y = sv_fma_f32_x (pg, z4, y_6_54, y_32_10);
+  return y;
+}
+
+static NOINLINE sv_f32_t
+__sv_tanf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (tanf, x, y, cmp);
+}
+
+/* Fast implementation of SVE tanf.
+   The maximum measured errors were located near RangeVal.
+   Maximum error: 3.121ulps.
+   svtan_f32(0x1.ff3df8p+16) got -0x1.fbb7b8p-1
+			    want -0x1.fbb7b2p-1.  */
+sv_f32_t
+__sv_tanf_x (sv_f32_t x, const svbool_t pg)
+{
+  /* Determine whether input is too large to perform fast regression.  */
+  svbool_t cmp = svacge_f32 (pg, x, RangeVal);
+  svbool_t pred_minuszero = svcmpeq_f32 (pg, x, sv_f32 (-0.0));
+
+  /* n = rint(x/(pi/2)).  */
+  sv_f32_t q = sv_fma_f32_x (pg, InvPio2, x, Shift);
+  sv_f32_t n = svsub_f32_x (pg, q, Shift);
+  /* n is already a signed integer, simply convert it.  */
+  sv_s32_t in = sv_to_s32_f32_x (pg, n);
+  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
+  sv_s32_t alt = svand_s32_x (pg, in, sv_s32 (1));
+  svbool_t pred_alt = svcmpne_s32 (pg, alt, sv_s32 (0));
+
+  /* r = x - n * (pi/2)  (range reduction into 0 .. pi/4).  */
+  sv_f32_t r;
+  r = sv_fma_f32_x (pg, NegPio2_1, n, x);
+  r = sv_fma_f32_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_3, n, r);
+
+  /* If x lives in an interval, where |tan(x)|
+     - is finite, then use a polynomial approximation of the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use symmetries of tangent and the identity
+       tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
+       the same polynomial approximation of tan as above.  */
+
+  /* Perform additional reduction if required.  */
+  sv_f32_t z = svneg_f32_m (r, pred_alt, r);
+
+  /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4].  */
+  sv_f32_t z2 = svmul_f32_x (pg, z, z);
+  sv_f32_t p = eval_poly (pg, z2);
+  sv_f32_t y = sv_fma_f32_x (pg, svmul_f32_x (pg, z, z2), p, z);
+
+  /* Transform result back, if necessary.  */
+  sv_f32_t inv_y = svdiv_f32_x (pg, sv_f32 (1.0f), y);
+  y = svsel_f32 (pred_alt, inv_y, y);
+
+  /* Fast reduction does not handle the x = -0.0 case well,
+     therefore it is fixed here.  */
+  y = svsel_f32 (pred_minuszero, x, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_tanf_specialcase (x, y, cmp);
+  return y;
+}
+
+strong_alias (__sv_tanf_x, _ZGVsMxv_tanf)
+
+#endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 6b68c4a..23ee406 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -146,6 +146,9 @@ SVD (__sv_cos_x, -3.1, 3.1)
 SVD (_ZGVsMxv_cos, -3.1, 3.1)
 SVD (__sv_sin_x, -3.1, 3.1)
 SVD (_ZGVsMxv_sin, -3.1, 3.1)
+
+SVF (__sv_tanf_x, -3.1, 3.1)
+SVF (_ZGVsMxv_tanf, -3.1, 3.1)
 #endif
 #endif
   // clang-format on
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 26349fd..25fa731 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -417,6 +417,17 @@ range_sve_erf='
   -0x1p28  -inf      20000
 '
 
+range_sve_tanf='
+     -0.0  -0x1p126  100
+ 0x1p-149  0x1p-126  4000
+ 0x1p-126   0x1p-23  50000
+  0x1p-23       0.7  50000
+      0.7       1.5  50000
+      1.5       100  50000
+      100    0x1p17  50000
+   0x1p17       inf  50000
+'
+
 # error limits
 L_erfc=3.14
 L_erfcf=0.26
@@ -451,6 +462,7 @@ L_sve_log=1.68
 L_sve_expf=1.46
 L_sve_erff=0.76
 L_sve_erf=1.97
+L_sve_tanf=2.7
 
 while read G F R
 do
@@ -548,6 +560,8 @@ sve_expf     __sv_expf         $runsv
 sve_expf     _ZGVsMxv_expf     $runsv
 sve_erff     __sv_erff         $runsv
 sve_erff     _ZGVsMxv_erff     $runsv
+sve_tanf    __sv_tanf          $runsv
+sve_tanf    _ZGVsMxv_tanf      $runsv
 
 sve_cos    __sv_cos        $runsv
 sve_cos    _ZGVsMxv_cos    $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 37ddc6d..42b089d 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -98,5 +98,7 @@ SVF1 (sin)
 ZSVF1 (sin)
 SVD1 (sin)
 ZSVD1 (sin)
+SVF1 (tan)
+ZSVF1 (tan)
 #endif
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 4ff8e8a..93e2e7d 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -95,6 +95,7 @@ ZSVNF1_WRAP(exp)
 ZSVNF1_WRAP(log)
 ZSVNF1_WRAP(log10)
 ZSVNF1_WRAP(sin)
+ZSVNF1_WRAP(tan)
 
 ZSVND2_WRAP(atan2)
 ZSVND1_WRAP(atan)
-- 
cgit v1.2.3


From 05fe0091dece6bedc5ed807cc8aa49dd67d418c1 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 17 Oct 2022 09:54:04 +0100
Subject: pl/math: Add scalar acosh

New routine uses AOR log and log1p. It is accurate to 3 ULP.
---
 pl/math/acosh_3u.c                        | 55 +++++++++++++++++++++++++++++++
 pl/math/include/mathlib.h                 |  1 +
 pl/math/test/mathbench_funcs.h            |  1 +
 pl/math/test/runulp.sh                    |  7 ++++
 pl/math/test/testcases/directed/acosh.tst | 19 +++++++++++
 pl/math/test/ulp_funcs.h                  |  1 +
 6 files changed, 84 insertions(+)
 create mode 100644 pl/math/acosh_3u.c
 create mode 100644 pl/math/test/testcases/directed/acosh.tst

diff --git a/pl/math/acosh_3u.c b/pl/math/acosh_3u.c
new file mode 100644
index 0000000..b946dcb
--- /dev/null
+++ b/pl/math/acosh_3u.c
@@ -0,0 +1,55 @@
+/*
+ * Double-precision acosh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define Ln2 (0x1.62e42fefa39efp-1)
+#define MinusZero (0x8000000000000000)
+#define SquareLim (0x5fe0000000000000) /* asuint64(0x1.0p511).  */
+#define Two (0x4000000000000000)       /* asuint64(2.0).  */
+
+double
+optr_aor_log_f64 (double);
+
+double
+log1p (double);
+
+/* acosh approximation using a variety of approaches on different intervals:
+
+   acosh(x) = ln(x + sqrt(x * x - 1)).
+
+   x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
+   close enough to x that we can calculate the result by ln(2x) == ln(x) +
+   ln(2). The greatest observed error in this region is 0.98 ULP:
+   acosh(0x1.1b9bf42923d1dp+853) got 0x1.28066a11a7c7fp+9
+				want 0x1.28066a11a7c8p+9.
+
+   x > 2: Calculate the result directly using definition of acosh(x). Greatest
+   observed error in this region is 1.33 ULP:
+   acosh(0x1.1e45d14bfcfa2p+1) got 0x1.71a06f50c34b5p+0
+			      want 0x1.71a06f50c34b6p+0.
+
+   0 <= x <= 2: Calculate the result using log1p. For x < 1, acosh(x) is
+   undefined. For 1 <= x <= 2, the largest observed error is 2.63 ULP:
+   acosh(0x1.072462f3df186p+0) got 0x1.e2a700043edabp-3
+			      want 0x1.e2a700043edaep-3.  */
+double
+acosh (double x)
+{
+  uint64_t ix = asuint64 (x);
+
+  if (unlikely (ix >= MinusZero))
+    return __math_invalid (x);
+
+  if (unlikely (ix >= SquareLim))
+    return optr_aor_log_f64 (x) + Ln2;
+
+  if (ix >= Two)
+    return optr_aor_log_f64 (x + sqrt (x * x - 1));
+
+  double xm1 = x - 1;
+  return log1p (xm1 + sqrt (2 * xm1 + xm1 * xm1));
+}
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index b53847f..662b756 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -18,6 +18,7 @@ float log10f (float);
 float log1pf (float);
 float tanf (float);
 
+double acosh (double);
 double asinh (double);
 double atan2 (double, double);
 double erfc (double);
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 23ee406..9c98934 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -18,6 +18,7 @@ F (log2f, 0.01, 11.1)
 F (sinf, -3.1, 3.1)
 F (tanf, -3.1, 3.1)
 
+D (acosh, 1.0, 10.0)
 D (asinh, -10.0, 10.0)
 D (atan, -10.0, 10.0)
 {"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 25fa731..d4aa69b 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -146,6 +146,13 @@ t acoshf 2      0x1p64 100000
 t acoshf 0x1p64 inf    100000
 t acoshf -0     -inf    10000
 
+L=2.14
+t acosh 0        1       10000
+t acosh 1        2       100000
+t acosh 2        0x1p511 100000
+t acosh 0x1p511  inf     100000
+t acosh -0      -inf     10000
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/acosh.tst b/pl/math/test/testcases/directed/acosh.tst
new file mode 100644
index 0000000..bbc1551
--- /dev/null
+++ b/pl/math/test/testcases/directed/acosh.tst
@@ -0,0 +1,19 @@
+; acosh.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=acosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=acosh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=acosh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=acosh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=acosh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=acosh op1=3ff00000.00000000 result=00000000.00000000 errno=0
+func=acosh op1=3fefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=00000000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=80000000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=bfefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=7fe01ac0.7f03a83e result=40862e50.541778f1.8cc error=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 42b089d..deb840b 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -11,6 +11,7 @@ F1 (erfc)
 F1 (erf)
 F1 (log10)
 F1 (log1p)
+D1 (acosh)
 D1 (asinh)
 D2 (atan2)
 F1 (tan)
-- 
cgit v1.2.3


From 6345ddad9f31dfedd9dd4e96e2571f42f2210545 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 19 Oct 2022 11:57:43 +0100
Subject: pl/math: Update ULP threshold for log1p

A larger error was found, threshold updated in runulp.
---
 pl/math/log1p_2u.c     | 6 +++---
 pl/math/test/runulp.sh | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
index c214954..161475f 100644
--- a/pl/math/log1p_2u.c
+++ b/pl/math/log1p_2u.c
@@ -54,9 +54,9 @@ eval_poly (double f)
 /* log1p approximation using polynomial on reduced interval. Largest
    observed errors are near the lower boundary of the region where k
    is 0.
-   Maximum measured error: 1.7ULP.
-   log1p(-0x1.2e515c0f31f8p-2) got  -0x1.6648c36863fc2p-2
-			       want -0x1.6648c36863fc4p-2.  */
+   Maximum measured error: 1.72ULP.
+   log1p(-0x1.2e49eddc007d4p-2) got -0x1.663e386abd899p-2
+			       want -0x1.663e386abd89bp-2.  */
 double
 log1p (double x)
 {
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index d4aa69b..60e719f 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -103,7 +103,7 @@ t asinh     -1.0  -100.0   10000
 t asinh    100.0     inf   50000
 t asinh   -100.0    -inf   10000
 
-L=1.18
+L=1.24
 t log1p    -10.0     10.0  10000
 t log1p      0.0  0x1p-23  50000
 t log1p  0x1p-23    0.001  50000
-- 
cgit v1.2.3


From cda6bb41a2030689dfae885d06c042111e5b07f3 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 19 Oct 2022 14:21:00 +0100
Subject: pl/math: Add vector/SVE erfc

New routine is an SVE port of the Neon routine, including the exp_tail
helper. It is accurate to 4 ULP.
---
 pl/math/include/mathlib.h      |   2 +
 pl/math/sv_erfc_4u.c           | 136 +++++++++++++++++++++++++++++++++++++++++
 pl/math/sv_exp_tail.h          |  79 ++++++++++++++++++++++++
 pl/math/sv_math.h              |   7 +++
 pl/math/test/mathbench_funcs.h |   3 +
 pl/math/test/runulp.sh         |  12 ++++
 pl/math/test/ulp_funcs.h       |   2 +
 pl/math/test/ulp_wrappers.h    |   1 +
 8 files changed, 242 insertions(+)
 create mode 100644 pl/math/sv_erfc_4u.c
 create mode 100644 pl/math/sv_exp_tail.h

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 662b756..0b88e76 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -119,6 +119,7 @@ svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_erff_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_erf_x (svfloat64_t, svbool_t);
+svfloat64_t __sv_erfc_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_expf_x (svfloat32_t, svbool_t);
 svfloat32_t __sv_logf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
@@ -136,6 +137,7 @@ svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_erfc (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t);
 svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
diff --git a/pl/math/sv_erfc_4u.c b/pl/math/sv_erfc_4u.c
new file mode 100644
index 0000000..b4f2ff0
--- /dev/null
+++ b/pl/math/sv_erfc_4u.c
@@ -0,0 +1,136 @@
+/*
+ * Double-precision SVE erfc(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+
+#if SV_SUPPORTED
+#include "sv_exp_tail.h"
+
+sv_f64_t __sv_exp_x (sv_f64_t, svbool_t);
+
+static NOINLINE sv_f64_t
+specialcase (sv_f64_t x, sv_f64_t y, svbool_t special)
+{
+  return sv_call_f64 (erfc, x, y, special);
+}
+
+static inline sv_u64_t
+lookup_interval_idx (const svbool_t pg, sv_f64_t abs_x)
+{
+  /* Interval index is calculated by (((abs(x) + 1)^4) >> 53) - 1023, bounded by
+     the number of polynomials.  */
+  sv_f64_t xp1 = svadd_n_f64_x (pg, abs_x, 1);
+  xp1 = svmul_f64_x (pg, xp1, xp1);
+  xp1 = svmul_f64_x (pg, xp1, xp1);
+  sv_u64_t interval_idx
+    = svsub_n_u64_x (pg, svlsr_n_u64_x (pg, sv_as_u64_f64 (xp1), 52), 1023);
+  return svsel_u64 (svcmple_n_u64 (pg, interval_idx, ERFC_NUM_INTERVALS),
+		    interval_idx, sv_u64 (ERFC_NUM_INTERVALS));
+}
+
+static inline sv_f64_t
+sv_eval_poly (const svbool_t pg, sv_f64_t z, sv_u64_t idx)
+{
+  sv_u64_t offset = svmul_n_u64_x (pg, idx, ERFC_POLY_ORDER + 1);
+  const double *base = &__v_erfc_data.poly[0][12];
+  sv_f64_t r = sv_lookup_f64_x (pg, base, offset);
+  for (int i = 0; i < ERFC_POLY_ORDER; i++)
+    {
+      base--;
+      sv_f64_t c = sv_lookup_f64_x (pg, base, offset);
+      r = sv_fma_f64_x (pg, z, r, c);
+    }
+  return r;
+}
+
+static inline sv_f64_t
+sv_eval_gauss (const svbool_t pg, sv_f64_t abs_x)
+{
+  /* Accurate evaluation of exp(-x^2). This operation is sensitive to rounding
+     errors in x^2, so we compute an estimate for the error and use a custom exp
+     helper which corrects for the calculated error estimate.  */
+  sv_f64_t a2 = svmul_f64_x (pg, abs_x, abs_x);
+
+  /* Split abs_x into (a_hi + a_lo), where a_hi is the 'large' component and
+     a_lo is the 'small' component.  */
+  const sv_f64_t scale = sv_f64 (0x1.0000002p27);
+  sv_f64_t a_hi = svneg_f64_x (pg, sv_fma_f64_x (pg, scale, abs_x,
+						 svneg_f64_x (pg, abs_x)));
+  a_hi = sv_fma_f64_x (pg, scale, abs_x, a_hi);
+  sv_f64_t a_lo = svsub_f64_x (pg, abs_x, a_hi);
+
+  sv_f64_t a_hi_neg = svneg_f64_x (pg, a_hi);
+  sv_f64_t a_lo_neg = svneg_f64_x (pg, a_lo);
+
+  /* We can then estimate the error in abs_x^2 by computing (abs_x * abs_x) -
+     (a_hi + a_lo) * (a_hi + a_lo).  */
+  sv_f64_t e2 = sv_fma_f64_x (pg, a_hi_neg, a_hi, a2);
+  e2 = sv_fma_f64_x (pg, a_hi_neg, a_lo, e2);
+  e2 = sv_fma_f64_x (pg, a_lo_neg, a_hi, e2);
+  e2 = sv_fma_f64_x (pg, a_lo_neg, a_lo, e2);
+
+  return sv_exp_tail (pg, svneg_f64_x (pg, a2), e2);
+}
+
+/* Optimized double precision vector complementary error function erfc.
+   Maximum measured error is 3.63 ULP:
+   __sv_erfc(0x1.479279a3bbc74p+2) got 0x1.ff341c664edc5p-42
+				  want 0x1.ff341c664edc9p-42.  */
+sv_f64_t
+__sv_erfc_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_f64_t abs_x = svabs_f64_x (pg, x);
+  sv_u64_t atop = svlsr_n_u64_x (pg, sv_as_u64_f64 (abs_x), 52);
+
+  /* Outside of the 'interesting' bounds, [-6, 28], +ve goes to 0, -ve goes
+     to 2. As long as the polynomial is 0 in the boring zone, we can assemble
+     the result correctly. This is dealt with in two ways:
+
+     The 'coarse approach' is that the approximation algorithm is
+     zero-predicated on in_bounds = |x| < 32, which saves the need to do
+     coefficient lookup etc for |x| >= 32.
+
+     The coarse approach misses [-32, -6] and [28, 32], which are dealt with in
+     the polynomial and index calculation, such that the polynomial evaluates to
+     0 in these regions.  */
+  /* in_bounds is true for lanes where |x| < 32.  */
+  svbool_t in_bounds = svcmplt_n_u64 (pg, atop, 0x404);
+  /* boring_zone = 2 for x < 0, 0 otherwise.  */
+  sv_f64_t boring_zone
+    = sv_as_f64_u64 (svlsl_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 63), 62));
+  /* Very small, nan and inf.  */
+  svbool_t special_cases
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3cd), 0x432);
+
+  /* erfc(|x|) ~= P_i(|x|-x_i)*exp(-x^2)
+
+     Where P_i is a polynomial and x_i is an offset, both defined in
+     v_erfc_data.c. i is chosen based on which interval x falls in.  */
+  sv_u64_t i = lookup_interval_idx (in_bounds, abs_x);
+  sv_f64_t x_i = sv_lookup_f64_x (in_bounds, __v_erfc_data.interval_bounds, i);
+  sv_f64_t p = sv_eval_poly (in_bounds, svsub_f64_x (pg, abs_x, x_i), i);
+  /* 'copy' sign of x to p, i.e. negate p if x is negative.  */
+  sv_u64_t sign = svbic_n_u64_z (in_bounds, ix, 0x7fffffffffffffff);
+  p = sv_as_f64_u64 (sveor_u64_z (in_bounds, sv_as_u64_f64 (p), sign));
+
+  sv_f64_t e = sv_eval_gauss (in_bounds, abs_x);
+
+  /* Assemble result: 2-p*e if x<0, p*e otherwise. No need to conditionally
+     select boring_zone because P[V_ERFC_NINTS-1]=0.  */
+  sv_f64_t y = sv_fma_f64_x (pg, p, e, boring_zone);
+
+  if (unlikely (svptest_any (pg, special_cases)))
+    {
+      return specialcase (x, y, special_cases);
+    }
+  return y;
+}
+
+strong_alias (__sv_erfc_x, _ZGVsMxv_erfc)
+
+#endif
diff --git a/pl/math/sv_exp_tail.h b/pl/math/sv_exp_tail.h
new file mode 100644
index 0000000..846fe97
--- /dev/null
+++ b/pl/math/sv_exp_tail.h
@@ -0,0 +1,79 @@
+/*
+ * Double-precision SVE e^(x+tail) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef SV_EXP_TAIL_H
+#define SV_EXP_TAIL_H
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#include "v_exp_tail.h"
+
+#define C1 sv_f64 (C1_scal)
+#define C2 sv_f64 (C2_scal)
+#define C3 sv_f64 (C3_scal)
+#define MinusLn2hi (-Ln2hi_scal)
+#define MinusLn2lo (-Ln2lo_scal)
+
+#define N (1 << V_EXP_TAIL_TABLE_BITS)
+#define Tab __v_exp_tail_data
+#define IndexMask (N - 1)
+#define Shift sv_f64 (0x1.8p+52)
+#define Thres 704.0
+
+static inline sv_f64_t
+sv_exp_tail_special_case (svbool_t pg, sv_f64_t s, sv_f64_t y, sv_f64_t n)
+{
+  sv_f64_t absn = svabs_f64_x (pg, n);
+
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  sv_u64_t b = svsel_u64 (svcmple_n_f64 (pg, n, 0), sv_u64 (0x6000000000000000),
+			  sv_u64 (0));
+  sv_f64_t s1 = sv_as_f64_u64 (svsubr_n_u64_x (pg, b, 0x7000000000000000));
+  sv_f64_t s2 = sv_as_f64_u64 (
+    svadd_u64_x (pg, svsub_n_u64_x (pg, sv_as_u64_f64 (s), 0x3010000000000000),
+		 b));
+
+  svbool_t cmp = svcmpgt_n_f64 (pg, absn, 1280.0 * N);
+  sv_f64_t r1 = svmul_f64_x (pg, s1, s1);
+  sv_f64_t r0 = svmul_f64_x (pg, sv_fma_f64_x (pg, y, s2, s2), s1);
+  return svsel_f64 (cmp, r1, r0);
+}
+
+static inline sv_f64_t
+sv_exp_tail (const svbool_t pg, sv_f64_t x, sv_f64_t xtail)
+{
+  /* Calculate exp(x + xtail).  */
+  sv_f64_t z = sv_fma_n_f64_x (pg, InvLn2_scal, x, Shift);
+  sv_f64_t n = svsub_f64_x (pg, z, Shift);
+
+  sv_f64_t r = sv_fma_n_f64_x (pg, MinusLn2hi, n, x);
+  r = sv_fma_n_f64_x (pg, MinusLn2lo, n, r);
+
+  sv_u64_t u = sv_as_u64_f64 (z);
+  sv_u64_t e = svlsl_n_u64_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
+  sv_u64_t i = svand_n_u64_x (pg, u, IndexMask);
+
+  sv_f64_t y = sv_fma_f64_x (pg, C3, r, C2);
+  y = sv_fma_f64_x (pg, y, r, C1);
+  y = sv_fma_f64_x (pg, y, r, sv_f64 (1.0));
+  y = sv_fma_f64_x (pg, y, r, xtail);
+
+  /* s = 2^(n/N).  */
+  u = sv_lookup_u64_x (pg, Tab, i);
+  sv_f64_t s = sv_as_f64_u64 (svadd_u64_x (pg, u, e));
+
+  svbool_t cmp = svcmpgt_n_f64 (pg, svabs_f64_x (pg, x), Thres);
+  if (unlikely (svptest_any (pg, cmp)))
+    {
+      return sv_exp_tail_special_case (pg, s, y, n);
+    }
+  return sv_fma_f64_x (pg, y, s, s);
+}
+
+#endif
+#endif
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
index b574f10..7f06a11 100644
--- a/pl/math/sv_math.h
+++ b/pl/math/sv_math.h
@@ -125,6 +125,13 @@ sv_call2_f64 (f64_t (*f) (f64_t, f64_t), sv_f64_t x1, sv_f64_t x2, sv_f64_t y,
   return y;
 }
 
+/* Load array of uint64_t into svuint64_t.  */
+static inline sv_u64_t
+sv_lookup_u64_x (svbool_t pg, const u64_t *tab, sv_u64_t idx)
+{
+  return svld1_gather_u64index_u64 (pg, tab, idx);
+}
+
 /* Load array of double into svfloat64_t.  */
 static inline sv_f64_t
 sv_lookup_f64_x (svbool_t pg, const f64_t *tab, sv_u64_t idx)
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 9c98934..edd5c0d 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -125,6 +125,9 @@ SVF (_ZGVsMxv_erff, -4.0, 4.0)
 SVD (__sv_erf_x, -4.0, 4.0)
 SVD (_ZGVsMxv_erf, -4.0, 4.0)
 
+SVD (__sv_erfc_x, -4, 10)
+SVD (_ZGVsMxv_erfc, -4, 10)
+
 SVF (__sv_expf_x, -9.9, 9.9)
 SVF (_ZGVsMxv_expf, -9.9, 9.9)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 60e719f..0f5d70d 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -435,6 +435,15 @@ range_sve_tanf='
    0x1p17       inf  50000
 '
 
+range_sve_erfc='
+   0      0xffff0000 10000
+   0x1p-127  0x1p-26 40000
+  -0x1p-127 -0x1p-26 40000
+   0x1p-26    0x1p5  40000
+  -0x1p-26   -0x1p3  40000
+   0          inf    40000
+'
+
 # error limits
 L_erfc=3.14
 L_erfcf=0.26
@@ -470,6 +479,7 @@ L_sve_expf=1.46
 L_sve_erff=0.76
 L_sve_erf=1.97
 L_sve_tanf=2.7
+L_sve_erfc=3.14
 
 while read G F R
 do
@@ -584,6 +594,8 @@ sve_log    __sv_log        $runsv
 sve_log    _ZGVsMxv_log    $runsv
 sve_erf    __sv_erf        $runsv
 sve_erf    _ZGVsMxv_erf    $runsv
+sve_erfc   __sv_erfc       $runsv
+sve_erfc   _ZGVsMxv_erfc   $runsv
 fi
 EOF
 
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index deb840b..9870bf0 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -85,6 +85,8 @@ SVF1 (erf)
 ZSVF1 (erf)
 SVD1 (erf)
 ZSVD1 (erf)
+SVD1 (erfc)
+ZSVD1 (erfc)
 SVF1 (exp)
 ZSVF1 (exp)
 SVF1 (log)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 93e2e7d..cbe281b 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -101,6 +101,7 @@ ZSVND2_WRAP(atan2)
 ZSVND1_WRAP(atan)
 ZSVND1_WRAP(cos)
 ZSVND1_WRAP(erf)
+ZSVND1_WRAP(erfc)
 ZSVND1_WRAP(log)
 ZSVND1_WRAP(log10)
 ZSVND1_WRAP(sin)
-- 
cgit v1.2.3


From 0e150e494d04bd501652acc34bf9de275d3c02b4 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 19 Oct 2022 14:21:20 +0100
Subject: Fix NaN behaviour in atan2

NaN handling in double-precision atan2 was not consistent with the
reference. This is now updated. Since atan2 is used as reference for
atan2f, we also have to update the NaN handling there. An explicit
test for correct -NaN handling has been added in runulp.sh

atan2 is used as a scalar fallback for vector variants of atan2 where
an input is NaN - the same problem was observed for those and is now
fixed.
---
 pl/math/atan2_2u.c     | 17 +++++++----------
 pl/math/atan2f_3u.c    |  9 +--------
 pl/math/test/runulp.sh |  3 +++
 3 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/pl/math/atan2_2u.c b/pl/math/atan2_2u.c
index 9bd88ef..7638c30 100644
--- a/pl/math/atan2_2u.c
+++ b/pl/math/atan2_2u.c
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+#include <stdbool.h>
+
 #include "math_config.h"
 #include "atan_common.h"
 
@@ -51,16 +53,11 @@ atan2 (double y, double x)
   uint64_t iax = ix & ~SignMask;
   uint64_t iay = iy & ~SignMask;
 
-  /* x or y is NaN.  */
-  if ((iax > 0x7ff0000000000000) || (iay > 0x7ff0000000000000))
-    {
-      if (unlikely ((iax > 0x7f80000000000000) && (iay > 0x7f80000000000000)))
-	{
-	  /* Both are NaN. Force sign to be +ve.  */
-	  return (asdouble (iax) + asdouble (iay));
-	}
-      return x + y;
-    }
+  bool xisnan = isnan (x);
+  if (unlikely (isnan (y) && !xisnan))
+    return __math_invalid (y);
+  if (unlikely (xisnan))
+    return __math_invalid (x);
 
   /* m = 2 * sign(x) + sign(y).  */
   uint32_t m = ((iy >> 63) & 1) | ((ix >> 62) & 2);
diff --git a/pl/math/atan2f_3u.c b/pl/math/atan2f_3u.c
index d2f1749..3fa6296 100644
--- a/pl/math/atan2f_3u.c
+++ b/pl/math/atan2f_3u.c
@@ -53,14 +53,7 @@ atan2f (float y, float x)
 
   /* x or y is NaN.  */
   if ((iax > 0x7f800000) || (iay > 0x7f800000))
-    {
-      if (unlikely ((iax > 0x7f800000) && (iay > 0x7f800000)))
-	{
-	  /* Both are NaN. Force sign to be +ve.  */
-	  return (asfloat (iax) + asfloat (iay));
-	}
-      return x + y;
-    }
+    return x + y;
 
   /* m = 2 * sign(x) + sign(y).  */
   uint32_t m = ((iy >> 31) & 1) | ((ix >> 30) & 2);
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 0f5d70d..fd1ad5f 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -80,6 +80,9 @@ t atan2  -1.0        1.0  40000
 t atan2   0.0        1.0  40000
 t atan2   1.0      100.0  40000
 t atan2   1e6       1e32  40000
+# Regression-test for correct NaN handling
+check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan
+check atan2 nan nan x -nan -nan
 
 L=2.4
 t atan2f -10.0       10.0  50000
-- 
cgit v1.2.3


From bf4e49b34ab462d84dcbf34d1f307fe2992869b1 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 19 Oct 2022 14:25:36 +0100
Subject: pl/math: Add vector/Neon log1p

New routine is based on the scalar algorithm, and accurate to 2.5 ULP.
---
 pl/math/include/mathlib.h      |  4 ++
 pl/math/log1p_2u.c             | 35 +---------------
 pl/math/log1p_common.h         | 61 ++++++++++++++++++++++++++++
 pl/math/s_log1p_2u5.c          |  6 +++
 pl/math/test/mathbench_funcs.h |  5 +++
 pl/math/test/runulp.sh         | 16 ++++++++
 pl/math/test/ulp_funcs.h       |  3 ++
 pl/math/test/ulp_wrappers.h    |  2 +
 pl/math/v_log1p_2u5.c          | 92 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_log1p_2u5.c         | 12 ++++++
 10 files changed, 203 insertions(+), 33 deletions(-)
 create mode 100644 pl/math/log1p_common.h
 create mode 100644 pl/math/s_log1p_2u5.c
 create mode 100644 pl/math/v_log1p_2u5.c
 create mode 100644 pl/math/vn_log1p_2u5.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 0b88e76..fd3702a 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -40,6 +40,7 @@ double __s_atan2 (double, double);
 double __s_erf (double);
 double __s_erfc (double);
 double __s_log10 (double);
+double __s_log1p (double);
 double __s_log2 (double);
 
 #if __aarch64__
@@ -66,6 +67,7 @@ __f64x2_t __v_erfc (__f64x2_t);
 __f32x4_t __v_log10f (__f32x4_t);
 __f64x2_t __v_log10 (__f64x2_t);
 __f32x4_t __v_log1pf (__f32x4_t);
+__f64x2_t __v_log1p (__f64x2_t);
 __f32x4_t __v_log2f (__f32x4_t);
 __f64x2_t __v_log2 (__f64x2_t);
 __f32x4_t __v_tanf (__f32x4_t);
@@ -86,6 +88,7 @@ __vpcs __f64x2_t __vn_erfc (__f64x2_t);
 __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 __vpcs __f32x4_t __vn_log1pf (__f32x4_t);
+__vpcs __f64x2_t __vn_log1p (__f64x2_t);
 __vpcs __f32x4_t __vn_log2f (__f32x4_t);
 __vpcs __f64x2_t __vn_log2 (__f64x2_t);
 __vpcs __f32x4_t __vn_tanf (__f32x4_t);
@@ -103,6 +106,7 @@ __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
index 161475f..ddef7c5 100644
--- a/pl/math/log1p_2u.c
+++ b/pl/math/log1p_2u.c
@@ -6,6 +6,8 @@
 
 #include "math_config.h"
 
+#include "log1p_common.h"
+
 #define Ln2Hi 0x1.62e42fefa3800p-1
 #define Ln2Lo 0x1.ef35793c76730p-45
 #define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)).  */
@@ -18,39 +20,6 @@
 #define AbsMask 0x7fffffffffffffff
 #define ExpM63 0x3c00
 
-#define C(i) __log1p_data.coeffs[i]
-
-static inline double
-eval_poly (double f)
-{
-  /* Evaluate polynomial using Estrin's method.  */
-  double p_01 = fma (f, C (1), C (0));
-  double p_23 = fma (f, C (3), C (2));
-  double p_45 = fma (f, C (5), C (4));
-  double p_67 = fma (f, C (7), C (6));
-  double p_89 = fma (f, C (9), C (8));
-  double p_ab = fma (f, C (11), C (10));
-  double p_cd = fma (f, C (13), C (12));
-  double p_ef = fma (f, C (15), C (14));
-  double p_gh = fma (f, C (17), C (16));
-
-  double f2 = f * f;
-  double p_03 = fma (f2, p_23, p_01);
-  double p_47 = fma (f2, p_67, p_45);
-  double p_8b = fma (f2, p_ab, p_89);
-  double p_cf = fma (f2, p_ef, p_cd);
-  double p_gi = fma (f2, C (18), p_gh);
-
-  double f4 = f2 * f2;
-  double p_07 = fma (f4, p_47, p_03);
-  double p_8f = fma (f4, p_cf, p_8b);
-
-  double f8 = f4 * f4;
-  double p_0f = fma (f8, p_8f, p_07);
-
-  return fma (f8 * f8, p_gi, p_0f);
-}
-
 /* log1p approximation using polynomial on reduced interval. Largest
    observed errors are near the lower boundary of the region where k
    is 0.
diff --git a/pl/math/log1p_common.h b/pl/math/log1p_common.h
new file mode 100644
index 0000000..24e6f20
--- /dev/null
+++ b/pl/math/log1p_common.h
@@ -0,0 +1,61 @@
+/*
+ * Double-precision polynomial evaluation function for scalar and vector
+ * log1p(x)
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_LOG1P_COMMON_H
+#define PL_MATH_LOG1P_COMMON_H
+
+#include "math_config.h"
+
+#if V_SUPPORTED
+
+#include "v_math.h"
+
+#define DBL_T v_f64_t
+#define FMA v_fma_f64
+#define C(i) v_f64 (__log1p_data.coeffs[i])
+
+#else
+
+#define DBL_T double
+#define FMA fma
+#define C(i) __log1p_data.coeffs[i]
+
+#endif
+
+static inline DBL_T
+eval_poly (DBL_T f)
+{
+  /* Evaluate polynomial using Estrin's method.  */
+  DBL_T p_01 = FMA (f, C (1), C (0));
+  DBL_T p_23 = FMA (f, C (3), C (2));
+  DBL_T p_45 = FMA (f, C (5), C (4));
+  DBL_T p_67 = FMA (f, C (7), C (6));
+  DBL_T p_89 = FMA (f, C (9), C (8));
+  DBL_T p_ab = FMA (f, C (11), C (10));
+  DBL_T p_cd = FMA (f, C (13), C (12));
+  DBL_T p_ef = FMA (f, C (15), C (14));
+  DBL_T p_gh = FMA (f, C (17), C (16));
+
+  DBL_T f2 = f * f;
+  DBL_T p_03 = FMA (f2, p_23, p_01);
+  DBL_T p_47 = FMA (f2, p_67, p_45);
+  DBL_T p_8b = FMA (f2, p_ab, p_89);
+  DBL_T p_cf = FMA (f2, p_ef, p_cd);
+  DBL_T p_gi = FMA (f2, C (18), p_gh);
+
+  DBL_T f4 = f2 * f2;
+  DBL_T p_07 = FMA (f4, p_47, p_03);
+  DBL_T p_8f = FMA (f4, p_cf, p_8b);
+
+  DBL_T f8 = f4 * f4;
+  DBL_T p_0f = FMA (f8, p_8f, p_07);
+
+  return FMA (f8 * f8, p_gi, p_0f);
+}
+
+#endif // PL_MATH_LOG1P_COMMON_H
diff --git a/pl/math/s_log1p_2u5.c b/pl/math/s_log1p_2u5.c
new file mode 100644
index 0000000..1d96025
--- /dev/null
+++ b/pl/math/s_log1p_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log1p_2u5.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index edd5c0d..6774ebb 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -43,6 +43,7 @@ D (__s_erfc, -6.0, 28.0)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 F (__s_log1pf, -0.9, 10.0)
+D (__s_log1p, -0.9, 10.0)
 F (__s_log2f, 0.01, 11.1)
 D (__s_log2, 0.01, 11.1)
 F (__s_tanf, -3.1, 3.1)
@@ -59,6 +60,7 @@ VD (__v_erfc, -6.0, 28.0)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 VF (__v_log1pf, -0.9, 10.0)
+VD (__v_log1p, -0.9, 10.0)
 VF (__v_log2f, 0.01, 11.1)
 VD (__v_log2, 0.01, 11.1)
 VF (__v_tanf, -3.1, 3.1)
@@ -99,6 +101,9 @@ VND (_ZGVnN2v_log10, 0.01, 11.1)
 VNF (__vn_log1pf, -0.9, 10.0)
 VNF (_ZGVnN4v_log1pf, -0.9, 10.0)
 
+VND (__vn_log1p, -0.9, 10.0)
+VND (_ZGVnN2v_log1p, -0.9, 10.0)
+
 VNF (__vn_log2f, 0.01, 11.1)
 VNF (_ZGVnN4v_log2f, 0.01, 11.1)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index fd1ad5f..fac5cfa 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -306,6 +306,17 @@ range_tanf='
    0x1p17       inf  50000
 '
 
+range_log1p='
+    -10.0     10.0  10000
+      0.0  0x1p-23  50000
+  0x1p-23    0.001  50000
+    0.001      1.0  50000
+      0.0 -0x1p-23  50000
+ -0x1p-23   -0.001  50000
+   -0.001     -1.0  50000
+     -1.0      inf   5000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -464,6 +475,7 @@ L_asinhf=2.17
 L_log2f=2.10
 L_log2=2.09
 L_tanf=2.7
+L_log1p=1.97
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -562,6 +574,10 @@ tanf  __s_tanf         $runs
 tanf  __v_tanf         $runv
 tanf  __vn_tanf        $runvn
 tanf  _ZGVnN4v_tanf    $runvn
+log1p  __s_log1p       $runs
+log1p  __v_log1p       $runv
+log1p  __vn_log1p      $runvn
+log1p  _ZGVnN2v_log1p  $runvn
 
 if [ $WANT_SVE_MATH -eq 1 ]; then
 sve_cosf     __sv_cosf         $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 9870bf0..efa92f5 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -31,6 +31,7 @@ SD1 (erfc)
 SF1 (log10)
 SD1 (log10)
 SF1 (log1p)
+SD1 (log1p)
 SF1 (log2)
 SD1 (log2)
 SF1 (tan)
@@ -47,6 +48,7 @@ VD1 (erfc)
 VF1 (log10)
 VD1 (log10)
 VF1 (log1p)
+VD1 (log1p)
 VF1 (log2)
 VD1 (log2)
 VF1 (tan)
@@ -63,6 +65,7 @@ ZVND1 (erfc)
 ZVNF1 (log10)
 ZVND1 (log10)
 ZVNF1 (log1p)
+ZVND1 (log1p)
 ZVNF1 (log2)
 ZVND1 (log2)
 ZVNF1 (tan)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index cbe281b..7693578 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -68,6 +68,7 @@ VD2_WRAP(atan2)
 VD1_WRAP(erf)
 VD1_WRAP(erfc)
 VD1_WRAP(log10)
+VD1_WRAP(log1p)
 VD1_WRAP(log2)
 #ifdef __vpcs
 ZVNF1_WRAP(asinh)
@@ -84,6 +85,7 @@ ZVND2_WRAP(atan2)
 ZVND1_WRAP(erf)
 ZVND1_WRAP(erfc)
 ZVND1_WRAP(log10)
+ZVND1_WRAP(log1p)
 ZVND1_WRAP(log2)
 #endif
 #if WANT_SVE_MATH
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
new file mode 100644
index 0000000..51d0d51
--- /dev/null
+++ b/pl/math/v_log1p_2u5.c
@@ -0,0 +1,92 @@
+/*
+ * Double-precision vector log(1+x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#include "log1p_common.h"
+
+#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
+#define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
+#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32.  */
+#define OneMHfRt2Top                                                           \
+  0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)))      \
+			<< 32.  */
+#define OneTop12 0x3ff
+#define BottomMask 0xffffffff
+#define AbsMask 0x7fffffffffffffff
+
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (log1p, x, y, special);
+}
+
+/* Vector log1p approximation using polynomial on reduced interval. Routine is a
+   modification of the algorithm used in scalar log1p, with no shortcut for k=0
+   and no narrowing for f and k. Maximum observed error is 2.46 ULP:
+    __v_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2
+				    want 0x1.fd5565fb590f6p+2 .  */
+VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t ia = ix & AbsMask;
+  v_u64_t special
+    = v_cond_u64 ((ia >= v_u64 (0x7ff0000000000000))
+		  | (ix >= 0xbff0000000000000) | (ix == 0x8000000000000000));
+
+  /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
+			   is in [sqrt(2)/2, sqrt(2)]):
+     log1p(x) = k*log(2) + log1p(f).
+
+     f may not be representable exactly, so we need a correction term:
+     let m = round(1 + x), c = (1 + x) - m.
+     c << m: at very small x, log1p(x) ~ x, hence:
+     log(1+x) - log(m) ~ c/m.
+
+     We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m.  */
+
+  /* Obtain correctly scaled k by manipulation in the exponent.
+     The scalar algorithm casts down to 32-bit at this point to calculate k and
+     u_red. We stay in double-width to obtain f and k, using the same constants
+     as the scalar algorithm but shifted left by 32.  */
+  v_f64_t m = x + 1;
+  v_u64_t mi = v_as_u64_f64 (m);
+  v_u64_t u = mi + OneMHfRt2Top;
+
+  v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop12;
+  v_f64_t k = v_to_f64_s64 (ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top;
+  v_u64_t u_red = utop | (mi & BottomMask);
+  v_f64_t f = v_as_f64_u64 (u_red) - 1;
+
+  /* Correction term c/m.  */
+  v_f64_t cm = (x - (m - 1)) / m;
+
+  /* Approximate log1p(x) on the reduced input using a polynomial. Because
+   log1p(0)=0 we choose an approximation of the form:
+      x + C0*x^2 + C1*x^3 + C2x^4 + ...
+   Hence approximation has the form f + f^2 * P(f)
+      where P(x) = C0 + C1*x + C2x^2 + ...
+   Assembling this all correctly is dealt with at the final step.  */
+  v_f64_t p = eval_poly (f);
+
+  v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm);
+  v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f);
+  v_f64_t y = v_fma_f64 (f * f, p, ylo + yhi);
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+
+  return y;
+}
+
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_log1p_2u5.c b/pl/math/vn_log1p_2u5.c
new file mode 100644
index 0000000..4fed0b3
--- /dev/null
+++ b/pl/math/vn_log1p_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log1p.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log1p, _ZGVnN2v_log1p)
+#include "v_log1p_2u5.c"
+#endif
-- 
cgit v1.2.3


From bb96da1a7eedb002278cf09413f2faac852c4f24 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 19 Oct 2022 14:34:49 +0100
Subject: pl/math: Add vector/SVE powif

Add single-precision SVE powi. powi is not required to meet any
particular accuracy requirements, so we do not test in the usual
way. Instead we check for bitwise reproducibility with a scalar
reference.

Vector powif does not follow the usual naming convention. There is no
trailing f in the ZGV... alias, instead the i denotes 32-bit integer,
which distinguishes it from vector powk.
---
 pl/math/include/mathlib.h         |  2 ++
 pl/math/sv_powif.c                | 54 +++++++++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h    |  5 ++++
 pl/math/test/mathbench_wrappers.h | 18 +++++++++++++
 pl/math/test/runulp.sh            |  6 +++++
 pl/math/test/ulp_funcs.h          |  2 ++
 pl/math/test/ulp_wrappers.h       | 44 +++++++++++++++++++++++++++++++
 7 files changed, 131 insertions(+)
 create mode 100644 pl/math/sv_powif.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index fd3702a..c280c7e 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -129,6 +129,7 @@ svfloat32_t __sv_logf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_powif_x (svfloat32_t, svint32_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_tanf_x (svfloat32_t, svbool_t);
@@ -147,6 +148,7 @@ svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxvv_powi(svfloat32_t, svint32_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t);
diff --git a/pl/math/sv_powif.c b/pl/math/sv_powif.c
new file mode 100644
index 0000000..819c318
--- /dev/null
+++ b/pl/math/sv_powif.c
@@ -0,0 +1,54 @@
+/*
+ * Single-precision SVE powi(x, n) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+/* Optimized single-precision vector powi (float base, integer power).
+   powi is developed for environments in which accuracy is of much less
+   importance than performance, hence we provide no estimate for worst-case
+   error.  */
+svfloat32_t
+__sv_powif_x (svfloat32_t as, svint32_t ns, svbool_t p)
+{
+  /* Compute powi by successive squaring, right to left.  */
+  svfloat32_t acc = svdup_n_f32 (1.f);
+  svbool_t want_recip = svcmplt_n_s32 (p, ns, 0);
+  svuint32_t ns_abs = svreinterpret_u32_s32 (svabs_s32_x (p, ns));
+
+  /* We use a max to avoid needing to check whether any lane != 0 on each
+     iteration.  */
+  uint32_t max_n = svmaxv_u32 (p, ns_abs);
+
+  svfloat32_t c = as;
+  /* Successively square c, and use merging predication (_m) to determine
+     whether or not to perform the multiplication or keep the previous
+     iteration.  */
+  while (true)
+    {
+      svbool_t px = svcmpeq_n_u32 (p, svand_n_u32_x (p, ns_abs, 1), 1);
+      acc = svmul_f32_m (px, acc, c);
+      max_n >>= 1;
+      if (max_n == 0)
+	break;
+
+      ns_abs = svlsr_n_u32_x (p, ns_abs, 1);
+      c = svmul_f32_x (p, c, c);
+    }
+
+  /* Negative powers are handled by computing the abs(n) version and then
+     taking the reciprocal.  */
+  if (svptest_any (want_recip, want_recip))
+    acc = svdivr_n_f32_m (want_recip, acc, 1.0f);
+
+  return acc;
+}
+
+/* Note no trailing f for ZGV... name - 64-bit integer version is powk.  */
+strong_alias (__sv_powif_x, _ZGVsMxvv_powi)
+
+#endif // SV_SUPPORTED
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 6774ebb..35adc54 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -28,6 +28,7 @@ D (erfc, -6.0, 28.0)
 D (log10, 0.01, 11.1)
 D (log1p, -0.9, 10.0)
 D (log2, 0.01, 11.1)
+{"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}},
 D (sin, -3.1, 3.1)
 
 #if WANT_VMATH
@@ -158,6 +159,10 @@ SVD (_ZGVsMxv_sin, -3.1, 3.1)
 
 SVF (__sv_tanf_x, -3.1, 3.1)
 SVF (_ZGVsMxv_tanf, -3.1, 3.1)
+
+{"__sv_powif_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_powif_wrap}},
+{"_ZGVsMxvv_powi", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
+
 #endif
 #endif
   // clang-format on
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index fa1b99a..92e4454 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -17,6 +17,12 @@ atan2f_wrap (float x)
   return atan2f (5.0f, x);
 }
 
+static double
+powi_wrap (double x)
+{
+  return __builtin_powi (x, (int) round (x));
+}
+
 #if WANT_VMATH
 #if __aarch64__
 
@@ -100,4 +106,16 @@ _Z_sv_atan2_wrap (sv_double x, sv_bool pg)
   return _ZGVsMxvv_atan2 (x, svdup_n_f64 (5.0), pg);
 }
 
+static sv_float
+_Z_sv_powi_wrap (sv_float x, sv_bool pg)
+{
+  return _ZGVsMxvv_powi (x, svcvt_s32_f32_x (pg, x), pg);
+}
+
+static sv_float
+__sv_powif_wrap (sv_float x, sv_bool pg)
+{
+  return __sv_powif_x (x, svcvt_s32_f32_x (pg, x), pg);
+}
+
 #endif // WANT_SVE_MATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index fac5cfa..50256bf 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -174,6 +174,12 @@ check __sv_cosf 0 && runsv=1
 check __sv_cos  0 && runsv=1
 check __sv_sinf 0 && runsv=1
 check __sv_sin 0 && runsv=1
+# No guarantees about powi accuracy, so regression-test for exactness
+# w.r.t. the custom reference impl in ulp_wrappers.h
+check -q -f -e 0 __sv_powif  0  inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powif -0 -inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powif  0  inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 __sv_powif -0 -inf x -0 -1000 100000 && runsv=1
 fi
 
 range_erfc='
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index efa92f5..89d8181 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -100,6 +100,8 @@ SVF1 (log10)
 ZSVF1 (log10)
 SVD1 (log10)
 ZSVD1 (log10)
+F (__sv_powif, sv_powif, ref_powif, mpfr_powi, 2, 1, f2, 0)
+F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0)
 SVF1 (sin)
 ZSVF1 (sin)
 SVD1 (sin)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 7693578..d19b97a 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -6,6 +6,8 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+#include <stdbool.h>
+
 #if USE_MPFR
 static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
   mpfr_cos(y, x, r);
@@ -15,8 +17,48 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
   mpfr_sin(y, x, r);
   return mpfr_cos(y, x, r);
 }
+static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t rnd) {
+  mpfr_t y2;
+  mpfr_init(y2);
+  mpfr_trunc(y2, y);
+  return mpfr_pow(ret, x, y2, rnd);
+}
 #endif
 
+/* Our implementations of powi/powk are too imprecise to verify
+   against any established pow implementation. Instead we have the
+   following simple implementation, against which it is enough to
+   maintain bitwise reproducibility. Note the test framework expects
+   the reference impl to be of higher precision than the function
+   under test. For instance this means that the reference for
+   double-precision powi will be passed a long double, so to check
+   bitwise reproducibility we have to cast it back down to
+   double. This is fine since a round-trip to higher precision and
+   back down is correctly rounded.  */
+#define DECL_POW_INT_REF(NAME, DBL_T, FLT_T, INT_T)                            \
+  static DBL_T NAME (DBL_T in_val, DBL_T y)                                    \
+  {                                                                            \
+    INT_T n = (INT_T) round (y);                                               \
+    FLT_T acc = 1.0;                                                           \
+    bool want_recip = n < 0;                                                   \
+    n = n < 0 ? -n : n;                                                        \
+                                                                               \
+    for (FLT_T c = in_val; n; c *= c, n >>= 1)                                 \
+      {                                                                        \
+        if (n & 0x1)                                                           \
+          {                                                                    \
+            acc *= c;                                                          \
+          }                                                                    \
+      }                                                                        \
+    if (want_recip)                                                            \
+      {                                                                        \
+        acc = 1.0 / acc;                                                       \
+      }                                                                        \
+    return acc;                                                                \
+  }
+
+DECL_POW_INT_REF(ref_powif, double, float, int)
+
 #define VF1_WRAP(func) static float v_##func##f(float x) { return __v_##func##f(argf(x))[0]; }
 #define VF2_WRAP(func) static float v_##func##f(float x, float y) { return __v_##func##f(argf(x), argf(y))[0]; }
 #define VD1_WRAP(func) static double v_##func(double x) { return __v_##func(argd(x))[0]; }
@@ -98,6 +140,8 @@ ZSVNF1_WRAP(log)
 ZSVNF1_WRAP(log10)
 ZSVNF1_WRAP(sin)
 ZSVNF1_WRAP(tan)
+static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); }
+static float sv_powif(float x, float y) { return svretf(__sv_powif_x(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); }
 
 ZSVND2_WRAP(atan2)
 ZSVND1_WRAP(atan)
-- 
cgit v1.2.3


From 222183caf938d3b9626623b0a4949dc77abd5599 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 19 Oct 2022 14:34:55 +0100
Subject: pl/math: Add vector/SVE powi

Add double-precision SVE powi. powi is not required to meet any
particular accuracy requirements, so we do not test in the usual
way. Instead we check for bitwise reproducibility with a scalar
reference.

Vector powi does not follow the usual naming convention, instead the
trailing k denotes 64-bit integer, which distinguishes it from vector
powif.
---
 pl/math/include/mathlib.h         |  2 ++
 pl/math/sv_powi.c                 | 53 +++++++++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h    |  2 ++
 pl/math/test/mathbench_wrappers.h | 12 +++++++++
 pl/math/test/runulp.sh            |  4 +++
 pl/math/test/ulp_funcs.h          |  2 ++
 pl/math/test/ulp_wrappers.h       |  3 +++
 7 files changed, 78 insertions(+)
 create mode 100644 pl/math/sv_powi.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index c280c7e..cba3205 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -130,6 +130,7 @@ svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_powif_x (svfloat32_t, svint32_t, svbool_t);
+svfloat64_t __sv_powi_x (svfloat64_t, svint64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_tanf_x (svfloat32_t, svbool_t);
@@ -149,6 +150,7 @@ svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxvv_powi(svfloat32_t, svint32_t, svbool_t);
+svfloat64_t _ZGVsMxvv_powk(svfloat64_t, svint64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t);
diff --git a/pl/math/sv_powi.c b/pl/math/sv_powi.c
new file mode 100644
index 0000000..4e653dc
--- /dev/null
+++ b/pl/math/sv_powi.c
@@ -0,0 +1,53 @@
+/*
+ * Double-precision SVE powi(x, n) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+/* Optimized double-precision vector powi (double base, long integer power).
+   powi is developed for environments in which accuracy is of much less
+   importance than performance, hence we provide no estimate for worst-case
+   error.  */
+svfloat64_t
+__sv_powi_x (svfloat64_t as, svint64_t ns, svbool_t p)
+{
+  /* Compute powi by successive squaring, right to left.  */
+  svfloat64_t acc = svdup_n_f64 (1.0);
+  svbool_t want_recip = svcmplt_n_s64 (p, ns, 0);
+  svuint64_t ns_abs = svreinterpret_u64_s64 (svabs_s64_x (p, ns));
+
+  /* We use a max to avoid needing to check whether any lane != 0 on each
+     iteration.  */
+  uint64_t max_n = svmaxv_u64 (p, ns_abs);
+
+  svfloat64_t c = as;
+  /* Successively square c, and use merging predication (_m) to determine
+     whether or not to perform the multiplication or keep the previous
+     iteration.  */
+  while (true)
+    {
+      svbool_t px = svcmpeq_n_u64 (p, svand_n_u64_x (p, ns_abs, 1ull), 1ull);
+      acc = svmul_f64_m (px, acc, c);
+      max_n >>= 1;
+      if (max_n == 0)
+	break;
+
+      ns_abs = svlsr_n_u64_x (p, ns_abs, 1);
+      c = svmul_f64_x (p, c, c);
+    }
+
+  /* Negative powers are handled by computing the abs(n) version and then
+     taking the reciprocal.  */
+  if (svptest_any (want_recip, want_recip))
+    acc = svdivr_n_f64_m (want_recip, acc, 1.0);
+
+  return acc;
+}
+
+strong_alias (__sv_powi_x, _ZGVsMxvv_powk)
+
+#endif // SV_SUPPORTED
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 35adc54..43e3439 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -162,6 +162,8 @@ SVF (_ZGVsMxv_tanf, -3.1, 3.1)
 
 {"__sv_powif_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_powif_wrap}},
 {"_ZGVsMxvv_powi", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
+{"__sv_powi_x", 'd', 'n', -10.0, 10.0, {.svd = __sv_powi_wrap}},
+{"_ZGVsMxvv_powk", 'd', 'n', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}},
 
 #endif
 #endif
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index 92e4454..7c990ba 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -118,4 +118,16 @@ __sv_powif_wrap (sv_float x, sv_bool pg)
   return __sv_powif_x (x, svcvt_s32_f32_x (pg, x), pg);
 }
 
+static sv_double
+_Z_sv_powk_wrap (sv_double x, sv_bool pg)
+{
+  return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg);
+}
+
+static sv_double
+__sv_powi_wrap (sv_double x, sv_bool pg)
+{
+  return __sv_powi_x (x, svcvt_s64_f64_x (pg, x), pg);
+}
+
 #endif // WANT_SVE_MATH
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 50256bf..2a79a27 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -180,6 +180,10 @@ check -q -f -e 0 __sv_powif  0  inf x  0  1000 100000 && runsv=1
 check -q -f -e 0 __sv_powif -0 -inf x  0  1000 100000 && runsv=1
 check -q -f -e 0 __sv_powif  0  inf x -0 -1000 100000 && runsv=1
 check -q -f -e 0 __sv_powif -0 -inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi   0  inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi  -0 -inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi   0  inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi  -0 -inf x -0 -1000 100000 && runsv=1
 fi
 
 range_erfc='
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 89d8181..f306f0b 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -100,6 +100,8 @@ SVF1 (log10)
 ZSVF1 (log10)
 SVD1 (log10)
 ZSVD1 (log10)
+F (__sv_powi, sv_powi, ref_powi, mpfr_powi, 2, 0, d2, 0)
+F (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0)
 F (__sv_powif, sv_powif, ref_powif, mpfr_powi, 2, 1, f2, 0)
 F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0)
 SVF1 (sin)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index d19b97a..3547cf2 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -58,6 +58,7 @@ static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t
   }
 
 DECL_POW_INT_REF(ref_powif, double, float, int)
+DECL_POW_INT_REF(ref_powi, long double, double, int)
 
 #define VF1_WRAP(func) static float v_##func##f(float x) { return __v_##func##f(argf(x))[0]; }
 #define VF2_WRAP(func) static float v_##func##f(float x, float y) { return __v_##func##f(argf(x), argf(y))[0]; }
@@ -151,6 +152,8 @@ ZSVND1_WRAP(erfc)
 ZSVND1_WRAP(log)
 ZSVND1_WRAP(log10)
 ZSVND1_WRAP(sin)
+static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); }
+static double sv_powi(double x, double y) { return svretd(__sv_powi_x(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); }
 #endif
 #endif
 // clang-format on
-- 
cgit v1.2.3


From 23f1039a7a4d43c0b45991374e99f1cf757fc1fe Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 19 Oct 2022 14:51:12 +0100
Subject: pl/math: Fix subnormal handling in atan2

Previously subnormals were handled specially when determining from the
exponent whether to shortcut calculating x^8. This was not the correct
approach - to compare with the threshold in such a way that matches
the reference they should be treated the name as normal exponents.

Larger errors than previously thought were found in the subnormal
region - a new test has been added in runulp and the threshold has
been updated.
---
 pl/math/atan2_2u.c     | 155 ------------------------------------------------
 pl/math/atan2_2u5.c    | 156 +++++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/test/runulp.sh |  13 +++--
 3 files changed, 163 insertions(+), 161 deletions(-)
 delete mode 100644 pl/math/atan2_2u.c
 create mode 100644 pl/math/atan2_2u5.c

diff --git a/pl/math/atan2_2u.c b/pl/math/atan2_2u.c
deleted file mode 100644
index 7638c30..0000000
--- a/pl/math/atan2_2u.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Double-precision scalar atan2(x) function.
- *
- * Copyright (c) 2021-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include <stdbool.h>
-
-#include "math_config.h"
-#include "atan_common.h"
-
-#define Pi (0x1.921fb54442d18p+1)
-#define PiOver2 (0x1.921fb54442d18p+0)
-#define PiOver4 (0x1.921fb54442d18p-1)
-#define SignMask (0x8000000000000000)
-#define ExpMask (0x7ff0000000000000)
-
-/* We calculate atan2 by P(n/d), where n and d are similar to the input
-   arguments, and P is a polynomial. Evaluating P(x) requires calculating x^8,
-   which may underflow if n and d have very different magnitude.
-   POW8_EXP_UFLOW_BOUND is the lower bound of the difference in exponents of n
-   and d for which P underflows, and is used to special-case such inputs.  */
-#define POW8_EXP_UFLOW_BOUND 62
-
-static inline int64_t
-biased_exponent (double f)
-{
-  uint64_t fi = asuint64 (f);
-  int64_t ex = (fi & ExpMask) >> 52;
-  if (unlikely (ex == 0))
-    {
-      /* Subnormal case - we still need to get the exponent right for subnormal
-	 numbers as division may take us back inside the normal range.  */
-      return ex - __builtin_clz (fi << 12);
-    }
-  return ex;
-}
-
-/* Fast implementation of scalar atan2. Errors are greatest when y and
-   x are reasonably close together. Maximum observed error is 2.0 ulps:
-   atan2(0x1.8d9621df2f329p+2, 0x1.884cf49437972p+2)
-   got 0x1.958cd0e8c618bp-1 want 0x1.958cd0e8c618dp-1.  */
-double
-atan2 (double y, double x)
-{
-  uint64_t ix = asuint64 (x);
-  uint64_t iy = asuint64 (y);
-
-  uint64_t sign_x = ix & SignMask;
-  uint64_t sign_y = iy & SignMask;
-
-  uint64_t iax = ix & ~SignMask;
-  uint64_t iay = iy & ~SignMask;
-
-  bool xisnan = isnan (x);
-  if (unlikely (isnan (y) && !xisnan))
-    return __math_invalid (y);
-  if (unlikely (xisnan))
-    return __math_invalid (x);
-
-  /* m = 2 * sign(x) + sign(y).  */
-  uint32_t m = ((iy >> 63) & 1) | ((ix >> 62) & 2);
-
-  int64_t exp_diff = biased_exponent (x) - biased_exponent (y);
-
-  /* y = 0.  */
-  if (iay == 0)
-    {
-      switch (m)
-	{
-	case 0:
-	case 1:
-	  return y; /* atan(+-0,+anything)=+-0.  */
-	case 2:
-	  return Pi; /* atan(+0,-anything) = pi.  */
-	case 3:
-	  return -Pi; /* atan(-0,-anything) =-pi.  */
-	}
-    }
-  /* Special case for (x, y) either on or very close to the y axis. Either x =
-     0, or y is much larger than x (difference in exponents >=
-     POW8_EXP_UFLOW_BOUND).  */
-  if (unlikely (iax == 0 || exp_diff <= -POW8_EXP_UFLOW_BOUND))
-    return sign_y ? -PiOver2 : PiOver2;
-
-  /* Special case for either x is INF or (x, y) is very close to x axis and x is
-     negative.  */
-  if (unlikely (iax == 0x7ff0000000000000
-		|| (exp_diff >= POW8_EXP_UFLOW_BOUND && m >= 2)))
-    {
-      if (iay == 0x7ff0000000000000)
-	{
-	  switch (m)
-	    {
-	    case 0:
-	      return PiOver4; /* atan(+INF,+INF).  */
-	    case 1:
-	      return -PiOver4; /* atan(-INF,+INF).  */
-	    case 2:
-	      return 3.0 * PiOver4; /* atan(+INF,-INF).  */
-	    case 3:
-	      return -3.0 * PiOver4; /* atan(-INF,-INF).  */
-	    }
-	}
-      else
-	{
-	  switch (m)
-	    {
-	    case 0:
-	      return 0.0; /* atan(+...,+INF).  */
-	    case 1:
-	      return -0.0; /* atan(-...,+INF).  */
-	    case 2:
-	      return Pi; /* atan(+...,-INF).  */
-	    case 3:
-	      return -Pi; /* atan(-...,-INF).  */
-	    }
-	}
-    }
-  /* y is INF.  */
-  if (iay == 0x7ff0000000000000)
-    return sign_y ? -PiOver2 : PiOver2;
-
-  uint64_t sign_xy = sign_x ^ sign_y;
-
-  double ax = asdouble (iax);
-  double ay = asdouble (iay);
-  uint64_t pred_aygtax = (ay > ax);
-
-  /* Set up z for call to atan.  */
-  double n = pred_aygtax ? -ax : ay;
-  double d = pred_aygtax ? ay : ax;
-  double z = n / d;
-
-  double ret;
-  if (unlikely (m < 2 && exp_diff >= POW8_EXP_UFLOW_BOUND))
-    {
-      /* If (x, y) is very close to x axis and x is positive, the polynomial
-	 will underflow and evaluate to z.  */
-      ret = z;
-    }
-  else
-    {
-      /* Work out the correct shift.  */
-      double shift = sign_x ? -2.0 : 0.0;
-      shift = pred_aygtax ? shift + 1.0 : shift;
-      shift *= PiOver2;
-
-      ret = eval_poly (z, z, shift);
-    }
-
-  /* Account for the sign of x and y.  */
-  return asdouble (asuint64 (ret) ^ sign_xy);
-}
diff --git a/pl/math/atan2_2u5.c b/pl/math/atan2_2u5.c
new file mode 100644
index 0000000..572d171
--- /dev/null
+++ b/pl/math/atan2_2u5.c
@@ -0,0 +1,156 @@
+/*
+ * Double-precision scalar atan2(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stdbool.h>
+
+#include "math_config.h"
+#include "atan_common.h"
+
+#define Pi (0x1.921fb54442d18p+1)
+#define PiOver2 (0x1.921fb54442d18p+0)
+#define PiOver4 (0x1.921fb54442d18p-1)
+#define SignMask (0x8000000000000000)
+#define ExpMask (0x7ff0000000000000)
+
+/* We calculate atan2 by P(n/d), where n and d are similar to the input
+   arguments, and P is a polynomial. Evaluating P(x) requires calculating x^8,
+   which may underflow if n and d have very different magnitude.
+   POW8_EXP_UFLOW_BOUND is the lower bound of the difference in exponents of n
+   and d for which P underflows, and is used to special-case such inputs.  */
+#define POW8_EXP_UFLOW_BOUND 62
+
+static inline int64_t
+biased_exponent (double f)
+{
+  uint64_t fi = asuint64 (f);
+  return (fi & ExpMask) >> 52;
+}
+
+/* Fast implementation of scalar atan2.
+
+   For normal input, there are large errors when y and x are
+   reasonably close together. The maximum such observed error is 2.0
+   ulps:
+   atan2(0x1.8d9621df2f329p+2, 0x1.884cf49437972p+2)
+   got 0x1.958cd0e8c618bp-1 want 0x1.958cd0e8c618dp-1.
+
+   There are larger errors when y is very small, but normal, and x is
+   subnormal. The greatest observed error is 2.23 ulps:
+   atan2(0x1.01dc020fc8e2cp-1022, 0x0.fea20ed5c5a23p-1022)
+   got 0x1.9558da87cabaap-1 want 0x1.9558da87cabacp-1.  */
+double
+atan2 (double y, double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iy = asuint64 (y);
+
+  uint64_t sign_x = ix & SignMask;
+  uint64_t sign_y = iy & SignMask;
+
+  uint64_t iax = ix & ~SignMask;
+  uint64_t iay = iy & ~SignMask;
+
+  bool xisnan = isnan (x);
+  if (unlikely (isnan (y) && !xisnan))
+    return __math_invalid (y);
+  if (unlikely (xisnan))
+    return __math_invalid (x);
+
+  /* m = 2 * sign(x) + sign(y).  */
+  uint32_t m = ((iy >> 63) & 1) | ((ix >> 62) & 2);
+
+  int64_t exp_diff = biased_exponent (x) - biased_exponent (y);
+
+  /* y = 0.  */
+  if (iay == 0)
+    {
+      switch (m)
+	{
+	case 0:
+	case 1:
+	  return y; /* atan(+-0,+anything)=+-0.  */
+	case 2:
+	  return Pi; /* atan(+0,-anything) = pi.  */
+	case 3:
+	  return -Pi; /* atan(-0,-anything) =-pi.  */
+	}
+    }
+  /* Special case for (x, y) either on or very close to the y axis. Either x =
+     0, or y is much larger than x (difference in exponents >=
+     POW8_EXP_UFLOW_BOUND).  */
+  if (unlikely (iax == 0 || exp_diff <= -POW8_EXP_UFLOW_BOUND))
+    return sign_y ? -PiOver2 : PiOver2;
+
+  /* Special case for either x is INF or (x, y) is very close to x axis and x is
+     negative.  */
+  if (unlikely (iax == 0x7ff0000000000000
+		|| (exp_diff >= POW8_EXP_UFLOW_BOUND && m >= 2)))
+    {
+      if (iay == 0x7ff0000000000000)
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return PiOver4; /* atan(+INF,+INF).  */
+	    case 1:
+	      return -PiOver4; /* atan(-INF,+INF).  */
+	    case 2:
+	      return 3.0 * PiOver4; /* atan(+INF,-INF).  */
+	    case 3:
+	      return -3.0 * PiOver4; /* atan(-INF,-INF).  */
+	    }
+	}
+      else
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return 0.0; /* atan(+...,+INF).  */
+	    case 1:
+	      return -0.0; /* atan(-...,+INF).  */
+	    case 2:
+	      return Pi; /* atan(+...,-INF).  */
+	    case 3:
+	      return -Pi; /* atan(-...,-INF).  */
+	    }
+	}
+    }
+  /* y is INF.  */
+  if (iay == 0x7ff0000000000000)
+    return sign_y ? -PiOver2 : PiOver2;
+
+  uint64_t sign_xy = sign_x ^ sign_y;
+
+  double ax = asdouble (iax);
+  double ay = asdouble (iay);
+  uint64_t pred_aygtax = (ay > ax);
+
+  /* Set up z for call to atan.  */
+  double n = pred_aygtax ? -ax : ay;
+  double d = pred_aygtax ? ay : ax;
+  double z = n / d;
+
+  double ret;
+  if (unlikely (m < 2 && exp_diff >= POW8_EXP_UFLOW_BOUND))
+    {
+      /* If (x, y) is very close to x axis and x is positive, the polynomial
+	 will underflow and evaluate to z.  */
+      ret = z;
+    }
+  else
+    {
+      /* Work out the correct shift.  */
+      double shift = sign_x ? -2.0 : 0.0;
+      shift = pred_aygtax ? shift + 1.0 : shift;
+      shift *= PiOver2;
+
+      ret = eval_poly (z, z, shift);
+    }
+
+  /* Account for the sign of x and y.  */
+  return asdouble (asuint64 (ret) ^ sign_xy);
+}
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 2a79a27..059be60 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -74,12 +74,13 @@ t erfcf  0x1p-26    0x1p5  40000
 t erfcf -0x1p-26   -0x1p3  40000
 t erfcf  0          inf    40000
 
-L=1.5
-t atan2 -10.0       10.0  50000
-t atan2  -1.0        1.0  40000
-t atan2   0.0        1.0  40000
-t atan2   1.0      100.0  40000
-t atan2   1e6       1e32  40000
+L=1.74
+t atan2     -10.0      10.0               50000
+t atan2      -1.0       1.0               40000
+t atan2       0.0       1.0               40000
+t atan2       1.0     100.0               40000
+t atan2       1e6      1e32               40000
+t atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000
 # Regression-test for correct NaN handling
 check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan
 check atan2 nan nan x -nan -nan
-- 
cgit v1.2.3


From 84c28b9ac99a48a310fa459df534c09f33e304f7 Mon Sep 17 00:00:00 2001
From: Nicholas Dingle <Nicholas.Dingle@arm.com>
Date: Wed, 19 Oct 2022 15:02:46 +0100
Subject: pl/math: Correct spelling mistakes in comments

Correct a couple of spelling mistakes in the comments in two separate files.
---
 pl/math/sv_expf_data.c    | 2 +-
 pl/math/tools/tanf.sollya | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pl/math/sv_expf_data.c b/pl/math/sv_expf_data.c
index d0d85df..22c8b2f 100644
--- a/pl/math/sv_expf_data.c
+++ b/pl/math/sv_expf_data.c
@@ -7,6 +7,6 @@
 
 #include "math_config.h"
 
-/* Cofficients copied from the polynomial in math/v_expf.c.  */
+/* Coefficients copied from the polynomial in math/v_expf.c.  */
 const float __sv_expf_poly[] = {0x1.0e4020p-7f, 0x1.573e2ep-5f, 0x1.555e66p-3f,
 				0x1.fffdb6p-2f, 0x1.ffffecp-1f};
diff --git a/pl/math/tools/tanf.sollya b/pl/math/tools/tanf.sollya
index e8ff1e2..73ca0f9 100644
--- a/pl/math/tools/tanf.sollya
+++ b/pl/math/tools/tanf.sollya
@@ -24,7 +24,7 @@ else if (dtype==single) then { prec = 23!; };
 print("pi/4");
 pi/4;
 
-// Setup precisions (dislay and computation)
+// Setup precisions (display and computation)
 display = decimal!;
 prec=128!;
 save_prec=prec;
-- 
cgit v1.2.3


From 7780a64c543c43360f603ce81173ac530389fac6 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Fri, 21 Oct 2022 17:21:11 +0100
Subject: string: arm: Add new functionality to prologue/epilogue assembler
 macros.

This patch adds options for automatic alignment enforcement and for
pushing/popping the lr register to prologue and epilogue assembler macros,
while making the pushing of the ip register optional for PACBTI.

Furthermore, as the use of these macros is independent of PACBTI and
may be used on architectures without the feature, the macros are moved
to a common header.

Improvements are also made to cfi handling. Where absolute cfi offset
calculation is complicated by optional function prologue
parameters (e.g. the pushing of pac-codes to the stack for M-profile
pacbti on function entry and pushing of dummy register when alignment
required), replace the use of .cfi_offset for .cfi_rel_offset,
simplifying cfi calculations by basing offsets on SP rather than the
cfa.

Finally, extensive in-source documentation is added to these macros to
facilitate their use and further development.

Built w/ arm-none-linux-gnueabihf, ran make check-string w/ qemu-arm-static.
---
 string/arm/memchr.S         |  22 ++-
 string/arm/strcmp.S         |  30 ++-
 string/arm/strlen-armv6t2.S |   7 +-
 string/asmdefs.h            | 439 ++++++++++++++++++++++++++++++++++++++++++++
 string/pacbti.h             | 147 ---------------
 5 files changed, 471 insertions(+), 174 deletions(-)
 delete mode 100644 string/pacbti.h

diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 83a96ca..125618d 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -36,8 +36,8 @@
 #define CHARTSTMASK(c) 1<<(c*8)
 #endif
 	.thumb
+#include "../asmdefs.h"
 
-#include "../pacbti.h"
 
 @ ---------------------------------------------------------------------------
 	.thumb_func
@@ -74,10 +74,10 @@ __memchr_arm:
 	@ At this point, we are aligned, we know we have at least 8 bytes to work with
 	push	{r4,r5,r6,r7}
 	.cfi_adjust_cfa_offset 16
-	.cfi_offset 4, -(16+PAC_CFI_ADJ)
-	.cfi_offset 5, -(12+PAC_CFI_ADJ)
-	.cfi_offset 6, -(8+PAC_CFI_ADJ)
-	.cfi_offset 7, -(4+PAC_CFI_ADJ)
+	.cfi_rel_offset 4, 0
+	.cfi_rel_offset 5, 4
+	.cfi_rel_offset 6, 8
+	.cfi_rel_offset 7, 12
 	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
 	orr	r1, r1, r1, lsl #16
 	bic	r4, r2, #7	@ Number of double words to work with
@@ -116,16 +116,20 @@ __memchr_arm:
 	bne	21b		@ on r2 flags
 
 40:
+	.cfi_remember_state
 	movs	r0,#0		@ not found
 	epilogue
 
 50:
+	.cfi_restore_state
+	.cfi_remember_state
 	subs	r0,r0,#1	@ found
 	epilogue
 
 60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
 	@ r0 points to the start of the double word after the one that was tested
 	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
+	.cfi_restore_state
 	cmp	r5, #0
 	itte	eq
 	moveq	r5, r6		@ the end is in the 2nd word
@@ -144,8 +148,14 @@ __memchr_arm:
 	addeq	r0,r0,#1
 
 61:
+	pop	{r4,r5,r6,r7}
+	.cfi_restore 7
+	.cfi_restore 6
+	.cfi_restore 5
+	.cfi_restore 4
+	.cfi_adjust_cfa_offset -16
 	subs	r0,r0,#1
-	epilogue 4 7
+	epilogue
 	.cfi_endproc
 	.cantunwind
 	.fnend
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index eafb9f6..b01c02e 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -13,7 +13,6 @@
    the compares.  */
 
 #include "../asmdefs.h"
-#include "../pacbti.h"
 
 /* Build Options:
    STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
@@ -106,7 +105,7 @@
 	.cfi_restore 5
 	.cfi_adjust_cfa_offset -16
 	sub	result, result, r1, lsr #24
-	epilogue savepac=HAVE_PAC_LEAF
+	epilogue push_ip=HAVE_PAC_LEAF
 #else
 	/* To use the big-endian trick we'd have to reverse all three words.
 	   that's slower than this approach.  */
@@ -129,7 +128,7 @@
 	.cfi_adjust_cfa_offset -16
 	sub	result, result, r1
 
-	epilogue savepac=HAVE_PAC_LEAF
+	epilogue push_ip=HAVE_PAC_LEAF
 #endif
 	.endm
 
@@ -140,14 +139,14 @@ L(strcmp_start_addr):
 #if STRCMP_NO_PRECHECK == 0
 L(fastpath_exit):
 	sub	r0, r2, r3
-	epilogue savepac=HAVE_PAC_LEAF
+	epilogue push_ip=HAVE_PAC_LEAF
 	nop
 #endif
 	.global __strcmp_arm
 	.type __strcmp_arm,%function
 	.align 0
 __strcmp_arm:
-	prologue savepac=HAVE_PAC_LEAF
+	prologue push_ip=HAVE_PAC_LEAF
 #if STRCMP_NO_PRECHECK == 0
 	ldrb	r2, [src1]
 	ldrb	r3, [src2]
@@ -158,17 +157,12 @@ __strcmp_arm:
 #endif
 	strd	r4, r5, [sp, #-16]!
 	.cfi_adjust_cfa_offset 16
-	.cfi_offset 5, -(12+PAC_CFI_ADJ_DEFAULT)
-	.cfi_offset 4, -(16+PAC_CFI_ADJ_DEFAULT)
+	.cfi_rel_offset 4, 0
+	.cfi_rel_offset 5, 4
 	orr	tmp1, src1, src2
 	strd	r6, r7, [sp, #8]
-#if HAVE_PAC_LEAF
-	.cfi_offset 6, -12
-	.cfi_offset 7, -8
-#else
-	.cfi_offset 6, -8
-	.cfi_offset 7, -4
-#endif /* HAVE_PAC_LEAF */
+	.cfi_rel_offset 6, 8
+	.cfi_rel_offset 7, 12
 	mvn	const_m1, #0
 	lsl	r2, tmp1, #29
 	cbz	r2, L(loop_aligned8)
@@ -339,7 +333,7 @@ L(misaligned_exit):
 	.cfi_restore 4
 	.cfi_adjust_cfa_offset -16
 
-	epilogue savepac=HAVE_PAC_LEAF
+	epilogue push_ip=HAVE_PAC_LEAF
 
 #if STRCMP_NO_PRECHECK == 0
 L(aligned_m1):
@@ -391,7 +385,7 @@ L(overlap3):
 	.cfi_restore 7
 	.cfi_adjust_cfa_offset -16
 	neg	result, result
-	epilogue savepac=HAVE_PAC_LEAF
+	epilogue push_ip=HAVE_PAC_LEAF
 6:
 	.cfi_restore_state
 	S2LO	data1, data1, #24
@@ -467,7 +461,7 @@ L(strcmp_done_equal):
 	.cfi_restore 6
 	.cfi_restore 7
 	.cfi_adjust_cfa_offset -16
-	epilogue savepac=HAVE_PAC_LEAF
+	epilogue push_ip=HAVE_PAC_LEAF
 
 L(strcmp_tail):
 	.cfi_restore_state
@@ -491,7 +485,7 @@ L(strcmp_tail):
 	.cfi_restore 7
 	.cfi_adjust_cfa_offset -16
 	sub	result, result, data2, lsr #24
-	epilogue savepac=HAVE_PAC_LEAF
+	epilogue push_ip=HAVE_PAC_LEAF
 
 END (__strcmp_arm)
 
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 6e0352d..f06b238 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -14,7 +14,6 @@
  */
 
 #include "../asmdefs.h"
-#include "../pacbti.h"
 
 #ifdef __ARMEB__
 #define S2LO		lsl
@@ -47,7 +46,7 @@
 #define tmp2		r5
 
 ENTRY (__strlen_armv6t2)
-	prologue 4 5 savepac=HAVE_PAC_LEAF
+	prologue 4 5 push_ip=HAVE_PAC_LEAF
 	pld	[srcin, #0]
 	bic	src, srcin, #7
 	mvn	const_m1, #0
@@ -98,6 +97,7 @@ L(start_realigned):
 	beq	L(loop_aligned)
 
 L(null_found):
+	.cfi_remember_state
 	cmp	data1a, #0
 	itt	eq
 	addeq	result, result, #4
@@ -107,9 +107,10 @@ L(null_found):
 #endif
 	clz	data1a, data1a
 	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
-	epilogue 4 5 savepac=HAVE_PAC_LEAF
+	epilogue 4 5 push_ip=HAVE_PAC_LEAF
 
 L(misaligned8):
+	.cfi_restore_state
 	ldrd	data1a, data1b, [src]
 	and	tmp2, tmp1, #3
 	rsb	result, tmp1, #0
diff --git a/string/asmdefs.h b/string/asmdefs.h
index eb43836..a814190 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -9,6 +9,7 @@
 #define _ASMDEFS_H
 
 #if defined (__arm__)
+
 #define ARM_FNSTART .fnstart
 #if defined (IS_LEAF)
 #define ARM_FNEND \
@@ -20,6 +21,444 @@
 #else
 #define ARM_FNSTART
 #define ARM_FNEND
+
+/* Check whether leaf function PAC signing has been requested in the
+   -mbranch-protect compile-time option.  */
+#define LEAF_PROTECT_BIT 2
+
+#ifdef __ARM_FEATURE_PAC_DEFAULT
+# define HAVE_PAC_LEAF \
+	((__ARM_FEATURE_PAC_DEFAULT & (1 << LEAF_PROTECT_BIT)) && 1)
+#else
+# define HAVE_PAC_LEAF 0
+#endif
+
+/* Provide default parameters for PAC-code handling in leaf-functions.  */
+#if HAVE_PAC_LEAF
+# ifndef PAC_LEAF_PUSH_IP
+#  define PAC_LEAF_PUSH_IP 1
+# endif
+#else /* !HAVE_PAC_LEAF */
+# undef PAC_LEAF_PUSH_IP
+# define PAC_LEAF_PUSH_IP 0
+#endif /* HAVE_PAC_LEAF */
+
+#define STACK_ALIGN_ENFORCE 0
+
+/******************************************************************************
+* Implementation of the prologue and epilogue assembler macros and their
+* associated helper functions.
+*
+* These functions add support for the following:
+*
+* - M-profile branch target identification (BTI) landing-pads when compiled
+*   with `-mbranch-protection=bti'.
+* - PAC-signing and verification instructions, depending on hardware support
+*   and whether the PAC-signing of leaf functions has been requested via the
+*   `-mbranch-protection=pac-ret+leaf' compiler argument.
+* - 8-byte stack alignment preservation at function entry, defaulting to the
+*   value of STACK_ALIGN_ENFORCE.
+*
+* Notes:
+* - Prologue stack alignment is implemented by detecting a push with an odd
+*   number of registers and prepending a dummy register to the list.
+* - If alignment is attempted on a list containing r0, compilation will result
+*   in an error.
+* - If alignment is attempted in a list containing r1, r0 will be prepended to
+*   the register list and r0 will be restored prior to function return.  for
+*   functions with non-void return types, this will result in the corruption of
+*   the result register.
+* - Stack alignment is enforced via the following helper macro call-chain:
+*
+*	{prologue|epilogue} ->_align8 -> _preprocess_reglist ->
+*		_preprocess_reglist1 -> {_prologue|_epilogue}
+*
+* - Debug CFI directives are automatically added to prologues and epilogues,
+*   assisted by `cfisavelist' and `cfirestorelist', respectively.
+*
+* Arguments:
+* prologue
+* --------
+* - first	- If `last' specified, this serves as start of general-purpose
+*		  register (GPR) range to push onto stack, otherwise represents
+*		  single GPR to push onto stack.  If omitted, no GPRs pushed
+*		  onto stack at prologue.
+* - last	- If given, specifies inclusive upper-bound of GPR range.
+* - push_ip	- Determines whether IP register is to be pushed to stack at
+*		  prologue.  When pac-signing is requested, this holds the
+*		  the pac-key.  Either 1 or 0 to push or not push, respectively.
+*		  Default behavior: Set to value of PAC_LEAF_PUSH_IP macro.
+* - push_lr	- Determines whether to push lr to the stack on function entry.
+*		  Either 1 or 0  to push or not push, respectively.
+* - align8	- Whether to enforce alignment. Either 1 or 0, with 1 requesting
+*		  alignment.
+*
+* epilogue
+* --------
+*   The epilogue should be called passing the same arguments as those passed to
+*   the prologue to ensure the stack is not corrupted on function return.
+*
+* Usage examples:
+*
+*   prologue push_ip=1 -> push {ip}
+*   epilogue push_ip=1, align8=1 -> pop {r2, ip}
+*   prologue push_ip=1, push_lr=1 -> push {ip, lr}
+*   epilogue 1 -> pop {r1}
+*   prologue 1, align8=1 -> push {r0, r1}
+*   epilogue 1, push_ip=1 -> pop {r1, ip}
+*   prologue 1, 4 -> push {r1-r4}
+*   epilogue 1, 4 push_ip=1 -> pop {r1-r4, ip}
+*
+******************************************************************************/
+
+/* Emit .cfi_restore directives for a consecutive sequence of registers.  */
+	.macro cfirestorelist first, last
+	.cfi_restore \last
+	.if \last-\first
+	 cfirestorelist \first, \last-1
+	.endif
+	.endm
+
+/* Emit .cfi_offset directives for a consecutive sequence of registers.  */
+	.macro cfisavelist first, last, index=1
+	.cfi_offset \last, -4*(\index)
+	.if \last-\first
+	 cfisavelist \first, \last-1, \index+1
+	.endif
+	.endm
+
+.macro _prologue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
+	.if \push_ip & 1 != \push_ip
+	 .error "push_ip may be either 0 or 1"
+	.endif
+	.if \push_lr & 1 != \push_lr
+	 .error "push_lr may be either 0 or 1"
+	.endif
+	.if \first != -1
+	 .if \last == -1
+	  /* Upper-bound not provided: Set upper = lower.  */
+	  _prologue \first, \first, \push_ip, \push_lr
+	  .exitm
+	 .endif
+	.endif
+#if HAVE_PAC_LEAF
+#if __ARM_FEATURE_BTI_DEFAULT
+	pacbti	ip, lr, sp
+#else
+	pac	ip, lr, sp
+#endif /* __ARM_FEATURE_BTI_DEFAULT */
+	.cfi_register 143, 12
+#else
+#if __ARM_FEATURE_BTI_DEFAULT
+	bti
+#endif /* __ARM_FEATURE_BTI_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
+	.if \first != -1
+	 .if \last != \first
+	  .if \last >= 13
+	.error "SP cannot be in the save list"
+	  .endif
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 1: push register range, ip and lr registers.  */
+	push {r\first-r\last, ip, lr}
+	.cfi_adjust_cfa_offset ((\last-\first)+3)*4
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+	cfisavelist \first, \last, 3
+	   .else // !\push_lr
+	/* Case 2: push register range and ip register.  */
+	push {r\first-r\last, ip}
+	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
+	.cfi_offset 143, -4
+	cfisavelist \first, \last, 2
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 3: push register range and lr register.  */
+	push {r\first-r\last, lr}
+	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
+	.cfi_offset 14, -4
+	cfisavelist \first, \last, 2
+	   .else // !\push_lr
+	/* Case 4: push register range.  */
+	push {r\first-r\last}
+	.cfi_adjust_cfa_offset ((\last-\first)+1)*4
+	cfisavelist \first, \last, 1
+	   .endif
+	  .endif
+	 .else // \last == \first
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 5: push single GP register plus ip and lr registers.  */
+	push {r\first, ip, lr}
+	.cfi_adjust_cfa_offset 12
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+        cfisavelist \first, \first, 3
+	   .else // !\push_lr
+	/* Case 6: push single GP register plus ip register.  */
+	push {r\first, ip}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 143, -4
+        cfisavelist \first, \first, 2
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 7: push single GP register plus lr register.  */
+	push {r\first, lr}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 14, -4
+	cfisavelist \first, \first, 2
+	   .else // !\push_lr
+	/* Case 8: push single GP register.  */
+	push {r\first}
+	.cfi_adjust_cfa_offset 4
+	cfisavelist \first, \first, 1
+	   .endif
+	  .endif
+	 .endif
+	.else // \first == -1
+	 .if \push_ip
+	  .if \push_lr
+	/* Case 9: push ip and lr registers.  */
+	push {ip, lr}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+	  .else // !\push_lr
+	/* Case 10: push ip register.  */
+	push {ip}
+	.cfi_adjust_cfa_offset 4
+	.cfi_offset 143, -4
+	  .endif
+	 .else // !\push_ip
+          .if \push_lr
+	/* Case 11: push lr register.  */
+	push {lr}
+	.cfi_adjust_cfa_offset 4
+	.cfi_offset 14, -4
+          .endif
+	 .endif
+	.endif
+.endm
+
+.macro _epilogue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
+	.if \push_ip & 1 != \push_ip
+	 .error "push_ip may be either 0 or 1"
+	.endif
+	.if \push_lr & 1 != \push_lr
+	 .error "push_lr may be either 0 or 1"
+	.endif
+	.if \first != -1
+	 .if \last == -1
+	  /* Upper-bound not provided: Set upper = lower.  */
+	  _epilogue \first, \first, \push_ip, \push_lr
+	  .exitm
+	 .endif
+	 .if \last != \first
+	  .if \last >= 13
+	.error "SP cannot be in the save list"
+	  .endif
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 1: pop register range, ip and lr registers.  */
+	pop {r\first-r\last, ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	cfirestorelist \first, \last
+	   .else // !\push_lr
+	/* Case 2: pop register range and ip register.  */
+	pop {r\first-r\last, ip}
+	.cfi_register 143, 12
+	cfirestorelist \first, \last
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 3: pop register range and lr register.  */
+	pop {r\first-r\last, lr}
+	.cfi_restore 14
+	cfirestorelist \first, \last
+	   .else // !\push_lr
+	/* Case 4: pop register range.  */
+	pop {r\first-r\last}
+	cfirestorelist \first, \last
+	   .endif
+	  .endif
+	 .else // \last == \first
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 5: pop single GP register plus ip and lr registers.  */
+	pop {r\first, ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	cfirestorelist \first, \first
+	   .else // !\push_lr
+	/* Case 6: pop single GP register plus ip register.  */
+	pop {r\first, ip}
+	.cfi_register 143, 12
+	cfirestorelist \first, \first
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 7: pop single GP register plus lr register.  */
+	pop {r\first, lr}
+	.cfi_restore 14
+	cfirestorelist \first, \first
+	   .else // !\push_lr
+	/* Case 8: pop single GP register.  */
+	pop {r\first}
+	cfirestorelist \first, \first
+	   .endif
+	  .endif
+	 .endif
+	.else // \first == -1
+	 .if \push_ip
+	  .if \push_lr
+	/* Case 9: pop ip and lr registers.  */
+	pop {ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	  .else // !\push_lr
+	/* Case 10: pop ip register.  */
+	pop {ip}
+	.cfi_register 143, 12
+	  .endif
+	 .else // !\push_ip
+          .if \push_lr
+	/* Case 11: pop lr register.  */
+	pop {lr}
+	.cfi_restore 14
+          .endif
+	 .endif
+	.endif
+#if HAVE_PAC_LEAF
+	aut	ip, lr, sp
+#endif /* HAVE_PAC_LEAF */
+	bx	lr
+.endm
+
+# clean up expressions in 'last'
+.macro _preprocess_reglist1 first:req, last:req, push_ip:req, push_lr:req, reglist_op:req
+	.if \last == 0
+	 \reglist_op \first, 0, \push_ip, \push_lr
+	.elseif \last == 1
+	 \reglist_op \first, 1, \push_ip, \push_lr
+	.elseif \last == 2
+	 \reglist_op \first, 2, \push_ip, \push_lr
+	.elseif \last == 3
+	 \reglist_op \first, 3, \push_ip, \push_lr
+	.elseif \last == 4
+	 \reglist_op \first, 4, \push_ip, \push_lr
+	.elseif \last == 5
+	 \reglist_op \first, 5, \push_ip, \push_lr
+	.elseif \last == 6
+	 \reglist_op \first, 6, \push_ip, \push_lr
+	.elseif \last == 7
+	 \reglist_op \first, 7, \push_ip, \push_lr
+	.elseif \last == 8
+	 \reglist_op \first, 8, \push_ip, \push_lr
+	.elseif \last == 9
+	 \reglist_op \first, 9, \push_ip, \push_lr
+	.elseif \last == 10
+	 \reglist_op \first, 10, \push_ip, \push_lr
+	.elseif \last == 11
+	 \reglist_op \first, 11, \push_ip, \push_lr
+	.else
+	 .error "last (\last) out of range"
+	.endif
+.endm
+
+# clean up expressions in 'first'
+.macro _preprocess_reglist first:req, last, push_ip=0, push_lr=0, reglist_op:req
+	.ifb \last
+	 _preprocess_reglist \first \first \push_ip \push_lr
+	.else
+	 .if \first > \last
+	  .error "last (\last) must be at least as great as first (\first)"
+	 .endif
+	 .if \first == 0
+	  _preprocess_reglist1 0, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 1
+	  _preprocess_reglist1 1, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 2
+	  _preprocess_reglist1 2, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 3
+	  _preprocess_reglist1 3, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 4
+	  _preprocess_reglist1 4, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 5
+	  _preprocess_reglist1 5, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 6
+	  _preprocess_reglist1 6, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 7
+	  _preprocess_reglist1 7, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 8
+	  _preprocess_reglist1 8, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 9
+	  _preprocess_reglist1 9, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 10
+	  _preprocess_reglist1 10, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 11
+	  _preprocess_reglist1 11, \last, \push_ip, \push_lr, \reglist_op
+	 .else
+	  .error "first (\first) out of range"
+	 .endif
+	.endif
+.endm
+
+.macro _align8 first, last, push_ip=0, push_lr=0, reglist_op=_prologue
+	.ifb \first
+	 .ifnb \last
+	  .error "can't have last (\last) without specifying first"
+	 .else // \last not blank
+	  .if ((\push_ip + \push_lr) % 2) == 0
+	   \reglist_op first=-1, last=-1, push_ip=\push_ip, push_lr=\push_lr
+	   .exitm
+	  .else // ((\push_ip + \push_lr) % 2) odd
+	   _align8 2, 2, \push_ip, \push_lr, \reglist_op
+	   .exitm
+	  .endif // ((\push_ip + \push_lr) % 2) == 0
+	 .endif // .ifnb \last
+	.endif // .ifb \first
+
+	.ifb \last
+	 _align8 \first, \first, \push_ip, \push_lr, \reglist_op
+	.else
+	 .if \push_ip & 1 <> \push_ip
+	  .error "push_ip may be 0 or 1"
+	 .endif
+	 .if \push_lr & 1 <> \push_lr
+	  .error "push_lr may be 0 or 1"
+	 .endif
+	 .ifeq (\last - \first + \push_ip + \push_lr) % 2
+	  .if \first == 0
+	   .error "Alignment required and first register is r0"
+	   .exitm
+	  .endif
+	  _preprocess_reglist \first-1, \last, \push_ip, \push_lr, \reglist_op
+	 .else
+	  _preprocess_reglist \first \last, \push_ip, \push_lr, \reglist_op
+	 .endif
+	.endif
+.endm
+
+.macro prologue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
+	.if \align8
+	 _align8 \first, \last, \push_ip, \push_lr, _prologue
+	.else
+	 _prologue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
+	.endif
+.endm
+
+.macro epilogue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
+	.if \align8
+	 _align8 \first, \last, \push_ip, \push_lr, reglist_op=_epilogue
+	.else
+	 _epilogue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
+	.endif
+.endm
+
 #endif
 
 #if defined(__aarch64__)
diff --git a/string/pacbti.h b/string/pacbti.h
deleted file mode 100644
index 4e8a600..0000000
--- a/string/pacbti.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Macros for pacbti asm code.
- *
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-/* Check whether leaf function PAC signing has been requested in the
-   -mbranch-protect compile-time option.  */
-#define LEAF_PROTECT_BIT 2
-
-#ifdef __ARM_FEATURE_PAC_DEFAULT
-# define HAVE_PAC_LEAF \
-	__ARM_FEATURE_PAC_DEFAULT & (1 << LEAF_PROTECT_BIT)
-#else
-# define HAVE_PAC_LEAF 0
-#endif
-
-/* Provide default parameters for PAC-code handling in leaf-functions.  */
-#ifndef PAC_LEAF_PUSH_IP
-# define PAC_LEAF_PUSH_IP 1
-#endif
-
-/* Two distinct PAC_CFI adjustment values are needed at any given time.
-   If PAC-signing is requested for leaf functions but pushing the pac
-   code to the stack is not, PAC_CFI_ADJ defaults to 0, as most
-   functions will not overwrite the register holding pac (ip). This is not
-   appropriate for functions that clobber the ip register, where pushing
-   to the stack is non-optional.  Wherever a generated pac code must be
-   unconditionally pushed to the stack, a CFI adjustment of
-   PAC_CFI_ADJ_DEFAULT is used instead.  */
-#if HAVE_PAC_LEAF
-#  define PAC_CFI_ADJ_DEFAULT 4
-#endif
-
-#if HAVE_PAC_LEAF
-# if PAC_LEAF_PUSH_IP
-#  define PAC_CFI_ADJ 4
-# else
-#  define PAC_CFI_ADJ 0
-# endif /* PAC_LEAF_PUSH_IP*/
-#else
-# undef PAC_LEAF_PUSH_IP
-# define PAC_LEAF_PUSH_IP 0
-# define PAC_CFI_ADJ 0
-# define PAC_CFI_ADJ_DEFAULT PAC_CFI_ADJ
-#endif /* HAVE_PAC_LEAF */
-
-/* Emit .cfi_restore directives for a consecutive sequence of registers.  */
-	.macro cfirestorelist first, last
-	.cfi_restore \last
-	.if \last-\first
-	cfirestorelist \first, \last-1
-	.endif
-	.endm
-
-/* Emit .cfi_offset directives for a consecutive sequence of registers.  */
-	.macro cfisavelist first, last, index=1
-	.cfi_offset \last, -4 * (\index)
-	.if \last-\first
-	cfisavelist \first, \last-1, \index+1
-	.endif
-	.endm
-
-/* Create a prologue entry sequence handling PAC/BTI, if required and emitting
-   CFI directives for generated PAC code and any pushed registers.  */
-	.macro prologue first=-1, last=-1, savepac=PAC_LEAF_PUSH_IP
-#if HAVE_PAC_LEAF
-#if __ARM_FEATURE_BTI_DEFAULT
-	pacbti	ip, lr, sp
-#else
-	pac	ip, lr, sp
-#endif /* __ARM_FEATURE_BTI_DEFAULT */
-	.cfi_register 143, 12
-#else
-#if __ARM_FEATURE_BTI_DEFAULT
-	bti
-#endif /* __ARM_FEATURE_BTI_DEFAULT */
-#endif /* HAVE_PAC_LEAF */
-	.if \first != -1
-	.if \last != -1
-	.if \savepac
-	push {r\first-r\last, ip}
-	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
-	.cfi_offset 143, -4
-	cfisavelist \first, \last, 2
-	.else
-	push {r\first-r\last}
-	.cfi_adjust_cfa_offset ((\last-\first)+1)*4
-	cfisavelist \first, \last, 1
-	.endif
-	.else
-	.if \savepac
-	push {r\first, ip}
-	.cfi_adjust_cfa_offset 8
-	.cfi_offset 143, -4
-	cfisavelist \first, \first, 2
-	.else // !\savepac
-	push {r\first}
-	.cfi_adjust_cfa_offset 4
-	cfisavelist \first, \first, 1
-	.endif
-	.endif
-	.else // \first == -1
-	.if \savepac
-	push {ip}
-	.cfi_adjust_cfa_offset 4
-	.cfi_offset 143, -4
-	.endif
-	.endif
-	.endm
-
-/* Create an epilogue exit sequence handling PAC/BTI, if required and emitting
-  CFI directives for all restored registers.  */
-	.macro epilogue first=-1, last=-1, savepac=PAC_LEAF_PUSH_IP
-	.if \first != -1
-	.if \last != -1
-	.if \savepac
-	pop {r\first-r\last, ip}
-	.cfi_restore 143
-	cfirestorelist \first, \last
-	.else
-	pop {r\first-r\last}
-	cfirestorelist \first, \last
-	.endif
-	.else
-	.if \savepac
-	pop {r\first, ip}
-	.cfi_restore 143
-	cfirestorelist \first, \first
-	.else
-	pop {r\first}
-	cfirestorelist \first, \first
-	.endif
-	.endif
-	.else
-	.if \savepac
-	pop {ip}
-	.cfi_restore 143
-	.endif
-	.endif
-	.cfi_def_cfa_offset 0
-#if HAVE_PAC_LEAF
-	aut	ip, lr, sp
-#endif /* HAVE_PAC_LEAF */
-	bx	lr
-	.endm
-- 
cgit v1.2.3


From dcbe6e3da93b405d22feb331aef7c316cdf4f606 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 24 Oct 2022 16:45:25 +0100
Subject: pl/math: Update maximum ULP estimate for SVE sin

Filename, comments and test threshold updated.
---
 pl/math/sv_sin_2u.c    | 82 --------------------------------------------------
 pl/math/sv_sin_3u.c    | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/test/runulp.sh |  2 +-
 3 files changed, 83 insertions(+), 83 deletions(-)
 delete mode 100644 pl/math/sv_sin_2u.c
 create mode 100644 pl/math/sv_sin_3u.c

diff --git a/pl/math/sv_sin_2u.c b/pl/math/sv_sin_2u.c
deleted file mode 100644
index 9c5b747..0000000
--- a/pl/math/sv_sin_2u.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Double-precision SVE sin(x) function.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#if SV_SUPPORTED
-
-#define InvPi (sv_f64 (0x1.45f306dc9c883p-2))
-#define HalfPi (sv_f64 (0x1.921fb54442d18p+0))
-#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1))
-#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0))
-#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26))
-#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54))
-#define Shift (sv_f64 (0x1.8p52))
-#define RangeVal (sv_f64 (0x1p23))
-#define AbsMask (0x7fffffffffffffff)
-
-static NOINLINE sv_f64_t
-__sv_sin_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
-{
-  return sv_call_f64 (sin, x, y, cmp);
-}
-
-/* A fast SVE implementation of sin based on trigonometric
-   instructions (FTMAD, FTSSEL, FTSMUL).
-   Maximum measured error: 1.95 ULPs
-   __sv_sin(0x1.0abe696a98052p+19) got -0x1.ff302079d96a4p-3
-				  want -0x1.ff302079d96a2p-3.  */
-sv_f64_t
-__sv_sin_x (sv_f64_t x, const svbool_t pg)
-{
-  sv_f64_t n, r, r2, y;
-  sv_u64_t sign;
-  svbool_t cmp;
-
-  r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask));
-  sign = svand_n_u64_x (pg, sv_as_u64_f64 (x), ~AbsMask);
-  cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal));
-
-  /* n = rint(|x|/(pi/2)).  */
-  sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift);
-  n = svsub_f64_x (pg, q, Shift);
-
-  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
-  r = sv_fma_f64_x (pg, NegPio2_1, n, r);
-  r = sv_fma_f64_x (pg, NegPio2_2, n, r);
-  r = sv_fma_f64_x (pg, NegPio2_3, n, r);
-
-  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
-  sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q));
-
-  /* sin(r) poly approx.  */
-  r2 = svtsmul_f64 (r, sv_as_u64_f64 (q));
-  y = sv_f64 (0.0);
-  y = svtmad_f64 (y, r2, 7);
-  y = svtmad_f64 (y, r2, 6);
-  y = svtmad_f64 (y, r2, 5);
-  y = svtmad_f64 (y, r2, 4);
-  y = svtmad_f64 (y, r2, 3);
-  y = svtmad_f64 (y, r2, 2);
-  y = svtmad_f64 (y, r2, 1);
-  y = svtmad_f64 (y, r2, 0);
-
-  /* Apply factor.  */
-  y = svmul_f64_x (pg, f, y);
-
-  /* sign = y^sign.  */
-  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
-
-  /* No need to pass pg to specialcase here since cmp is a strict subset,
-     guaranteed by the cmpge above.  */
-  if (unlikely (svptest_any (pg, cmp)))
-    return __sv_sin_specialcase (x, y, cmp);
-  return y;
-}
-
-strong_alias (__sv_sin_x, _ZGVsMxv_sin)
-
-#endif
diff --git a/pl/math/sv_sin_3u.c b/pl/math/sv_sin_3u.c
new file mode 100644
index 0000000..be873a2
--- /dev/null
+++ b/pl/math/sv_sin_3u.c
@@ -0,0 +1,82 @@
+/*
+ * Double-precision SVE sin(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#define InvPi (sv_f64 (0x1.45f306dc9c883p-2))
+#define HalfPi (sv_f64 (0x1.921fb54442d18p+0))
+#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1))
+#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0))
+#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26))
+#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54))
+#define Shift (sv_f64 (0x1.8p52))
+#define RangeVal (sv_f64 (0x1p23))
+#define AbsMask (0x7fffffffffffffff)
+
+static NOINLINE sv_f64_t
+__sv_sin_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (sin, x, y, cmp);
+}
+
+/* A fast SVE implementation of sin based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum observed error in 2.52 ULP:
+   __sv_sin(0x1.2d2b00df69661p+19) got 0x1.10ace8f3e786bp-40
+				  want 0x1.10ace8f3e7868p-40.  */
+sv_f64_t
+__sv_sin_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_f64_t n, r, r2, y;
+  sv_u64_t sign;
+  svbool_t cmp;
+
+  r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask));
+  sign = svand_n_u64_x (pg, sv_as_u64_f64 (x), ~AbsMask);
+  cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal));
+
+  /* n = rint(|x|/(pi/2)).  */
+  sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift);
+  n = svsub_f64_x (pg, q, Shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = sv_fma_f64_x (pg, NegPio2_1, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_3, n, r);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q));
+
+  /* sin(r) poly approx.  */
+  r2 = svtsmul_f64 (r, sv_as_u64_f64 (q));
+  y = sv_f64 (0.0);
+  y = svtmad_f64 (y, r2, 7);
+  y = svtmad_f64 (y, r2, 6);
+  y = svtmad_f64 (y, r2, 5);
+  y = svtmad_f64 (y, r2, 4);
+  y = svtmad_f64 (y, r2, 3);
+  y = svtmad_f64 (y, r2, 2);
+  y = svtmad_f64 (y, r2, 1);
+  y = svtmad_f64 (y, r2, 0);
+
+  /* Apply factor.  */
+  y = svmul_f64_x (pg, f, y);
+
+  /* sign = y^sign.  */
+  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_sin_specialcase (x, y, cmp);
+  return y;
+}
+
+strong_alias (__sv_sin_x, _ZGVsMxv_sin)
+
+#endif
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 059be60..55b7747 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -491,7 +491,7 @@ L_log1p=1.97
 L_sve_cosf=1.57
 L_sve_cos=1.61
 L_sve_sinf=1.40
-L_sve_sin=1.46
+L_sve_sin=2.03
 L_sve_atanf=2.9
 L_sve_atan=1.7
 L_sve_atan2f=2.45
-- 
cgit v1.2.3


From 4705cc8a8333f5eb37687e9920dddfa93b91198b Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Tue, 25 Oct 2022 17:41:35 +0100
Subject: pl/math: Update ULP threshold for Neon log2

Filename, comments and test threshold updated.
---
 pl/math/test/runulp.sh | 2 +-
 pl/math/v_log2_3u.c    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 55b7747..53dc7f5 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -484,7 +484,7 @@ L_atanf=2.5
 L_log1pf=1.53
 L_asinhf=2.17
 L_log2f=2.10
-L_log2=2.09
+L_log2=2.10
 L_tanf=2.7
 L_log1p=1.97
 
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
index d34d0e8..b076874 100644
--- a/pl/math/v_log2_3u.c
+++ b/pl/math/v_log2_3u.c
@@ -46,8 +46,8 @@ specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
 /* Double-precision vector log2 routine. Implements the same algorithm as vector
    log10, with coefficients and table entries scaled in extended precision.
    The maximum observed error is 2.59 ULP:
-   __v_log2(0x1.0b5572f05bc9dp+0) got 0x1.fffc917a7a52dp-5
-				  want 0x1.fffc917a7a53p-5.  */
+   __v_log2(0x1.0b556e53a80b6p+0) got 0x1.fffbc594d146bp-5
+				 want 0x1.fffbc594d146ep-5.  */
 VPCS_ATTR
 v_f64_t V_NAME (log2) (v_f64_t x)
 {
-- 
cgit v1.2.3


From e873df5c1f5925f9cd82458e068d697fe3a8c18d Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 31 Oct 2022 14:02:58 +0000
Subject: pl/math: Update ULP threshold for erfc (all variants) and scalar
 log1p

Filenames, comments and threshold in runulp updated.
---
 pl/math/erfc_4u5.c     |   4 +-
 pl/math/log1p_2u.c     |   6 +-
 pl/math/s_erfc_3u7.c   |   6 --
 pl/math/s_erfc_4u.c    |   6 ++
 pl/math/sv_erfc_4u.c   |   6 +-
 pl/math/test/runulp.sh |   8 +--
 pl/math/v_erfc_3u7.c   | 176 -------------------------------------------------
 pl/math/v_erfc_4u.c    | 176 +++++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_erfc_3u7.c  |  12 ----
 pl/math/vn_erfc_4u.c   |  12 ++++
 10 files changed, 206 insertions(+), 206 deletions(-)
 delete mode 100644 pl/math/s_erfc_3u7.c
 create mode 100644 pl/math/s_erfc_4u.c
 delete mode 100644 pl/math/v_erfc_3u7.c
 create mode 100644 pl/math/v_erfc_4u.c
 delete mode 100644 pl/math/vn_erfc_3u7.c
 create mode 100644 pl/math/vn_erfc_4u.c

diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
index 7e0e813..003a0cc 100644
--- a/pl/math/erfc_4u5.c
+++ b/pl/math/erfc_4u5.c
@@ -118,8 +118,8 @@ top32 (double x)
    The approximation uses polynomial approximation of
    exp(x^2) * erfc(x) with fixed orders on 20 intervals.
    Maximum measured error is 4.05 ULPs:.
-   erfc(0x1.e8ee8c87064ap-2) got 0x1.ff81b0d2dc2e6p-2
-			    want 0x1.ff81b0d2dc2eap-2.  */
+   erfc(0x1.e8ebf6a2b0801p-2) got 0x1.ff84036f8f0b3p-2
+			     want 0x1.ff84036f8f0b7p-2.  */
 double
 erfc (double x)
 {
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
index ddef7c5..b9c7e9e 100644
--- a/pl/math/log1p_2u.c
+++ b/pl/math/log1p_2u.c
@@ -23,9 +23,9 @@
 /* log1p approximation using polynomial on reduced interval. Largest
    observed errors are near the lower boundary of the region where k
    is 0.
-   Maximum measured error: 1.72ULP.
-   log1p(-0x1.2e49eddc007d4p-2) got -0x1.663e386abd899p-2
-			       want -0x1.663e386abd89bp-2.  */
+   Maximum measured error: 1.75ULP.
+   log1p(-0x1.2e1aea97b3e5cp-2) got -0x1.65fb8659a2f9p-2
+			       want -0x1.65fb8659a2f92p-2.  */
 double
 log1p (double x)
 {
diff --git a/pl/math/s_erfc_3u7.c b/pl/math/s_erfc_3u7.c
deleted file mode 100644
index 880d7a7..0000000
--- a/pl/math/s_erfc_3u7.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_erfc_3u7.c"
diff --git a/pl/math/s_erfc_4u.c b/pl/math/s_erfc_4u.c
new file mode 100644
index 0000000..6d80574
--- /dev/null
+++ b/pl/math/s_erfc_4u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erfc_4u.c"
diff --git a/pl/math/sv_erfc_4u.c b/pl/math/sv_erfc_4u.c
index b4f2ff0..33c1c62 100644
--- a/pl/math/sv_erfc_4u.c
+++ b/pl/math/sv_erfc_4u.c
@@ -77,9 +77,9 @@ sv_eval_gauss (const svbool_t pg, sv_f64_t abs_x)
 }
 
 /* Optimized double precision vector complementary error function erfc.
-   Maximum measured error is 3.63 ULP:
-   __sv_erfc(0x1.479279a3bbc74p+2) got 0x1.ff341c664edc5p-42
-				  want 0x1.ff341c664edc9p-42.  */
+   Maximum measured error is 3.64 ULP:
+   __sv_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42
+				  want 0x1.ff3f4c8e200d9p-42.  */
 sv_f64_t
 __sv_erfc_x (sv_f64_t x, const svbool_t pg)
 {
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 53dc7f5..1573979 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -57,7 +57,7 @@ t log10  0 0xffff000000000000 10000
 t log10  0x1p-4    0x1p4      40000
 t log10  0         inf        40000
 
-L=3.55
+L=3.56
 t erfc  0       0xffff0000   10000
 t erfc  0x1p-1022  0x1p-26   40000
 t erfc -0x1p-1022 -0x1p-26   40000
@@ -107,7 +107,7 @@ t asinh     -1.0  -100.0   10000
 t asinh    100.0     inf   50000
 t asinh   -100.0    -inf   10000
 
-L=1.24
+L=1.26
 t log1p    -10.0     10.0  10000
 t log1p      0.0  0x1p-23  50000
 t log1p  0x1p-23    0.001  50000
@@ -470,7 +470,7 @@ range_sve_erfc='
 '
 
 # error limits
-L_erfc=3.14
+L_erfc=3.15
 L_erfcf=0.26
 L_log10=1.97
 L_log10f=2.81
@@ -505,7 +505,7 @@ L_sve_expf=1.46
 L_sve_erff=0.76
 L_sve_erf=1.97
 L_sve_tanf=2.7
-L_sve_erfc=3.14
+L_sve_erfc=3.15
 
 while read G F R
 do
diff --git a/pl/math/v_erfc_3u7.c b/pl/math/v_erfc_3u7.c
deleted file mode 100644
index d65a9d7..0000000
--- a/pl/math/v_erfc_3u7.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Double-precision vector erfc(x) function.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-/* Accurate exponential (vector variant of exp_dd).  */
-v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
-
-#define One v_f64 (1.0)
-#define AbsMask v_u64 (0x7fffffffffffffff)
-#define Scale v_f64 (0x1.0000002p27)
-
-/* Coeffs for polynomial approximation on [0x1.0p-28., 31.].  */
-#define PX __v_erfc_data.poly
-#define xint __v_erfc_data.interval_bounds
-
-/* Special cases (fall back to scalar calls).  */
-VPCS_ATTR
-NOINLINE static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
-  return v_call_f64 (erfc, x, y, cmp);
-}
-
-/* A structure to perform look-up in coeffs and other parameter
-   tables.  */
-struct entry
-{
-  v_f64_t P[ERFC_POLY_ORDER + 1];
-  v_f64_t xi;
-};
-
-static inline struct entry
-lookup (v_u64_t i)
-{
-  struct entry e;
-#ifdef SCALAR
-  for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
-    e.P[j] = PX[i][j];
-  e.xi = xint[i];
-#else
-  for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
-    {
-      e.P[j][0] = PX[i[0]][j];
-      e.P[j][1] = PX[i[1]][j];
-    }
-  e.xi[0] = xint[i[0]];
-  e.xi[1] = xint[i[1]];
-#endif
-  return e;
-}
-
-/* Evaluate order-12 polynomials using pairwise summation and Horner
-   scheme.  */
-static inline v_f64_t
-v_eval_poly (v_f64_t z, struct entry e)
-{
-  v_f64_t r = e.P[12];
-  r = v_fma_f64 (z, r, e.P[11]);
-  r = v_fma_f64 (z, r, e.P[10]);
-  r = v_fma_f64 (z, r, e.P[9]);
-  r = v_fma_f64 (z, r, e.P[8]);
-  r = v_fma_f64 (z, r, e.P[7]);
-  r = v_fma_f64 (z, r, e.P[6]);
-  r = v_fma_f64 (z, r, e.P[5]);
-  r = v_fma_f64 (z, r, e.P[4]);
-  r = v_fma_f64 (z, r, e.P[3]);
-  r = v_fma_f64 (z, r, e.P[2]);
-  r = v_fma_f64 (z, r, e.P[1]);
-  r = v_fma_f64 (z, r, e.P[0]);
-
-  return r;
-}
-
-/* Accurate evaluation of exp(x^2) using compensated product
-   (x^2 ~ x*x + e2) and custom exp(y+d) routine for small
-   corrections d<<y.  */
-static inline v_f64_t
-v_eval_gauss (v_f64_t a)
-{
-  v_f64_t e2;
-  v_f64_t a2 = a * a;
-
-  /* TwoProduct (Dekker) applied to a * a.  */
-  v_f64_t a_hi = -v_fma_f64 (Scale, a, -a);
-  a_hi = v_fma_f64 (Scale, a, a_hi);
-  v_f64_t a_lo = a - a_hi;
-
-  /* Now assemble error term.  */
-  e2 = v_fma_f64 (-a_hi, a_hi, a2);
-  e2 = v_fma_f64 (-a_hi, a_lo, e2);
-  e2 = v_fma_f64 (-a_lo, a_hi, e2);
-  e2 = v_fma_f64 (-a_lo, a_lo, e2);
-
-  /* Fast and accurate evaluation of exp(-a2 + e2) where e2 << a2.  */
-  return V_NAME (exp_tail) (-a2, e2);
-}
-
-/* Optimized double precision vector complementary error function erfc.
-   Maximum measured error is 3.63 ULP:
-   __v_erfc(0x1.479279a3bbc74p+2) got 0x1.ff341c664edc5p-42
-				 want 0x1.ff341c664edc9p-42.  */
-VPCS_ATTR
-v_f64_t V_NAME (erfc) (v_f64_t x)
-{
-  v_f64_t z, p, y;
-  v_u64_t ix, atop, sign, i, cmp;
-
-  ix = v_as_u64_f64 (x);
-  /* Compute fac as early as possible in order to get best performance.  */
-  v_f64_t fac = v_as_f64_u64 ((ix >> 63) << 62);
-  /* Use 12-bit for small, nan and inf case detection.  */
-  atop = (ix >> 52) & 0x7ff;
-  cmp = v_cond_u64 (atop - v_u64 (0x3cd) >= v_u64 (0x7ff - 0x3cd));
-
-  struct entry dat;
-
-  /* All entries of the vector are out of bounds, take a short path.
-     Use smallest possible number above 28 representable in 12 bits.  */
-  v_u64_t out_of_bounds = v_cond_u64 (atop >= v_u64 (0x404));
-
-  /* Use sign to produce either 0 if x > 0, 2 otherwise.  */
-  if (v_all_u64 (out_of_bounds) && likely (v_any_u64 (~cmp)))
-    return fac;
-
-  /* erfc(|x|) = P(|x|-x_i)*exp(-x^2).  */
-
-  v_f64_t a = v_abs_f64 (x);
-
-  /* Interval bounds are a logarithmic scale, i.e. interval n has
-     lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain
-     the interval index.  */
-  v_f64_t xp1 = a + v_f64 (1.0);
-  xp1 = xp1 * xp1;
-  xp1 = xp1 * xp1;
-  v_u64_t ixp1 = v_as_u64_f64 (xp1);
-  i = (ixp1 >> 52) - v_u64 (1023);
-
-  /* Index cannot exceed number of polynomials.  */
-#ifdef SCALAR
-  i = i <= (ERFC_NUM_INTERVALS) ? i : ERFC_NUM_INTERVALS;
-#else
-  i = (v_u64_t){i[0] <= ERFC_NUM_INTERVALS ? i[0] : ERFC_NUM_INTERVALS,
-		i[1] <= ERFC_NUM_INTERVALS ? i[1] : ERFC_NUM_INTERVALS};
-#endif
-  /* Get coeffs of i-th polynomial.  */
-  dat = lookup (i);
-
-  /* Evaluate Polynomial: P(|x|-x_i).  */
-  z = a - dat.xi;
-  p = v_eval_poly (z, dat);
-
-  /* Evaluate Gaussian: exp(-x^2).  */
-  v_f64_t e = v_eval_gauss (a);
-
-  /* Copy sign.  */
-  sign = v_as_u64_f64 (x) & ~AbsMask;
-  p = v_as_f64_u64 (v_as_u64_f64 (p) ^ sign);
-
-  /* Assemble result as 2.0 - p * e if x < 0, p * e otherwise.  */
-  y = v_fma_f64 (p, e, fac);
-
-  /* No need to fix value of y if x is out of bound, as
-     P[ERFC_NUM_INTERVALS]=0.  */
-  if (unlikely (v_any_u64 (cmp)))
-    return specialcase (x, y, cmp);
-  return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/pl/math/v_erfc_4u.c b/pl/math/v_erfc_4u.c
new file mode 100644
index 0000000..603d8f5
--- /dev/null
+++ b/pl/math/v_erfc_4u.c
@@ -0,0 +1,176 @@
+/*
+ * Double-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+/* Accurate exponential (vector variant of exp_dd).  */
+v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
+
+#define One v_f64 (1.0)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define Scale v_f64 (0x1.0000002p27)
+
+/* Coeffs for polynomial approximation on [0x1.0p-28., 31.].  */
+#define PX __v_erfc_data.poly
+#define xint __v_erfc_data.interval_bounds
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (erfc, x, y, cmp);
+}
+
+/* A structure to perform look-up in coeffs and other parameter
+   tables.  */
+struct entry
+{
+  v_f64_t P[ERFC_POLY_ORDER + 1];
+  v_f64_t xi;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
+    e.P[j] = PX[i][j];
+  e.xi = xint[i];
+#else
+  for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
+    {
+      e.P[j][0] = PX[i[0]][j];
+      e.P[j][1] = PX[i[1]][j];
+    }
+  e.xi[0] = xint[i[0]];
+  e.xi[1] = xint[i[1]];
+#endif
+  return e;
+}
+
+/* Evaluate order-12 polynomials using pairwise summation and Horner
+   scheme.  */
+static inline v_f64_t
+v_eval_poly (v_f64_t z, struct entry e)
+{
+  v_f64_t r = e.P[12];
+  r = v_fma_f64 (z, r, e.P[11]);
+  r = v_fma_f64 (z, r, e.P[10]);
+  r = v_fma_f64 (z, r, e.P[9]);
+  r = v_fma_f64 (z, r, e.P[8]);
+  r = v_fma_f64 (z, r, e.P[7]);
+  r = v_fma_f64 (z, r, e.P[6]);
+  r = v_fma_f64 (z, r, e.P[5]);
+  r = v_fma_f64 (z, r, e.P[4]);
+  r = v_fma_f64 (z, r, e.P[3]);
+  r = v_fma_f64 (z, r, e.P[2]);
+  r = v_fma_f64 (z, r, e.P[1]);
+  r = v_fma_f64 (z, r, e.P[0]);
+
+  return r;
+}
+
+/* Accurate evaluation of exp(x^2) using compensated product
+   (x^2 ~ x*x + e2) and custom exp(y+d) routine for small
+   corrections d<<y.  */
+static inline v_f64_t
+v_eval_gauss (v_f64_t a)
+{
+  v_f64_t e2;
+  v_f64_t a2 = a * a;
+
+  /* TwoProduct (Dekker) applied to a * a.  */
+  v_f64_t a_hi = -v_fma_f64 (Scale, a, -a);
+  a_hi = v_fma_f64 (Scale, a, a_hi);
+  v_f64_t a_lo = a - a_hi;
+
+  /* Now assemble error term.  */
+  e2 = v_fma_f64 (-a_hi, a_hi, a2);
+  e2 = v_fma_f64 (-a_hi, a_lo, e2);
+  e2 = v_fma_f64 (-a_lo, a_hi, e2);
+  e2 = v_fma_f64 (-a_lo, a_lo, e2);
+
+  /* Fast and accurate evaluation of exp(-a2 + e2) where e2 << a2.  */
+  return V_NAME (exp_tail) (-a2, e2);
+}
+
+/* Optimized double precision vector complementary error function erfc.
+   Maximum measured error is 3.64 ULP:
+   __v_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42
+				 want 0x1.ff3f4c8e200d9p-42.  */
+VPCS_ATTR
+v_f64_t V_NAME (erfc) (v_f64_t x)
+{
+  v_f64_t z, p, y;
+  v_u64_t ix, atop, sign, i, cmp;
+
+  ix = v_as_u64_f64 (x);
+  /* Compute fac as early as possible in order to get best performance.  */
+  v_f64_t fac = v_as_f64_u64 ((ix >> 63) << 62);
+  /* Use 12-bit for small, nan and inf case detection.  */
+  atop = (ix >> 52) & 0x7ff;
+  cmp = v_cond_u64 (atop - v_u64 (0x3cd) >= v_u64 (0x7ff - 0x3cd));
+
+  struct entry dat;
+
+  /* All entries of the vector are out of bounds, take a short path.
+     Use smallest possible number above 28 representable in 12 bits.  */
+  v_u64_t out_of_bounds = v_cond_u64 (atop >= v_u64 (0x404));
+
+  /* Use sign to produce either 0 if x > 0, 2 otherwise.  */
+  if (v_all_u64 (out_of_bounds) && likely (v_any_u64 (~cmp)))
+    return fac;
+
+  /* erfc(|x|) = P(|x|-x_i)*exp(-x^2).  */
+
+  v_f64_t a = v_abs_f64 (x);
+
+  /* Interval bounds are a logarithmic scale, i.e. interval n has
+     lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain
+     the interval index.  */
+  v_f64_t xp1 = a + v_f64 (1.0);
+  xp1 = xp1 * xp1;
+  xp1 = xp1 * xp1;
+  v_u64_t ixp1 = v_as_u64_f64 (xp1);
+  i = (ixp1 >> 52) - v_u64 (1023);
+
+  /* Index cannot exceed number of polynomials.  */
+#ifdef SCALAR
+  i = i <= (ERFC_NUM_INTERVALS) ? i : ERFC_NUM_INTERVALS;
+#else
+  i = (v_u64_t){i[0] <= ERFC_NUM_INTERVALS ? i[0] : ERFC_NUM_INTERVALS,
+		i[1] <= ERFC_NUM_INTERVALS ? i[1] : ERFC_NUM_INTERVALS};
+#endif
+  /* Get coeffs of i-th polynomial.  */
+  dat = lookup (i);
+
+  /* Evaluate Polynomial: P(|x|-x_i).  */
+  z = a - dat.xi;
+  p = v_eval_poly (z, dat);
+
+  /* Evaluate Gaussian: exp(-x^2).  */
+  v_f64_t e = v_eval_gauss (a);
+
+  /* Copy sign.  */
+  sign = v_as_u64_f64 (x) & ~AbsMask;
+  p = v_as_f64_u64 (v_as_u64_f64 (p) ^ sign);
+
+  /* Assemble result as 2.0 - p * e if x < 0, p * e otherwise.  */
+  y = v_fma_f64 (p, e, fac);
+
+  /* No need to fix value of y if x is out of bound, as
+     P[ERFC_NUM_INTERVALS]=0.  */
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/vn_erfc_3u7.c b/pl/math/vn_erfc_3u7.c
deleted file mode 100644
index db06bc3..0000000
--- a/pl/math/vn_erfc_3u7.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_erfc.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_erfc, _ZGVnN2v_erfc)
-#include "v_erfc_3u7.c"
-#endif
diff --git a/pl/math/vn_erfc_4u.c b/pl/math/vn_erfc_4u.c
new file mode 100644
index 0000000..678e316
--- /dev/null
+++ b/pl/math/vn_erfc_4u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erfc.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_erfc, _ZGVnN2v_erfc)
+#include "v_erfc_4u.c"
+#endif
-- 
cgit v1.2.3


From bc55630c0dabb01d22dfbf894689d116a284b68b Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 1 Nov 2022 16:18:25 +0000
Subject: pl/math: Add scalar expm1f

New routine uses polynomial on a reduced interval, and is accurate to
1.6 ULP.
---
 pl/math/expm1f_1u6.c                       | 75 ++++++++++++++++++++++++++++++
 pl/math/expm1f_data.c                      | 12 +++++
 pl/math/include/mathlib.h                  |  1 +
 pl/math/math_config.h                      |  3 ++
 pl/math/test/mathbench_funcs.h             |  1 +
 pl/math/test/runulp.sh                     |  6 +++
 pl/math/test/testcases/directed/expm1f.tst | 57 +++++++++++++++++++++++
 pl/math/test/ulp_funcs.h                   |  1 +
 pl/math/tools/expm1f.sollya                | 21 +++++++++
 9 files changed, 177 insertions(+)
 create mode 100644 pl/math/expm1f_1u6.c
 create mode 100644 pl/math/expm1f_data.c
 create mode 100644 pl/math/test/testcases/directed/expm1f.tst
 create mode 100644 pl/math/tools/expm1f.sollya

diff --git a/pl/math/expm1f_1u6.c b/pl/math/expm1f_1u6.c
new file mode 100644
index 0000000..44981ca
--- /dev/null
+++ b/pl/math/expm1f_1u6.c
@@ -0,0 +1,75 @@
+/*
+ * Single-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define Shift (0x1.8p23f)
+#define InvLn2 (0x1.715476p+0f)
+#define Ln2hi (0x1.62e4p-1f)
+#define Ln2lo (0x1.7f7d1cp-20f)
+#define AbsMask (0x7fffffff)
+#define InfLimit                                                               \
+  (0x1.644716p6) /* Smallest value of x for which expm1(x) overflows.  */
+#define NegLimit                                                               \
+  (-0x1.9bbabcp+6) /* Largest value of x for which expm1(x) rounds to 1.  */
+
+#define C(i) __expm1f_poly[i]
+
+/* Approximation for exp(x) - 1 using polynomial on a reduced interval.
+   The maximum error is 1.51 ULP:
+   expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2
+			want 0x1.e2fb94p-2.  */
+float
+expm1f (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t ax = ix & AbsMask;
+
+  /* Tiny: |x| < 0x1p-23. expm1(x) is closely approximated by x.
+     Inf:  x == +Inf => expm1(x) = x.  */
+  if (ax <= 0x34000000 || (ix == 0x7f800000))
+    return x;
+
+  /* +/-NaN.  */
+  if (ax > 0x7f800000)
+    return __math_invalidf (x);
+
+  if (x >= InfLimit)
+    return __math_oflowf (0);
+
+  if (x <= NegLimit || ix == 0xff800000)
+    return -1;
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  float j = fmaf (InvLn2, x, Shift) - Shift;
+  int32_t i = j;
+  float f = fmaf (j, -Ln2hi, x);
+  f = fmaf (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  float p = fmaf (C (4), f, C (3));
+  p = fmaf (p, f, C (2));
+  p = fmaf (p, f, C (1));
+  p = fmaf (p, f, C (0));
+  p = fmaf (f * f, p, f);
+
+  /* Assemble the result, using a slight rearrangement to achieve acceptable
+     accuracy.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^(i - 1).  */
+  float t = ldexpf (0.5f, i);
+  /* expm1(x) ~= 2 * (p * t + (t - 1/2)).  */
+  return 2 * fmaf (p, t, t - 0.5f);
+}
diff --git a/pl/math/expm1f_data.c b/pl/math/expm1f_data.c
new file mode 100644
index 0000000..fc0bd41
--- /dev/null
+++ b/pl/math/expm1f_data.c
@@ -0,0 +1,12 @@
+/*
+ * Coefficients for single-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Generated using fpminimax, see tools/expm1f.sollya for details.  */
+const float __expm1f_poly[] = {0x1.fffffep-2, 0x1.5554aep-3, 0x1.555736p-5,
+			       0x1.12287cp-7, 0x1.6b55a2p-10};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index cba3205..afa845c 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -14,6 +14,7 @@ float asinhf (float);
 float atan2f (float, float);
 float erfcf (float);
 float erff (float);
+float expm1f (float);
 float log10f (float);
 float log1pf (float);
 float tanf (float);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 9a17159..2d39e91 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -538,4 +538,7 @@ extern const struct sv_log_data
 #define SV_EXPF_POLY_ORDER 6
 extern const float __sv_expf_poly[SV_EXPF_POLY_ORDER - 1] HIDDEN;
 
+#define EXPM1F_POLY_ORDER 5
+extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN;
+
 #endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 43e3439..76a64da 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -12,6 +12,7 @@ F (atanf, -10.0, 10.0)
 F (cosf, -3.1, 3.1)
 F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
+F (expm1f, -9.9, 9.9)
 F (log10f, 0.01, 11.1)
 F (log1pf, -0.9, 10.0)
 F (log2f, 0.01, 11.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 1573979..190a294 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -157,6 +157,12 @@ t acosh 2        0x1p511 100000
 t acosh 0x1p511  inf     100000
 t acosh -0      -inf     10000
 
+L=1.02
+t expm1f  0        0x1p-23       1000
+t expm1f -0       -0x1p-23       1000
+t expm1f  0x1p-23  0x1.644716p6  100000
+t expm1f -0x1p-23 -0x1.9bbabcp+6 100000
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/expm1f.tst b/pl/math/test/testcases/directed/expm1f.tst
new file mode 100644
index 0000000..dcf3d06
--- /dev/null
+++ b/pl/math/test/testcases/directed/expm1f.tst
@@ -0,0 +1,57 @@
+; expm1f.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=expm1f op1=7fc00001 result=7fc00001 errno=0
+func=expm1f op1=ffc00001 result=7fc00001 errno=0
+func=expm1f op1=7f800001 result=7fc00001 errno=0 status=i
+func=expm1f op1=ff800001 result=7fc00001 errno=0 status=i
+func=expm1f op1=7f800000 result=7f800000 errno=0
+func=expm1f op1=7f7fffff result=7f800000 errno=ERANGE status=ox
+func=expm1f op1=ff800000 result=bf800000 errno=0
+func=expm1f op1=ff7fffff result=bf800000 errno=0
+func=expm1f op1=00000000 result=00000000 errno=0
+func=expm1f op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+
+func=expm1f op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=expm1f op1=80000001 result=80000001 errno=0 maybestatus=ux
+
+func=expm1f op1=42b145c0 result=7f6ac2dd.9b8 errno=0
+
+; Check both sides of the over/underflow thresholds in the code.
+func=expm1f op1=c2000000 result=bf7fffff.fff error=0
+func=expm1f op1=c2000001 result=bf7fffff.fff error=0
+func=expm1f op1=43000000 result=7f800000 error=overflow
+func=expm1f op1=43000001 result=7f800000 error=overflow
+func=expm1f op1=c2a80000 result=bf800000.000 error=0
+func=expm1f op1=c2a80001 result=bf800000.000 error=0
+
+; Check values for which exp goes denormal. expm1f should not report
+; spurious overflow.
+func=expm1f op1=c2b00f34 result=bf800000.000 error=0
+func=expm1f op1=c2ce8ed0 result=bf800000.000 error=0
+func=expm1f op1=c2dc6bba result=bf800000.000 error=0
+
+; Regression tests for significance loss when the two components of
+; the result have opposite sign but similar magnitude
+func=expm1f op1=be8516c1 result=be6a652b.0dc error=0
+func=expm1f op1=be851714 result=be6a65ab.0e5 error=0
+func=expm1f op1=be851cc7 result=be6a6e75.111 error=0
+func=expm1f op1=be851d1a result=be6a6ef5.102 error=0
+func=expm1f op1=be851d6d result=be6a6f75.0f2 error=0
+func=expm1f op1=be852065 result=be6a7409.0e4 error=0
+func=expm1f op1=be8520b8 result=be6a7489.0c7 error=0
+func=expm1f op1=be85210b result=be6a7509.0a8 error=0
+func=expm1f op1=be855401 result=be6ac39b.0d5 error=0
+func=expm1f op1=be933307 result=be7fdbf0.d8d error=0
+func=expm1f op1=be92ed6b result=be7f737a.d81 error=0
+func=expm1f op1=be933b90 result=be7fe8be.d76 error=0
+func=expm1f op1=3eb11364 result=3ed38deb.0c0 error=0
+func=expm1f op1=3f28e830 result=3f6f344b.0da error=0
+func=expm1f op1=3eb1578f result=3ed3ee47.13b error=0
+func=expm1f op1=3f50176a result=3fa08e36.fea error=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index f306f0b..446db00 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -9,6 +9,7 @@ F1 (asinh)
 F2 (atan2)
 F1 (erfc)
 F1 (erf)
+F1 (expm1)
 F1 (log10)
 F1 (log1p)
 D1 (acosh)
diff --git a/pl/math/tools/expm1f.sollya b/pl/math/tools/expm1f.sollya
new file mode 100644
index 0000000..f5d769c
--- /dev/null
+++ b/pl/math/tools/expm1f.sollya
@@ -0,0 +1,21 @@
+// polynomial for approximating exp(x)-1 in single precision
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 5;
+
+a = -log(2)/2;
+b = log(2)/2;
+
+f = proc(y) {
+  return exp(y)-1;
+};
+
+poly = fpminimax(f(x), deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do round(coeff(poly,i), SG, RN);
-- 
cgit v1.2.3


From 83ed7a4b6cf347615e89d24e90d7894ad803bf87 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 2 Nov 2022 09:14:54 +0000
Subject: pl/math: Update ULP thresholds for atan/atan2

Larger errors observed for several variants.
---
 pl/math/atan2_2u5.c    | 16 +++-------
 pl/math/sv_atan2_2u.c  | 82 --------------------------------------------------
 pl/math/sv_atan2_2u5.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/sv_atan_2u5.c  |  6 ++--
 pl/math/test/runulp.sh |  9 +++---
 pl/math/v_atan_2u5.c   |  6 ++--
 6 files changed, 96 insertions(+), 105 deletions(-)
 delete mode 100644 pl/math/sv_atan2_2u.c
 create mode 100644 pl/math/sv_atan2_2u5.c

diff --git a/pl/math/atan2_2u5.c b/pl/math/atan2_2u5.c
index 572d171..471c5c9 100644
--- a/pl/math/atan2_2u5.c
+++ b/pl/math/atan2_2u5.c
@@ -30,18 +30,10 @@ biased_exponent (double f)
   return (fi & ExpMask) >> 52;
 }
 
-/* Fast implementation of scalar atan2.
-
-   For normal input, there are large errors when y and x are
-   reasonably close together. The maximum such observed error is 2.0
-   ulps:
-   atan2(0x1.8d9621df2f329p+2, 0x1.884cf49437972p+2)
-   got 0x1.958cd0e8c618bp-1 want 0x1.958cd0e8c618dp-1.
-
-   There are larger errors when y is very small, but normal, and x is
-   subnormal. The greatest observed error is 2.23 ulps:
-   atan2(0x1.01dc020fc8e2cp-1022, 0x0.fea20ed5c5a23p-1022)
-   got 0x1.9558da87cabaap-1 want 0x1.9558da87cabacp-1.  */
+/* Fast implementation of scalar atan2. Largest errors are when y and x are
+   close together. The greatest observed error is 2.28 ULP:
+   atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
+   got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1.  */
 double
 atan2 (double y, double x)
 {
diff --git a/pl/math/sv_atan2_2u.c b/pl/math/sv_atan2_2u.c
deleted file mode 100644
index 82f7588..0000000
--- a/pl/math/sv_atan2_2u.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Double-precision vector atan2(x) function.
- *
- * Copyright (c) 2021-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#if SV_SUPPORTED
-
-#include "sv_atan_common.h"
-
-/* Useful constants.  */
-#define PiOver2 sv_f64 (0x1.921fb54442d18p+0)
-#define SignMask sv_u64 (0x8000000000000000)
-
-/* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
-__attribute__ ((noinline)) static sv_f64_t
-specialcase (sv_f64_t y, sv_f64_t x, sv_f64_t ret, const svbool_t cmp)
-{
-  return sv_call2_f64 (atan2, y, x, ret, cmp);
-}
-
-/* Returns a predicate indicating true if the input is the bit representation of
-   0, infinity or nan.  */
-static inline svbool_t
-zeroinfnan (sv_u64_t i, const svbool_t pg)
-{
-  return svcmpge_u64 (pg, svsub_n_u64_x (pg, svlsl_n_u64_x (pg, i, 1), 1),
-		      sv_u64 (2 * asuint64 (INFINITY) - 1));
-}
-
-/* Fast implementation of SVE atan2. Errors are greatest when y and
-   x are reasonably close together. Maximum observed error is 2.0 ulps:
-   sv_atan2(0x1.8d9621df2f329p+2, 0x1.884cf49437972p+2)
-   got 0x1.958cd0e8c618bp-1 want 0x1.958cd0e8c618dp-1.  */
-sv_f64_t
-__sv_atan2_x (sv_f64_t y, sv_f64_t x, const svbool_t pg)
-{
-  sv_u64_t ix = sv_as_u64_f64 (x);
-  sv_u64_t iy = sv_as_u64_f64 (y);
-
-  svbool_t cmp_x = zeroinfnan (ix, pg);
-  svbool_t cmp_y = zeroinfnan (iy, pg);
-  svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y);
-
-  sv_u64_t sign_x = svand_u64_x (pg, ix, SignMask);
-  sv_u64_t sign_y = svand_u64_x (pg, iy, SignMask);
-  sv_u64_t sign_xy = sveor_u64_x (pg, sign_x, sign_y);
-
-  sv_f64_t ax = svabs_f64_x (pg, x);
-  sv_f64_t ay = svabs_f64_x (pg, y);
-
-  svbool_t pred_xlt0 = svcmplt_f64 (pg, x, sv_f64 (0.0));
-  svbool_t pred_aygtax = svcmpgt_f64 (pg, ay, ax);
-
-  /* Set up z for call to atan.  */
-  sv_f64_t n = svsel_f64 (pred_aygtax, svneg_f64_x (pg, ax), ay);
-  sv_f64_t d = svsel_f64 (pred_aygtax, ay, ax);
-  sv_f64_t z = svdiv_f64_x (pg, n, d);
-
-  /* Work out the correct shift.  */
-  sv_f64_t shift = svsel_f64 (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
-  shift = svsel_f64 (pred_aygtax, svadd_n_f64_x (pg, shift, 1.0), shift);
-  shift = svmul_f64_x (pg, shift, PiOver2);
-
-  sv_f64_t ret = __sv_atan_common (pg, pg, z, z, shift);
-
-  /* Account for the sign of x and y.  */
-  ret = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (ret), sign_xy));
-
-  if (unlikely (svptest_any (pg, cmp_xy)))
-    {
-      return specialcase (y, x, ret, cmp_xy);
-    }
-
-  return ret;
-}
-
-strong_alias (__sv_atan2_x, _ZGVsMxvv_atan2)
-
-#endif
diff --git a/pl/math/sv_atan2_2u5.c b/pl/math/sv_atan2_2u5.c
new file mode 100644
index 0000000..bc98ccd
--- /dev/null
+++ b/pl/math/sv_atan2_2u5.c
@@ -0,0 +1,82 @@
+/*
+ * Double-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#include "sv_atan_common.h"
+
+/* Useful constants.  */
+#define PiOver2 sv_f64 (0x1.921fb54442d18p+0)
+#define SignMask sv_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
+__attribute__ ((noinline)) static sv_f64_t
+specialcase (sv_f64_t y, sv_f64_t x, sv_f64_t ret, const svbool_t cmp)
+{
+  return sv_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns a predicate indicating true if the input is the bit representation of
+   0, infinity or nan.  */
+static inline svbool_t
+zeroinfnan (sv_u64_t i, const svbool_t pg)
+{
+  return svcmpge_u64 (pg, svsub_n_u64_x (pg, svlsl_n_u64_x (pg, i, 1), 1),
+		      sv_u64 (2 * asuint64 (INFINITY) - 1));
+}
+
+/* Fast implementation of SVE atan2. Errors are greatest when y and
+   x are reasonably close together. The greatest observed error is 2.28 ULP:
+   sv_atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
+   got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1.  */
+sv_f64_t
+__sv_atan2_x (sv_f64_t y, sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t iy = sv_as_u64_f64 (y);
+
+  svbool_t cmp_x = zeroinfnan (ix, pg);
+  svbool_t cmp_y = zeroinfnan (iy, pg);
+  svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y);
+
+  sv_u64_t sign_x = svand_u64_x (pg, ix, SignMask);
+  sv_u64_t sign_y = svand_u64_x (pg, iy, SignMask);
+  sv_u64_t sign_xy = sveor_u64_x (pg, sign_x, sign_y);
+
+  sv_f64_t ax = svabs_f64_x (pg, x);
+  sv_f64_t ay = svabs_f64_x (pg, y);
+
+  svbool_t pred_xlt0 = svcmplt_f64 (pg, x, sv_f64 (0.0));
+  svbool_t pred_aygtax = svcmpgt_f64 (pg, ay, ax);
+
+  /* Set up z for call to atan.  */
+  sv_f64_t n = svsel_f64 (pred_aygtax, svneg_f64_x (pg, ax), ay);
+  sv_f64_t d = svsel_f64 (pred_aygtax, ay, ax);
+  sv_f64_t z = svdiv_f64_x (pg, n, d);
+
+  /* Work out the correct shift.  */
+  sv_f64_t shift = svsel_f64 (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
+  shift = svsel_f64 (pred_aygtax, svadd_n_f64_x (pg, shift, 1.0), shift);
+  shift = svmul_f64_x (pg, shift, PiOver2);
+
+  sv_f64_t ret = __sv_atan_common (pg, pg, z, z, shift);
+
+  /* Account for the sign of x and y.  */
+  ret = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (ret), sign_xy));
+
+  if (unlikely (svptest_any (pg, cmp_xy)))
+    {
+      return specialcase (y, x, ret, cmp_xy);
+    }
+
+  return ret;
+}
+
+strong_alias (__sv_atan2_x, _ZGVsMxvv_atan2)
+
+#endif
diff --git a/pl/math/sv_atan_2u5.c b/pl/math/sv_atan_2u5.c
index aa741f7..e0b621f 100644
--- a/pl/math/sv_atan_2u5.c
+++ b/pl/math/sv_atan_2u5.c
@@ -17,9 +17,9 @@
 /* Fast implementation of SVE atan.
    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
    z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
-   is 2.2 ulps:
-   __sv_atan(0x1.00050804cdc8cp+0) got 0x1.9224bd3c68773p-1
-				  want 0x1.9224bd3c68775p-1.  */
+   error is 2.22 ulps:
+   __sv_atan(0x1.0005fd947bf57p+0) got 0x1.9225b2c6cd6cdp-1
+				  want 0x1.9225b2c6cd6cfp-1.  */
 sv_f64_t
 __sv_atan_x (sv_f64_t x, const svbool_t pg)
 {
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 190a294..ebc6c30 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -74,7 +74,7 @@ t erfcf  0x1p-26    0x1p5  40000
 t erfcf -0x1p-26   -0x1p3  40000
 t erfcf  0          inf    40000
 
-L=1.74
+L=1.78
 t atan2     -10.0      10.0               50000
 t atan2      -1.0       1.0               40000
 t atan2       0.0       1.0               40000
@@ -484,7 +484,7 @@ L_erf=1.26
 L_erff=0.76
 # TODO tighten this once __v_atan2 is fixed
 L_atan2=2.9
-L_atan=2.15
+L_atan=1.73
 L_atan2f=2.46
 L_atanf=2.5
 L_log1pf=1.53
@@ -499,10 +499,9 @@ L_sve_cos=1.61
 L_sve_sinf=1.40
 L_sve_sin=2.03
 L_sve_atanf=2.9
-L_sve_atan=1.7
+L_sve_atan=1.73
 L_sve_atan2f=2.45
-# TODO tighten this once __sv_atan2 is fixed
-L_sve_atan2=2.0
+L_sve_atan2=1.73
 L_sve_log10=1.97
 L_sve_log10f=2.82
 L_sve_logf=2.85
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
index 619bbb9..a0223ed 100644
--- a/pl/math/v_atan_2u5.c
+++ b/pl/math/v_atan_2u5.c
@@ -15,9 +15,9 @@
 
 /* Fast implementation of vector atan.
    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
-   z=1/x and shift = pi/2. Maximum observed error is 2.14 ulps:
-   __v_atan(-0x1.02eac6432cb9ap+0) got -0x1.95063e76724c1p-1
-				  want -0x1.95063e76724c3p-1.  */
+   z=1/x and shift = pi/2. Maximum observed error is 2.22 ulps:
+   __v_atan(0x1.0005fd947bf57p+0) got 0x1.9225b2c6cd6cdp-1
+				 want 0x1.9225b2c6cd6cfp-1.  */
 VPCS_ATTR
 v_f64_t V_NAME (atan) (v_f64_t x)
 {
-- 
cgit v1.2.3


From baef8cd322ffe43fd0b93c73e19a9b5974794de4 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 2 Nov 2022 15:14:34 +0000
Subject: pl/math: Add vector/Neon expm1f

New routine is a vector port of the scalar algorithm, with fallback to
the scalar variant for large and special input. This enables us to
simplify elements of the algorithm which were necessary for large
input. It also means that, as long as we fall back to the scalar for
tiny input as well (dependent on the value of WANT_ERRNO), the routine
sets fenv flags correctly. Some changes were needed in runulp.sh to
test this, as previously we have not been interested in fenv behaviour
of vector routines.
---
 pl/math/include/mathlib.h      |  4 ++
 pl/math/s_expm1f_1u6.c         |  6 +++
 pl/math/test/mathbench_funcs.h |  5 +++
 pl/math/test/runulp.sh         | 28 ++++++++++++--
 pl/math/test/ulp_funcs.h       |  3 ++
 pl/math/test/ulp_wrappers.h    |  2 +
 pl/math/v_expm1f_1u6.c         | 84 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/v_math.h               | 10 +++++
 pl/math/vn_expm1f_1u6.c        | 12 ++++++
 9 files changed, 151 insertions(+), 3 deletions(-)
 create mode 100644 pl/math/s_expm1f_1u6.c
 create mode 100644 pl/math/v_expm1f_1u6.c
 create mode 100644 pl/math/vn_expm1f_1u6.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index afa845c..47d5dce 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -31,6 +31,7 @@ float __s_atanf (float);
 float __s_atan2f (float, float);
 float __s_erfcf (float);
 float __s_erff (float);
+float __s_expm1f (float);
 float __s_log10f (float);
 float __s_log1pf (float);
 float __s_log2f (float);
@@ -65,6 +66,7 @@ __f32x4_t __v_erff (__f32x4_t);
 __f64x2_t __v_erf (__f64x2_t);
 __f32x4_t __v_erfcf (__f32x4_t);
 __f64x2_t __v_erfc (__f64x2_t);
+__f32x4_t __v_expm1f (__f32x4_t);
 __f32x4_t __v_log10f (__f32x4_t);
 __f64x2_t __v_log10 (__f64x2_t);
 __f32x4_t __v_log1pf (__f32x4_t);
@@ -86,6 +88,7 @@ __vpcs __f32x4_t __vn_erff (__f32x4_t);
 __vpcs __f64x2_t __vn_erf (__f64x2_t);
 __vpcs __f32x4_t __vn_erfcf (__f32x4_t);
 __vpcs __f64x2_t __vn_erfc (__f64x2_t);
+__vpcs __f32x4_t __vn_expm1f (__f32x4_t);
 __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 __vpcs __f32x4_t __vn_log1pf (__f32x4_t);
@@ -104,6 +107,7 @@ __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
diff --git a/pl/math/s_expm1f_1u6.c b/pl/math/s_expm1f_1u6.c
new file mode 100644
index 0000000..83385df
--- /dev/null
+++ b/pl/math/s_expm1f_1u6.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_expm1f_1u6.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 76a64da..902088e 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -42,6 +42,7 @@ F (__s_erff, -4.0, 4.0)
 D (__s_erf, -6.0, 6.0)
 F (__s_erfcf, -6.0, 28.0)
 D (__s_erfc, -6.0, 28.0)
+F (__s_expm1f, -9.9, 9.9)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 F (__s_log1pf, -0.9, 10.0)
@@ -59,6 +60,7 @@ VF  (__v_erff, -4.0, 4.0)
 VD  (__v_erf, -6.0, 6.0)
 VF (__v_erfcf, -6.0, 28.0)
 VD (__v_erfc, -6.0, 28.0)
+VF (__v_expm1f, -9.9, 9.9)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 VF (__v_log1pf, -0.9, 10.0)
@@ -94,6 +96,9 @@ VNF (_ZGVnN4v_erfcf, -6.0, 28.0)
 VND (__vn_erfc, -6.0, 28.0)
 VND (_ZGVnN2v_erfc, -6.0, 28.0)
 
+VNF (__vn_expm1f, -9.9, 9.9)
+VNF (_ZGVnN4v_expm1f, -9.9, 9.9)
+
 VNF (__vn_log10f, 0.01, 11.1)
 VNF (_ZGVnN4v_log10f, 0.01, 11.1)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index ebc6c30..009fde2 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -168,7 +168,7 @@ done
 # vector functions
 Ldir=0.5
 r='n'
-flags="${ULPFLAGS:--q} -f"
+flags="${ULPFLAGS:--q}"
 runs=
 check __s_log10f 1 && runs=1
 runv=
@@ -334,6 +334,13 @@ range_log1p='
      -1.0      inf   5000
 '
 
+range_expm1f='
+  0        0x1p-23       1000
+ -0       -0x1p-23       1000
+  0x1p-23  0x1.644716p6  1000000
+ -0x1p-23 -0x1.9bbabcp+6 1000000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -493,6 +500,7 @@ L_log2f=2.10
 L_log2=2.10
 L_tanf=2.7
 L_log1p=1.97
+L_expm1f=1.02
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -512,7 +520,7 @@ L_sve_erf=1.97
 L_sve_tanf=2.7
 L_sve_erfc=3.15
 
-while read G F R
+while read G F R D
 do
 	[ "$R" = 1 ] || continue
 	case "$G" in \#*) continue ;; esac
@@ -521,8 +529,18 @@ do
 	while read X
 	do
 		[ -n "$X" ] || continue
+		# fenv checking is enabled by default, but we almost
+		# always want to disable it for vector routines, so a
+		# hack is needed. Pass a fourth argument to prevent -f
+		# being added to the run line.
+		if [ -z "$D" ]
+		then
+		    f="-f"
+		else
+		    f=""
+		fi
 		case "$X" in \#*) continue ;; esac
-		t $F $X
+		t $f $F $X
 	done << EOF
 $range
 EOF
@@ -594,6 +612,10 @@ log1p  __s_log1p       $runs
 log1p  __v_log1p       $runv
 log1p  __vn_log1p      $runvn
 log1p  _ZGVnN2v_log1p  $runvn
+expm1f __s_expm1f      $runs  EF
+expm1f __v_expm1f      $runv  EF
+expm1f __vn_expm1f     $runvn EF
+expm1f _ZGVnN4v_expm1f $runvn EF
 
 if [ $WANT_SVE_MATH -eq 1 ]; then
 sve_cosf     __sv_cosf         $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 446db00..ad2719d 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -29,6 +29,7 @@ SF1 (erf)
 SD1 (erf)
 SF1 (erfc)
 SD1 (erfc)
+SF1 (expm1)
 SF1 (log10)
 SD1 (log10)
 SF1 (log1p)
@@ -46,6 +47,7 @@ VF1 (erf)
 VD1 (erf)
 VF1 (erfc)
 VD1 (erfc)
+VF1 (expm1)
 VF1 (log10)
 VD1 (log10)
 VF1 (log1p)
@@ -63,6 +65,7 @@ ZVNF1 (erf)
 ZVND1 (erf)
 ZVNF1 (erfc)
 ZVND1 (erfc)
+ZVNF1 (expm1)
 ZVNF1 (log10)
 ZVND1 (log10)
 ZVNF1 (log1p)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 3547cf2..8087692 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -102,6 +102,7 @@ VF1_WRAP(atan)
 VF2_WRAP(atan2)
 VF1_WRAP(erf)
 VF1_WRAP(erfc)
+VF1_WRAP(expm1)
 VF1_WRAP(log10)
 VF1_WRAP(log1p)
 VF1_WRAP(log2)
@@ -119,6 +120,7 @@ ZVNF1_WRAP(atan)
 ZVNF2_WRAP(atan2)
 ZVNF1_WRAP(erf)
 ZVNF1_WRAP(erfc)
+ZVNF1_WRAP(expm1)
 ZVNF1_WRAP(log10)
 ZVNF1_WRAP(log1p)
 ZVNF1_WRAP(log2)
diff --git a/pl/math/v_expm1f_1u6.c b/pl/math/v_expm1f_1u6.c
new file mode 100644
index 0000000..e18814e
--- /dev/null
+++ b/pl/math/v_expm1f_1u6.c
@@ -0,0 +1,84 @@
+/*
+ * Single-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#if V_SUPPORTED
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define MLn2hi v_f32 (-0x1.62e4p-1f)
+#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
+#define AbsMask (0x7fffffff)
+#define One (0x3f800000)
+#define SpecialBound                                                           \
+  (0x42af5e20) /* asuint(0x1.5ebc4p+6). Largest value of x for which expm1(x)  \
+		  should round to -1.  */
+#define TinyBound (0x34000000) /* asuint(0x1p-23).  */
+
+#define C(i) v_f32 (__expm1f_poly[i])
+
+/* Single-precision vector exp(x) - 1 function.
+   The maximum error is 1.51 ULP:
+   expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2
+			want 0x1.e2fb94p-2.  */
+VPCS_ATTR
+v_f32_t V_NAME (expm1f) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t ax = ix & AbsMask;
+
+#if WANT_ERRNO
+  /* If errno is to be set correctly, fall back to the scalar variant for all
+     lanes if any of them should trigger an exception.  */
+  v_u32_t special
+    = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000) | (ax < TinyBound));
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (expm1f, x, x, v_u32 (0xffffffff));
+#else
+  /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf and -0.  */
+  v_u32_t special = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000));
+#endif
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
+  v_s32_t i = v_to_s32_f32 (j);
+  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
+  f = v_fma_f32 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+
+  v_f32_t p = v_fma_f32 (C (4), f, C (3));
+  p = v_fma_f32 (p, f, C (2));
+  p = v_fma_f32 (p, f, C (1));
+  p = v_fma_f32 (p, f, C (0));
+  p = v_fma_f32 (f * f, p, f);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  v_f32_t y = v_fma_f32 (p, t, t - 1);
+
+#if !WANT_ERRNO
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (expm1f, x, y, special);
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index e98824f..a3f9c57 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -230,6 +230,11 @@ v_to_f32_s32 (v_s32_t x)
 {
   return x;
 }
+static inline v_s32_t
+v_to_s32_f32 (v_f32_t x)
+{
+  return x;
+}
 static inline v_f32_t
 v_to_f32_u32 (v_u32_t x)
 {
@@ -581,6 +586,11 @@ v_to_f32_s32 (v_s32_t x)
 {
   return (v_f32_t){x[0], x[1], x[2], x[3]};
 }
+static inline v_s32_t
+v_to_s32_f32 (v_f32_t x)
+{
+  return vcvtq_s32_f32 (x);
+}
 static inline v_f32_t
 v_to_f32_u32 (v_u32_t x)
 {
diff --git a/pl/math/vn_expm1f_1u6.c b/pl/math/vn_expm1f_1u6.c
new file mode 100644
index 0000000..5cbb929
--- /dev/null
+++ b/pl/math/vn_expm1f_1u6.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expm1f.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_expm1f, _ZGVnN4v_expm1f)
+#include "v_expm1f_1u6.c"
+#endif
-- 
cgit v1.2.3


From 1e3d1055c5e8341ac6c9ee568867ef78ac4048a8 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 3 Nov 2022 14:12:19 +0000
Subject: pl/math: Add scalar sinhf

New routine uses expm1f, and the __exp_dd helper for special cases. It is accurate to 2.3 ULP.
---
 pl/math/include/mathlib.h                 |  1 +
 pl/math/sinhf_2u3.c                       | 65 +++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h            |  1 +
 pl/math/test/runulp.sh                    |  8 ++++
 pl/math/test/testcases/directed/sinhf.tst | 21 ++++++++++
 pl/math/test/ulp_funcs.h                  |  1 +
 6 files changed, 97 insertions(+)
 create mode 100644 pl/math/sinhf_2u3.c
 create mode 100644 pl/math/test/testcases/directed/sinhf.tst

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 47d5dce..4a26092 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -17,6 +17,7 @@ float erff (float);
 float expm1f (float);
 float log10f (float);
 float log1pf (float);
+float sinhf (float);
 float tanf (float);
 
 double acosh (double);
diff --git a/pl/math/sinhf_2u3.c b/pl/math/sinhf_2u3.c
new file mode 100644
index 0000000..a0459ca
--- /dev/null
+++ b/pl/math/sinhf_2u3.c
@@ -0,0 +1,65 @@
+/*
+ * Single-precision sinh(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define Expm1OFlowLimit                                                        \
+  0x42b17218 /* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f          \
+		overflows.  */
+#define OFlowLimit                                                             \
+  0x42b2d4fd /* 0x1.65a9fap+6, minimum positive value for which sinhf should   \
+		overflow.  */
+
+double
+__exp_dd (double, double);
+
+/* Approximation for single-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The maximum error is 2.26 ULP:
+   sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4.  */
+float
+sinhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  float ax = asfloat (iax);
+  uint32_t sign = ix & ~AbsMask;
+  float halfsign = asfloat (Half | sign);
+
+  if (unlikely (iax >= Expm1OFlowLimit))
+    {
+      /* Special values and overflow.  */
+      if (iax >= 0x7fc00001 || iax == 0x7f800000)
+	return x;
+      if (iax >= 0x7f800000)
+	return __math_invalidf (x);
+      if (iax >= OFlowLimit)
+	return __math_oflowf (sign);
+
+      /* expm1f overflows a little before sinhf, (~88.7 vs ~89.4). We have to
+	 fill this gap by using a different algorithm, in this case we use a
+	 double-precision exp helper. For large x sinh(x) dominated by exp(x),
+	 however we cannot compute exp without overflow either. We use the
+	 identity:
+	 exp(a) = (exp(a / 2)) ^ 2.
+	 to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2    for x > 0
+			    ~= (exp(|x| / 2)) ^ 2 / -2   for x < 0.
+	 Greatest error in this region is 1.89 ULP:
+	 sinhf(0x1.65898cp+6) got 0x1.f00aep+127  want 0x1.f00adcp+127.  */
+      float e = __exp_dd (ax / 2, 0);
+      return (e * halfsign) * e;
+    }
+
+  /* Use expm1f to retain acceptable precision for small numbers.
+     Let t = e^(|x|) - 1.  */
+  float t = expm1f (ax);
+  /* Then sinh(x) = (t + t / (t + 1)) / 2   for x > 0
+		    (t + t / (t + 1)) / -2  for x < 0.  */
+  return (t + t / (t + 1)) * halfsign;
+}
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 902088e..ee8120c 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -17,6 +17,7 @@ F (log10f, 0.01, 11.1)
 F (log1pf, -0.9, 10.0)
 F (log2f, 0.01, 11.1)
 F (sinf, -3.1, 3.1)
+F (sinhf, -10.0, 10.0)
 F (tanf, -3.1, 3.1)
 
 D (acosh, 1.0, 10.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 009fde2..86b0c69 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -163,6 +163,14 @@ t expm1f -0       -0x1p-23       1000
 t expm1f  0x1p-23  0x1.644716p6  100000
 t expm1f -0x1p-23 -0x1.9bbabcp+6 100000
 
+L=1.76
+t sinhf  0              0x1.62e43p+6  100000
+t sinhf -0             -0x1.62e43p+6  100000
+t sinhf  0x1.62e43p+6   0x1.65a9fap+6 100
+t sinhf -0x1.62e43p+6  -0x1.65a9fap+6 100
+t sinhf  0x1.65a9fap+6  inf           100
+t sinhf -0x1.65a9fap+6 -inf           100
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/sinhf.tst b/pl/math/test/testcases/directed/sinhf.tst
new file mode 100644
index 0000000..9a5ee56
--- /dev/null
+++ b/pl/math/test/testcases/directed/sinhf.tst
@@ -0,0 +1,21 @@
+; sinhf.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=sinhf op1=7fc00001 result=7fc00001 errno=0
+func=sinhf op1=ffc00001 result=7fc00001 errno=0
+func=sinhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=sinhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=sinhf op1=7f800000 result=7f800000 errno=0
+func=sinhf op1=7f7fffff result=7f800000 errno=ERANGE status=ox
+func=sinhf op1=ff800000 result=ff800000 errno=0
+func=sinhf op1=ff7fffff result=ff800000 errno=ERANGE status=ox
+func=sinhf op1=00000000 result=00000000 errno=0
+func=sinhf op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=sinhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=sinhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index ad2719d..ecda07c 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -12,6 +12,7 @@ F1 (erf)
 F1 (expm1)
 F1 (log10)
 F1 (log1p)
+F1 (sinh)
 D1 (acosh)
 D1 (asinh)
 D2 (atan2)
-- 
cgit v1.2.3


From 5086aa1065bc0d6d2f32051fc849d2804415b812 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 4 Nov 2022 10:47:06 +0000
Subject: pl/math: Update ULP threshold for acosh

Larger max was observed, updated comments and runulp.sh.
---
 pl/math/acosh_3u.c     | 6 +++---
 pl/math/test/runulp.sh | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pl/math/acosh_3u.c b/pl/math/acosh_3u.c
index b946dcb..6ac64f6 100644
--- a/pl/math/acosh_3u.c
+++ b/pl/math/acosh_3u.c
@@ -33,9 +33,9 @@ log1p (double);
 			      want 0x1.71a06f50c34b6p+0.
 
    0 <= x <= 2: Calculate the result using log1p. For x < 1, acosh(x) is
-   undefined. For 1 <= x <= 2, the largest observed error is 2.63 ULP:
-   acosh(0x1.072462f3df186p+0) got 0x1.e2a700043edabp-3
-			      want 0x1.e2a700043edaep-3.  */
+   undefined. For 1 <= x <= 2, the largest observed error is 2.69 ULP:
+   acosh(0x1.073528248093p+0) got 0x1.e4d9bd20684f3p-3
+			     want 0x1.e4d9bd20684f6p-3.  */
 double
 acosh (double x)
 {
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 86b0c69..8f04684 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -150,7 +150,7 @@ t acoshf 2      0x1p64 100000
 t acoshf 0x1p64 inf    100000
 t acoshf -0     -inf    10000
 
-L=2.14
+L=2.19
 t acosh 0        1       10000
 t acosh 1        2       100000
 t acosh 2        0x1p511 100000
-- 
cgit v1.2.3


From 617d26f6fdbeb1f2a5567869ac94642ad0c6e6fa Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 4 Nov 2022 13:57:18 +0000
Subject: Add WANT_ERRNO config option

This makes it easier for users to toggle errno off and on, and also
makes it possible to toggle the behaviour of our tests depending on
whether we expect errno to be set properly or not.
---
 config.mk.dist | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/config.mk.dist b/config.mk.dist
index b29a9b0..25cfdca 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -69,6 +69,12 @@ ifeq ($(WANT_SVE_MATH), 1)
 endif
 math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
 
+# If defined to 1, set errno in math functions according to ISO C.  Many math
+# libraries do not set errno, so this is 0 by default.  It may need to be
+# set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.
+WANT_ERRNO = 0
+math-cflags += -DWANT_ERRNO=$(WANT_ERRNO)
+
 # Disable fenv checks
 #math-ulpflags = -q -f
 #math-testflags = -nostatus
-- 
cgit v1.2.3


From 41d08c65d36a09038458299dcbf7cd807187ede6 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 9 Nov 2022 14:48:51 +0000
Subject: pl/math: Add vector/Neon sinhf

New routine uses vector expm1f in the same way as the scalar
variant. Fall back to the scalar for all special and large
cases. Accurate to 2.3 ULP at the same point as the scalar.

Also tidied up the flag to check fenv for vector routines.
---
 pl/math/include/mathlib.h      |  4 ++++
 pl/math/s_sinhf_2u3.c          |  6 ++++++
 pl/math/test/mathbench_funcs.h |  5 +++++
 pl/math/test/runulp.sh         | 35 +++++++++++++++++++++++----------
 pl/math/test/ulp_funcs.h       |  3 +++
 pl/math/test/ulp_wrappers.h    |  2 ++
 pl/math/v_sinhf_2u3.c          | 44 ++++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_sinhf_2u3.c         | 12 ++++++++++++
 8 files changed, 101 insertions(+), 10 deletions(-)
 create mode 100644 pl/math/s_sinhf_2u3.c
 create mode 100644 pl/math/v_sinhf_2u3.c
 create mode 100644 pl/math/vn_sinhf_2u3.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 4a26092..e99b8eb 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -36,6 +36,7 @@ float __s_expm1f (float);
 float __s_log10f (float);
 float __s_log1pf (float);
 float __s_log2f (float);
+float __s_sinhf (float);
 float __s_tanf (float);
 
 double __s_atan (double);
@@ -74,6 +75,7 @@ __f32x4_t __v_log1pf (__f32x4_t);
 __f64x2_t __v_log1p (__f64x2_t);
 __f32x4_t __v_log2f (__f32x4_t);
 __f64x2_t __v_log2 (__f64x2_t);
+__f32x4_t __v_sinhf (__f32x4_t);
 __f32x4_t __v_tanf (__f32x4_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
@@ -96,6 +98,7 @@ __vpcs __f32x4_t __vn_log1pf (__f32x4_t);
 __vpcs __f64x2_t __vn_log1p (__f64x2_t);
 __vpcs __f32x4_t __vn_log2f (__f32x4_t);
 __vpcs __f64x2_t __vn_log2 (__f64x2_t);
+__vpcs __f32x4_t __vn_sinhf (__f32x4_t);
 __vpcs __f32x4_t __vn_tanf (__f32x4_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
@@ -115,6 +118,7 @@ __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
 
 #endif
diff --git a/pl/math/s_sinhf_2u3.c b/pl/math/s_sinhf_2u3.c
new file mode 100644
index 0000000..ac6a269
--- /dev/null
+++ b/pl/math/s_sinhf_2u3.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_sinhf_2u3.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index ee8120c..ee4440d 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -50,6 +50,7 @@ F (__s_log1pf, -0.9, 10.0)
 D (__s_log1p, -0.9, 10.0)
 F (__s_log2f, 0.01, 11.1)
 D (__s_log2, 0.01, 11.1)
+F (__s_sinhf, -10.0, 10.0)
 F (__s_tanf, -3.1, 3.1)
 #if __aarch64__
 VF (__v_asinhf, -10.0, 10.0)
@@ -68,6 +69,7 @@ VF (__v_log1pf, -0.9, 10.0)
 VD (__v_log1p, -0.9, 10.0)
 VF (__v_log2f, 0.01, 11.1)
 VD (__v_log2, 0.01, 11.1)
+VF (__v_sinhf, -10.0, 10.0)
 VF (__v_tanf, -3.1, 3.1)
 #ifdef __vpcs
 VNF (__vn_asinhf, -10.0, 10.0)
@@ -118,6 +120,9 @@ VNF (_ZGVnN4v_log2f, 0.01, 11.1)
 VND (__vn_log2, 0.01, 11.1)
 VND (_ZGVnN2v_log2, 0.01, 11.1)
 
+VNF (__vn_sinhf, -10.0, 10.0)
+VNF (_ZGVnN4v_sinhf, -10.0, 10.0)
+
 VNF (__vn_tanf, -3.1, 3.1)
 VNF (_ZGVnN4v_tanf, -3.1, 3.1)
 #endif
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 8f04684..6470a89 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -349,6 +349,15 @@ range_expm1f='
  -0x1p-23 -0x1.9bbabcp+6 1000000
 '
 
+range_sinhf='
+  0              0x1.62e43p+6  100000
+ -0             -0x1.62e43p+6  100000
+  0x1.62e43p+6   0x1.65a9fap+6 100
+ -0x1.62e43p+6  -0x1.65a9fap+6 100
+  0x1.65a9fap+6  inf           100
+ -0x1.65a9fap+6 -inf           100
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -509,6 +518,7 @@ L_log2=2.10
 L_tanf=2.7
 L_log1p=1.97
 L_expm1f=1.02
+L_sinhf=1.76
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -539,13 +549,14 @@ do
 		[ -n "$X" ] || continue
 		# fenv checking is enabled by default, but we almost
 		# always want to disable it for vector routines, so a
-		# hack is needed. Pass a fourth argument to prevent -f
-		# being added to the run line.
-		if [ -z "$D" ]
-		then
-		    f="-f"
-		else
+		# hack is needed. Pass "fenv" as fourth argument to
+		# prevent -f being added to the run line.
+		f="-f"
+		if [ "$D" = "fenv" ]; then
 		    f=""
+		elif [ ! -z "$D" ]; then
+		    echo "Unrecognised 4th argument: $D"
+		    exit 1
 		fi
 		case "$X" in \#*) continue ;; esac
 		t $f $F $X
@@ -620,10 +631,14 @@ log1p  __s_log1p       $runs
 log1p  __v_log1p       $runv
 log1p  __vn_log1p      $runvn
 log1p  _ZGVnN2v_log1p  $runvn
-expm1f __s_expm1f      $runs  EF
-expm1f __v_expm1f      $runv  EF
-expm1f __vn_expm1f     $runvn EF
-expm1f _ZGVnN4v_expm1f $runvn EF
+expm1f __s_expm1f      $runs    fenv
+expm1f __v_expm1f      $runv    fenv
+expm1f __vn_expm1f     $runvn   fenv
+expm1f _ZGVnN4v_expm1f $runvn   fenv
+sinhf  __s_sinhf       $runs    fenv
+sinhf  __v_sinhf       $runv    fenv
+sinhf  __vn_sinhf      $runvn   fenv
+sinhf  _ZGVnN4v_sinhf  $runvn   fenv
 
 if [ $WANT_SVE_MATH -eq 1 ]; then
 sve_cosf     __sv_cosf         $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index ecda07c..ab5bcd6 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -37,6 +37,7 @@ SF1 (log1p)
 SD1 (log1p)
 SF1 (log2)
 SD1 (log2)
+SF1 (sinh)
 SF1 (tan)
 #if __aarch64__
 VF1 (asinh)
@@ -55,6 +56,7 @@ VF1 (log1p)
 VD1 (log1p)
 VF1 (log2)
 VD1 (log2)
+VF1 (sinh)
 VF1 (tan)
 #ifdef __vpcs
 ZVNF1 (asinh)
@@ -73,6 +75,7 @@ ZVNF1 (log1p)
 ZVND1 (log1p)
 ZVNF1 (log2)
 ZVND1 (log2)
+ZVNF1 (sinh)
 ZVNF1 (tan)
 #endif
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 8087692..210b738 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -106,6 +106,7 @@ VF1_WRAP(expm1)
 VF1_WRAP(log10)
 VF1_WRAP(log1p)
 VF1_WRAP(log2)
+VF1_WRAP(sinh)
 VF1_WRAP(tan)
 VD1_WRAP(atan)
 VD2_WRAP(atan2)
@@ -124,6 +125,7 @@ ZVNF1_WRAP(expm1)
 ZVNF1_WRAP(log10)
 ZVNF1_WRAP(log1p)
 ZVNF1_WRAP(log2)
+ZVNF1_WRAP(sinh)
 ZVNF1_WRAP(tan)
 ZVND1_WRAP(atan)
 ZVND2_WRAP(atan2)
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
new file mode 100644
index 0000000..4397bca
--- /dev/null
+++ b/pl/math/v_sinhf_2u3.c
@@ -0,0 +1,44 @@
+/*
+ * Single-precision vector sinh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define Expm1OFlowLimit                                                        \
+  0x42b17218 /* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f          \
+		overflows.  */
+
+/* Approximation for vector single-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The maximum error is 2.26 ULP:
+   __v_sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4.  */
+VPCS_ATTR v_f32_t V_NAME (sinhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+  v_f32_t ax = v_as_f32_u32 (iax);
+  v_u32_t sign = ix & ~AbsMask;
+  v_f32_t halfsign = v_as_f32_u32 (sign | Half);
+
+  v_u32_t special = v_cond_u32 (iax >= Expm1OFlowLimit);
+  /* Fall back to the scalar variant for all lanes if any of them should trigger
+     an exception.  */
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (sinhf, x, x, v_u32 (-1));
+
+  /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+     using a slight rearrangement of the definition of asinh. This allows us to
+     retain acceptable accuracy for very small inputs.  */
+  v_f32_t t = V_NAME (expm1f) (ax);
+  return (t + t / (t + 1)) * halfsign;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_sinhf_2u3.c b/pl/math/vn_sinhf_2u3.c
new file mode 100644
index 0000000..fcedb6d
--- /dev/null
+++ b/pl/math/vn_sinhf_2u3.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_sinhf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_sinhf, _ZGVnN4v_sinhf)
+#include "v_sinhf_2u3.c"
+#endif
-- 
cgit v1.2.3


From 4ed9b4953967fe8356e7680b488d567d298570fb Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 9 Nov 2022 14:49:09 +0000
Subject: Fix tests for WANT_SVE_MATH=1

Also skips the test line when D is not "fenv" or empty, i.e. when it
is the SVE if statement. This used to work but was broken by adding
the D variable, so the tests did not run properly when WANT_SVE_MATH
was enabled. Now fixed.
---
 pl/math/test/runulp.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 6470a89..53043d3 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -540,7 +540,7 @@ L_sve_erfc=3.15
 
 while read G F R D
 do
-	[ "$R" = 1 ] || continue
+	[ "$R" = 1 ] && { [[ $G != sve_* ]] || [ $WANT_SVE_MATH -eq 1 ]; } || continue
 	case "$G" in \#*) continue ;; esac
 	eval range="\${range_$G}"
 	eval L="\${L_$G}"
@@ -640,7 +640,6 @@ sinhf  __v_sinhf       $runv    fenv
 sinhf  __vn_sinhf      $runvn   fenv
 sinhf  _ZGVnN4v_sinhf  $runvn   fenv
 
-if [ $WANT_SVE_MATH -eq 1 ]; then
 sve_cosf     __sv_cosf         $runsv
 sve_cosf     _ZGVsMxv_cosf     $runsv
 sve_sinf     __sv_sinf         $runsv
@@ -676,7 +675,6 @@ sve_erf    __sv_erf        $runsv
 sve_erf    _ZGVsMxv_erf    $runsv
 sve_erfc   __sv_erfc       $runsv
 sve_erfc   _ZGVsMxv_erfc   $runsv
-fi
 EOF
 
 [ 0 -eq $FAIL ] || {
-- 
cgit v1.2.3


From 78a876ad601673d07fe4600d7ebfcf46587d819d Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 9 Nov 2022 14:49:23 +0000
Subject: Make fenv checking dependent on WANT_ERRNO

We want these tests to pass regardless of whether the user has enabled
or disabled WANT_ERRNO - this is now supported by a WANT_ERRNO config
option, which will be added to config.mk.dist in a follow-on.
---
 pl/math/Dir.mk         |  2 +-
 pl/math/test/runulp.sh | 23 +++++++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index 7909ea0..bb81052 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -128,7 +128,7 @@ check-pl/math-rtest: $(math-host-tools) $(math-tools)
 	cat $(math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
 
 check-pl/math-ulp: $(math-tools)
-	WANT_SVE_MATH=$(WANT_SVE_MATH) ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR)
+	WANT_ERRNO=$(WANT_ERRNO) WANT_SVE_MATH=$(WANT_SVE_MATH) ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR)
 
 check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 53043d3..1690d5f 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -548,15 +548,22 @@ do
 	do
 		[ -n "$X" ] || continue
 		# fenv checking is enabled by default, but we almost
-		# always want to disable it for vector routines, so a
-		# hack is needed. Pass "fenv" as fourth argument to
-		# prevent -f being added to the run line.
+		# always want to disable it for vector routines. There
+		# are, however, a small number of vector routines in
+		# pl/math which are supposed to set fenv correctly
+		# when WANT_ERRNO is enabled. A hack is needed to
+		# ensure fenv checking is enabled for routines where
+		# this is the case. Pass "fenv" as fourth argument to
+		# prevent -f being added to the run line when
+		# WANT_ERRNO is enabled.
 		f="-f"
-		if [ "$D" = "fenv" ]; then
-		    f=""
-		elif [ ! -z "$D" ]; then
-		    echo "Unrecognised 4th argument: $D"
-		    exit 1
+		if [ $WANT_ERRNO -eq 1 ]; then
+			if [ "$D" = "fenv" ]; then
+				f=""
+			elif [ ! -z "$D" ]; then
+				echo "Unrecognised 4th argument: $D"
+				exit 1
+			fi
 		fi
 		case "$X" in \#*) continue ;; esac
 		t $f $F $X
-- 
cgit v1.2.3


From 43140a886fa8f0907ec8545d57534cc4343b8b9c Mon Sep 17 00:00:00 2001
From: Add joeram01 <joe.ramsay@arm.com>
Date: Mon, 31 Oct 2022 12:23:43 +0000
Subject: pl/math: Add scalar and vector/Neon coshf

New routines use single-precision exp, which has been copied from
math/. Scalar is accurate to 1.9 ULP, Neon to 2.4 ULP.

Also use the new expf helper in scalar sinhf.
---
 pl/math/coshf_1u9.c                       | 60 ++++++++++++++++++++++++
 pl/math/expf.c                            | 76 +++++++++++++++++++++++++++++++
 pl/math/expf_data.c                       | 31 +++++++++++++
 pl/math/include/mathlib.h                 |  5 ++
 pl/math/math_config.h                     |  9 ++++
 pl/math/s_coshf_2u4.c                     |  6 +++
 pl/math/sinhf_2u3.c                       |  6 +--
 pl/math/test/mathbench_funcs.h            |  6 +++
 pl/math/test/runulp.sh                    | 22 +++++++++
 pl/math/test/testcases/directed/coshf.tst | 15 ++++++
 pl/math/test/ulp_funcs.h                  |  4 ++
 pl/math/test/ulp_wrappers.h               |  2 +
 pl/math/v_coshf_2u4.c                     | 62 +++++++++++++++++++++++++
 pl/math/vn_coshf_2u4.c                    | 12 +++++
 14 files changed, 313 insertions(+), 3 deletions(-)
 create mode 100644 pl/math/coshf_1u9.c
 create mode 100644 pl/math/expf.c
 create mode 100644 pl/math/expf_data.c
 create mode 100644 pl/math/s_coshf_2u4.c
 create mode 100644 pl/math/test/testcases/directed/coshf.tst
 create mode 100644 pl/math/v_coshf_2u4.c
 create mode 100644 pl/math/vn_coshf_2u4.c

diff --git a/pl/math/coshf_1u9.c b/pl/math/coshf_1u9.c
new file mode 100644
index 0000000..ca3f767
--- /dev/null
+++ b/pl/math/coshf_1u9.c
@@ -0,0 +1,60 @@
+/*
+ * Single-precision cosh(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define AbsMask 0x7fffffff
+#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this.  */
+#define SpecialBound                                                           \
+  0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use       \
+		special case.  */
+
+float
+optr_aor_exp_f32 (float);
+
+static NOINLINE float
+specialcase (float x, uint32_t iax)
+{
+  if (iax == 0x7f800000)
+    return INFINITY;
+  if (iax > 0x7f800000)
+    return __math_invalidf (x);
+  if (iax <= TinyBound)
+    /* For tiny x, avoid underflow by just returning 1.  */
+    return 1;
+  /* Otherwise SpecialBound <= |x| < Inf. x is too large to calculate exp(x)
+     without overflow, so use exp(|x|/2) instead. For large x cosh(x) is
+     dominated by exp(x), so return:
+     cosh(x) ~= (exp(|x|/2))^2 / 2.  */
+  float t = optr_aor_exp_f32 (asfloat (iax) / 2);
+  return (0.5 * t) * t;
+}
+
+/* Approximation for single-precision cosh(x) using exp.
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The maximum error is 1.89 ULP, observed for |x| > SpecialBound:
+   coshf(0x1.65898cp+6) got 0x1.f00aep+127 want 0x1.f00adcp+127.
+   The maximum error observed for TinyBound < |x| < SpecialBound is 1.02 ULP:
+   coshf(0x1.50a3cp+0) got 0x1.ff21dcp+0 want 0x1.ff21dap+0.  */
+float
+coshf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  float ax = asfloat (iax);
+
+  if (unlikely (iax <= TinyBound || iax >= SpecialBound))
+    {
+      /* x is tiny, large or special.  */
+      return specialcase (x, iax);
+    }
+
+  /* Compute cosh using the definition:
+     coshf(x) = exp(x) / 2 + exp(-x) / 2.  */
+  float t = optr_aor_exp_f32 (ax);
+  return 0.5f * t + 0.5f / t;
+}
diff --git a/pl/math/expf.c b/pl/math/expf.c
new file mode 100644
index 0000000..fa03b05
--- /dev/null
+++ b/pl/math/expf.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision e^x function.
+ *
+ * Copyright (c) 2017-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include "math_config.h"
+
+/*
+EXPF_TABLE_BITS = 5
+EXPF_POLY_ORDER = 3
+
+ULP error: 0.502 (nearest rounding.)
+Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.)
+Wrong count: 170635 (all nearest rounding wrong results with fma.)
+Non-nearest ULP error: 1 (rounded ULP error)
+*/
+
+#define N (1 << EXPF_TABLE_BITS)
+#define InvLn2N __expf_data.invln2_scaled
+#define T __expf_data.tab
+#define C __expf_data.poly_scaled
+
+static inline uint32_t
+top12 (float x)
+{
+  return asuint (x) >> 20;
+}
+
+float
+optr_aor_exp_f32 (float x)
+{
+  uint32_t abstop;
+  uint64_t ki, t;
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t kd, xd, z, r, r2, y, s;
+
+  xd = (double_t) x;
+  abstop = top12 (x) & 0x7ff;
+  if (unlikely (abstop >= top12 (88.0f)))
+    {
+      /* |x| >= 88 or x is nan.  */
+      if (asuint (x) == asuint (-INFINITY))
+	return 0.0f;
+      if (abstop >= top12 (INFINITY))
+	return x + x;
+      if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */
+	return __math_oflowf (0);
+      if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */
+	return __math_uflowf (0);
+    }
+
+  /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k.  */
+  z = InvLn2N * xd;
+
+  /* Round and convert z to int, the result is in [-150*N, 128*N] and
+     ideally nearest int is used, otherwise the magnitude of r can be
+     bigger which gives larger approximation error.  */
+  kd = roundtoint (z);
+  ki = converttoint (z);
+  r = z - kd;
+
+  /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
+  t = T[ki % N];
+  t += ki << (52 - EXPF_TABLE_BITS);
+  s = asdouble (t);
+  z = C[0] * r + C[1];
+  r2 = r * r;
+  y = C[2] * r + 1;
+  y = z * r2 + y;
+  y = y * s;
+  return eval_as_float (y);
+}
diff --git a/pl/math/expf_data.c b/pl/math/expf_data.c
new file mode 100644
index 0000000..1525fcc
--- /dev/null
+++ b/pl/math/expf_data.c
@@ -0,0 +1,31 @@
+/*
+ * Coeffs and table entries for single-precision exp. Copied from
+ * math/exp2f_data.c, with EXP2F_TABLE_BITS == 32.
+ *
+ * Copyright (c) 2017-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << EXPF_TABLE_BITS)
+
+const struct expf_data __expf_data = {
+  /* tab[i] = uint(2^(i/N)) - (i << 52-BITS)
+     used for computing 2^(k/N) for an int |k| < 150 N as
+     double(tab[k%N] + (k << 52-BITS)) */
+  .tab = {
+0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
+0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
+0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
+0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
+0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
+0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+  },
+  .invln2_scaled = 0x1.71547652b82fep+0 * N,
+  .poly_scaled = {
+  0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N,
+  },
+};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e99b8eb..55223a3 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -12,6 +12,7 @@
 float acoshf (float);
 float asinhf (float);
 float atan2f (float, float);
+float coshf (float);
 float erfcf (float);
 float erff (float);
 float expm1f (float);
@@ -30,6 +31,7 @@ double log1p (double);
 float __s_asinhf (float);
 float __s_atanf (float);
 float __s_atan2f (float, float);
+float __s_coshf (float);
 float __s_erfcf (float);
 float __s_erff (float);
 float __s_expm1f (float);
@@ -64,6 +66,7 @@ __f32x4_t __v_atanf (__f32x4_t);
 __f64x2_t __v_atan (__f64x2_t);
 __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
+__f32x4_t __v_coshf (__f32x4_t);
 __f32x4_t __v_erff (__f32x4_t);
 __f64x2_t __v_erf (__f64x2_t);
 __f32x4_t __v_erfcf (__f32x4_t);
@@ -87,6 +90,7 @@ __vpcs __f32x4_t __vn_atanf (__f32x4_t);
 __vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t __vn_coshf (__f32x4_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
 __vpcs __f64x2_t __vn_erf (__f64x2_t);
 __vpcs __f32x4_t __vn_erfcf (__f32x4_t);
@@ -107,6 +111,7 @@ __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 2d39e91..dc660f1 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -541,4 +541,13 @@ extern const float __sv_expf_poly[SV_EXPF_POLY_ORDER - 1] HIDDEN;
 #define EXPM1F_POLY_ORDER 5
 extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN;
 
+#define EXPF_TABLE_BITS 5
+#define EXPF_POLY_ORDER 3
+extern const struct expf_data
+{
+  uint64_t tab[1 << EXPF_TABLE_BITS];
+  double invln2_scaled;
+  double poly_scaled[EXPF_POLY_ORDER];
+} __expf_data HIDDEN;
+
 #endif
diff --git a/pl/math/s_coshf_2u4.c b/pl/math/s_coshf_2u4.c
new file mode 100644
index 0000000..1b7091b
--- /dev/null
+++ b/pl/math/s_coshf_2u4.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_coshf_2u4.c"
diff --git a/pl/math/sinhf_2u3.c b/pl/math/sinhf_2u3.c
index a0459ca..c616dac 100644
--- a/pl/math/sinhf_2u3.c
+++ b/pl/math/sinhf_2u3.c
@@ -16,8 +16,8 @@
   0x42b2d4fd /* 0x1.65a9fap+6, minimum positive value for which sinhf should   \
 		overflow.  */
 
-double
-__exp_dd (double, double);
+float
+optr_aor_exp_f32 (float);
 
 /* Approximation for single-precision sinh(x) using expm1.
    sinh(x) = (exp(x) - exp(-x)) / 2.
@@ -52,7 +52,7 @@ sinhf (float x)
 			    ~= (exp(|x| / 2)) ^ 2 / -2   for x < 0.
 	 Greatest error in this region is 1.89 ULP:
 	 sinhf(0x1.65898cp+6) got 0x1.f00aep+127  want 0x1.f00adcp+127.  */
-      float e = __exp_dd (ax / 2, 0);
+      float e = optr_aor_exp_f32 (ax / 2);
       return (e * halfsign) * e;
     }
 
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index ee4440d..544a7d1 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -10,6 +10,7 @@ F (asinhf, -10.0, 10.0)
 F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
 F (cosf, -3.1, 3.1)
+F (coshf, -10.0, 10.0)
 F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
 F (expm1f, -9.9, 9.9)
@@ -39,6 +40,7 @@ F (__s_atanf, -10.0, 10.0)
 D (__s_atan, -10.0, 10.0)
 {"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
 {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
+F (__s_coshf, -10.0, 10.0)
 F (__s_erff, -4.0, 4.0)
 D (__s_erf, -6.0, 6.0)
 F (__s_erfcf, -6.0, 28.0)
@@ -58,6 +60,7 @@ VF (__v_atanf, -10.0, 10.0)
 VD (__v_atan, -10.0, 10.0)
 {"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
 {"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
+VF (__v_coshf, -10.0, 10.0)
 VF  (__v_erff, -4.0, 4.0)
 VD  (__v_erf, -6.0, 6.0)
 VF (__v_erfcf, -6.0, 28.0)
@@ -87,6 +90,9 @@ VND (_ZGVnN2v_atan, -10.0, 10.0)
 {"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
 {"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
 
+VNF (__vn_coshf, -10.0, 10.0)
+VNF (_ZGVnN4v_coshf, -10.0, 10.0)
+
 VNF (__vn_erff, -4.0, 4.0)
 VNF (_ZGVnN4v_erff, -4.0, 4.0)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 1690d5f..d30e707 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -171,6 +171,14 @@ t sinhf -0x1.62e43p+6  -0x1.65a9fap+6 100
 t sinhf  0x1.65a9fap+6  inf           100
 t sinhf -0x1.65a9fap+6 -inf           100
 
+L=1.89
+t coshf  0              0x1p-63         100
+t coshf  0              0x1.5a92d8p+6   80000
+t coshf  0x1.5a92d8p+6  inf             2000
+t coshf -0             -0x1p-63         100
+t coshf -0             -0x1.5a92d8p+6   80000
+t coshf -0x1.5a92d8p+6 -inf             2000
+
 done
 
 # vector functions
@@ -358,6 +366,15 @@ range_sinhf='
  -0x1.65a9fap+6 -inf           100
 '
 
+range_coshf='
+  0              0x1p-63         100
+  0              0x1.5a92d8p+6   80000
+  0x1.5a92d8p+6  inf             2000
+ -0             -0x1p-63         100
+ -0             -0x1.5a92d8p+6   80000
+ -0x1.5a92d8p+6 -inf             2000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -519,6 +536,7 @@ L_tanf=2.7
 L_log1p=1.97
 L_expm1f=1.02
 L_sinhf=1.76
+L_coshf=1.89
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -646,6 +664,10 @@ sinhf  __s_sinhf       $runs    fenv
 sinhf  __v_sinhf       $runv    fenv
 sinhf  __vn_sinhf      $runvn   fenv
 sinhf  _ZGVnN4v_sinhf  $runvn   fenv
+coshf  __s_coshf       $runs    fenv
+coshf  __v_coshf       $runv    fenv
+coshf  __vn_coshf      $runvn   fenv
+coshf  _ZGVnN4v_coshf  $runvn   fenv
 
 sve_cosf     __sv_cosf         $runsv
 sve_cosf     _ZGVsMxv_cosf     $runsv
diff --git a/pl/math/test/testcases/directed/coshf.tst b/pl/math/test/testcases/directed/coshf.tst
new file mode 100644
index 0000000..cdc1d8d
--- /dev/null
+++ b/pl/math/test/testcases/directed/coshf.tst
@@ -0,0 +1,15 @@
+; coshf.tst
+;
+; Copyright (c) 2007-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=coshf op1=7fc00001 result=7fc00001 errno=0
+func=coshf op1=ffc00001 result=7fc00001 errno=0
+func=coshf op1=7f800001 result=7fc00001 errno=0 status=i
+func=coshf op1=ff800001 result=7fc00001 errno=0 status=i
+func=coshf op1=7f800000 result=7f800000 errno=0
+func=coshf op1=7f7fffff result=7f800000 errno=ERANGE status=ox
+func=coshf op1=ff800000 result=7f800000 errno=0
+func=coshf op1=ff7fffff result=7f800000 errno=ERANGE status=ox
+func=coshf op1=00000000 result=3f800000 errno=0
+func=coshf op1=80000000 result=3f800000 errno=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index ab5bcd6..ef7c7be 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -7,6 +7,7 @@
 F1 (acosh)
 F1 (asinh)
 F2 (atan2)
+F1 (cosh)
 F1 (erfc)
 F1 (erf)
 F1 (expm1)
@@ -26,6 +27,7 @@ SF1 (atan)
 SD1 (atan)
 SF2 (atan2)
 SD2 (atan2)
+SF1 (cosh)
 SF1 (erf)
 SD1 (erf)
 SF1 (erfc)
@@ -45,6 +47,7 @@ VF1 (atan)
 VD1 (atan)
 VF2 (atan2)
 VD2 (atan2)
+VF1 (cosh)
 VF1 (erf)
 VD1 (erf)
 VF1 (erfc)
@@ -64,6 +67,7 @@ ZVNF1 (atan)
 ZVND1 (atan)
 ZVNF2 (atan2)
 ZVND2 (atan2)
+ZVNF1 (cosh)
 ZVNF1 (erf)
 ZVND1 (erf)
 ZVNF1 (erfc)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 210b738..93cf75e 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -100,6 +100,7 @@ DECL_POW_INT_REF(ref_powi, long double, double, int)
 VF1_WRAP(asinh)
 VF1_WRAP(atan)
 VF2_WRAP(atan2)
+VF1_WRAP(cosh)
 VF1_WRAP(erf)
 VF1_WRAP(erfc)
 VF1_WRAP(expm1)
@@ -119,6 +120,7 @@ VD1_WRAP(log2)
 ZVNF1_WRAP(asinh)
 ZVNF1_WRAP(atan)
 ZVNF2_WRAP(atan2)
+ZVNF1_WRAP(cosh)
 ZVNF1_WRAP(erf)
 ZVNF1_WRAP(erfc)
 ZVNF1_WRAP(expm1)
diff --git a/pl/math/v_coshf_2u4.c b/pl/math/v_coshf_2u4.c
new file mode 100644
index 0000000..7d7a228
--- /dev/null
+++ b/pl/math/v_coshf_2u4.c
@@ -0,0 +1,62 @@
+/*
+ * Single-precision vector cosh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#define AbsMask 0x7fffffff
+#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this.  */
+#define SpecialBound                                                           \
+  0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use       \
+		special case.  */
+#define Half v_f32 (0.5)
+
+#if V_SUPPORTED
+
+v_f32_t V_NAME (expf) (v_f32_t);
+
+/* Single-precision vector cosh, using vector expf.
+   Maximum error is 2.38 ULP:
+   __v_coshf(0x1.e8001ep+1) got 0x1.6a491ep+4 want 0x1.6a4922p+4.  */
+VPCS_ATTR v_f32_t V_NAME (coshf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+  v_f32_t ax = v_as_f32_u32 (iax);
+  v_u32_t special = v_cond_u32 (iax >= SpecialBound);
+
+#if WANT_ERRNO
+  /* If errno is to be set correctly, fall back to the scalar variant for all
+     inputs if any input is a special value or above the bound at which expf
+     overflows. */
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (coshf, x, x, v_u32 (-1));
+
+  v_u32_t tiny = v_cond_u32 (iax <= TinyBound);
+  /* If any input is tiny, avoid underflow exception by fixing tiny lanes of
+     input to 1, which will generate no exceptions, and then also fixing tiny
+     lanes of output to 1 just before return.  */
+  if (unlikely (v_any_u32 (tiny)))
+    ax = v_sel_f32 (tiny, v_f32 (1), ax);
+#endif
+
+  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+  v_f32_t t = V_NAME (expf) (ax);
+  v_f32_t y = t * Half + Half / t;
+
+#if WANT_ERRNO
+  if (unlikely (v_any_u32 (tiny)))
+    return v_sel_f32 (tiny, v_f32 (1), y);
+#else
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (coshf, x, y, special);
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_coshf_2u4.c b/pl/math/vn_coshf_2u4.c
new file mode 100644
index 0000000..6bc4635
--- /dev/null
+++ b/pl/math/vn_coshf_2u4.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_coshf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_coshf, _ZGVnN4v_coshf)
+#include "v_coshf_2u4.c"
+#endif
-- 
cgit v1.2.3


From 151020a369757cd33e13ca3dd9dfadfc2a15a905 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 9 Nov 2022 14:52:26 +0000
Subject: pl/math: Add scalar expm1

New routine uses the same algorithm as the single-precision routine,
and is accurate to 2.5 ULP.
---
 pl/math/expm1_2u5.c                       | 92 +++++++++++++++++++++++++++++++
 pl/math/expm1_data.c                      | 21 +++++++
 pl/math/include/mathlib.h                 |  1 +
 pl/math/math_config.h                     |  3 +
 pl/math/test/mathbench_funcs.h            |  1 +
 pl/math/test/runulp.sh                    |  9 +++
 pl/math/test/testcases/directed/expm1.tst | 21 +++++++
 pl/math/test/ulp_funcs.h                  |  1 +
 pl/math/tools/expm1.sollya                | 21 +++++++
 9 files changed, 170 insertions(+)
 create mode 100644 pl/math/expm1_2u5.c
 create mode 100644 pl/math/expm1_data.c
 create mode 100644 pl/math/test/testcases/directed/expm1.tst
 create mode 100644 pl/math/tools/expm1.sollya

diff --git a/pl/math/expm1_2u5.c b/pl/math/expm1_2u5.c
new file mode 100644
index 0000000..c701d7e
--- /dev/null
+++ b/pl/math/expm1_2u5.c
@@ -0,0 +1,92 @@
+/*
+ * Double-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define InvLn2 0x1.71547652b82fep0
+#define Ln2hi 0x1.62e42fefa39efp-1
+#define Ln2lo 0x1.abc9e3b39803fp-56
+#define Shift 0x1.8p52
+#define TinyBound                                                              \
+  0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */
+#define BigBound 0x1.63108c75a1937p+9  /* Above which expm1(x) overflows.  */
+#define NegBound -0x1.740bf7c0d927dp+9 /* Below which expm1(x) rounds to 1. */
+#define AbsMask 0x7fffffffffffffff
+
+#define C(i) __expm1_poly[i]
+
+static inline double
+eval_poly (double f, double f2)
+{
+  /* Evaluate custom polynomial using Estrin scheme.  */
+  double p_01 = fma (f, C (1), C (0));
+  double p_23 = fma (f, C (3), C (2));
+  double p_45 = fma (f, C (5), C (4));
+  double p_67 = fma (f, C (7), C (6));
+  double p_89 = fma (f, C (9), C (8));
+
+  double p_03 = fma (f2, p_23, p_01);
+  double p_47 = fma (f2, p_67, p_45);
+  double p_8a = fma (f2, C (10), p_89);
+
+  double f4 = f2 * f2;
+  double p_07 = fma (f4, p_47, p_03);
+  return fma (f4 * f4, p_8a, p_07);
+}
+
+/* Approximation for exp(x) - 1 using polynomial on a reduced interval.
+   The maximum error observed error is 2.17 ULP:
+   expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2
+			      want 0x1.a9af566038788p-2.  */
+double
+expm1 (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ax = ix & AbsMask;
+
+  /* Tiny, +Infinity.  */
+  if (ax <= TinyBound || ix == 0x7ff0000000000000)
+    return x;
+
+  /* +/-NaN.  */
+  if (ax > 0x7ff0000000000000)
+    return __math_invalid (x);
+
+  /* Result is too large to be represented as a double.  */
+  if (x >= 0x1.63108c75a1937p+9)
+    return __math_oflow (0);
+
+  /* Result rounds to -1 in double precision.  */
+  if (x <= NegBound)
+    return -1;
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  double j = fma (InvLn2, x, Shift) - Shift;
+  int64_t i = j;
+  double f = fma (j, -Ln2hi, x);
+  f = fma (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  double f2 = f * f;
+  double p = fma (f2, eval_poly (f, f2), f);
+
+  /* Assemble the result, using a slight rearrangement to achieve acceptable
+     accuracy.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^(i - 1).  */
+  double t = ldexp (0.5, i);
+  /* expm1(x) ~= 2 * (p * t + (t - 1/2)).  */
+  return 2 * fma (p, t, t - 0.5);
+}
diff --git a/pl/math/expm1_data.c b/pl/math/expm1_data.c
new file mode 100644
index 0000000..93aaa47
--- /dev/null
+++ b/pl/math/expm1_data.c
@@ -0,0 +1,21 @@
+/*
+ * Coefficients for double-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Generated using fpminimax, see tools/expm1.sollya for details.  */
+const double __expm1_poly[] = {0x1p-1,
+			       0x1.5555555555559p-3,
+			       0x1.555555555554bp-5,
+			       0x1.111111110f663p-7,
+			       0x1.6c16c16c1b5f3p-10,
+			       0x1.a01a01affa35dp-13,
+			       0x1.a01a018b4ecbbp-16,
+			       0x1.71ddf82db5bb4p-19,
+			       0x1.27e517fc0d54bp-22,
+			       0x1.af5eedae67435p-26,
+			       0x1.1f143d060a28ap-29};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 55223a3..e0ad61a 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -25,6 +25,7 @@ double acosh (double);
 double asinh (double);
 double atan2 (double, double);
 double erfc (double);
+double expm1 (double);
 double log10 (double);
 double log1p (double);
 
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index dc660f1..68b6ee7 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -550,4 +550,7 @@ extern const struct expf_data
   double poly_scaled[EXPF_POLY_ORDER];
 } __expf_data HIDDEN;
 
+#define EXPM1_POLY_ORDER 11
+extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN;
+
 #endif
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 544a7d1..751d5fc 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -28,6 +28,7 @@ D (atan, -10.0, 10.0)
 D (cos, -3.1, 3.1)
 D (erf, -6,6)
 D (erfc, -6.0, 28.0)
+D (expm1, -9.9, 9.9)
 D (log10, 0.01, 11.1)
 D (log1p, -0.9, 10.0)
 D (log2, 0.01, 11.1)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index d30e707..87911a7 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -179,6 +179,15 @@ t coshf -0             -0x1p-63         100
 t coshf -0             -0x1.5a92d8p+6   80000
 t coshf -0x1.5a92d8p+6 -inf             2000
 
+
+L=1.68
+t expm1  0                     0x1p-51              1000
+t expm1 -0                    -0x1p-51              1000
+t expm1  0x1p-51               0x1.63108c75a1937p+9 100000
+t expm1 -0x1p-51              -0x1.740bf7c0d927dp+9 100000
+t expm1  0x1.63108c75a1937p+9  inf                  100
+t expm1 -0x1.740bf7c0d927dp+9 -inf                  100
+
 done
 
 # vector functions
diff --git a/pl/math/test/testcases/directed/expm1.tst b/pl/math/test/testcases/directed/expm1.tst
new file mode 100644
index 0000000..d382c18
--- /dev/null
+++ b/pl/math/test/testcases/directed/expm1.tst
@@ -0,0 +1,21 @@
+; expm1.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=expm1 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=expm1 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=expm1 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=expm1 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=expm1 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=expm1 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=expm1 op1=fff00000.00000000 result=bff00000.00000000 errno=0
+func=expm1 op1=ffefffff.ffffffff result=bff00000.00000000 errno=0
+func=expm1 op1=00000000.00000000 result=00000000.00000000 errno=0
+func=expm1 op1=80000000.00000000 result=80000000.00000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=expm1 op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=expm1 op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index ef7c7be..ff94e1d 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -19,6 +19,7 @@ D1 (asinh)
 D2 (atan2)
 F1 (tan)
 D1 (erfc)
+D1 (expm1)
 D1 (log10)
 D1 (log1p)
 #if WANT_VMATH
diff --git a/pl/math/tools/expm1.sollya b/pl/math/tools/expm1.sollya
new file mode 100644
index 0000000..587db46
--- /dev/null
+++ b/pl/math/tools/expm1.sollya
@@ -0,0 +1,21 @@
+// polynomial for approximating exp(x)-1 in double precision
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 12;
+
+a = -log(2)/2;
+b = log(2)/2;
+
+f = proc(y) {
+  return exp(y)-1;
+};
+
+poly = fpminimax(f(x), deg, [|double ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do round(coeff(poly,i), D, RN);
-- 
cgit v1.2.3


From 1721f53563004249849968a4f78a3ed162b5e8e1 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 9 Nov 2022 14:52:37 +0000
Subject: pl/math: Add vector/Neon expm1

New routine is a vector port of the scalar algorithm, with fallback to
the scalar variant for large and special input. This enables us to
simplify elements of the algorithm which were necessary for large
input. It also means that, as long as we fall back to the scalar for
tiny input as well (dependent on the value of WANT_ERRNO), the routine
sets fenv flags correctly.
---
 pl/math/include/mathlib.h      |   4 ++
 pl/math/s_expm1_2u5.c          |   6 +++
 pl/math/test/mathbench_funcs.h |   5 +++
 pl/math/test/runulp.sh         |  14 ++++++
 pl/math/test/ulp_funcs.h       |   3 ++
 pl/math/test/ulp_wrappers.h    |   2 +
 pl/math/v_expm1_2u5.c          | 100 +++++++++++++++++++++++++++++++++++++++++
 pl/math/v_math.h               |  11 +++++
 pl/math/vn_expm1_2u5.c         |  12 +++++
 9 files changed, 157 insertions(+)
 create mode 100644 pl/math/s_expm1_2u5.c
 create mode 100644 pl/math/v_expm1_2u5.c
 create mode 100644 pl/math/vn_expm1_2u5.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e0ad61a..9ebe539 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -46,6 +46,7 @@ double __s_atan (double);
 double __s_atan2 (double, double);
 double __s_erf (double);
 double __s_erfc (double);
+double __s_expm1 (double);
 double __s_log10 (double);
 double __s_log1p (double);
 double __s_log2 (double);
@@ -73,6 +74,7 @@ __f64x2_t __v_erf (__f64x2_t);
 __f32x4_t __v_erfcf (__f32x4_t);
 __f64x2_t __v_erfc (__f64x2_t);
 __f32x4_t __v_expm1f (__f32x4_t);
+__f64x2_t __v_expm1 (__f64x2_t);
 __f32x4_t __v_log10f (__f32x4_t);
 __f64x2_t __v_log10 (__f64x2_t);
 __f32x4_t __v_log1pf (__f32x4_t);
@@ -97,6 +99,7 @@ __vpcs __f64x2_t __vn_erf (__f64x2_t);
 __vpcs __f32x4_t __vn_erfcf (__f32x4_t);
 __vpcs __f64x2_t __vn_erfc (__f64x2_t);
 __vpcs __f32x4_t __vn_expm1f (__f32x4_t);
+__vpcs __f64x2_t __vn_expm1 (__f64x2_t);
 __vpcs __f32x4_t __vn_log10f (__f32x4_t);
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 __vpcs __f32x4_t __vn_log1pf (__f32x4_t);
@@ -118,6 +121,7 @@ __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
diff --git a/pl/math/s_expm1_2u5.c b/pl/math/s_expm1_2u5.c
new file mode 100644
index 0000000..00827da
--- /dev/null
+++ b/pl/math/s_expm1_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_expm1_2u5.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 751d5fc..e73206e 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -47,6 +47,7 @@ D (__s_erf, -6.0, 6.0)
 F (__s_erfcf, -6.0, 28.0)
 D (__s_erfc, -6.0, 28.0)
 F (__s_expm1f, -9.9, 9.9)
+D (__s_expm1, -9.9, 9.9)
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 F (__s_log1pf, -0.9, 10.0)
@@ -67,6 +68,7 @@ VD  (__v_erf, -6.0, 6.0)
 VF (__v_erfcf, -6.0, 28.0)
 VD (__v_erfc, -6.0, 28.0)
 VF (__v_expm1f, -9.9, 9.9)
+VD (__v_expm1, -9.9, 9.9)
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 VF (__v_log1pf, -0.9, 10.0)
@@ -109,6 +111,9 @@ VND (_ZGVnN2v_erfc, -6.0, 28.0)
 VNF (__vn_expm1f, -9.9, 9.9)
 VNF (_ZGVnN4v_expm1f, -9.9, 9.9)
 
+VND (__vn_expm1, -9.9, 9.9)
+VND (_ZGVnN2v_expm1, -9.9, 9.9)
+
 VNF (__vn_log10f, 0.01, 11.1)
 VNF (_ZGVnN4v_log10f, 0.01, 11.1)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 87911a7..c92892b 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -384,6 +384,15 @@ range_coshf='
  -0x1.5a92d8p+6 -inf             2000
 '
 
+range_expm1='
+  0                     0x1p-51              1000
+ -0                    -0x1p-51              1000
+  0x1p-51               0x1.63108c75a1937p+9 100000
+ -0x1p-51              -0x1.740bf7c0d927dp+9 100000
+  0x1.63108c75a1937p+9  inf                  100
+ -0x1.740bf7c0d927dp+9 -inf                  100
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -546,6 +555,7 @@ L_log1p=1.97
 L_expm1f=1.02
 L_sinhf=1.76
 L_coshf=1.89
+L_expm1=1.68
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -624,6 +634,10 @@ log2   __s_log2        $runs
 log2   __v_log2        $runv
 log2   __vn_log2       $runvn
 log2   _ZGVnN2v_log2   $runvn
+expm1  __s_expm1       $runs    fenv
+expm1  __v_expm1       $runv    fenv
+expm1  __vn_expm1      $runvn   fenv
+expm1  _ZGVnN2v_expm1  $runvn   fenv
 
 atanf  __s_atanf       $runs
 atanf  __v_atanf       $runv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index ff94e1d..98b63c8 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -34,6 +34,7 @@ SD1 (erf)
 SF1 (erfc)
 SD1 (erfc)
 SF1 (expm1)
+SD1 (expm1)
 SF1 (log10)
 SD1 (log10)
 SF1 (log1p)
@@ -54,6 +55,7 @@ VD1 (erf)
 VF1 (erfc)
 VD1 (erfc)
 VF1 (expm1)
+VD1 (expm1)
 VF1 (log10)
 VD1 (log10)
 VF1 (log1p)
@@ -74,6 +76,7 @@ ZVND1 (erf)
 ZVNF1 (erfc)
 ZVND1 (erfc)
 ZVNF1 (expm1)
+ZVND1 (expm1)
 ZVNF1 (log10)
 ZVND1 (log10)
 ZVNF1 (log1p)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 93cf75e..9c639c1 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -113,6 +113,7 @@ VD1_WRAP(atan)
 VD2_WRAP(atan2)
 VD1_WRAP(erf)
 VD1_WRAP(erfc)
+VD1_WRAP(expm1)
 VD1_WRAP(log10)
 VD1_WRAP(log1p)
 VD1_WRAP(log2)
@@ -133,6 +134,7 @@ ZVND1_WRAP(atan)
 ZVND2_WRAP(atan2)
 ZVND1_WRAP(erf)
 ZVND1_WRAP(erfc)
+ZVND1_WRAP(expm1)
 ZVND1_WRAP(log10)
 ZVND1_WRAP(log1p)
 ZVND1_WRAP(log2)
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
new file mode 100644
index 0000000..425ad88
--- /dev/null
+++ b/pl/math/v_expm1_2u5.c
@@ -0,0 +1,100 @@
+/*
+ * Double-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#if V_SUPPORTED
+
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define TinyBound                                                              \
+  0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */
+#define SpecialBound                                                           \
+  0x40862b7d369a5aa9 /* 0x1.62b7d369a5aa9p+9. For |x| > SpecialBound, the      \
+			final stage of the algorithm overflows so fall back to \
+			scalar.  */
+#define AbsMask 0x7fffffffffffffff
+#define One 0x3ff0000000000000
+
+#define C(i) v_f64 (__expm1_poly[i])
+
+static inline v_f64_t
+eval_poly (v_f64_t f, v_f64_t f2)
+{
+  /* Evaluate custom polynomial using Estrin scheme.  */
+  v_f64_t p_01 = v_fma_f64 (f, C (1), C (0));
+  v_f64_t p_23 = v_fma_f64 (f, C (3), C (2));
+  v_f64_t p_45 = v_fma_f64 (f, C (5), C (4));
+  v_f64_t p_67 = v_fma_f64 (f, C (7), C (6));
+  v_f64_t p_89 = v_fma_f64 (f, C (9), C (8));
+
+  v_f64_t p_03 = v_fma_f64 (f2, p_23, p_01);
+  v_f64_t p_47 = v_fma_f64 (f2, p_67, p_45);
+  v_f64_t p_8a = v_fma_f64 (f2, C (10), p_89);
+
+  v_f64_t f4 = f2 * f2;
+  v_f64_t p_07 = v_fma_f64 (f4, p_47, p_03);
+  return v_fma_f64 (f4 * f4, p_8a, p_07);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+   The maximum error observed error is 2.18 ULP:
+   __v_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+				  want 0x1.a8b9ea8d66e2p-2.  */
+VPCS_ATTR
+v_f64_t V_NAME (expm1) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t ax = ix & AbsMask;
+
+#if WANT_ERRNO
+  /* If errno is to be set correctly, fall back to the scalar variant for all
+     lanes if any of them should trigger an exception.  */
+  v_u64_t special = v_cond_u64 ((ax >= SpecialBound) | (ax <= TinyBound));
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (expm1, x, x, v_u64 (-1));
+#else
+  /* Large input, NaNs and Infs.  */
+  v_u64_t special = v_cond_u64 (ax >= SpecialBound);
+#endif
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+  v_s64_t i = v_to_s64_f64 (j);
+  v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+  f = v_fma_f64 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  v_f64_t f2 = f * f;
+  v_f64_t p = v_fma_f64 (f2, eval_poly (f, f2), f);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  v_f64_t y = v_fma_f64 (p, t, t - 1);
+
+#if !WANT_ERRNO
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (expm1, x, y, special);
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index a3f9c57..d4597c8 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -400,6 +400,12 @@ v_to_f64_u64 (v_u64_t x)
 {
   return x;
 }
+
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+  return x;
+}
 /* reinterpret as type1 from type2.  */
 static inline v_u64_t
 v_as_u64_f64 (v_f64_t x)
@@ -761,6 +767,11 @@ v_to_f64_u64 (v_u64_t x)
 {
   return (v_f64_t){x[0], x[1]};
 }
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+  return vcvtq_s64_f64 (x);
+}
 /* reinterpret as type1 from type2.  */
 static inline v_u64_t
 v_as_u64_f64 (v_f64_t x)
diff --git a/pl/math/vn_expm1_2u5.c b/pl/math/vn_expm1_2u5.c
new file mode 100644
index 0000000..fc88b06
--- /dev/null
+++ b/pl/math/vn_expm1_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expm1.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_expm1, _ZGVnN2v_expm1)
+#include "v_expm1_2u5.c"
+#endif
-- 
cgit v1.2.3


From cf69308ea773524eec9365108d608e4c9e4036af Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 9 Nov 2022 14:52:49 +0000
Subject: pl/math/test: Simplify ulp and bench macros

Reduces the amount of boilerplate developers need to write for new
routines.
---
 pl/math/test/mathbench_funcs.h | 190 ++++++++++++-----------------------------
 pl/math/test/ulp_funcs.h       | 164 ++++++++++++++---------------------
 pl/math/test/ulp_wrappers.h    |  50 +++++------
 3 files changed, 144 insertions(+), 260 deletions(-)

diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index e73206e..d09cdb0 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -5,6 +5,27 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+
+#ifdef __vpcs
+
+#define ZVNF(f, a, b) F(__s_##f, a, b) VF(__v_##f, a, b) VNF(__vn_##f, a, b) VNF(_ZGVnN4v_##f, a, b)
+#define ZVND(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) VND(__vn_##f, a, b) VND(_ZGVnN2v_##f, a, b)
+
+#elif __aarch64__
+
+#define ZVNF(f, a, b) F(__s_##f, a, b) VF(__v_##f, a, b)
+#define ZVND(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b)
+
+#else
+
+#define ZVNF(f, a, b) F(__s_##f, a, b)
+#define ZVND(f, a, b) D(__s_##f, a, b)
+
+#endif
+
+#define VZSVF(f, a, b) SVF(__sv_##f##_x, a, b) SVF(_ZGVsMsv_##f, a, b)
+#define VZSVD(f, a, b) SVD(__sv_##f##_x, a, b) SVD(_ZGVsMsv_##f, a, b)
+
 F (acoshf, 1.0, 10.0)
 F (asinhf, -10.0, 10.0)
 F (atanf, -10.0, 10.0)
@@ -36,154 +57,53 @@ D (log2, 0.01, 11.1)
 D (sin, -3.1, 3.1)
 
 #if WANT_VMATH
-F (__s_asinhf, -10.0, 10.0)
-F (__s_atanf, -10.0, 10.0)
-D (__s_atan, -10.0, 10.0)
+ZVNF (asinhf, -10.0, 10.0)
+ZVNF (atanf, -10.0, 10.0)
+ZVND (atan, -10.0, 10.0)
+ZVNF (coshf, -10.0, 10.0)
+ZVNF (erff, -4.0, 4.0)
+ZVND (erf, -6.0, 6.0)
+ZVNF (erfcf, -6.0, 28.0)
+ZVND (erfc, -6.0, 28.0)
+ZVNF (expm1f, -9.9, 9.9)
+ZVND (expm1, -9.9, 9.9)
+ZVNF (log10f, 0.01, 11.1)
+ZVND (log10, 0.01, 11.1)
+ZVNF (log1pf, -0.9, 10.0)
+ZVND (log1p, -0.9, 10.0)
+ZVNF (log2f, 0.01, 11.1)
+ZVND (log2, 0.01, 11.1)
+ZVNF (sinhf, -10.0, 10.0)
+ZVNF (tanf, -3.1, 3.1)
 {"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
 {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
-F (__s_coshf, -10.0, 10.0)
-F (__s_erff, -4.0, 4.0)
-D (__s_erf, -6.0, 6.0)
-F (__s_erfcf, -6.0, 28.0)
-D (__s_erfc, -6.0, 28.0)
-F (__s_expm1f, -9.9, 9.9)
-D (__s_expm1, -9.9, 9.9)
-F (__s_log10f, 0.01, 11.1)
-D (__s_log10, 0.01, 11.1)
-F (__s_log1pf, -0.9, 10.0)
-D (__s_log1p, -0.9, 10.0)
-F (__s_log2f, 0.01, 11.1)
-D (__s_log2, 0.01, 11.1)
-F (__s_sinhf, -10.0, 10.0)
-F (__s_tanf, -3.1, 3.1)
-#if __aarch64__
-VF (__v_asinhf, -10.0, 10.0)
-VF (__v_atanf, -10.0, 10.0)
-VD (__v_atan, -10.0, 10.0)
 {"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
 {"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
-VF (__v_coshf, -10.0, 10.0)
-VF  (__v_erff, -4.0, 4.0)
-VD  (__v_erf, -6.0, 6.0)
-VF (__v_erfcf, -6.0, 28.0)
-VD (__v_erfc, -6.0, 28.0)
-VF (__v_expm1f, -9.9, 9.9)
-VD (__v_expm1, -9.9, 9.9)
-VD (__v_log10, 0.01, 11.1)
-VF (__v_log10f, 0.01, 11.1)
-VF (__v_log1pf, -0.9, 10.0)
-VD (__v_log1p, -0.9, 10.0)
-VF (__v_log2f, 0.01, 11.1)
-VD (__v_log2, 0.01, 11.1)
-VF (__v_sinhf, -10.0, 10.0)
-VF (__v_tanf, -3.1, 3.1)
-#ifdef __vpcs
-VNF (__vn_asinhf, -10.0, 10.0)
-VNF (_ZGVnN4v_asinhf, -10.0, 10.0)
-
-VNF (__vn_atanf, -10.0, 10.0)
-VNF (_ZGVnN4v_atanf, -10.0, 10.0)
-
-VND (__vn_atan, -10.0, 10.0)
-VND (_ZGVnN2v_atan, -10.0, 10.0)
-
 {"__vn_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}},
 {"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}},
-
 {"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
 {"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
 
-VNF (__vn_coshf, -10.0, 10.0)
-VNF (_ZGVnN4v_coshf, -10.0, 10.0)
-
-VNF (__vn_erff, -4.0, 4.0)
-VNF (_ZGVnN4v_erff, -4.0, 4.0)
-
-VND (__vn_erf, -6.0, 6.0)
-VND (_ZGVnN2v_erf, -6.0, 6.0)
-
-VNF (__vn_erfcf, -6.0, 28.0)
-VNF (_ZGVnN4v_erfcf, -6.0, 28.0)
-
-VND (__vn_erfc, -6.0, 28.0)
-VND (_ZGVnN2v_erfc, -6.0, 28.0)
-
-VNF (__vn_expm1f, -9.9, 9.9)
-VNF (_ZGVnN4v_expm1f, -9.9, 9.9)
-
-VND (__vn_expm1, -9.9, 9.9)
-VND (_ZGVnN2v_expm1, -9.9, 9.9)
-
-VNF (__vn_log10f, 0.01, 11.1)
-VNF (_ZGVnN4v_log10f, 0.01, 11.1)
-
-VND (__vn_log10, 0.01, 11.1)
-VND (_ZGVnN2v_log10, 0.01, 11.1)
-
-VNF (__vn_log1pf, -0.9, 10.0)
-VNF (_ZGVnN4v_log1pf, -0.9, 10.0)
-
-VND (__vn_log1p, -0.9, 10.0)
-VND (_ZGVnN2v_log1p, -0.9, 10.0)
-
-VNF (__vn_log2f, 0.01, 11.1)
-VNF (_ZGVnN4v_log2f, 0.01, 11.1)
-
-VND (__vn_log2, 0.01, 11.1)
-VND (_ZGVnN2v_log2, 0.01, 11.1)
-
-VNF (__vn_sinhf, -10.0, 10.0)
-VNF (_ZGVnN4v_sinhf, -10.0, 10.0)
-
-VNF (__vn_tanf, -3.1, 3.1)
-VNF (_ZGVnN4v_tanf, -3.1, 3.1)
-#endif
-#endif
 #if WANT_SVE_MATH
-SVF (__sv_atanf_x, -3.1, 3.1)
-SVF (_ZGVsMxv_atanf, -3.1, 3.1)
-SVD (__sv_atan_x, -3.1, 3.1)
-SVD (_ZGVsMxv_atan, -3.1, 3.1)
-
+ZSVF (atanf, -3.1, 3.1)
+ZSVD (atan, -3.1, 3.1)
 {"__sv_atan2f_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_atan2f_wrap}},
 {"_ZGVsMxvv_atan2f", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}},
-{"__sv_atan2", 'd', 'n', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
+{"__sv_atan2_x", 'd', 'n', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
 {"_ZGVsM2vv_atan2", 'd', 'n', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
-
-SVF (__sv_erff_x, -4.0, 4.0)
-SVF (_ZGVsMxv_erff, -4.0, 4.0)
-SVD (__sv_erf_x, -4.0, 4.0)
-SVD (_ZGVsMxv_erf, -4.0, 4.0)
-
-SVD (__sv_erfc_x, -4, 10)
-SVD (_ZGVsMxv_erfc, -4, 10)
-
-SVF (__sv_expf_x, -9.9, 9.9)
-SVF (_ZGVsMxv_expf, -9.9, 9.9)
-
-SVF (__sv_cosf_x, -3.1, 3.1)
-SVF (_ZGVsMxv_cosf, -3.1, 3.1)
-SVF (__sv_sinf_x, -3.1, 3.1)
-SVF (_ZGVsMxv_sinf, -3.1, 3.1)
-
-SVF (__sv_logf_x, 0.01, 11.1)
-SVF (_ZGVsMxv_logf, 0.01, 11.1)
-SVD (__sv_log_x, 0.01, 11.1)
-SVD (_ZGVsMxv_log, 0.01, 11.1)
-
-SVF (__sv_log10f_x, 0.01, 11.1)
-SVF (_ZGVsMxv_log10f, 0.01, 11.1)
-SVD (__sv_log10_x, 0.01, 11.1)
-SVD (_ZGVsMxv_log10, 0.01, 11.1)
-
-SVD (__sv_cos_x, -3.1, 3.1)
-SVD (_ZGVsMxv_cos, -3.1, 3.1)
-SVD (__sv_sin_x, -3.1, 3.1)
-SVD (_ZGVsMxv_sin, -3.1, 3.1)
-
-SVF (__sv_tanf_x, -3.1, 3.1)
-SVF (_ZGVsMxv_tanf, -3.1, 3.1)
-
+ZSVF (erff, -4.0, 4.0)
+ZSVD (erf, -4.0, 4.0)
+ZSVD (erfc, -4, 10)
+ZSVF (expf, -9.9, 9.9)
+ZSVF (cosf, -3.1, 3.1)
+ZSVD (cos, -3.1, 3.1)
+ZSVF (sinf, -3.1, 3.1)
+ZSVD (sin, -3.1, 3.1)
+ZSVF (logf, 0.01, 11.1)
+ZSVD (log, 0.01, 11.1)
+ZSVF (log10f, 0.01, 11.1)
+ZSVD (log10, 0.01, 11.1)
+ZSVF (tanf, -3.1, 3.1)
 {"__sv_powif_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_powif_wrap}},
 {"_ZGVsMxvv_powi", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
 {"__sv_powi_x", 'd', 'n', -10.0, 10.0, {.svd = __sv_powi_wrap}},
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 98b63c8..a6c3866 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -4,6 +4,35 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+
+#ifdef __vpcs
+
+#define _ZVNF1(f) SF1 (f) VF1 (f) ZVNF1 (f)
+#define _ZVND1(f) SD1 (f) VD1 (f) ZVND1 (f)
+#define _ZVNF2(f) SF2 (f) VF2 (f) ZVNF2 (f)
+#define _ZVND2(f) SD2 (f) VD2 (f) ZVND2 (f)
+
+#elif __aarch64
+
+#define _ZVNF1(f) SF1 (f) VF1 (f)
+#define _ZVND1(f) SD1 (f) VD1 (f)
+#define _ZVNF2(f) SF2 (f) VF2 (f)
+#define _ZVND2(f) SD2 (f) VD2 (f)
+
+#else
+
+#define _ZVNF1(f) SF1 (f)
+#define _ZVND1(f) SD1 (f)
+#define _ZVNF2(f) SF2 (f)
+#define _ZVND2(f) SD2 (f)
+
+#endif
+
+#define _ZSVF1(f) SVF1 (f) ZSVF1 (f)
+#define _ZSVF2(f) SVF2 (f) ZSVF2 (f)
+#define _ZSVD1(f) SVD1 (f) ZSVD1 (f)
+#define _ZSVD2(f) SVD2 (f) ZSVD2 (f)
+
 F1 (acosh)
 F1 (asinh)
 F2 (atan2)
@@ -23,108 +52,47 @@ D1 (expm1)
 D1 (log10)
 D1 (log1p)
 #if WANT_VMATH
-SF1 (asinh)
-SF1 (atan)
-SD1 (atan)
-SF2 (atan2)
-SD2 (atan2)
-SF1 (cosh)
-SF1 (erf)
-SD1 (erf)
-SF1 (erfc)
-SD1 (erfc)
-SF1 (expm1)
-SD1 (expm1)
-SF1 (log10)
-SD1 (log10)
-SF1 (log1p)
-SD1 (log1p)
-SF1 (log2)
-SD1 (log2)
-SF1 (sinh)
-SF1 (tan)
-#if __aarch64__
-VF1 (asinh)
-VF1 (atan)
-VD1 (atan)
-VF2 (atan2)
-VD2 (atan2)
-VF1 (cosh)
-VF1 (erf)
-VD1 (erf)
-VF1 (erfc)
-VD1 (erfc)
-VF1 (expm1)
-VD1 (expm1)
-VF1 (log10)
-VD1 (log10)
-VF1 (log1p)
-VD1 (log1p)
-VF1 (log2)
-VD1 (log2)
-VF1 (sinh)
-VF1 (tan)
-#ifdef __vpcs
-ZVNF1 (asinh)
-ZVNF1 (atan)
-ZVND1 (atan)
-ZVNF2 (atan2)
-ZVND2 (atan2)
-ZVNF1 (cosh)
-ZVNF1 (erf)
-ZVND1 (erf)
-ZVNF1 (erfc)
-ZVND1 (erfc)
-ZVNF1 (expm1)
-ZVND1 (expm1)
-ZVNF1 (log10)
-ZVND1 (log10)
-ZVNF1 (log1p)
-ZVND1 (log1p)
-ZVNF1 (log2)
-ZVND1 (log2)
-ZVNF1 (sinh)
-ZVNF1 (tan)
-#endif
-#endif
+_ZVNF1 (asinh)
+_ZVNF1 (atan)
+_ZVND1 (atan)
+_ZVNF2 (atan2)
+_ZVND2 (atan2)
+_ZVNF1 (cosh)
+_ZVNF1 (erf)
+_ZVND1 (erf)
+_ZVNF1 (erfc)
+_ZVND1 (erfc)
+_ZVNF1 (expm1)
+_ZVND1 (expm1)
+_ZVNF1 (log10)
+_ZVND1 (log10)
+_ZVNF1 (log1p)
+_ZVND1 (log1p)
+_ZVNF1 (log2)
+_ZVND1 (log2)
+_ZVNF1 (sinh)
+_ZVNF1 (tan)
 #if WANT_SVE_MATH
-SVF2 (atan2)
-ZSVF2 (atan2)
-SVD2 (atan2)
-ZSVD2 (atan2)
-SVF1 (atan)
-ZSVF1 (atan)
-SVD1 (atan)
-ZSVD1 (atan)
-SVF1 (cos)
-ZSVF1 (cos)
-SVD1 (cos)
-ZSVD1 (cos)
-SVF1 (erf)
-ZSVF1 (erf)
-SVD1 (erf)
-ZSVD1 (erf)
-SVD1 (erfc)
-ZSVD1 (erfc)
-SVF1 (exp)
-ZSVF1 (exp)
-SVF1 (log)
-ZSVF1 (log)
-SVD1 (log)
-ZSVD1 (log)
-SVF1 (log10)
-ZSVF1 (log10)
-SVD1 (log10)
-ZSVD1 (log10)
+_ZSVF2 (atan2)
+_ZSVD2 (atan2)
+_ZSVF1 (atan)
+_ZSVD1 (atan)
+_ZSVF1 (cos)
+_ZSVD1 (cos)
+_ZSVF1 (erf)
+_ZSVD1 (erf)
+_ZSVD1 (erfc)
+_ZSVF1 (exp)
+_ZSVF1 (log)
+_ZSVD1 (log)
+_ZSVF1 (log10)
+_ZSVD1 (log10)
 F (__sv_powi, sv_powi, ref_powi, mpfr_powi, 2, 0, d2, 0)
 F (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0)
 F (__sv_powif, sv_powif, ref_powif, mpfr_powi, 2, 1, f2, 0)
 F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0)
-SVF1 (sin)
-ZSVF1 (sin)
-SVD1 (sin)
-ZSVD1 (sin)
-SVF1 (tan)
-ZSVF1 (tan)
+_ZSVF1 (sin)
+_ZSVD1 (sin)
+_ZSVF1 (tan)
 #endif
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 9c639c1..18dfe13 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -75,10 +75,28 @@ DECL_POW_INT_REF(ref_powi, long double, double, int)
 #define ZVD1_WRAP(func) static double Z_##func(double x) { return _ZGVnN2v_##func(argd(x))[0]; }
 #define ZVD2_WRAP(func) static double Z_##func(double x, double y) { return _ZGVnN2vv_##func(argd(x), argd(y))[0]; }
 
-#define ZVNF1_WRAP(func) VNF1_WRAP(func) ZVF1_WRAP(func)
-#define ZVNF2_WRAP(func) VNF2_WRAP(func) ZVF2_WRAP(func)
-#define ZVND1_WRAP(func) VND1_WRAP(func) ZVD1_WRAP(func)
-#define ZVND2_WRAP(func) VND2_WRAP(func) ZVD2_WRAP(func)
+#ifdef __vpcs
+
+#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func) ZVF1_WRAP(func)
+#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func) ZVF2_WRAP(func)
+#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func) ZVD1_WRAP(func)
+#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func) ZVD2_WRAP(func)
+
+#elif __aarch64__
+
+#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func)
+#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func)
+#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func)
+#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func)
+
+#else
+
+#define ZVNF1_WRAP(func) VF1_WRAP(func)
+#define ZVNF2_WRAP(func) VF2_WRAP(func)
+#define ZVND1_WRAP(func) VD1_WRAP(func)
+#define ZVND2_WRAP(func) VD2_WRAP(func)
+
+#endif
 
 #define SVF1_WRAP(func) static float sv_##func##f(float x) { return svretf(__sv_##func##f_x(svargf(x), svptrue_b32())); }
 #define SVF2_WRAP(func) static float sv_##func##f(float x, float y) { return svretf(__sv_##func##f_x(svargf(x), svargf(y), svptrue_b32())); }
@@ -96,28 +114,7 @@ DECL_POW_INT_REF(ref_powi, long double, double, int)
 #define ZSVND2_WRAP(func) SVD2_WRAP(func) ZSVD2_WRAP(func)
 
 /* Wrappers for vector functions.  */
-#if __aarch64__ && WANT_VMATH
-VF1_WRAP(asinh)
-VF1_WRAP(atan)
-VF2_WRAP(atan2)
-VF1_WRAP(cosh)
-VF1_WRAP(erf)
-VF1_WRAP(erfc)
-VF1_WRAP(expm1)
-VF1_WRAP(log10)
-VF1_WRAP(log1p)
-VF1_WRAP(log2)
-VF1_WRAP(sinh)
-VF1_WRAP(tan)
-VD1_WRAP(atan)
-VD2_WRAP(atan2)
-VD1_WRAP(erf)
-VD1_WRAP(erfc)
-VD1_WRAP(expm1)
-VD1_WRAP(log10)
-VD1_WRAP(log1p)
-VD1_WRAP(log2)
-#ifdef __vpcs
+#if WANT_VMATH
 ZVNF1_WRAP(asinh)
 ZVNF1_WRAP(atan)
 ZVNF2_WRAP(atan2)
@@ -138,7 +135,6 @@ ZVND1_WRAP(expm1)
 ZVND1_WRAP(log10)
 ZVND1_WRAP(log1p)
 ZVND1_WRAP(log2)
-#endif
 #if WANT_SVE_MATH
 ZSVNF2_WRAP(atan2)
 ZSVNF1_WRAP(atan)
-- 
cgit v1.2.3


From 5b8ac95ba99ec2f903886e88624ca4f0137c274d Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 11 Nov 2022 09:24:44 +0000
Subject: pl/math: Fix SVE mathbench wrappers

These were broken in the previous patch, now fixed.
---
 pl/math/test/mathbench_funcs.h | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index d09cdb0..9289f45 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -23,8 +23,8 @@
 
 #endif
 
-#define VZSVF(f, a, b) SVF(__sv_##f##_x, a, b) SVF(_ZGVsMsv_##f, a, b)
-#define VZSVD(f, a, b) SVD(__sv_##f##_x, a, b) SVD(_ZGVsMsv_##f, a, b)
+#define VZSVF(f, a, b) SVF(__sv_##f##_x, a, b) SVF(_ZGVsMxv_##f, a, b)
+#define VZSVD(f, a, b) SVD(__sv_##f##_x, a, b) SVD(_ZGVsMxv_##f, a, b)
 
 F (acoshf, 1.0, 10.0)
 F (asinhf, -10.0, 10.0)
@@ -85,25 +85,25 @@ ZVNF (tanf, -3.1, 3.1)
 {"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
 
 #if WANT_SVE_MATH
-ZSVF (atanf, -3.1, 3.1)
-ZSVD (atan, -3.1, 3.1)
+VZSVF (atanf, -3.1, 3.1)
+VZSVD (atan, -3.1, 3.1)
 {"__sv_atan2f_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_atan2f_wrap}},
 {"_ZGVsMxvv_atan2f", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}},
 {"__sv_atan2_x", 'd', 'n', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
 {"_ZGVsM2vv_atan2", 'd', 'n', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
-ZSVF (erff, -4.0, 4.0)
-ZSVD (erf, -4.0, 4.0)
-ZSVD (erfc, -4, 10)
-ZSVF (expf, -9.9, 9.9)
-ZSVF (cosf, -3.1, 3.1)
-ZSVD (cos, -3.1, 3.1)
-ZSVF (sinf, -3.1, 3.1)
-ZSVD (sin, -3.1, 3.1)
-ZSVF (logf, 0.01, 11.1)
-ZSVD (log, 0.01, 11.1)
-ZSVF (log10f, 0.01, 11.1)
-ZSVD (log10, 0.01, 11.1)
-ZSVF (tanf, -3.1, 3.1)
+VZSVF (erff, -4.0, 4.0)
+VZSVD (erf, -4.0, 4.0)
+VZSVD (erfc, -4, 10)
+VZSVF (expf, -9.9, 9.9)
+VZSVF (cosf, -3.1, 3.1)
+VZSVD (cos, -3.1, 3.1)
+VZSVF (sinf, -3.1, 3.1)
+VZSVD (sin, -3.1, 3.1)
+VZSVF (logf, 0.01, 11.1)
+VZSVD (log, 0.01, 11.1)
+VZSVF (log10f, 0.01, 11.1)
+VZSVD (log10, 0.01, 11.1)
+VZSVF (tanf, -3.1, 3.1)
 {"__sv_powif_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_powif_wrap}},
 {"_ZGVsMxvv_powi", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
 {"__sv_powi_x", 'd', 'n', -10.0, 10.0, {.svd = __sv_powi_wrap}},
-- 
cgit v1.2.3


From 2348c662121f3d04e48664638ae76628348a5b7e Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 11 Nov 2022 16:31:27 +0000
Subject: pl/math: Fix minus zero in vector expm1

Extra special-case check.
---
 pl/math/v_expm1_2u5.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
index 425ad88..7a5818b 100644
--- a/pl/math/v_expm1_2u5.c
+++ b/pl/math/v_expm1_2u5.c
@@ -61,7 +61,8 @@ v_f64_t V_NAME (expm1) (v_f64_t x)
     return v_call_f64 (expm1, x, x, v_u64 (-1));
 #else
   /* Large input, NaNs and Infs.  */
-  v_u64_t special = v_cond_u64 (ax >= SpecialBound);
+  v_u64_t special
+    = v_cond_u64 ((ax >= SpecialBound) | (ix == 0x8000000000000000));
 #endif
 
   /* Reduce argument to smaller range:
-- 
cgit v1.2.3


From 3031a49fc90924206242cca12acfeeada1a98488 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 15 Nov 2022 09:07:32 +0000
Subject: pl/math: Change conflicting variable names

There is collision for math-tests and math-rtests between math/ and
pl/math, which can lead to failures if running both concurrently. We
rename the pl-specific lists to avoid this.
---
 pl/math/Dir.mk | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index bb81052..0fe1e67 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -118,14 +118,14 @@ build/pl/include/test/%.h: $(PLM)/test/%.h
 build/pl/bin/%.sh: $(PLM)/test/%.sh
 	cp $< $@
 
-math-tests := $(wildcard $(PLM)/test/testcases/directed/*.tst)
-math-rtests := $(wildcard $(PLM)/test/testcases/random/*.tst)
+pl-math-tests := $(wildcard $(PLM)/test/testcases/directed/*.tst)
+pl-math-rtests := $(wildcard $(PLM)/test/testcases/random/*.tst)
 
 check-pl/math-test: $(math-tools)
-	cat $(math-tests) | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
+	cat $(pl-math-tests) | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
 
 check-pl/math-rtest: $(math-host-tools) $(math-tools)
-	cat $(math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
+	cat $(pl-math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
 
 check-pl/math-ulp: $(math-tools)
 	WANT_ERRNO=$(WANT_ERRNO) WANT_SVE_MATH=$(WANT_SVE_MATH) ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR)
-- 
cgit v1.2.3


From 538697eeaaba4640dbe85ba439e2de60ff7b2695 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pierre-Cl=C3=A9ment=20Tosi?= <ptosi@google.com>
Date: Wed, 28 Sep 2022 14:21:53 +0100
Subject: Add mem*, str* functions to baremetal static lib

Add more libc helper functions, to be used by baremetal Rust targets.

Bug: 255521657
Test: atest vmbase_example.integration_test # used by aosp/2138640
Change-Id: I6ec50bc37d0851c5fd47902f34a25b6178e36ed3
---
 Android.bp | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/Android.bp b/Android.bp
index ea238fe..4050312 100755
--- a/Android.bp
+++ b/Android.bp
@@ -151,15 +151,37 @@ cc_library_static {
     arch: {
         arm64: {
             srcs: [
+                "string/aarch64/memchr.S",
                 "string/aarch64/memcmp.S",
                 "string/aarch64/memcpy.S",
+                "string/aarch64/memrchr.S",
                 "string/aarch64/memset.S",
+                "string/aarch64/stpcpy.S",
+                "string/aarch64/strchr.S",
+                "string/aarch64/strchrnul.S",
+                "string/aarch64/strcmp.S",
+                "string/aarch64/strcpy.S",
+                "string/aarch64/strlen.S",
+                "string/aarch64/strncmp.S",
+                "string/aarch64/strnlen.S",
+                "string/aarch64/strrchr.S",
             ],
             asflags: [
+                "-D__memchr_aarch64=memchr",
                 "-D__memcmp_aarch64=memcmp",
                 "-D__memcpy_aarch64=memcpy",
                 "-D__memmove_aarch64=memmove",
+                "-D__memrchr_aarch64=memrchr",
                 "-D__memset_aarch64=memset",
+                "-D__stpcpy_aarch64=stpcpy",
+                "-D__strchr_aarch64=strchr",
+                "-D__strchrnul_aarch64=strchrnul",
+                "-D__strcmp_aarch64=strcmp",
+                "-D__strcpy_aarch64=strcpy",
+                "-D__strlen_aarch64=strlen",
+                "-D__strncmp_aarch64=strncmp",
+                "-D__strnlen_aarch64=strnlen",
+                "-D__strrchr_aarch64=strrchr",
             ],
         },
     },
-- 
cgit v1.2.3


From 3d7d7fab1c9ea03d23dc8eb1b546c30a9b2ef7fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pierre-Cl=C3=A9ment=20Tosi?= <ptosi@google.com>
Date: Tue, 15 Nov 2022 11:12:27 +0000
Subject: Android.bp: Change file mode to non-executable

The file is a build configuration and shouldn't be executable.

Test: TH # No change intended
Change-Id: I76ab86cd2971160b7f376bdda0d59da36c50a59b
---
 Android.bp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 Android.bp

diff --git a/Android.bp b/Android.bp
old mode 100755
new mode 100644
-- 
cgit v1.2.3


From 3560d66f2efb0164821681e5006280b2f6dadd07 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 15 Nov 2022 12:31:04 +0000
Subject: math/test: Allow user to set control element of input vector

argf and argd have been designed such that non-special input is
tested, optionally followed by a vector with one special lane. To be
able to test that vector functions have correct behaviour w.r.t. fenv
exceptions, we need to be able to choose a different value for the
last lane, as using 1 leads to false negatives when testing a function
for which 1 is a special value. We add an option, -c, for the user to
provide a different control value.
---
 math/test/ulp.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/math/test/ulp.c b/math/test/ulp.c
index 24185a2..e011ae3 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -223,8 +223,9 @@ static int secondcall;
 #if __aarch64__ && WANT_VMATH
 typedef __f32x4_t v_float;
 typedef __f64x2_t v_double;
-static const float fv[2] = {1.0f, -INFINITY};
-static const double dv[2] = {1.0, -INFINITY};
+/* First element of fv and dv may be changed by -c argument.  */
+static float fv[2] = {1.0f, -INFINITY};
+static double dv[2] = {1.0, -INFINITY};
 static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
 static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
 #if WANT_SVE_MATH
@@ -633,6 +634,9 @@ usage (void)
   puts ("-q: quiet.");
   puts ("-m: use mpfr even if faster method is available.");
   puts ("-f: disable fenv testing (rounding modes and exceptions).");
+  puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n"
+	"    This should be different from tested input in other lanes, and non-special \n"
+	"    (i.e. should not trigger fenv exceptions). Default is 1.");
   puts ("Supported func:");
   for (const struct fun *f = fun; f->name; f++)
     printf ("\t%s\n", f->name);
@@ -800,6 +804,12 @@ main (int argc, char *argv[])
 	      conf.rc = argv[0][0];
 	    }
 	  break;
+	case 'c':
+	  argc--;
+	  argv++;
+	  fv[0] = strtof(argv[0], 0);
+	  dv[0] = strtod(argv[0], 0);
+	  break;
 	default:
 	  usage ();
 	}
-- 
cgit v1.2.3


From c1cf1eb0ad5fb98c4c14e8e83e00b779d1e646a2 Mon Sep 17 00:00:00 2001
From: Nicholas Dingle <Nicholas.Dingle@arm.com>
Date: Tue, 15 Nov 2022 14:55:06 +0000
Subject: pl/math: Use order-6 polynomial in Vector/Neon log2

Reduce the order of the polynomial used in Neon log2 by one (from 7 to
6). In order to calculate the new coefficients required we rescale the
coefficients from log_data.c by log2(e) in extended precision and
round back.

The maximum observed error is unchanged (2.59 ULPs) but the point at
which it is observed has changed slightly.
---
 pl/math/math_config.h | 2 +-
 pl/math/v_log2_3u.c   | 7 +++----
 pl/math/v_log2_data.c | 6 +++---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 68b6ee7..7472395 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -489,7 +489,7 @@ extern const struct v_log2f_data
 } __v_log2f_data HIDDEN;
 
 #define V_LOG2_TABLE_BITS 7
-#define V_LOG2_POLY_ORDER 7
+#define V_LOG2_POLY_ORDER 6
 extern const struct v_log2_data
 {
   double poly[V_LOG2_POLY_ORDER - 1];
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
index b076874..af7340d 100644
--- a/pl/math/v_log2_3u.c
+++ b/pl/math/v_log2_3u.c
@@ -46,8 +46,8 @@ specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
 /* Double-precision vector log2 routine. Implements the same algorithm as vector
    log10, with coefficients and table entries scaled in extended precision.
    The maximum observed error is 2.59 ULP:
-   __v_log2(0x1.0b556e53a80b6p+0) got 0x1.fffbc594d146bp-5
-				 want 0x1.fffbc594d146ep-5.  */
+   __v_log2(0x1.0b555054a9bd1p+0) got 0x1.fff6977bdced3p-5
+				 want 0x1.fff6977bdced6p-5.  */
 VPCS_ATTR
 v_f64_t V_NAME (log2) (v_f64_t x)
 {
@@ -73,10 +73,9 @@ v_f64_t V_NAME (log2) (v_f64_t x)
   v_f64_t w = v_fma_f64 (r, InvLn2, e.log2c);
 
   v_f64_t r2 = r * r;
-  v_f64_t p_45 = v_fma_f64 (P (5), r, P (4));
   v_f64_t p_23 = v_fma_f64 (P (3), r, P (2));
   v_f64_t p_01 = v_fma_f64 (P (1), r, P (0));
-  v_f64_t y = v_fma_f64 (r2, p_45, p_23);
+  v_f64_t y = v_fma_f64 (P (4), r2, p_23);
   y = v_fma_f64 (r2, y, p_01);
   y = v_fma_f64 (r2, y, kd + w);
 
diff --git a/pl/math/v_log2_data.c b/pl/math/v_log2_data.c
index f926d7f..e3c56c1 100644
--- a/pl/math/v_log2_data.c
+++ b/pl/math/v_log2_data.c
@@ -13,11 +13,11 @@
 
 const struct v_log2_data __v_log2_data = {
 
-/* Derived from the coefficients in log_data.c for N == 128 && LOG_POLY_ORDER == 7.
+/* Derived from the coefficients in log_data.c for N == 128 && LOG_POLY_ORDER == 6.
    Each coefficient was scaled by log2(e) in extended precision and rounded back to
    double.  */
-.poly = { -0x1.71547652b83p-1,   0x1.ec709dc3a03fep-2, -0x1.71547651bb77bp-2,
-	  0x1.2776c50e7a6a3p-2, -0x1.ec73b0462606bp-3,  0x1.a619832ca8615p-3 },
+.poly = { -0x1.71547652b83p-1,    0x1.ec709dc340953p-2, -0x1.71547651c8f35p-2,
+	   0x1.2777ebe12dda5p-2, -0x1.ec738d616fe26p-3 },
 
 /* Derived from the table in v_log10_data.c. invc is unchanged. log2(c) was
    calculated by scaling log10(c) by log2(10) in extended precision and rounding
-- 
cgit v1.2.3


From 0184f5179e0d4e16ff8dc8e2b4c225592f67c935 Mon Sep 17 00:00:00 2001
From: Elliott Hughes <enh@google.com>
Date: Thu, 17 Nov 2022 00:38:06 +0000
Subject: Build the optimized memcpy() and memmove().

Test: treehugger
Change-Id: I545117b6b8283f0b5eca2f4591579393720c7960
---
 Android.bp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Android.bp b/Android.bp
index 4050312..f79e6d6 100644
--- a/Android.bp
+++ b/Android.bp
@@ -111,6 +111,8 @@ cc_library_static {
                 "string/aarch64/memchr-mte.S",
                 "string/aarch64/memchr.S",
                 "string/aarch64/memcmp.S",
+                "string/aarch64/memcpy-advsimd.S",
+                "string/aarch64/memcpy.S",
                 "string/aarch64/memrchr.S",
                 "string/aarch64/stpcpy-mte.S",
                 "string/aarch64/stpcpy.S",
-- 
cgit v1.2.3


From 56a92ffa259a0e77411541a013951a3fa76d7062 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 17 Nov 2022 10:42:20 +0000
Subject: pl/math: Add scalar and vector/Neon sinh

New routines are based on the single-precision versions and are
accurate to 3 ULP.
---
 pl/math/include/mathlib.h                |  5 +++
 pl/math/s_sinh_3u.c                      |  6 ++++
 pl/math/sinh_3u.c                        | 55 ++++++++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h           |  2 ++
 pl/math/test/runulp.sh                   | 23 ++++++++++++-
 pl/math/test/testcases/directed/sinh.tst | 21 ++++++++++++
 pl/math/test/ulp_funcs.h                 |  2 ++
 pl/math/test/ulp_wrappers.h              |  1 +
 pl/math/v_sinh_3u.c                      | 45 ++++++++++++++++++++++++++
 pl/math/vn_sinh_3u.c                     | 12 +++++++
 10 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100644 pl/math/s_sinh_3u.c
 create mode 100644 pl/math/sinh_3u.c
 create mode 100644 pl/math/test/testcases/directed/sinh.tst
 create mode 100644 pl/math/v_sinh_3u.c
 create mode 100644 pl/math/vn_sinh_3u.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 9ebe539..96da00a 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -28,6 +28,7 @@ double erfc (double);
 double expm1 (double);
 double log10 (double);
 double log1p (double);
+double sinh (double);
 
 float __s_asinhf (float);
 float __s_atanf (float);
@@ -50,6 +51,7 @@ double __s_expm1 (double);
 double __s_log10 (double);
 double __s_log1p (double);
 double __s_log2 (double);
+double __s_sinh (double);
 
 #if __aarch64__
 #if __GNUC__ >= 5
@@ -82,6 +84,7 @@ __f64x2_t __v_log1p (__f64x2_t);
 __f32x4_t __v_log2f (__f32x4_t);
 __f64x2_t __v_log2 (__f64x2_t);
 __f32x4_t __v_sinhf (__f32x4_t);
+__f64x2_t __v_sinh (__f64x2_t);
 __f32x4_t __v_tanf (__f32x4_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
@@ -107,6 +110,7 @@ __vpcs __f64x2_t __vn_log1p (__f64x2_t);
 __vpcs __f32x4_t __vn_log2f (__f32x4_t);
 __vpcs __f64x2_t __vn_log2 (__f64x2_t);
 __vpcs __f32x4_t __vn_sinhf (__f32x4_t);
+__vpcs __f64x2_t __vn_sinh (__f64x2_t);
 __vpcs __f32x4_t __vn_tanf (__f32x4_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
@@ -129,6 +133,7 @@ __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
 
 #endif
diff --git a/pl/math/s_sinh_3u.c b/pl/math/s_sinh_3u.c
new file mode 100644
index 0000000..2c08fa1
--- /dev/null
+++ b/pl/math/s_sinh_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_sinh_3u.c"
diff --git a/pl/math/sinh_3u.c b/pl/math/sinh_3u.c
new file mode 100644
index 0000000..ce3ff13
--- /dev/null
+++ b/pl/math/sinh_3u.c
@@ -0,0 +1,55 @@
+/*
+ * Double-precision sinh(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define OFlowBound                                                             \
+  0x40862e42fefa39f0 /* 0x1.62e42fefa39fp+9, above which using expm1 results   \
+			in NaN.  */
+
+double
+__exp_dd (double, double);
+
+/* Approximation for double-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The greatest observed error is 2.57 ULP:
+   __v_sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
+				 want 0x1.ab34e59d678d9p-2.  */
+double
+sinh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iax = ix & AbsMask;
+  double ax = asdouble (iax);
+  uint64_t sign = ix & ~AbsMask;
+  double halfsign = asdouble (Half | sign);
+
+  if (unlikely (iax >= OFlowBound))
+    {
+      /* Special values and overflow.  */
+      if (unlikely (iax > 0x7ff0000000000000))
+	return __math_invalidf (x);
+      /* expm1 overflows a little before sinh. We have to fill this
+	 gap by using a different algorithm, in this case we use a
+	 double-precision exp helper. For large x sinh(x) is dominated
+	 by exp(x), however we cannot compute exp without overflow
+	 either. We use the identity: exp(a) = (exp(a / 2)) ^ 2
+	 to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2    for x > 0
+			    ~= (exp(|x| / 2)) ^ 2 / -2   for x < 0.  */
+      double e = __exp_dd (ax / 2, 0);
+      return (e * halfsign) * e;
+    }
+
+  /* Use expm1f to retain acceptable precision for small numbers.
+     Let t = e^(|x|) - 1.  */
+  double t = expm1 (ax);
+  /* Then sinh(x) = (t + t / (t + 1)) / 2   for x > 0
+		    (t + t / (t + 1)) / -2  for x < 0.  */
+  return (t + t / (t + 1)) * halfsign;
+}
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 9289f45..49208de 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -55,6 +55,7 @@ D (log1p, -0.9, 10.0)
 D (log2, 0.01, 11.1)
 {"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}},
 D (sin, -3.1, 3.1)
+D (sinh, -10.0, 10.0)
 
 #if WANT_VMATH
 ZVNF (asinhf, -10.0, 10.0)
@@ -74,6 +75,7 @@ ZVND (log1p, -0.9, 10.0)
 ZVNF (log2f, 0.01, 11.1)
 ZVND (log2, 0.01, 11.1)
 ZVNF (sinhf, -10.0, 10.0)
+ZVND (sinh, -10.0, 10.0)
 ZVNF (tanf, -3.1, 3.1)
 {"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
 {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index c92892b..8835db1 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -179,7 +179,6 @@ t coshf -0             -0x1p-63         100
 t coshf -0             -0x1.5a92d8p+6   80000
 t coshf -0x1.5a92d8p+6 -inf             2000
 
-
 L=1.68
 t expm1  0                     0x1p-51              1000
 t expm1 -0                    -0x1p-51              1000
@@ -188,6 +187,14 @@ t expm1 -0x1p-51              -0x1.740bf7c0d927dp+9 100000
 t expm1  0x1.63108c75a1937p+9  inf                  100
 t expm1 -0x1.740bf7c0d927dp+9 -inf                  100
 
+L=2.08
+t sinh  0                    0x1p-51             100
+t sinh -0                   -0x1p-51             100
+t sinh  0x1p-51              0x1.62e42fefa39fp+9 100000
+t sinh -0x1p-51             -0x1.62e42fefa39fp+9 100000
+t sinh  0x1.62e42fefa39fp+9  inf                 1000
+t sinh -0x1.62e42fefa39fp+9 -inf                 1000
+
 done
 
 # vector functions
@@ -393,6 +400,15 @@ range_expm1='
  -0x1.740bf7c0d927dp+9 -inf                  100
 '
 
+range_sinh='
+  0                    0x1p-51             100
+ -0                   -0x1p-51             100
+  0x1p-51              0x1.62e42fefa39fp+9 100000
+ -0x1p-51             -0x1.62e42fefa39fp+9 100000
+  0x1.62e42fefa39fp+9  inf                 1000
+ -0x1.62e42fefa39fp+9 -inf                 1000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -556,6 +572,7 @@ L_expm1f=1.02
 L_sinhf=1.76
 L_coshf=1.89
 L_expm1=1.68
+L_sinh=2.08
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -638,6 +655,10 @@ expm1  __s_expm1       $runs    fenv
 expm1  __v_expm1       $runv    fenv
 expm1  __vn_expm1      $runvn   fenv
 expm1  _ZGVnN2v_expm1  $runvn   fenv
+sinh   __s_sinh        $runs    fenv
+sinh   __v_sinh        $runv    fenv
+sinh   __vn_sinh       $runvn   fenv
+sinh   _ZGVnN2v_sinh   $runvn   fenv
 
 atanf  __s_atanf       $runs
 atanf  __v_atanf       $runv
diff --git a/pl/math/test/testcases/directed/sinh.tst b/pl/math/test/testcases/directed/sinh.tst
new file mode 100644
index 0000000..d8c7d91
--- /dev/null
+++ b/pl/math/test/testcases/directed/sinh.tst
@@ -0,0 +1,21 @@
+; sinh.tst
+;
+; Copyright 1999-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=sinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=sinh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=sinh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=sinh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=sinh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=sinh op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=sinh op1=fff00000.00000000 result=fff00000.00000000 errno=0
+func=sinh op1=ffefffff.ffffffff result=fff00000.00000000 errno=ERANGE status=ox
+func=sinh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=sinh op1=80000000.00000000 result=80000000.00000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=sinh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=sinh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index a6c3866..b3bd940 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -51,6 +51,7 @@ D1 (erfc)
 D1 (expm1)
 D1 (log10)
 D1 (log1p)
+D1 (sinh)
 #if WANT_VMATH
 _ZVNF1 (asinh)
 _ZVNF1 (atan)
@@ -71,6 +72,7 @@ _ZVND1 (log1p)
 _ZVNF1 (log2)
 _ZVND1 (log2)
 _ZVNF1 (sinh)
+_ZVND1 (sinh)
 _ZVNF1 (tan)
 #if WANT_SVE_MATH
 _ZSVF2 (atan2)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 18dfe13..91f8e76 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -135,6 +135,7 @@ ZVND1_WRAP(expm1)
 ZVND1_WRAP(log10)
 ZVND1_WRAP(log1p)
 ZVND1_WRAP(log2)
+ZVND1_WRAP(sinh)
 #if WANT_SVE_MATH
 ZSVNF2_WRAP(atan2)
 ZSVNF1_WRAP(atan)
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
new file mode 100644
index 0000000..c707364
--- /dev/null
+++ b/pl/math/v_sinh_3u.c
@@ -0,0 +1,45 @@
+/*
+ * Double-precision vector sinh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define OFlowBound                                                             \
+  0x40862e42fefa39f0 /* 0x1.62e42fefa39fp+9, above which using expm1 results   \
+			in NaN.  */
+
+#if V_SUPPORTED
+
+/* Approximation for vector double-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The greatest observed error is 2.57 ULP:
+   sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
+			     want 0x1.ab34e59d678d9p-2.  */
+VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_f64_t ax = v_as_f64_u64 (iax);
+  v_u64_t sign = ix & ~AbsMask;
+  v_f64_t halfsign = v_as_f64_u64 (sign | Half);
+
+  v_u64_t special = v_cond_u64 (iax >= OFlowBound);
+  /* Fall back to the scalar variant for all lanes if any of them should trigger
+     an exception.  */
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (sinh, x, x, v_u64 (-1));
+
+  /* Up to the point that expm1 overflows, we can use it to calculate sinh
+     using a slight rearrangement of the definition of asinh. This allows us to
+     retain acceptable accuracy for very small inputs.  */
+  v_f64_t t = V_NAME (expm1) (ax);
+  return (t + t / (t + 1)) * halfsign;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_sinh_3u.c b/pl/math/vn_sinh_3u.c
new file mode 100644
index 0000000..2b68578
--- /dev/null
+++ b/pl/math/vn_sinh_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_sinh.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_sinh, _ZGVnN2v_sinh)
+#include "v_sinh_3u.c"
+#endif
-- 
cgit v1.2.3


From 50fec2627f460a4b30cbfd11f4e4609cdcd2aa4d Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 17 Nov 2022 10:42:29 +0000
Subject: pl/math: Add scalar & vector/Neon cosh

New routines are based on double-precision exp, both accurate to 2 ULP.
---
 pl/math/cosh_2u.c                        | 55 ++++++++++++++++++++
 pl/math/include/mathlib.h                |  5 ++
 pl/math/s_cosh_2u.c                      |  6 +++
 pl/math/test/mathbench_funcs.h           |  2 +
 pl/math/test/runulp.sh                   | 20 ++++++++
 pl/math/test/testcases/directed/cosh.tst | 15 ++++++
 pl/math/test/ulp_funcs.h                 |  4 +-
 pl/math/test/ulp_wrappers.h              |  1 +
 pl/math/v_cosh_2u.c                      | 86 ++++++++++++++++++++++++++++++++
 pl/math/vn_cosh_2u.c                     | 12 +++++
 10 files changed, 205 insertions(+), 1 deletion(-)
 create mode 100644 pl/math/cosh_2u.c
 create mode 100644 pl/math/s_cosh_2u.c
 create mode 100644 pl/math/test/testcases/directed/cosh.tst
 create mode 100644 pl/math/v_cosh_2u.c
 create mode 100644 pl/math/vn_cosh_2u.c

diff --git a/pl/math/cosh_2u.c b/pl/math/cosh_2u.c
new file mode 100644
index 0000000..7526cdf
--- /dev/null
+++ b/pl/math/cosh_2u.c
@@ -0,0 +1,55 @@
+/*
+ * Double-precision cosh(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define SpecialBound                                                           \
+  0x40861da04cbafe44 /* 0x1.61da04cbafe44p+9, above which exp overflows.  */
+
+double
+__exp_dd (double, double);
+
+static double
+specialcase (double x, uint64_t iax)
+{
+  if (iax == 0x7ff0000000000000)
+    return INFINITY;
+  if (iax > 0x7ff0000000000000)
+    return __math_invalid (x);
+  /* exp overflows above SpecialBound. At this magnitude cosh(x) is dominated by
+     exp(x), so we can approximate cosh(x) by (exp(|x|/2)) ^ 2 / 2.  */
+  double t = __exp_dd (asdouble (iax) / 2, 0);
+  return (0.5 * t) * t;
+}
+
+/* Approximation for double-precision cosh(x).
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The greatest observed error is in the special region, 1.93 ULP:
+   cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+			     want 0x1.fdf28623ef923p+1021.
+
+   The greatest observed error in the non-special region is 1.03 ULP:
+   cosh(0x1.502cd8e56ab3bp+0) got 0x1.fe54962842d0ep+0
+			     want 0x1.fe54962842d0fp+0.  */
+double
+cosh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iax = ix & AbsMask;
+
+  /* exp overflows a little bit before cosh, so use special-case handler for the
+     gap, as well as special values.  */
+  if (unlikely (iax >= SpecialBound))
+    return specialcase (x, iax);
+
+  double ax = asdouble (iax);
+  /* Use double-precision exp helper to calculate exp(x), then:
+     cosh(x) = exp(|x|) / 2 + 1 / (exp(|x| * 2).  */
+  double t = __exp_dd (ax, 0);
+  return 0.5 * t + 0.5 / t;
+}
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 96da00a..e74cd6f 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -24,6 +24,7 @@ float tanf (float);
 double acosh (double);
 double asinh (double);
 double atan2 (double, double);
+double cosh (double);
 double erfc (double);
 double expm1 (double);
 double log10 (double);
@@ -45,6 +46,7 @@ float __s_tanf (float);
 
 double __s_atan (double);
 double __s_atan2 (double, double);
+double __s_cosh (double);
 double __s_erf (double);
 double __s_erfc (double);
 double __s_expm1 (double);
@@ -71,6 +73,7 @@ __f64x2_t __v_atan (__f64x2_t);
 __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
 __f32x4_t __v_coshf (__f32x4_t);
+__f64x2_t __v_cosh (__f64x2_t);
 __f32x4_t __v_erff (__f32x4_t);
 __f64x2_t __v_erf (__f64x2_t);
 __f32x4_t __v_erfcf (__f32x4_t);
@@ -97,6 +100,7 @@ __vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t __vn_coshf (__f32x4_t);
+__vpcs __f64x2_t __vn_cosh (__f64x2_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
 __vpcs __f64x2_t __vn_erf (__f64x2_t);
 __vpcs __f32x4_t __vn_erfcf (__f32x4_t);
@@ -120,6 +124,7 @@ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
diff --git a/pl/math/s_cosh_2u.c b/pl/math/s_cosh_2u.c
new file mode 100644
index 0000000..f9c681c
--- /dev/null
+++ b/pl/math/s_cosh_2u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cosh_2u.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 49208de..583c8fb 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -47,6 +47,7 @@ D (asinh, -10.0, 10.0)
 D (atan, -10.0, 10.0)
 {"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
 D (cos, -3.1, 3.1)
+D (cosh, -10.0, 10.0)
 D (erf, -6,6)
 D (erfc, -6.0, 28.0)
 D (expm1, -9.9, 9.9)
@@ -62,6 +63,7 @@ ZVNF (asinhf, -10.0, 10.0)
 ZVNF (atanf, -10.0, 10.0)
 ZVND (atan, -10.0, 10.0)
 ZVNF (coshf, -10.0, 10.0)
+ZVND (cosh, -10.0, 10.0)
 ZVNF (erff, -4.0, 4.0)
 ZVND (erf, -6.0, 6.0)
 ZVNF (erfcf, -6.0, 28.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 8835db1..02a5a97 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -195,6 +195,14 @@ t sinh -0x1p-51             -0x1.62e42fefa39fp+9 100000
 t sinh  0x1.62e42fefa39fp+9  inf                 1000
 t sinh -0x1.62e42fefa39fp+9 -inf                 1000
 
+L=1.43
+t cosh  0                     0x1.61da04cbafe44p+9 100000
+t cosh -0                    -0x1.61da04cbafe44p+9 100000
+t cosh  0x1.61da04cbafe44p+9  0x1p10               1000
+t cosh -0x1.61da04cbafe44p+9 -0x1p10               1000
+t cosh  0x1p10                inf                  100
+t cosh -0x1p10               -inf                  100
+
 done
 
 # vector functions
@@ -409,6 +417,13 @@ range_sinh='
  -0x1.62e42fefa39fp+9 -inf                 1000
 '
 
+range_cosh='
+  0        0x1.6p9   100000
+ -0       -0x1.6p9   100000
+  0x1.6p9  inf       1000
+ -0x1.6p9 -inf       1000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -573,6 +588,7 @@ L_sinhf=1.76
 L_coshf=1.89
 L_expm1=1.68
 L_sinh=2.08
+L_cosh=1.43
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -659,6 +675,10 @@ sinh   __s_sinh        $runs    fenv
 sinh   __v_sinh        $runv    fenv
 sinh   __vn_sinh       $runvn   fenv
 sinh   _ZGVnN2v_sinh   $runvn   fenv
+cosh   __s_cosh        $runs    fenv
+cosh   __v_cosh        $runv    fenv
+cosh   __vn_cosh       $runvn   fenv
+cosh   _ZGVnN2v_cosh   $runvn   fenv
 
 atanf  __s_atanf       $runs
 atanf  __v_atanf       $runv
diff --git a/pl/math/test/testcases/directed/cosh.tst b/pl/math/test/testcases/directed/cosh.tst
new file mode 100644
index 0000000..5fdc94b
--- /dev/null
+++ b/pl/math/test/testcases/directed/cosh.tst
@@ -0,0 +1,15 @@
+; cosh.tst
+;
+; Copyright 1999-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=cosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=cosh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=cosh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=cosh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=cosh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=cosh op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=cosh op1=fff00000.00000000 result=7ff00000.00000000 errno=0
+func=cosh op1=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=cosh op1=00000000.00000000 result=3ff00000.00000000 errno=0
+func=cosh op1=80000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index b3bd940..1b674a6 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -43,10 +43,11 @@ F1 (expm1)
 F1 (log10)
 F1 (log1p)
 F1 (sinh)
+F1 (tan)
 D1 (acosh)
 D1 (asinh)
 D2 (atan2)
-F1 (tan)
+D1 (cosh)
 D1 (erfc)
 D1 (expm1)
 D1 (log10)
@@ -59,6 +60,7 @@ _ZVND1 (atan)
 _ZVNF2 (atan2)
 _ZVND2 (atan2)
 _ZVNF1 (cosh)
+_ZVND1 (cosh)
 _ZVNF1 (erf)
 _ZVND1 (erf)
 _ZVNF1 (erfc)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 91f8e76..a9f2d15 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -129,6 +129,7 @@ ZVNF1_WRAP(sinh)
 ZVNF1_WRAP(tan)
 ZVND1_WRAP(atan)
 ZVND2_WRAP(atan2)
+ZVND1_WRAP(cosh)
 ZVND1_WRAP(erf)
 ZVND1_WRAP(erfc)
 ZVND1_WRAP(expm1)
diff --git a/pl/math/v_cosh_2u.c b/pl/math/v_cosh_2u.c
new file mode 100644
index 0000000..6d1a9ed
--- /dev/null
+++ b/pl/math/v_cosh_2u.c
@@ -0,0 +1,86 @@
+/*
+ * Double-precision vector cosh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#include "v_exp_tail.h"
+#define C1 v_f64 (C1_scal)
+#define C2 v_f64 (C2_scal)
+#define C3 v_f64 (C3_scal)
+#define InvLn2 v_f64 (InvLn2_scal)
+#define Ln2hi v_f64 (Ln2hi_scal)
+#define Ln2lo v_f64 (Ln2lo_scal)
+#define IndexMask v_u64 (IndexMask_scal)
+#define Shift v_f64 (Shift_scal)
+#define Thres v_f64 (Thres_scal)
+
+#define AbsMask 0x7fffffffffffffff
+#define Half v_f64 (0.5)
+#define SpecialBound                                                           \
+  0x4086000000000000 /* 0x1.6p9, above which exp overflows.  */
+
+#if V_SUPPORTED
+
+static inline v_f64_t
+exp_inline (v_f64_t x)
+{
+  /* Helper for approximating exp(x). Copied from v_exp_tail, with no
+     special-case handling or tail.  */
+
+  /* n = round(x/(ln2/N)).  */
+  v_f64_t z = v_fma_f64 (x, InvLn2, Shift);
+  v_u64_t u = v_as_u64_f64 (z);
+  v_f64_t n = z - Shift;
+
+  /* r = x - n*ln2/N.  */
+  v_f64_t r = x;
+  r = v_fma_f64 (-Ln2hi, n, r);
+  r = v_fma_f64 (-Ln2lo, n, r);
+
+  v_u64_t e = u << (52 - V_EXP_TAIL_TABLE_BITS);
+  v_u64_t i = u & IndexMask;
+
+  /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  v_f64_t y = v_fma_f64 (C3, r, C2);
+  y = v_fma_f64 (y, r, C1);
+  y = v_fma_f64 (y, r, v_f64 (1)) * r;
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (Tab, i);
+  v_f64_t s = v_as_f64_u64 (u + e);
+
+  return v_fma_f64 (y, s, s);
+}
+
+/* Approximation for vector double-precision cosh(x) using exp_inline.
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The greatest observed error is in the scalar fall-back region, so is the same
+   as the scalar routine, 1.93 ULP:
+   __v_cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+				 want 0x1.fdf28623ef923p+1021.
+
+   The greatest observed error in the non-special region is 1.54 ULP:
+   __v_cosh(0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
+				 want 0x1.f711dcb0c77b1p+7.  */
+VPCS_ATTR v_f64_t V_NAME (cosh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_u64_t special = v_cond_u64 (iax > SpecialBound);
+
+  /* If any inputs are special, fall back to scalar for all lanes.  */
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (cosh, x, x, v_u64 (-1));
+
+  v_f64_t ax = v_as_f64_u64 (iax);
+  /* Up to the point that exp overflows, we can use it to calculate cosh by
+     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+  v_f64_t t = exp_inline (ax);
+  return t * Half + Half / t;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_cosh_2u.c b/pl/math/vn_cosh_2u.c
new file mode 100644
index 0000000..5f02efd
--- /dev/null
+++ b/pl/math/vn_cosh_2u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cosh.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_cosh, _ZGVnN2v_cosh)
+#include "v_cosh_2u.c"
+#endif
-- 
cgit v1.2.3


From d16610ab7efa066d49a1cdeb101de32bcae962a0 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 17 Nov 2022 11:16:42 +0000
Subject: math/test: Fix ulp for non-AArch64 targets

fv and dv are only declared under __aarch64__ - for other targets the
new -c option should be disabled.
---
 math/test/ulp.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/math/test/ulp.c b/math/test/ulp.c
index e011ae3..8589ee9 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -634,9 +634,11 @@ usage (void)
   puts ("-q: quiet.");
   puts ("-m: use mpfr even if faster method is available.");
   puts ("-f: disable fenv testing (rounding modes and exceptions).");
+#if __aarch64__ && WANT_VMATH
   puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n"
 	"    This should be different from tested input in other lanes, and non-special \n"
 	"    (i.e. should not trigger fenv exceptions). Default is 1.");
+#endif
   puts ("Supported func:");
   for (const struct fun *f = fun; f->name; f++)
     printf ("\t%s\n", f->name);
@@ -804,12 +806,14 @@ main (int argc, char *argv[])
 	      conf.rc = argv[0][0];
 	    }
 	  break;
+#if __aarch64__ && WANT_VMATH
 	case 'c':
 	  argc--;
 	  argv++;
 	  fv[0] = strtof(argv[0], 0);
 	  dv[0] = strtod(argv[0], 0);
 	  break;
+#endif
 	default:
 	  usage ();
 	}
-- 
cgit v1.2.3


From 8e0a666f7840c745fcb9fe9d977a95c3d13ae246 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Thu, 17 Nov 2022 10:32:55 +0000
Subject: string: arm: Fix build failure

asmdefs.h ifdef logic was wrong: arm only macro definitions were
outside of defined(__arm__).

Added some ifdef indentation to make the code more readable.
---
 string/asmdefs.h | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/string/asmdefs.h b/string/asmdefs.h
index a814190..b4e5123 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -12,15 +12,12 @@
 
 #define ARM_FNSTART .fnstart
 #if defined (IS_LEAF)
-#define ARM_FNEND \
+# define ARM_FNEND \
   .cantunwind	  \
   .fnend
 #else
-#define ARM_FNEND .fnend
-# endif
-#else
-#define ARM_FNSTART
-#define ARM_FNEND
+# define ARM_FNEND .fnend
+#endif
 
 /* Check whether leaf function PAC signing has been requested in the
    -mbranch-protect compile-time option.  */
@@ -142,16 +139,16 @@
 	 .endif
 	.endif
 #if HAVE_PAC_LEAF
-#if __ARM_FEATURE_BTI_DEFAULT
+# if __ARM_FEATURE_BTI_DEFAULT
 	pacbti	ip, lr, sp
-#else
+# else
 	pac	ip, lr, sp
-#endif /* __ARM_FEATURE_BTI_DEFAULT */
+# endif /* __ARM_FEATURE_BTI_DEFAULT */
 	.cfi_register 143, 12
 #else
-#if __ARM_FEATURE_BTI_DEFAULT
+# if __ARM_FEATURE_BTI_DEFAULT
 	bti
-#endif /* __ARM_FEATURE_BTI_DEFAULT */
+# endif /* __ARM_FEATURE_BTI_DEFAULT */
 #endif /* HAVE_PAC_LEAF */
 	.if \first != -1
 	 .if \last != \first
@@ -459,6 +456,11 @@
 	.endif
 .endm
 
+#else /* !defined (__arm__) */
+
+#define ARM_FNSTART
+#define ARM_FNEND
+
 #endif
 
 #if defined(__aarch64__)
-- 
cgit v1.2.3


From 67a49eeffa7bd4d90a4edf54cb9d4d784ab170a7 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Thu, 17 Nov 2022 11:13:22 +0000
Subject: string: Add separate asmdefs.h per target

The definitions in this header are necessarily target specific, so
better to have a separate version in each target directory.
---
 string/aarch64/__mtag_tag_region.S      |   2 +-
 string/aarch64/__mtag_tag_zero_region.S |   2 +-
 string/aarch64/asmdefs.h                |  83 +++++
 string/aarch64/check-arch.S             |   2 +-
 string/aarch64/memchr-mte.S             |   2 +-
 string/aarch64/memchr-sve.S             |   2 +-
 string/aarch64/memchr.S                 |   2 +-
 string/aarch64/memcmp-sve.S             |   2 +-
 string/aarch64/memcmp.S                 |   2 +-
 string/aarch64/memcpy-advsimd.S         |   2 +-
 string/aarch64/memcpy-sve.S             |   2 +-
 string/aarch64/memcpy.S                 |   2 +-
 string/aarch64/memrchr.S                |   2 +-
 string/aarch64/memset.S                 |   2 +-
 string/aarch64/strchr-mte.S             |   2 +-
 string/aarch64/strchr-sve.S             |   2 +-
 string/aarch64/strchr.S                 |   2 +-
 string/aarch64/strchrnul-mte.S          |   2 +-
 string/aarch64/strchrnul.S              |   2 +-
 string/aarch64/strcmp-sve.S             |   2 +-
 string/aarch64/strcmp.S                 |   2 +-
 string/aarch64/strcpy-sve.S             |   2 +-
 string/aarch64/strcpy.S                 |   2 +-
 string/aarch64/strlen-mte.S             |   2 +-
 string/aarch64/strlen-sve.S             |   2 +-
 string/aarch64/strlen.S                 |   2 +-
 string/aarch64/strncmp-sve.S            |   2 +-
 string/aarch64/strncmp.S                |   2 +-
 string/aarch64/strnlen-sve.S            |   2 +-
 string/aarch64/strnlen.S                |   2 +-
 string/aarch64/strrchr-mte.S            |   2 +-
 string/aarch64/strrchr-sve.S            |   2 +-
 string/aarch64/strrchr.S                |   2 +-
 string/arm/asmdefs.h                    | 480 +++++++++++++++++++++++++++
 string/arm/memchr.S                     |   2 +-
 string/arm/memcpy.S                     |   2 +-
 string/arm/strcmp-armv6m.S              |   2 +-
 string/arm/strcmp.S                     |   2 +-
 string/arm/strlen-armv6t2.S             |   2 +-
 string/asmdefs.h                        | 556 --------------------------------
 40 files changed, 600 insertions(+), 593 deletions(-)
 create mode 100644 string/aarch64/asmdefs.h
 create mode 100644 string/arm/asmdefs.h
 delete mode 100644 string/asmdefs.h

diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
index 84b8c94..d9b7b64 100644
--- a/string/aarch64/__mtag_tag_region.S
+++ b/string/aarch64/__mtag_tag_region.S
@@ -15,7 +15,7 @@
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
index 97ae68e..874acf5 100644
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -15,7 +15,7 @@
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h
new file mode 100644
index 0000000..b5ad6fb
--- /dev/null
+++ b/string/aarch64/asmdefs.h
@@ -0,0 +1,83 @@
+/*
+ * Macros for asm code.  AArch64 version.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Branch Target Identitication support.  */
+#define BTI_C		hint	34
+#define BTI_J		hint	36
+/* Return address signing support (pac-ret).  */
+#define PACIASP		hint	25; .cfi_window_save
+#define AUTIASP		hint	29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
+#define GNU_PROPERTY(type, value)	\
+  .section .note.gnu.property, "a";	\
+  .p2align 3;				\
+  .word 4;				\
+  .word 16;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word type;				\
+  .word 4;				\
+  .word value;				\
+  .word 0;				\
+  .text
+
+/* If set then the GNU Property Note section will be added to
+   mark objects to support BTI and PAC-RET.  */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files.  */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .cfi_startproc;	\
+  BTI_C;
+
+#define ENTRY(name)	ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)	\
+  .global name;		\
+  .type name,%function;	\
+  name:
+
+#define END(name)	\
+  .cfi_endproc;		\
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+#endif
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
index 1565465..58f92d9 100644
--- a/string/aarch64/check-arch.S
+++ b/string/aarch64/check-arch.S
@@ -10,4 +10,4 @@
 #endif
 
 /* Include for GNU property notes.  */
-#include "../asmdefs.h"
+#include "asmdefs.h"
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index d4673b3..a2870d3 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index 820228e..3b358b1 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 5879c1c..53eadf7 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index d29588c..22e6d2c 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index e19521f..aa180e8 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -9,7 +9,7 @@
  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define src1	x0
 #define src2	x1
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index d1368d0..e86d7a3 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
index 66ae896..b82510a 100644
--- a/string/aarch64/memcpy-sve.S
+++ b/string/aarch64/memcpy-sve.S
@@ -13,7 +13,7 @@
 
 #if __ARM_FEATURE_SVE
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 36aaf60..2415bd6 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index 4726618..bee71ef 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index ad0b0d6..6bbcedf 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define val	x1
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 8840f0d..04f269f 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index 1b984b9..e18640c 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 3aab56c..a041e57 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index b1ac4db..cd67858 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index dc57f5f..c6b295d 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index 6ce80e3..4c00463 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index bc1f74e..137a9aa 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -12,7 +12,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index 3ce951c..803e603 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index ba4a7d8..9aca330 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin		x0
 #define srcin		x1
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index fdb07ae..a83b9b6 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define result		x0
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 0fd663f..1171558 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 103fac1..f164322 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -11,7 +11,7 @@
  * Not MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin	x0
 #define len	x0
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index 08b9a7e..4a7be2d 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 6957d07..128a10c 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index ec6f881..498a335 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index eecfad3..03a4706 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define cntin		x1
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index dcee1bf..e05579f 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index f907166..fbcd5ba 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index a1b43ca..8f10c96 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/arm/asmdefs.h b/string/arm/asmdefs.h
new file mode 100644
index 0000000..a275aa6
--- /dev/null
+++ b/string/arm/asmdefs.h
@@ -0,0 +1,480 @@
+/*
+ * Macros for asm code.  Arm version.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+#define ARM_FNSTART .fnstart
+#if defined (IS_LEAF)
+# define ARM_FNEND \
+  .cantunwind	  \
+  .fnend
+#else
+# define ARM_FNEND .fnend
+#endif
+
+/* Check whether leaf function PAC signing has been requested in the
+   -mbranch-protect compile-time option.  */
+#define LEAF_PROTECT_BIT 2
+
+#ifdef __ARM_FEATURE_PAC_DEFAULT
+# define HAVE_PAC_LEAF \
+	((__ARM_FEATURE_PAC_DEFAULT & (1 << LEAF_PROTECT_BIT)) && 1)
+#else
+# define HAVE_PAC_LEAF 0
+#endif
+
+/* Provide default parameters for PAC-code handling in leaf-functions.  */
+#if HAVE_PAC_LEAF
+# ifndef PAC_LEAF_PUSH_IP
+#  define PAC_LEAF_PUSH_IP 1
+# endif
+#else /* !HAVE_PAC_LEAF */
+# undef PAC_LEAF_PUSH_IP
+# define PAC_LEAF_PUSH_IP 0
+#endif /* HAVE_PAC_LEAF */
+
+#define STACK_ALIGN_ENFORCE 0
+
+/******************************************************************************
+* Implementation of the prologue and epilogue assembler macros and their
+* associated helper functions.
+*
+* These functions add support for the following:
+*
+* - M-profile branch target identification (BTI) landing-pads when compiled
+*   with `-mbranch-protection=bti'.
+* - PAC-signing and verification instructions, depending on hardware support
+*   and whether the PAC-signing of leaf functions has been requested via the
+*   `-mbranch-protection=pac-ret+leaf' compiler argument.
+* - 8-byte stack alignment preservation at function entry, defaulting to the
+*   value of STACK_ALIGN_ENFORCE.
+*
+* Notes:
+* - Prologue stack alignment is implemented by detecting a push with an odd
+*   number of registers and prepending a dummy register to the list.
+* - If alignment is attempted on a list containing r0, compilation will result
+*   in an error.
+* - If alignment is attempted in a list containing r1, r0 will be prepended to
+*   the register list and r0 will be restored prior to function return.  for
+*   functions with non-void return types, this will result in the corruption of
+*   the result register.
+* - Stack alignment is enforced via the following helper macro call-chain:
+*
+*	{prologue|epilogue} ->_align8 -> _preprocess_reglist ->
+*		_preprocess_reglist1 -> {_prologue|_epilogue}
+*
+* - Debug CFI directives are automatically added to prologues and epilogues,
+*   assisted by `cfisavelist' and `cfirestorelist', respectively.
+*
+* Arguments:
+* prologue
+* --------
+* - first	- If `last' specified, this serves as start of general-purpose
+*		  register (GPR) range to push onto stack, otherwise represents
+*		  single GPR to push onto stack.  If omitted, no GPRs pushed
+*		  onto stack at prologue.
+* - last	- If given, specifies inclusive upper-bound of GPR range.
+* - push_ip	- Determines whether IP register is to be pushed to stack at
+*		  prologue.  When pac-signing is requested, this holds the
+*		  the pac-key.  Either 1 or 0 to push or not push, respectively.
+*		  Default behavior: Set to value of PAC_LEAF_PUSH_IP macro.
+* - push_lr	- Determines whether to push lr to the stack on function entry.
+*		  Either 1 or 0  to push or not push, respectively.
+* - align8	- Whether to enforce alignment. Either 1 or 0, with 1 requesting
+*		  alignment.
+*
+* epilogue
+* --------
+*   The epilogue should be called passing the same arguments as those passed to
+*   the prologue to ensure the stack is not corrupted on function return.
+*
+* Usage examples:
+*
+*   prologue push_ip=1 -> push {ip}
+*   epilogue push_ip=1, align8=1 -> pop {r2, ip}
+*   prologue push_ip=1, push_lr=1 -> push {ip, lr}
+*   epilogue 1 -> pop {r1}
+*   prologue 1, align8=1 -> push {r0, r1}
+*   epilogue 1, push_ip=1 -> pop {r1, ip}
+*   prologue 1, 4 -> push {r1-r4}
+*   epilogue 1, 4 push_ip=1 -> pop {r1-r4, ip}
+*
+******************************************************************************/
+
+/* Emit .cfi_restore directives for a consecutive sequence of registers.  */
+	.macro cfirestorelist first, last
+	.cfi_restore \last
+	.if \last-\first
+	 cfirestorelist \first, \last-1
+	.endif
+	.endm
+
+/* Emit .cfi_offset directives for a consecutive sequence of registers.  */
+	.macro cfisavelist first, last, index=1
+	.cfi_offset \last, -4*(\index)
+	.if \last-\first
+	 cfisavelist \first, \last-1, \index+1
+	.endif
+	.endm
+
+.macro _prologue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
+	.if \push_ip & 1 != \push_ip
+	 .error "push_ip may be either 0 or 1"
+	.endif
+	.if \push_lr & 1 != \push_lr
+	 .error "push_lr may be either 0 or 1"
+	.endif
+	.if \first != -1
+	 .if \last == -1
+	  /* Upper-bound not provided: Set upper = lower.  */
+	  _prologue \first, \first, \push_ip, \push_lr
+	  .exitm
+	 .endif
+	.endif
+#if HAVE_PAC_LEAF
+# if __ARM_FEATURE_BTI_DEFAULT
+	pacbti	ip, lr, sp
+# else
+	pac	ip, lr, sp
+# endif /* __ARM_FEATURE_BTI_DEFAULT */
+	.cfi_register 143, 12
+#else
+# if __ARM_FEATURE_BTI_DEFAULT
+	bti
+# endif /* __ARM_FEATURE_BTI_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
+	.if \first != -1
+	 .if \last != \first
+	  .if \last >= 13
+	.error "SP cannot be in the save list"
+	  .endif
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 1: push register range, ip and lr registers.  */
+	push {r\first-r\last, ip, lr}
+	.cfi_adjust_cfa_offset ((\last-\first)+3)*4
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+	cfisavelist \first, \last, 3
+	   .else // !\push_lr
+	/* Case 2: push register range and ip register.  */
+	push {r\first-r\last, ip}
+	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
+	.cfi_offset 143, -4
+	cfisavelist \first, \last, 2
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 3: push register range and lr register.  */
+	push {r\first-r\last, lr}
+	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
+	.cfi_offset 14, -4
+	cfisavelist \first, \last, 2
+	   .else // !\push_lr
+	/* Case 4: push register range.  */
+	push {r\first-r\last}
+	.cfi_adjust_cfa_offset ((\last-\first)+1)*4
+	cfisavelist \first, \last, 1
+	   .endif
+	  .endif
+	 .else // \last == \first
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 5: push single GP register plus ip and lr registers.  */
+	push {r\first, ip, lr}
+	.cfi_adjust_cfa_offset 12
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+        cfisavelist \first, \first, 3
+	   .else // !\push_lr
+	/* Case 6: push single GP register plus ip register.  */
+	push {r\first, ip}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 143, -4
+        cfisavelist \first, \first, 2
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 7: push single GP register plus lr register.  */
+	push {r\first, lr}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 14, -4
+	cfisavelist \first, \first, 2
+	   .else // !\push_lr
+	/* Case 8: push single GP register.  */
+	push {r\first}
+	.cfi_adjust_cfa_offset 4
+	cfisavelist \first, \first, 1
+	   .endif
+	  .endif
+	 .endif
+	.else // \first == -1
+	 .if \push_ip
+	  .if \push_lr
+	/* Case 9: push ip and lr registers.  */
+	push {ip, lr}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+	  .else // !\push_lr
+	/* Case 10: push ip register.  */
+	push {ip}
+	.cfi_adjust_cfa_offset 4
+	.cfi_offset 143, -4
+	  .endif
+	 .else // !\push_ip
+          .if \push_lr
+	/* Case 11: push lr register.  */
+	push {lr}
+	.cfi_adjust_cfa_offset 4
+	.cfi_offset 14, -4
+          .endif
+	 .endif
+	.endif
+.endm
+
+.macro _epilogue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
+	.if \push_ip & 1 != \push_ip
+	 .error "push_ip may be either 0 or 1"
+	.endif
+	.if \push_lr & 1 != \push_lr
+	 .error "push_lr may be either 0 or 1"
+	.endif
+	.if \first != -1
+	 .if \last == -1
+	  /* Upper-bound not provided: Set upper = lower.  */
+	  _epilogue \first, \first, \push_ip, \push_lr
+	  .exitm
+	 .endif
+	 .if \last != \first
+	  .if \last >= 13
+	.error "SP cannot be in the save list"
+	  .endif
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 1: pop register range, ip and lr registers.  */
+	pop {r\first-r\last, ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	cfirestorelist \first, \last
+	   .else // !\push_lr
+	/* Case 2: pop register range and ip register.  */
+	pop {r\first-r\last, ip}
+	.cfi_register 143, 12
+	cfirestorelist \first, \last
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 3: pop register range and lr register.  */
+	pop {r\first-r\last, lr}
+	.cfi_restore 14
+	cfirestorelist \first, \last
+	   .else // !\push_lr
+	/* Case 4: pop register range.  */
+	pop {r\first-r\last}
+	cfirestorelist \first, \last
+	   .endif
+	  .endif
+	 .else // \last == \first
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 5: pop single GP register plus ip and lr registers.  */
+	pop {r\first, ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	cfirestorelist \first, \first
+	   .else // !\push_lr
+	/* Case 6: pop single GP register plus ip register.  */
+	pop {r\first, ip}
+	.cfi_register 143, 12
+	cfirestorelist \first, \first
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 7: pop single GP register plus lr register.  */
+	pop {r\first, lr}
+	.cfi_restore 14
+	cfirestorelist \first, \first
+	   .else // !\push_lr
+	/* Case 8: pop single GP register.  */
+	pop {r\first}
+	cfirestorelist \first, \first
+	   .endif
+	  .endif
+	 .endif
+	.else // \first == -1
+	 .if \push_ip
+	  .if \push_lr
+	/* Case 9: pop ip and lr registers.  */
+	pop {ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	  .else // !\push_lr
+	/* Case 10: pop ip register.  */
+	pop {ip}
+	.cfi_register 143, 12
+	  .endif
+	 .else // !\push_ip
+          .if \push_lr
+	/* Case 11: pop lr register.  */
+	pop {lr}
+	.cfi_restore 14
+          .endif
+	 .endif
+	.endif
+#if HAVE_PAC_LEAF
+	aut	ip, lr, sp
+#endif /* HAVE_PAC_LEAF */
+	bx	lr
+.endm
+
+# clean up expressions in 'last'
+.macro _preprocess_reglist1 first:req, last:req, push_ip:req, push_lr:req, reglist_op:req
+	.if \last == 0
+	 \reglist_op \first, 0, \push_ip, \push_lr
+	.elseif \last == 1
+	 \reglist_op \first, 1, \push_ip, \push_lr
+	.elseif \last == 2
+	 \reglist_op \first, 2, \push_ip, \push_lr
+	.elseif \last == 3
+	 \reglist_op \first, 3, \push_ip, \push_lr
+	.elseif \last == 4
+	 \reglist_op \first, 4, \push_ip, \push_lr
+	.elseif \last == 5
+	 \reglist_op \first, 5, \push_ip, \push_lr
+	.elseif \last == 6
+	 \reglist_op \first, 6, \push_ip, \push_lr
+	.elseif \last == 7
+	 \reglist_op \first, 7, \push_ip, \push_lr
+	.elseif \last == 8
+	 \reglist_op \first, 8, \push_ip, \push_lr
+	.elseif \last == 9
+	 \reglist_op \first, 9, \push_ip, \push_lr
+	.elseif \last == 10
+	 \reglist_op \first, 10, \push_ip, \push_lr
+	.elseif \last == 11
+	 \reglist_op \first, 11, \push_ip, \push_lr
+	.else
+	 .error "last (\last) out of range"
+	.endif
+.endm
+
+# clean up expressions in 'first'
+.macro _preprocess_reglist first:req, last, push_ip=0, push_lr=0, reglist_op:req
+	.ifb \last
+	 _preprocess_reglist \first \first \push_ip \push_lr
+	.else
+	 .if \first > \last
+	  .error "last (\last) must be at least as great as first (\first)"
+	 .endif
+	 .if \first == 0
+	  _preprocess_reglist1 0, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 1
+	  _preprocess_reglist1 1, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 2
+	  _preprocess_reglist1 2, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 3
+	  _preprocess_reglist1 3, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 4
+	  _preprocess_reglist1 4, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 5
+	  _preprocess_reglist1 5, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 6
+	  _preprocess_reglist1 6, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 7
+	  _preprocess_reglist1 7, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 8
+	  _preprocess_reglist1 8, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 9
+	  _preprocess_reglist1 9, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 10
+	  _preprocess_reglist1 10, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 11
+	  _preprocess_reglist1 11, \last, \push_ip, \push_lr, \reglist_op
+	 .else
+	  .error "first (\first) out of range"
+	 .endif
+	.endif
+.endm
+
+.macro _align8 first, last, push_ip=0, push_lr=0, reglist_op=_prologue
+	.ifb \first
+	 .ifnb \last
+	  .error "can't have last (\last) without specifying first"
+	 .else // \last not blank
+	  .if ((\push_ip + \push_lr) % 2) == 0
+	   \reglist_op first=-1, last=-1, push_ip=\push_ip, push_lr=\push_lr
+	   .exitm
+	  .else // ((\push_ip + \push_lr) % 2) odd
+	   _align8 2, 2, \push_ip, \push_lr, \reglist_op
+	   .exitm
+	  .endif // ((\push_ip + \push_lr) % 2) == 0
+	 .endif // .ifnb \last
+	.endif // .ifb \first
+
+	.ifb \last
+	 _align8 \first, \first, \push_ip, \push_lr, \reglist_op
+	.else
+	 .if \push_ip & 1 <> \push_ip
+	  .error "push_ip may be 0 or 1"
+	 .endif
+	 .if \push_lr & 1 <> \push_lr
+	  .error "push_lr may be 0 or 1"
+	 .endif
+	 .ifeq (\last - \first + \push_ip + \push_lr) % 2
+	  .if \first == 0
+	   .error "Alignment required and first register is r0"
+	   .exitm
+	  .endif
+	  _preprocess_reglist \first-1, \last, \push_ip, \push_lr, \reglist_op
+	 .else
+	  _preprocess_reglist \first \last, \push_ip, \push_lr, \reglist_op
+	 .endif
+	.endif
+.endm
+
+.macro prologue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
+	.if \align8
+	 _align8 \first, \last, \push_ip, \push_lr, _prologue
+	.else
+	 _prologue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
+	.endif
+.endm
+
+.macro epilogue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
+	.if \align8
+	 _align8 \first, \last, \push_ip, \push_lr, reglist_op=_epilogue
+	.else
+	 _epilogue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
+	.endif
+.endm
+
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  ARM_FNSTART;		\
+  .cfi_startproc;
+
+#define ENTRY(name)	ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)	\
+  .global name;		\
+  .type name,%function;	\
+  name:
+
+#define END(name)	\
+  .cfi_endproc;		\
+  ARM_FNEND;		\
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#endif
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 125618d..9649e10 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -36,7 +36,7 @@
 #define CHARTSTMASK(c) 1<<(c*8)
 #endif
 	.thumb
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 
 @ ---------------------------------------------------------------------------
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index 77f4553..c4dfa8a 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -17,7 +17,7 @@
 
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 	.syntax unified
 	/* This implementation requires ARM state.  */
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
index 0e49d09..699fa1b 100644
--- a/string/arm/strcmp-armv6m.S
+++ b/string/arm/strcmp-armv6m.S
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
 
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index b01c02e..a69dbff 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -12,7 +12,7 @@
    is sufficiently aligned.  Use saturating arithmetic to optimize
    the compares.  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Build Options:
    STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index f06b238..f9f50c0 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -13,7 +13,7 @@
 
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #ifdef __ARMEB__
 #define S2LO		lsl
diff --git a/string/asmdefs.h b/string/asmdefs.h
deleted file mode 100644
index b4e5123..0000000
--- a/string/asmdefs.h
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * Macros for asm code.
- *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef _ASMDEFS_H
-#define _ASMDEFS_H
-
-#if defined (__arm__)
-
-#define ARM_FNSTART .fnstart
-#if defined (IS_LEAF)
-# define ARM_FNEND \
-  .cantunwind	  \
-  .fnend
-#else
-# define ARM_FNEND .fnend
-#endif
-
-/* Check whether leaf function PAC signing has been requested in the
-   -mbranch-protect compile-time option.  */
-#define LEAF_PROTECT_BIT 2
-
-#ifdef __ARM_FEATURE_PAC_DEFAULT
-# define HAVE_PAC_LEAF \
-	((__ARM_FEATURE_PAC_DEFAULT & (1 << LEAF_PROTECT_BIT)) && 1)
-#else
-# define HAVE_PAC_LEAF 0
-#endif
-
-/* Provide default parameters for PAC-code handling in leaf-functions.  */
-#if HAVE_PAC_LEAF
-# ifndef PAC_LEAF_PUSH_IP
-#  define PAC_LEAF_PUSH_IP 1
-# endif
-#else /* !HAVE_PAC_LEAF */
-# undef PAC_LEAF_PUSH_IP
-# define PAC_LEAF_PUSH_IP 0
-#endif /* HAVE_PAC_LEAF */
-
-#define STACK_ALIGN_ENFORCE 0
-
-/******************************************************************************
-* Implementation of the prologue and epilogue assembler macros and their
-* associated helper functions.
-*
-* These functions add support for the following:
-*
-* - M-profile branch target identification (BTI) landing-pads when compiled
-*   with `-mbranch-protection=bti'.
-* - PAC-signing and verification instructions, depending on hardware support
-*   and whether the PAC-signing of leaf functions has been requested via the
-*   `-mbranch-protection=pac-ret+leaf' compiler argument.
-* - 8-byte stack alignment preservation at function entry, defaulting to the
-*   value of STACK_ALIGN_ENFORCE.
-*
-* Notes:
-* - Prologue stack alignment is implemented by detecting a push with an odd
-*   number of registers and prepending a dummy register to the list.
-* - If alignment is attempted on a list containing r0, compilation will result
-*   in an error.
-* - If alignment is attempted in a list containing r1, r0 will be prepended to
-*   the register list and r0 will be restored prior to function return.  for
-*   functions with non-void return types, this will result in the corruption of
-*   the result register.
-* - Stack alignment is enforced via the following helper macro call-chain:
-*
-*	{prologue|epilogue} ->_align8 -> _preprocess_reglist ->
-*		_preprocess_reglist1 -> {_prologue|_epilogue}
-*
-* - Debug CFI directives are automatically added to prologues and epilogues,
-*   assisted by `cfisavelist' and `cfirestorelist', respectively.
-*
-* Arguments:
-* prologue
-* --------
-* - first	- If `last' specified, this serves as start of general-purpose
-*		  register (GPR) range to push onto stack, otherwise represents
-*		  single GPR to push onto stack.  If omitted, no GPRs pushed
-*		  onto stack at prologue.
-* - last	- If given, specifies inclusive upper-bound of GPR range.
-* - push_ip	- Determines whether IP register is to be pushed to stack at
-*		  prologue.  When pac-signing is requested, this holds the
-*		  the pac-key.  Either 1 or 0 to push or not push, respectively.
-*		  Default behavior: Set to value of PAC_LEAF_PUSH_IP macro.
-* - push_lr	- Determines whether to push lr to the stack on function entry.
-*		  Either 1 or 0  to push or not push, respectively.
-* - align8	- Whether to enforce alignment. Either 1 or 0, with 1 requesting
-*		  alignment.
-*
-* epilogue
-* --------
-*   The epilogue should be called passing the same arguments as those passed to
-*   the prologue to ensure the stack is not corrupted on function return.
-*
-* Usage examples:
-*
-*   prologue push_ip=1 -> push {ip}
-*   epilogue push_ip=1, align8=1 -> pop {r2, ip}
-*   prologue push_ip=1, push_lr=1 -> push {ip, lr}
-*   epilogue 1 -> pop {r1}
-*   prologue 1, align8=1 -> push {r0, r1}
-*   epilogue 1, push_ip=1 -> pop {r1, ip}
-*   prologue 1, 4 -> push {r1-r4}
-*   epilogue 1, 4 push_ip=1 -> pop {r1-r4, ip}
-*
-******************************************************************************/
-
-/* Emit .cfi_restore directives for a consecutive sequence of registers.  */
-	.macro cfirestorelist first, last
-	.cfi_restore \last
-	.if \last-\first
-	 cfirestorelist \first, \last-1
-	.endif
-	.endm
-
-/* Emit .cfi_offset directives for a consecutive sequence of registers.  */
-	.macro cfisavelist first, last, index=1
-	.cfi_offset \last, -4*(\index)
-	.if \last-\first
-	 cfisavelist \first, \last-1, \index+1
-	.endif
-	.endm
-
-.macro _prologue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
-	.if \push_ip & 1 != \push_ip
-	 .error "push_ip may be either 0 or 1"
-	.endif
-	.if \push_lr & 1 != \push_lr
-	 .error "push_lr may be either 0 or 1"
-	.endif
-	.if \first != -1
-	 .if \last == -1
-	  /* Upper-bound not provided: Set upper = lower.  */
-	  _prologue \first, \first, \push_ip, \push_lr
-	  .exitm
-	 .endif
-	.endif
-#if HAVE_PAC_LEAF
-# if __ARM_FEATURE_BTI_DEFAULT
-	pacbti	ip, lr, sp
-# else
-	pac	ip, lr, sp
-# endif /* __ARM_FEATURE_BTI_DEFAULT */
-	.cfi_register 143, 12
-#else
-# if __ARM_FEATURE_BTI_DEFAULT
-	bti
-# endif /* __ARM_FEATURE_BTI_DEFAULT */
-#endif /* HAVE_PAC_LEAF */
-	.if \first != -1
-	 .if \last != \first
-	  .if \last >= 13
-	.error "SP cannot be in the save list"
-	  .endif
-	  .if \push_ip
-	   .if \push_lr
-	/* Case 1: push register range, ip and lr registers.  */
-	push {r\first-r\last, ip, lr}
-	.cfi_adjust_cfa_offset ((\last-\first)+3)*4
-	.cfi_offset 14, -4
-	.cfi_offset 143, -8
-	cfisavelist \first, \last, 3
-	   .else // !\push_lr
-	/* Case 2: push register range and ip register.  */
-	push {r\first-r\last, ip}
-	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
-	.cfi_offset 143, -4
-	cfisavelist \first, \last, 2
-	   .endif
-	  .else // !\push_ip
-	   .if \push_lr
-	/* Case 3: push register range and lr register.  */
-	push {r\first-r\last, lr}
-	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
-	.cfi_offset 14, -4
-	cfisavelist \first, \last, 2
-	   .else // !\push_lr
-	/* Case 4: push register range.  */
-	push {r\first-r\last}
-	.cfi_adjust_cfa_offset ((\last-\first)+1)*4
-	cfisavelist \first, \last, 1
-	   .endif
-	  .endif
-	 .else // \last == \first
-	  .if \push_ip
-	   .if \push_lr
-	/* Case 5: push single GP register plus ip and lr registers.  */
-	push {r\first, ip, lr}
-	.cfi_adjust_cfa_offset 12
-	.cfi_offset 14, -4
-	.cfi_offset 143, -8
-        cfisavelist \first, \first, 3
-	   .else // !\push_lr
-	/* Case 6: push single GP register plus ip register.  */
-	push {r\first, ip}
-	.cfi_adjust_cfa_offset 8
-	.cfi_offset 143, -4
-        cfisavelist \first, \first, 2
-	   .endif
-	  .else // !\push_ip
-	   .if \push_lr
-	/* Case 7: push single GP register plus lr register.  */
-	push {r\first, lr}
-	.cfi_adjust_cfa_offset 8
-	.cfi_offset 14, -4
-	cfisavelist \first, \first, 2
-	   .else // !\push_lr
-	/* Case 8: push single GP register.  */
-	push {r\first}
-	.cfi_adjust_cfa_offset 4
-	cfisavelist \first, \first, 1
-	   .endif
-	  .endif
-	 .endif
-	.else // \first == -1
-	 .if \push_ip
-	  .if \push_lr
-	/* Case 9: push ip and lr registers.  */
-	push {ip, lr}
-	.cfi_adjust_cfa_offset 8
-	.cfi_offset 14, -4
-	.cfi_offset 143, -8
-	  .else // !\push_lr
-	/* Case 10: push ip register.  */
-	push {ip}
-	.cfi_adjust_cfa_offset 4
-	.cfi_offset 143, -4
-	  .endif
-	 .else // !\push_ip
-          .if \push_lr
-	/* Case 11: push lr register.  */
-	push {lr}
-	.cfi_adjust_cfa_offset 4
-	.cfi_offset 14, -4
-          .endif
-	 .endif
-	.endif
-.endm
-
-.macro _epilogue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
-	.if \push_ip & 1 != \push_ip
-	 .error "push_ip may be either 0 or 1"
-	.endif
-	.if \push_lr & 1 != \push_lr
-	 .error "push_lr may be either 0 or 1"
-	.endif
-	.if \first != -1
-	 .if \last == -1
-	  /* Upper-bound not provided: Set upper = lower.  */
-	  _epilogue \first, \first, \push_ip, \push_lr
-	  .exitm
-	 .endif
-	 .if \last != \first
-	  .if \last >= 13
-	.error "SP cannot be in the save list"
-	  .endif
-	  .if \push_ip
-	   .if \push_lr
-	/* Case 1: pop register range, ip and lr registers.  */
-	pop {r\first-r\last, ip, lr}
-	.cfi_restore 14
-	.cfi_register 143, 12
-	cfirestorelist \first, \last
-	   .else // !\push_lr
-	/* Case 2: pop register range and ip register.  */
-	pop {r\first-r\last, ip}
-	.cfi_register 143, 12
-	cfirestorelist \first, \last
-	   .endif
-	  .else // !\push_ip
-	   .if \push_lr
-	/* Case 3: pop register range and lr register.  */
-	pop {r\first-r\last, lr}
-	.cfi_restore 14
-	cfirestorelist \first, \last
-	   .else // !\push_lr
-	/* Case 4: pop register range.  */
-	pop {r\first-r\last}
-	cfirestorelist \first, \last
-	   .endif
-	  .endif
-	 .else // \last == \first
-	  .if \push_ip
-	   .if \push_lr
-	/* Case 5: pop single GP register plus ip and lr registers.  */
-	pop {r\first, ip, lr}
-	.cfi_restore 14
-	.cfi_register 143, 12
-	cfirestorelist \first, \first
-	   .else // !\push_lr
-	/* Case 6: pop single GP register plus ip register.  */
-	pop {r\first, ip}
-	.cfi_register 143, 12
-	cfirestorelist \first, \first
-	   .endif
-	  .else // !\push_ip
-	   .if \push_lr
-	/* Case 7: pop single GP register plus lr register.  */
-	pop {r\first, lr}
-	.cfi_restore 14
-	cfirestorelist \first, \first
-	   .else // !\push_lr
-	/* Case 8: pop single GP register.  */
-	pop {r\first}
-	cfirestorelist \first, \first
-	   .endif
-	  .endif
-	 .endif
-	.else // \first == -1
-	 .if \push_ip
-	  .if \push_lr
-	/* Case 9: pop ip and lr registers.  */
-	pop {ip, lr}
-	.cfi_restore 14
-	.cfi_register 143, 12
-	  .else // !\push_lr
-	/* Case 10: pop ip register.  */
-	pop {ip}
-	.cfi_register 143, 12
-	  .endif
-	 .else // !\push_ip
-          .if \push_lr
-	/* Case 11: pop lr register.  */
-	pop {lr}
-	.cfi_restore 14
-          .endif
-	 .endif
-	.endif
-#if HAVE_PAC_LEAF
-	aut	ip, lr, sp
-#endif /* HAVE_PAC_LEAF */
-	bx	lr
-.endm
-
-# clean up expressions in 'last'
-.macro _preprocess_reglist1 first:req, last:req, push_ip:req, push_lr:req, reglist_op:req
-	.if \last == 0
-	 \reglist_op \first, 0, \push_ip, \push_lr
-	.elseif \last == 1
-	 \reglist_op \first, 1, \push_ip, \push_lr
-	.elseif \last == 2
-	 \reglist_op \first, 2, \push_ip, \push_lr
-	.elseif \last == 3
-	 \reglist_op \first, 3, \push_ip, \push_lr
-	.elseif \last == 4
-	 \reglist_op \first, 4, \push_ip, \push_lr
-	.elseif \last == 5
-	 \reglist_op \first, 5, \push_ip, \push_lr
-	.elseif \last == 6
-	 \reglist_op \first, 6, \push_ip, \push_lr
-	.elseif \last == 7
-	 \reglist_op \first, 7, \push_ip, \push_lr
-	.elseif \last == 8
-	 \reglist_op \first, 8, \push_ip, \push_lr
-	.elseif \last == 9
-	 \reglist_op \first, 9, \push_ip, \push_lr
-	.elseif \last == 10
-	 \reglist_op \first, 10, \push_ip, \push_lr
-	.elseif \last == 11
-	 \reglist_op \first, 11, \push_ip, \push_lr
-	.else
-	 .error "last (\last) out of range"
-	.endif
-.endm
-
-# clean up expressions in 'first'
-.macro _preprocess_reglist first:req, last, push_ip=0, push_lr=0, reglist_op:req
-	.ifb \last
-	 _preprocess_reglist \first \first \push_ip \push_lr
-	.else
-	 .if \first > \last
-	  .error "last (\last) must be at least as great as first (\first)"
-	 .endif
-	 .if \first == 0
-	  _preprocess_reglist1 0, \last, \push_ip, \push_lr, \reglist_op
-	 .elseif \first == 1
-	  _preprocess_reglist1 1, \last, \push_ip, \push_lr, \reglist_op
-	 .elseif \first == 2
-	  _preprocess_reglist1 2, \last, \push_ip, \push_lr, \reglist_op
-	 .elseif \first == 3
-	  _preprocess_reglist1 3, \last, \push_ip, \push_lr, \reglist_op
-	 .elseif \first == 4
-	  _preprocess_reglist1 4, \last, \push_ip, \push_lr, \reglist_op
-	 .elseif \first == 5
-	  _preprocess_reglist1 5, \last, \push_ip, \push_lr, \reglist_op
-	 .elseif \first == 6
-	  _preprocess_reglist1 6, \last, \push_ip, \push_lr, \reglist_op
-	 .elseif \first == 7
-	  _preprocess_reglist1 7, \last, \push_ip, \push_lr, \reglist_op
-	 .elseif \first == 8
-	  _preprocess_reglist1 8, \last, \push_ip, \push_lr, \reglist_op
-	 .elseif \first == 9
-	  _preprocess_reglist1 9, \last, \push_ip, \push_lr, \reglist_op
-	 .elseif \first == 10
-	  _preprocess_reglist1 10, \last, \push_ip, \push_lr, \reglist_op
-	 .elseif \first == 11
-	  _preprocess_reglist1 11, \last, \push_ip, \push_lr, \reglist_op
-	 .else
-	  .error "first (\first) out of range"
-	 .endif
-	.endif
-.endm
-
-.macro _align8 first, last, push_ip=0, push_lr=0, reglist_op=_prologue
-	.ifb \first
-	 .ifnb \last
-	  .error "can't have last (\last) without specifying first"
-	 .else // \last not blank
-	  .if ((\push_ip + \push_lr) % 2) == 0
-	   \reglist_op first=-1, last=-1, push_ip=\push_ip, push_lr=\push_lr
-	   .exitm
-	  .else // ((\push_ip + \push_lr) % 2) odd
-	   _align8 2, 2, \push_ip, \push_lr, \reglist_op
-	   .exitm
-	  .endif // ((\push_ip + \push_lr) % 2) == 0
-	 .endif // .ifnb \last
-	.endif // .ifb \first
-
-	.ifb \last
-	 _align8 \first, \first, \push_ip, \push_lr, \reglist_op
-	.else
-	 .if \push_ip & 1 <> \push_ip
-	  .error "push_ip may be 0 or 1"
-	 .endif
-	 .if \push_lr & 1 <> \push_lr
-	  .error "push_lr may be 0 or 1"
-	 .endif
-	 .ifeq (\last - \first + \push_ip + \push_lr) % 2
-	  .if \first == 0
-	   .error "Alignment required and first register is r0"
-	   .exitm
-	  .endif
-	  _preprocess_reglist \first-1, \last, \push_ip, \push_lr, \reglist_op
-	 .else
-	  _preprocess_reglist \first \last, \push_ip, \push_lr, \reglist_op
-	 .endif
-	.endif
-.endm
-
-.macro prologue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
-	.if \align8
-	 _align8 \first, \last, \push_ip, \push_lr, _prologue
-	.else
-	 _prologue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
-	.endif
-.endm
-
-.macro epilogue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
-	.if \align8
-	 _align8 \first, \last, \push_ip, \push_lr, reglist_op=_epilogue
-	.else
-	 _epilogue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
-	.endif
-.endm
-
-#else /* !defined (__arm__) */
-
-#define ARM_FNSTART
-#define ARM_FNEND
-
-#endif
-
-#if defined(__aarch64__)
-
-/* Branch Target Identitication support.  */
-#define BTI_C		hint	34
-#define BTI_J		hint	36
-/* Return address signing support (pac-ret).  */
-#define PACIASP		hint	25; .cfi_window_save
-#define AUTIASP		hint	29; .cfi_window_save
-
-/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
-#define FEATURE_1_AND 0xc0000000
-#define FEATURE_1_BTI 1
-#define FEATURE_1_PAC 2
-
-/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
-#define GNU_PROPERTY(type, value)	\
-  .section .note.gnu.property, "a";	\
-  .p2align 3;				\
-  .word 4;				\
-  .word 16;				\
-  .word 5;				\
-  .asciz "GNU";				\
-  .word type;				\
-  .word 4;				\
-  .word value;				\
-  .word 0;				\
-  .text
-
-/* If set then the GNU Property Note section will be added to
-   mark objects to support BTI and PAC-RET.  */
-#ifndef WANT_GNU_PROPERTY
-#define WANT_GNU_PROPERTY 1
-#endif
-
-#if WANT_GNU_PROPERTY
-/* Add property note with supported features to all asm files.  */
-GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
-#endif
-
-#define ENTRY_ALIGN(name, alignment)	\
-  .global name;		\
-  .type name,%function;	\
-  .align alignment;		\
-  name:			\
-  ARM_FNSTART;		\
-  .cfi_startproc;	\
-  BTI_C;
-
-#else
-
-#define END_FILE
-
-#define ENTRY_ALIGN(name, alignment)	\
-  .global name;		\
-  .type name,%function;	\
-  .align alignment;		\
-  name:			\
-  ARM_FNSTART;		\
-  .cfi_startproc;
-
-#endif
-
-#define ENTRY(name)	ENTRY_ALIGN(name, 6)
-
-#define ENTRY_ALIAS(name)	\
-  .global name;		\
-  .type name,%function;	\
-  name:
-
-#define END(name)	\
-  .cfi_endproc;		\
-  ARM_FNEND;		\
-  .size name, .-name;
-
-#define L(l) .L ## l
-
-#ifdef __ILP32__
-  /* Sanitize padding bits of pointer arguments as per aapcs64 */
-#define PTR_ARG(n)  mov w##n, w##n
-#else
-#define PTR_ARG(n)
-#endif
-
-#ifdef __ILP32__
-  /* Sanitize padding bits of size arguments as per aapcs64 */
-#define SIZE_ARG(n)  mov w##n, w##n
-#else
-#define SIZE_ARG(n)
-#endif
-
-#endif
-- 
cgit v1.2.3


From 044e81634cea7a77b3d0e26ea7408b0ad7d06036 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Thu, 17 Nov 2022 11:58:18 +0000
Subject: string: arm: Include asmdefs.h even into empty asm files

Currently this is not expected to change behaviour, but if global
directives are added in asmdefs.h (like .thumb) those should be in
all asm files in case the link ABI is affected.
---
 string/arm/check-arch.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/string/arm/check-arch.S b/string/arm/check-arch.S
index f69e112..b893f32 100644
--- a/string/arm/check-arch.S
+++ b/string/arm/check-arch.S
@@ -8,3 +8,6 @@
 #if !__arm__
 # error ARCH setting does not match the compiler.
 #endif
+
+/* For attributes that may affect ABI.  */
+#include "asmdefs.h"
-- 
cgit v1.2.3


From df9815ceabe700880890ab7baf00ba0d853f3b57 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Thu, 17 Nov 2022 11:47:12 +0000
Subject: string: arm: Use /**/ comments in asmdefs.h

This is preprocessed asm code, so /**/ style comments are most
appropriate.
---
 string/arm/asmdefs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/string/arm/asmdefs.h b/string/arm/asmdefs.h
index a275aa6..8f21a78 100644
--- a/string/arm/asmdefs.h
+++ b/string/arm/asmdefs.h
@@ -333,7 +333,7 @@
 	bx	lr
 .endm
 
-# clean up expressions in 'last'
+/* Clean up expressions in 'last'.  */
 .macro _preprocess_reglist1 first:req, last:req, push_ip:req, push_lr:req, reglist_op:req
 	.if \last == 0
 	 \reglist_op \first, 0, \push_ip, \push_lr
@@ -364,7 +364,7 @@
 	.endif
 .endm
 
-# clean up expressions in 'first'
+/* Clean up expressions in 'first'.  */
 .macro _preprocess_reglist first:req, last, push_ip=0, push_lr=0, reglist_op:req
 	.ifb \last
 	 _preprocess_reglist \first \first \push_ip \push_lr
-- 
cgit v1.2.3


From 9ace52019cab223febddb3fcd166b6b4ba64081f Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Thu, 17 Nov 2022 11:55:43 +0000
Subject: string: arm: Refactor ENTRY/END macros

The .fnstart/.fnend directives can be inlined now that asmdefs.h is
arm specific.
---
 string/arm/asmdefs.h | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/string/arm/asmdefs.h b/string/arm/asmdefs.h
index 8f21a78..e311888 100644
--- a/string/arm/asmdefs.h
+++ b/string/arm/asmdefs.h
@@ -8,15 +8,6 @@
 #ifndef _ASMDEFS_H
 #define _ASMDEFS_H
 
-#define ARM_FNSTART .fnstart
-#if defined (IS_LEAF)
-# define ARM_FNEND \
-  .cantunwind	  \
-  .fnend
-#else
-# define ARM_FNEND .fnend
-#endif
-
 /* Check whether leaf function PAC signing has been requested in the
    -mbranch-protect compile-time option.  */
 #define LEAF_PROTECT_BIT 2
@@ -454,13 +445,12 @@
 	.endif
 .endm
 
-
 #define ENTRY_ALIGN(name, alignment)	\
   .global name;		\
   .type name,%function;	\
   .align alignment;		\
   name:			\
-  ARM_FNSTART;		\
+  .fnstart;		\
   .cfi_startproc;
 
 #define ENTRY(name)	ENTRY_ALIGN(name, 6)
@@ -470,9 +460,16 @@
   .type name,%function;	\
   name:
 
+#if defined (IS_LEAF)
+# define END_UNWIND .cantunwind;
+#else
+# define END_UNWIND
+#endif
+
 #define END(name)	\
   .cfi_endproc;		\
-  ARM_FNEND;		\
+  END_UNWIND		\
+  .fnend;		\
   .size name, .-name;
 
 #define L(l) .L ## l
-- 
cgit v1.2.3


From 7aea3e8806f1275d473168089b6933a53360eb7d Mon Sep 17 00:00:00 2001
From: Elliott Hughes <enh@google.com>
Date: Thu, 17 Nov 2022 19:29:14 +0000
Subject: Build the optimized memset().

Test: treehugger
Change-Id: Ibae0859e9683d10ba53113baeba26f720d44d674
---
 Android.bp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Android.bp b/Android.bp
index f79e6d6..3df808c 100644
--- a/Android.bp
+++ b/Android.bp
@@ -114,6 +114,7 @@ cc_library_static {
                 "string/aarch64/memcpy-advsimd.S",
                 "string/aarch64/memcpy.S",
                 "string/aarch64/memrchr.S",
+                "string/aarch64/memset.S",
                 "string/aarch64/stpcpy-mte.S",
                 "string/aarch64/stpcpy.S",
                 "string/aarch64/strchrnul-mte.S",
@@ -134,6 +135,7 @@ cc_library_static {
             ],
             asflags: [
                 "-D__memcmp_aarch64=memcmp",
+                "-D__memset_aarch64=memset",
                 "-D__memrchr_aarch64=memrchr",
                 "-D__strnlen_aarch64=strnlen",
             ]
-- 
cgit v1.2.3


From d5844d863d498cb1c43fde4ae3e47f3eeb5d0024 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 22 Nov 2022 13:40:07 +0000
Subject: pl/math: Add scalar and vector/Neon atanhf

Both routines are based on a simplified version of log1pf, and are
accurate to 3.1 ULP. Also enabled -c flag from runulp.sh - we need
this for atanhf so that we can set the control lane to something other
than 1, since atanh(1) is infinite.
---
 pl/math/atanhf_3u1.c                       | 76 ++++++++++++++++++++++++++
 pl/math/include/mathlib.h                  |  5 ++
 pl/math/s_atanhf_3u1.c                     |  6 ++
 pl/math/test/mathbench_funcs.h             |  2 +
 pl/math/test/runulp.sh                     | 30 +++++++++-
 pl/math/test/testcases/directed/atanhf.tst | 23 ++++++++
 pl/math/test/ulp_funcs.h                   |  2 +
 pl/math/test/ulp_wrappers.h                |  1 +
 pl/math/v_atanhf_3u1.c                     | 88 ++++++++++++++++++++++++++++++
 pl/math/v_math.h                           | 10 ++++
 pl/math/vn_atanhf_3u1.c                    | 12 ++++
 11 files changed, 253 insertions(+), 2 deletions(-)
 create mode 100644 pl/math/atanhf_3u1.c
 create mode 100644 pl/math/s_atanhf_3u1.c
 create mode 100644 pl/math/test/testcases/directed/atanhf.tst
 create mode 100644 pl/math/v_atanhf_3u1.c
 create mode 100644 pl/math/vn_atanhf_3u1.c

diff --git a/pl/math/atanhf_3u1.c b/pl/math/atanhf_3u1.c
new file mode 100644
index 0000000..77795c8
--- /dev/null
+++ b/pl/math/atanhf_3u1.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision atanh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "mathlib.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Four 0x40800000
+#define Ln2 0x1.62e43p-1f
+#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+
+#define C(i) __log1pf_data.coeffs[i]
+
+static inline float
+eval_poly (float m)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  float p_12 = fmaf (m, C (1), C (0));
+  float p_34 = fmaf (m, C (3), C (2));
+  float p_56 = fmaf (m, C (5), C (4));
+  float p_78 = fmaf (m, C (7), C (6));
+
+  float m2 = m * m;
+  float p_02 = fmaf (m2, p_12, m);
+  float p_36 = fmaf (m2, p_56, p_34);
+  float p_79 = fmaf (m2, C (8), p_78);
+
+  float m4 = m2 * m2;
+  float p_06 = fmaf (m4, p_36, p_02);
+
+  return fmaf (m4 * p_79, m4, p_06);
+}
+
+static inline float
+log1pf_inline (float x)
+{
+  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+     special-case handling. See that file for details of the algorithm.  */
+  float m = x + 1.0f;
+  int k = (asuint (m) - 0x3f400000) & 0xff800000;
+  float s = asfloat (Four - k);
+  float m_scale = asfloat (asuint (x) - k) + fmaf (0.25f, s, -1.0f);
+  float p = eval_poly (m_scale);
+  float scale_back = (float) k * 0x1.0p-23f;
+  return fmaf (scale_back, Ln2, p);
+}
+
+/* Approximation for single-precision inverse tanh(x), using a simplified
+   version of log1p. Maximum error is 3.08 ULP:
+   atanhf(0x1.ff0d5p-5) got 0x1.ffb768p-5
+		       want 0x1.ffb76ep-5.  */
+float
+atanhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax < TinyBound))
+    return x;
+
+  if (iax == One)
+    return __math_divzero (sign);
+
+  if (unlikely (iax > One))
+    return __math_invalidf (x);
+
+  float halfsign = asfloat (Half | sign);
+  float ax = asfloat (iax);
+  return halfsign * log1pf_inline ((2 * ax) / (1 - ax));
+}
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index e74cd6f..f9faf2f 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -12,6 +12,7 @@
 float acoshf (float);
 float asinhf (float);
 float atan2f (float, float);
+float atanhf (float);
 float coshf (float);
 float erfcf (float);
 float erff (float);
@@ -34,6 +35,7 @@ double sinh (double);
 float __s_asinhf (float);
 float __s_atanf (float);
 float __s_atan2f (float, float);
+float __s_atanhf (float);
 float __s_coshf (float);
 float __s_erfcf (float);
 float __s_erff (float);
@@ -72,6 +74,7 @@ __f32x4_t __v_atanf (__f32x4_t);
 __f64x2_t __v_atan (__f64x2_t);
 __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
+__f32x4_t __v_atanhf (__f32x4_t);
 __f32x4_t __v_coshf (__f32x4_t);
 __f64x2_t __v_cosh (__f64x2_t);
 __f32x4_t __v_erff (__f32x4_t);
@@ -99,6 +102,7 @@ __vpcs __f32x4_t __vn_atanf (__f32x4_t);
 __vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t __vn_atanhf (__f32x4_t);
 __vpcs __f32x4_t __vn_coshf (__f32x4_t);
 __vpcs __f64x2_t __vn_cosh (__f64x2_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
@@ -123,6 +127,7 @@ __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
diff --git a/pl/math/s_atanhf_3u1.c b/pl/math/s_atanhf_3u1.c
new file mode 100644
index 0000000..9f75962
--- /dev/null
+++ b/pl/math/s_atanhf_3u1.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanhf_3u1.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 583c8fb..de009a3 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -30,6 +30,7 @@ F (acoshf, 1.0, 10.0)
 F (asinhf, -10.0, 10.0)
 F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
+F (atanhf, -1.0, 1.0)
 F (cosf, -3.1, 3.1)
 F (coshf, -10.0, 10.0)
 F (erfcf, -4.0, 10.0)
@@ -61,6 +62,7 @@ D (sinh, -10.0, 10.0)
 #if WANT_VMATH
 ZVNF (asinhf, -10.0, 10.0)
 ZVNF (atanf, -10.0, 10.0)
+ZVNF (atanhf, -1.0, 1.0)
 ZVND (atan, -10.0, 10.0)
 ZVNF (coshf, -10.0, 10.0)
 ZVND (cosh, -10.0, 10.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 02a5a97..53b4a43 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -203,6 +203,14 @@ t cosh -0x1.61da04cbafe44p+9 -0x1p10               1000
 t cosh  0x1p10                inf                  100
 t cosh -0x1p10               -inf                  100
 
+L=2.59
+t atanhf  0        0x1p-12 500
+t atanhf  0x1p-12  1       200000
+t atanhf  1        inf     1000
+t atanhf -0       -0x1p-12 500
+t atanhf -0x1p-12 -1       200000
+t atanhf -1       -inf     1000
+
 done
 
 # vector functions
@@ -424,6 +432,15 @@ range_cosh='
  -0x1.6p9 -inf       1000
 '
 
+range_atanhf='
+  0        0x1p-12 500
+  0x1p-12  1       200000
+  1        inf     1000
+ -0       -0x1p-12 500
+ -0x1p-12 -1       200000
+ -1       -inf     1000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -589,6 +606,7 @@ L_coshf=1.89
 L_expm1=1.68
 L_sinh=2.08
 L_cosh=1.43
+L_atanhf=2.59
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -608,7 +626,7 @@ L_sve_erf=1.97
 L_sve_tanf=2.7
 L_sve_erfc=3.15
 
-while read G F R D
+while read G F R D A
 do
 	[ "$R" = 1 ] && { [[ $G != sve_* ]] || [ $WANT_SVE_MATH -eq 1 ]; } || continue
 	case "$G" in \#*) continue ;; esac
@@ -630,13 +648,17 @@ do
 		if [ $WANT_ERRNO -eq 1 ]; then
 			if [ "$D" = "fenv" ]; then
 				f=""
+			elif [ "$D" = "nofenv" ]; then
+				# Need to pass this if you want additional
+				# arguments but keep fenv checking disabled.
+				f="-f"
 			elif [ ! -z "$D" ]; then
 				echo "Unrecognised 4th argument: $D"
 				exit 1
 			fi
 		fi
 		case "$X" in \#*) continue ;; esac
-		t $f $F $X
+		t $A $f $F $X
 	done << EOF
 $range
 EOF
@@ -732,6 +754,10 @@ coshf  __s_coshf       $runs    fenv
 coshf  __v_coshf       $runv    fenv
 coshf  __vn_coshf      $runvn   fenv
 coshf  _ZGVnN4v_coshf  $runvn   fenv
+atanhf __s_atanhf      $runs    fenv -c 0
+atanhf __v_atanhf      $runv    fenv -c 0
+atanhf __vn_atanhf     $runvn   fenv -c 0
+atanhf _ZGVnN4v_atanhf $runvn   fenv -c 0
 
 sve_cosf     __sv_cosf         $runsv
 sve_cosf     _ZGVsMxv_cosf     $runsv
diff --git a/pl/math/test/testcases/directed/atanhf.tst b/pl/math/test/testcases/directed/atanhf.tst
new file mode 100644
index 0000000..616b59d
--- /dev/null
+++ b/pl/math/test/testcases/directed/atanhf.tst
@@ -0,0 +1,23 @@
+; atanhf.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanhf op1=7fc00001 result=7fc00001 errno=0
+func=atanhf op1=ffc00001 result=7fc00001 errno=0
+func=atanhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=atanhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=atanhf op1=7f800000 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=3f800001 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=bf800001 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=3f800000 result=7f800000 errno=ERANGE status=z
+func=atanhf op1=bf800000 result=ff800000 errno=ERANGE status=z
+func=atanhf op1=00000000 result=00000000 errno=0
+func=atanhf op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=atanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 1b674a6..84f4182 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -36,6 +36,7 @@
 F1 (acosh)
 F1 (asinh)
 F2 (atan2)
+F1 (atanh)
 F1 (cosh)
 F1 (erfc)
 F1 (erf)
@@ -59,6 +60,7 @@ _ZVNF1 (atan)
 _ZVND1 (atan)
 _ZVNF2 (atan2)
 _ZVND2 (atan2)
+_ZVNF1 (atanh)
 _ZVNF1 (cosh)
 _ZVND1 (cosh)
 _ZVNF1 (erf)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index a9f2d15..0ee07e9 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -118,6 +118,7 @@ DECL_POW_INT_REF(ref_powi, long double, double, int)
 ZVNF1_WRAP(asinh)
 ZVNF1_WRAP(atan)
 ZVNF2_WRAP(atan2)
+ZVNF1_WRAP(atanh)
 ZVNF1_WRAP(cosh)
 ZVNF1_WRAP(erf)
 ZVNF1_WRAP(erfc)
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
new file mode 100644
index 0000000..54dcb9b
--- /dev/null
+++ b/pl/math/v_atanhf_3u1.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision vector atanh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Four 0x40800000
+#define Ln2 0x1.62e43p-1f
+#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+
+#define C(i) v_f32 (__log1pf_data.coeffs[i])
+
+static inline v_f32_t
+eval_poly (v_f32_t m)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
+  v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
+  v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
+  v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+
+  v_f32_t m2 = m * m;
+  v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
+  v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
+  v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+
+  v_f32_t m4 = m2 * m2;
+  v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
+
+  return v_fma_f32 (m4, m4 * p_79, p_06);
+}
+
+static inline v_f32_t
+log1pf_inline (v_f32_t x)
+{
+  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+     special-case handling. See that file for details of the algorithm.  */
+  v_f32_t m = x + 1.0f;
+  v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
+  v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
+  v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
+		    + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+  v_f32_t p = eval_poly (m_scale);
+  v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
+  return v_fma_f32 (scale_back, v_f32 (Ln2), p);
+}
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+   The maximum error is 3.08 ULP:
+   __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
+			   want 0x1.ffcb82p-5.  */
+VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_f32_t halfsign
+    = v_as_f32_u32 (v_bsl_u32 (v_u32 (AbsMask), v_u32 (Half), ix));
+  v_u32_t iax = ix & AbsMask;
+
+  v_f32_t ax = v_as_f32_u32 (iax);
+
+#if WANT_ERRNO
+  v_u32_t special = v_cond_u32 ((iax >= One) | (iax <= TinyBound));
+  /* Side-step special cases by setting those lanes to 0, which will trigger no
+     exceptions. These will be fixed up later.  */
+  if (unlikely (v_any_u32 (special)))
+    ax = v_sel_f32 (special, v_f32 (0), ax);
+#else
+  v_u32_t special = v_cond_u32 (iax >= One);
+#endif
+
+  v_f32_t y = halfsign * log1pf_inline ((2 * ax) / (1 - ax));
+
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (atanhf, x, y, special);
+  return y;
+}
+
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index d4597c8..3ed6244 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -174,6 +174,11 @@ v_abs_f32 (v_f32_t x)
   return __builtin_fabsf (x);
 }
 static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+  return (y & ~m) | (x & m);
+}
+static inline v_u32_t
 v_cagt_f32 (v_f32_t x, v_f32_t y)
 {
   return fabsf (x) > fabsf (y);
@@ -536,6 +541,11 @@ v_abs_f32 (v_f32_t x)
   return vabsq_f32 (x);
 }
 static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+  return vbslq_u32 (m, x, y);
+}
+static inline v_u32_t
 v_cagt_f32 (v_f32_t x, v_f32_t y)
 {
   return vcagtq_f32 (x, y);
diff --git a/pl/math/vn_atanhf_3u1.c b/pl/math/vn_atanhf_3u1.c
new file mode 100644
index 0000000..d4ad391
--- /dev/null
+++ b/pl/math/vn_atanhf_3u1.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanhf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atanhf, _ZGVnN4v_atanhf)
+#include "v_atanhf_3u1.c"
+#endif
-- 
cgit v1.2.3


From 6e875e8b27fd103bb41590a580c8ee03ea5d7138 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 22 Nov 2022 13:40:13 +0000
Subject: pl/math: Add scalar & vector/Neon cbrtf

Both routines use the same algorithm - one Newton iteration with the
initial guess obtained by a low-order polynomial. Scalar is used as a
fallback for subnormal and special cases for the vector routine, which
allows vastly simplified argument reduction and reassembly. Both
routines accurate to 1.5 ULP.
---
 pl/math/cbrtf_1u5.c                       | 64 ++++++++++++++++++++++
 pl/math/cbrtf_data.c                      | 15 ++++++
 pl/math/include/mathlib.h                 |  5 ++
 pl/math/math_config.h                     |  6 +++
 pl/math/s_cbrtf_1u5.c                     |  6 +++
 pl/math/test/mathbench_funcs.h            |  2 +
 pl/math/test/runulp.sh                    | 14 +++++
 pl/math/test/testcases/directed/cbrtf.tst | 29 ++++++++++
 pl/math/test/ulp_funcs.h                  |  2 +
 pl/math/test/ulp_wrappers.h               |  1 +
 pl/math/tools/cbrtf.sollya                | 20 +++++++
 pl/math/v_cbrtf_1u5.c                     | 88 +++++++++++++++++++++++++++++++
 pl/math/vn_cbrtf_1u5.c                    | 12 +++++
 13 files changed, 264 insertions(+)
 create mode 100644 pl/math/cbrtf_1u5.c
 create mode 100644 pl/math/cbrtf_data.c
 create mode 100644 pl/math/s_cbrtf_1u5.c
 create mode 100644 pl/math/test/testcases/directed/cbrtf.tst
 create mode 100644 pl/math/tools/cbrtf.sollya
 create mode 100644 pl/math/v_cbrtf_1u5.c
 create mode 100644 pl/math/vn_cbrtf_1u5.c

diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
new file mode 100644
index 0000000..73b9049
--- /dev/null
+++ b/pl/math/cbrtf_1u5.c
@@ -0,0 +1,64 @@
+/*
+ * Single-precision cbrt(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <math.h>
+
+#include "math_config.h"
+
+#define AbsMask 0x7fffffff
+#define SignMask 0x80000000
+#define TwoThirds 0x1.555556p-1f
+
+#define C(i) __cbrtf_data.poly[i]
+#define T(i) __cbrtf_data.table[i]
+
+/* Approximation for single-precision cbrt(x), using low-order polynomial and
+   one Newton iteration on a reduced interval. Greatest error is 1.5 ULP. This
+   is observed for every value where the mantissa is 0x1.81410e and the exponent
+   is a multiple of 3, for example:
+   cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
+			want 0x1.255d92p+10.  */
+float
+cbrtf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & SignMask;
+
+  if (unlikely (iax == 0 || iax == 0x7f800000))
+    return x;
+
+  /* |x| = m * 2^e, where m is in [0.5, 1.0].
+     We can easily decompose x into m and e using frexpf.  */
+  int e;
+  float m = frexpf (asfloat (iax), &e);
+
+  /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+     the less accurate the next stage of the algorithm needs to be. An order-4
+     polynomial is enough for one Newton iteration.  */
+  float p_01 = fmaf (C (1), m, C (0));
+  float p_23 = fmaf (C (3), m, C (2));
+  float p = fmaf (m * m, p_23, p_01);
+
+  /* One iteration of Newton's method for iteratively approximating cbrt.  */
+  float m_by_3 = m / 3;
+  float a = fmaf (TwoThirds, p, m_by_3 / (p * p));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)).
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3.
+     i is an integer in [-2, 2], so t can be looked up in the table T.
+     Hence the result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.
+     Which can be done easily using ldexpf.  */
+  return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign);
+}
diff --git a/pl/math/cbrtf_data.c b/pl/math/cbrtf_data.c
new file mode 100644
index 0000000..386a2b4
--- /dev/null
+++ b/pl/math/cbrtf_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and table entries for single-precision cbrt(x).
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct cbrtf_data __cbrtf_data
+  = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1].
+                  See cbrtf.sollya for details of generation.  */
+	        0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, 0x1.2c74c2p-3},
+     .table = { /* table[i] = 2^((i - 2) / 3).  */
+	        0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index f9faf2f..7bec5e1 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -13,6 +13,7 @@ float acoshf (float);
 float asinhf (float);
 float atan2f (float, float);
 float atanhf (float);
+float cbrtf (float);
 float coshf (float);
 float erfcf (float);
 float erff (float);
@@ -36,6 +37,7 @@ float __s_asinhf (float);
 float __s_atanf (float);
 float __s_atan2f (float, float);
 float __s_atanhf (float);
+float __s_cbrtf (float);
 float __s_coshf (float);
 float __s_erfcf (float);
 float __s_erff (float);
@@ -75,6 +77,7 @@ __f64x2_t __v_atan (__f64x2_t);
 __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
 __f32x4_t __v_atanhf (__f32x4_t);
+__f32x4_t __v_cbrtf (__f32x4_t);
 __f32x4_t __v_coshf (__f32x4_t);
 __f64x2_t __v_cosh (__f64x2_t);
 __f32x4_t __v_erff (__f32x4_t);
@@ -103,6 +106,7 @@ __vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t __vn_atanhf (__f32x4_t);
+__vpcs __f32x4_t __vn_cbrtf (__f32x4_t);
 __vpcs __f32x4_t __vn_coshf (__f32x4_t);
 __vpcs __f64x2_t __vn_cosh (__f64x2_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
@@ -128,6 +132,7 @@ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 7472395..99132a0 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -553,4 +553,10 @@ extern const struct expf_data
 #define EXPM1_POLY_ORDER 11
 extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN;
 
+extern const struct cbrtf_data
+{
+  float poly[4];
+  float table[5];
+} __cbrtf_data HIDDEN;
+
 #endif
diff --git a/pl/math/s_cbrtf_1u5.c b/pl/math/s_cbrtf_1u5.c
new file mode 100644
index 0000000..d60508e
--- /dev/null
+++ b/pl/math/s_cbrtf_1u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cbrtf_1u5.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index de009a3..42dc292 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -31,6 +31,7 @@ F (asinhf, -10.0, 10.0)
 F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
 F (atanhf, -1.0, 1.0)
+F (cbrtf, -10.0, 10.0)
 F (cosf, -3.1, 3.1)
 F (coshf, -10.0, 10.0)
 F (erfcf, -4.0, 10.0)
@@ -64,6 +65,7 @@ ZVNF (asinhf, -10.0, 10.0)
 ZVNF (atanf, -10.0, 10.0)
 ZVNF (atanhf, -1.0, 1.0)
 ZVND (atan, -10.0, 10.0)
+ZVNF (cbrtf, -10.0, 10.0)
 ZVNF (coshf, -10.0, 10.0)
 ZVND (cosh, -10.0, 10.0)
 ZVNF (erff, -4.0, 4.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 53b4a43..6410dd9 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -211,6 +211,10 @@ t atanhf -0       -0x1p-12 500
 t atanhf -0x1p-12 -1       200000
 t atanhf -1       -inf     1000
 
+L=1.03
+t cbrtf  0  inf 1000000
+t cbrtf -0 -inf 1000000
+
 done
 
 # vector functions
@@ -441,6 +445,11 @@ range_atanhf='
  -1       -inf     1000
 '
 
+range_cbrtf='
+  0  inf 1000000
+ -0 -inf 1000000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -607,6 +616,7 @@ L_expm1=1.68
 L_sinh=2.08
 L_cosh=1.43
 L_atanhf=2.59
+L_cbrtf=1.03
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -758,6 +768,10 @@ atanhf __s_atanhf      $runs    fenv -c 0
 atanhf __v_atanhf      $runv    fenv -c 0
 atanhf __vn_atanhf     $runvn   fenv -c 0
 atanhf _ZGVnN4v_atanhf $runvn   fenv -c 0
+cbrtf  __s_cbrtf       $runs    fenv
+cbrtf  __v_cbrtf       $runv    fenv
+cbrtf  __vn_cbrtf      $runvn   fenv
+cbrtf  _ZGVnN4v_cbrtf  $runvn   fenv
 
 sve_cosf     __sv_cosf         $runsv
 sve_cosf     _ZGVsMxv_cosf     $runsv
diff --git a/pl/math/test/testcases/directed/cbrtf.tst b/pl/math/test/testcases/directed/cbrtf.tst
new file mode 100644
index 0000000..5f8b97f
--- /dev/null
+++ b/pl/math/test/testcases/directed/cbrtf.tst
@@ -0,0 +1,29 @@
+; cbrtf.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=cbrtf op1=7f800000 result=7f800000 errno=0
+func=cbrtf op1=ff800000 result=ff800000 errno=0
+func=cbrtf op1=7f800001 result=7fc00001 errno=0 status=i
+func=cbrtf op1=7fc00001 result=7fc00001 errno=0
+func=cbrtf op1=00000000 result=00000000 errno=0
+func=cbrtf op1=00000001 result=26a14517.cc7 errno=0
+func=cbrtf op1=00000002 result=26cb2ff5.29f errno=0
+func=cbrtf op1=00000003 result=26e89768.579 errno=0
+func=cbrtf op1=00000004 result=27000000.000 errno=0
+func=cbrtf op1=00400000 result=2a4b2ff5.29f errno=0
+func=cbrtf op1=00800000 result=2a800000.000 errno=0
+func=cbrtf op1=3f800000 result=3f800000.000 errno=0
+func=cbrtf op1=40000000 result=3fa14517.cc7 errno=0
+func=cbrtf op1=7f7fffff result=54cb2ff4.e63 errno=0
+func=cbrtf op1=80000000 result=80000000 errno=0
+func=cbrtf op1=80000001 result=a6a14517.cc7 errno=0
+func=cbrtf op1=80000002 result=a6cb2ff5.29f errno=0
+func=cbrtf op1=80000003 result=a6e89768.579 errno=0
+func=cbrtf op1=80000004 result=a7000000.000 errno=0
+func=cbrtf op1=80400000 result=aa4b2ff5.29f errno=0
+func=cbrtf op1=80800000 result=aa800000.000 errno=0
+func=cbrtf op1=bf800000 result=bf800000.000 errno=0
+func=cbrtf op1=c0000000 result=bfa14517.cc7 errno=0
+func=cbrtf op1=ff7fffff result=d4cb2ff4.e63 errno=0
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 84f4182..8e41ccf 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -37,6 +37,7 @@ F1 (acosh)
 F1 (asinh)
 F2 (atan2)
 F1 (atanh)
+F1 (cbrt)
 F1 (cosh)
 F1 (erfc)
 F1 (erf)
@@ -61,6 +62,7 @@ _ZVND1 (atan)
 _ZVNF2 (atan2)
 _ZVND2 (atan2)
 _ZVNF1 (atanh)
+_ZVNF1 (cbrt)
 _ZVNF1 (cosh)
 _ZVND1 (cosh)
 _ZVNF1 (erf)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 0ee07e9..fec18a7 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -119,6 +119,7 @@ ZVNF1_WRAP(asinh)
 ZVNF1_WRAP(atan)
 ZVNF2_WRAP(atan2)
 ZVNF1_WRAP(atanh)
+ZVNF1_WRAP(cbrt)
 ZVNF1_WRAP(cosh)
 ZVNF1_WRAP(erf)
 ZVNF1_WRAP(erfc)
diff --git a/pl/math/tools/cbrtf.sollya b/pl/math/tools/cbrtf.sollya
new file mode 100644
index 0000000..9cd1259
--- /dev/null
+++ b/pl/math/tools/cbrtf.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating cbrt(x) in single precision
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 3;
+
+a = 0.5;
+b = 1;
+
+
+f = x^(1/3);
+
+poly = fpminimax(f, deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), SG, RN);
diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c
new file mode 100644
index 0000000..fd43051
--- /dev/null
+++ b/pl/math/v_cbrtf_1u5.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision vector cbrt(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define SignMask v_u32 (0x80000000)
+#define TwoThirds v_f32 (0x1.555556p-1f)
+#define SmallestNormal 0x00800000
+#define MantissaMask 0x007fffff
+#define HalfExp 0x3f000000
+
+#define C(i) v_f32 (__cbrtf_data.poly[i])
+#define T(i) v_lookup_f32 (__cbrtf_data.table, i)
+
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (cbrtf, x, y, special);
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration with
+   initial guess obtained by a low-order polynomial. Greatest error is 1.5 ULP.
+   This is observed for every value where the mantissa is 0x1.81410e and the
+   exponent is a multiple of 3, for example:
+   __v_cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
+			    want 0x1.255d92p+10.  */
+VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+
+  /* Subnormal, +/-0 and special values.  */
+  v_u32_t special = v_cond_u32 ((iax < SmallestNormal) | (iax >= 0x7f800000));
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexpf, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  v_f32_t m = v_as_f32_u32 ((iax & MantissaMask) | HalfExp);
+  v_s32_t e = v_as_s32_u32 (iax >> 23) - 126;
+
+  /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+     the less accurate the next stage of the algorithm needs to be. An order-4
+     polynomial is enough for one Newton iteration.  */
+  v_f32_t p_01 = v_fma_f32 (C (1), m, C (0));
+  v_f32_t p_23 = v_fma_f32 (C (3), m, C (2));
+  v_f32_t p = v_fma_f32 (m * m, p_23, p_01);
+
+  /* One iteration of Newton's method for iteratively approximating cbrt.  */
+  v_f32_t m_by_3 = m / 3;
+  v_f32_t a = v_fma_f32 (TwoThirds, p, m_by_3 / (p * p));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+     an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+
+  v_s32_t ey = e / 3;
+  v_f32_t my = a * T (v_as_u32_s32 (e % 3 + 2));
+
+  /* Vector version of ldexpf.  */
+  v_f32_t y = v_as_f32_u32 ((v_as_u32_s32 (ey + 127) << 23)) * my;
+  /* Copy sign.  */
+  y = v_as_f32_u32 (v_bsl_u32 (SignMask, ix, v_as_u32_f32 (y)));
+
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_cbrtf_1u5.c b/pl/math/vn_cbrtf_1u5.c
new file mode 100644
index 0000000..3452807
--- /dev/null
+++ b/pl/math/vn_cbrtf_1u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cbrtf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_cbrtf, _ZGVnN4v_cbrtf)
+#include "v_cbrtf_1u5.c"
+#endif
-- 
cgit v1.2.3


From 57c26e8cb23de471123872931dc5bb6277acd498 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 24 Nov 2022 11:34:39 +0000
Subject: pl/math: Update ULP threshold for vector atans

New max observed for both Neon and SVE.
---
 pl/math/sv_atan_2u5.c  | 6 +++---
 pl/math/test/runulp.sh | 4 ++--
 pl/math/v_atan_2u5.c   | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pl/math/sv_atan_2u5.c b/pl/math/sv_atan_2u5.c
index e0b621f..49c5e82 100644
--- a/pl/math/sv_atan_2u5.c
+++ b/pl/math/sv_atan_2u5.c
@@ -17,9 +17,9 @@
 /* Fast implementation of SVE atan.
    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
    z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
-   error is 2.22 ulps:
-   __sv_atan(0x1.0005fd947bf57p+0) got 0x1.9225b2c6cd6cdp-1
-				  want 0x1.9225b2c6cd6cfp-1.  */
+   error is 2.27 ulps:
+   __sv_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+				  want 0x1.9225645bdd7c3p-1.  */
 sv_f64_t
 __sv_atan_x (sv_f64_t x, const svbool_t pg)
 {
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 6410dd9..dd38e37 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -600,7 +600,7 @@ L_erf=1.26
 L_erff=0.76
 # TODO tighten this once __v_atan2 is fixed
 L_atan2=2.9
-L_atan=1.73
+L_atan=1.78
 L_atan2f=2.46
 L_atanf=2.5
 L_log1pf=1.53
@@ -623,7 +623,7 @@ L_sve_cos=1.61
 L_sve_sinf=1.40
 L_sve_sin=2.03
 L_sve_atanf=2.9
-L_sve_atan=1.73
+L_sve_atan=1.78
 L_sve_atan2f=2.45
 L_sve_atan2=1.73
 L_sve_log10=1.97
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
index a0223ed..05c77c0 100644
--- a/pl/math/v_atan_2u5.c
+++ b/pl/math/v_atan_2u5.c
@@ -15,9 +15,9 @@
 
 /* Fast implementation of vector atan.
    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
-   z=1/x and shift = pi/2. Maximum observed error is 2.22 ulps:
-   __v_atan(0x1.0005fd947bf57p+0) got 0x1.9225b2c6cd6cdp-1
-				 want 0x1.9225b2c6cd6cfp-1.  */
+   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+   __v_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+				 want 0x1.9225645bdd7c3p-1.  */
 VPCS_ATTR
 v_f64_t V_NAME (atan) (v_f64_t x)
 {
-- 
cgit v1.2.3


From 8a0d24f8af39bda7b54341067c6ccc8a7f12ff27 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 29 Nov 2022 10:31:10 +0000
Subject: pl/math: Add vector/Neon asinh

New routine uses two separate algorithms for input greater and less
than 1 (similar to the scalar routine). It is accurate to 2.5 ULP.
---
 pl/math/include/mathlib.h      |   4 +
 pl/math/s_asinh_2u5.c          |   6 ++
 pl/math/test/mathbench_funcs.h |   1 +
 pl/math/test/runulp.sh         |  24 ++++++
 pl/math/test/ulp_funcs.h       |   1 +
 pl/math/test/ulp_wrappers.h    |   1 +
 pl/math/v_asinh_2u5.c          | 179 +++++++++++++++++++++++++++++++++++++++++
 pl/math/v_math.h               |  20 +++++
 pl/math/vn_asinh_2u5.c         |  12 +++
 9 files changed, 248 insertions(+)
 create mode 100644 pl/math/s_asinh_2u5.c
 create mode 100644 pl/math/v_asinh_2u5.c
 create mode 100644 pl/math/vn_asinh_2u5.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 7bec5e1..6721d45 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -48,6 +48,7 @@ float __s_log2f (float);
 float __s_sinhf (float);
 float __s_tanf (float);
 
+double __s_asinh (double);
 double __s_atan (double);
 double __s_atan2 (double, double);
 double __s_cosh (double);
@@ -72,6 +73,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 
 /* Vector functions following the base PCS.  */
 __f32x4_t __v_asinhf (__f32x4_t);
+__f64x2_t __v_asinh (__f64x2_t);
 __f32x4_t __v_atanf (__f32x4_t);
 __f64x2_t __v_atan (__f64x2_t);
 __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
@@ -101,6 +103,7 @@ __f32x4_t __v_tanf (__f32x4_t);
 
 /* Vector functions following the vector PCS.  */
 __vpcs __f32x4_t __vn_asinhf (__f32x4_t);
+__vpcs __f64x2_t __vn_asinh (__f64x2_t);
 __vpcs __f32x4_t __vn_atanf (__f32x4_t);
 __vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
@@ -127,6 +130,7 @@ __vpcs __f32x4_t __vn_tanf (__f32x4_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
diff --git a/pl/math/s_asinh_2u5.c b/pl/math/s_asinh_2u5.c
new file mode 100644
index 0000000..6da30bd
--- /dev/null
+++ b/pl/math/s_asinh_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_asinh_2u5.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 42dc292..b972f83 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -62,6 +62,7 @@ D (sinh, -10.0, 10.0)
 
 #if WANT_VMATH
 ZVNF (asinhf, -10.0, 10.0)
+ZVND (asinh, -10.0, 10.0)
 ZVNF (atanf, -10.0, 10.0)
 ZVNF (atanhf, -1.0, 1.0)
 ZVND (atan, -10.0, 10.0)
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index dd38e37..5d0188d 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -450,6 +450,17 @@ range_cbrtf='
  -0 -inf 1000000
 '
 
+range_asinh='
+  0        0x1p-26 50000
+  0x1p-26  1       50000
+  1        0x1p511 50000
+  0x1p511  inf     40000
+ -0       -0x1p-26 50000
+ -0x1p-26 -1       50000
+ -1       -0x1p511 50000
+ -0x1p511 -inf     40000
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -617,6 +628,7 @@ L_sinh=2.08
 L_cosh=1.43
 L_atanhf=2.59
 L_cbrtf=1.03
+L_asinh=1.54
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -772,6 +784,18 @@ cbrtf  __s_cbrtf       $runs    fenv
 cbrtf  __v_cbrtf       $runv    fenv
 cbrtf  __vn_cbrtf      $runvn   fenv
 cbrtf  _ZGVnN4v_cbrtf  $runvn   fenv
+asinh  __s_asinh       $runs    fenv
+# Test vector asinh 3 times, with control lane < 1, > 1 and special.
+#  Ensures the v_sel is choosing the right option in all cases.
+asinh  __v_asinh       $runv    fenv -c 0.5
+asinh  __vn_asinh      $runvn   fenv -c 0.5
+asinh  _ZGVnN2v_asinh  $runvn   fenv -c 0.5
+asinh  __v_asinh       $runv    fenv -c 2
+asinh  __vn_asinh      $runvn   fenv -c 2
+asinh  _ZGVnN2v_asinh  $runvn   fenv -c 2
+asinh  __v_asinh       $runv    fenv -c 0x1p600
+asinh  __vn_asinh      $runvn   fenv -c 0x1p600
+asinh  _ZGVnN2v_asinh  $runvn   fenv -c 0x1p600
 
 sve_cosf     __sv_cosf         $runsv
 sve_cosf     _ZGVsMxv_cosf     $runsv
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 8e41ccf..465630d 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -57,6 +57,7 @@ D1 (log1p)
 D1 (sinh)
 #if WANT_VMATH
 _ZVNF1 (asinh)
+_ZVND1 (asinh)
 _ZVNF1 (atan)
 _ZVND1 (atan)
 _ZVNF2 (atan2)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index fec18a7..28ee251 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -129,6 +129,7 @@ ZVNF1_WRAP(log1p)
 ZVNF1_WRAP(log2)
 ZVNF1_WRAP(sinh)
 ZVNF1_WRAP(tan)
+ZVND1_WRAP(asinh)
 ZVND1_WRAP(atan)
 ZVND2_WRAP(atan2)
 ZVND1_WRAP(cosh)
diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c
new file mode 100644
index 0000000..a1bdf5b
--- /dev/null
+++ b/pl/math/v_asinh_2u5.c
@@ -0,0 +1,179 @@
+/*
+ * Double-precision vector asinh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#if V_SUPPORTED
+
+#define OneTop 0x3ff	/* top12(asuint64(1.0f)).  */
+#define HugeBound 0x5fe /* top12(asuint64(0x1p511)).  */
+#define TinyBound 0x3e5 /* top12(asuint64(0x1p-26)).  */
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define C(i) v_f64 (__asinh_data.poly[i])
+
+/* Constants & data for log.  */
+#define OFF 0x3fe6000000000000
+#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
+#define A(i) v_f64 (__sv_log_data.poly[i])
+#define T(i) __log_data.tab[i]
+#define N (1 << LOG_TABLE_BITS)
+
+static NOINLINE v_f64_t
+special_case (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (asinh, x, y, special);
+}
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t logc;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = T (i).invc;
+  e.logc = T (i).logc;
+#else
+  e.invc[0] = T (i[0]).invc;
+  e.logc[0] = T (i[0]).logc;
+  e.invc[1] = T (i[1]).invc;
+  e.logc[1] = T (i[1]).logc;
+#endif
+  return e;
+}
+
+static inline v_f64_t
+log_inline (v_f64_t x)
+{
+  /* Double-precision vector log, copied from math/v_log.c with some cosmetic
+     modification and special-cases removed. See that file for details of the
+     algorithm used.  */
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t tmp = ix - OFF;
+  v_u64_t i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
+  v_s64_t k = v_as_s64_u64 (tmp) >> 52;
+  v_u64_t iz = ix - (tmp & 0xfffULL << 52);
+  v_f64_t z = v_as_f64_u64 (iz);
+  struct entry e = lookup (i);
+  v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  v_f64_t kd = v_to_f64_s64 (k);
+  v_f64_t hi = v_fma_f64 (kd, Ln2, e.logc + r);
+  v_f64_t r2 = r * r;
+  v_f64_t y = v_fma_f64 (A (3), r, A (2));
+  v_f64_t p = v_fma_f64 (A (1), r, A (0));
+  y = v_fma_f64 (A (4), r2, y);
+  y = v_fma_f64 (y, r2, p);
+  y = v_fma_f64 (y, r2, hi);
+  return y;
+}
+
+static inline v_f64_t
+eval_poly (v_f64_t z)
+{
+  /* Custom polynomial, shared with scalar routine, for calculating asinh(x) in
+     [2^-26, 1]. Evaluated with Estrin scheme.  */
+  v_f64_t p_01 = v_fma_f64 (z, C (1), C (0));
+  v_f64_t p_23 = v_fma_f64 (z, C (3), C (2));
+  v_f64_t p_45 = v_fma_f64 (z, C (5), C (4));
+  v_f64_t p_67 = v_fma_f64 (z, C (7), C (6));
+  v_f64_t p_89 = v_fma_f64 (z, C (9), C (8));
+  v_f64_t p_ab = v_fma_f64 (z, C (11), C (10));
+  v_f64_t p_cd = v_fma_f64 (z, C (13), C (12));
+  v_f64_t p_ef = v_fma_f64 (z, C (15), C (14));
+  v_f64_t p_gh = v_fma_f64 (z, C (17), C (16));
+
+  v_f64_t z2 = z * z;
+  v_f64_t p_03 = v_fma_f64 (z2, p_23, p_01);
+  v_f64_t p_47 = v_fma_f64 (z2, p_67, p_45);
+  v_f64_t p_8b = v_fma_f64 (z2, p_ab, p_89);
+  v_f64_t p_cf = v_fma_f64 (z2, p_ef, p_cd);
+
+  v_f64_t z4 = z2 * z2;
+  v_f64_t p_07 = v_fma_f64 (z4, p_47, p_03);
+  v_f64_t p_8f = v_fma_f64 (z4, p_cf, p_8b);
+
+  v_f64_t z8 = z4 * z4;
+  v_f64_t p_0f = v_fma_f64 (z8, p_8f, p_07);
+
+  v_f64_t z16 = z8 * z8;
+  return v_fma_f64 (z16, p_gh, p_0f);
+}
+
+/* Double-precision implementation of vector asinh(x).
+   asinh is very sensitive around 1, so it is impractical to devise a single
+   low-cost algorithm which is sufficiently accurate on a wide range of input.
+   Instead we use two different algorithms:
+   asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
+	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
+   where log(x) is an optimized log approximation, and P(x) is a polynomial
+   shared with the scalar routine. The greatest observed error 2.03 ULP, in
+   |x| >= 1:
+   __v_asinh(-0x1.00094e0f39574p+0) got -0x1.c3508eb6a681ep-1
+				   want -0x1.c3508eb6a682p-1.  */
+VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_f64_t ax = v_as_f64_u64 (iax);
+  v_u64_t top12 = iax >> 52;
+
+  v_u64_t gt1 = v_cond_u64 (top12 >= OneTop);
+  v_u64_t special = v_cond_u64 (top12 >= HugeBound);
+
+  /* Option 1: |x| >= 1.
+     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
+     If WANT_ERRNO is enabled, sidestep special values, which will overflow, by
+     setting special lanes to 1. These will be fixed later.  */
+  v_f64_t option_1 = v_f64 (0);
+  if (likely (v_any_u64 (gt1)))
+    {
+#if WANT_ERRNO
+      v_f64_t xm = v_sel_f64 (special, v_f64 (1), ax);
+#else
+      v_f64_t xm = ax;
+#endif
+      option_1 = log_inline (xm + v_sqrt_f64 (xm * xm + 1));
+    }
+
+  /* Option 2: |x| < 1.
+     Compute asinh(x) using a polynomial.
+     If WANT_ERRNO is enabled, sidestep special lanes, which will overflow, and
+     tiny lanes, which will underflow, by setting them to 0. They will be fixed
+     later, either by selecting x or falling back to the scalar special-case.
+     The largest observed error in this region is 1.47 ULPs:
+     __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+				    want 0x1.c1d6bf874019cp-1.  */
+  v_f64_t option_2 = v_f64 (0);
+  if (likely (v_any_u64 (~gt1)))
+    {
+#if WANT_ERRNO
+      v_u64_t tiny = v_cond_u64 (top12 < TinyBound);
+      ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
+#endif
+      v_f64_t x2 = ax * ax;
+      v_f64_t p = eval_poly (x2);
+      option_2 = v_fma_f64 (p, x2 * ax, ax);
+#if WANT_ERRNO
+      option_2 = v_sel_f64 (tiny, x, option_2);
+#endif
+    }
+
+  /* Choose the right option for each lane.  */
+  v_f64_t y = v_sel_f64 (gt1, option_1, option_2);
+  /* Copy sign.  */
+  y = v_as_f64_u64 (v_bsl_u64 (AbsMask, v_as_u64_f64 (y), ix));
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index 3ed6244..0ff3db3 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -356,6 +356,11 @@ v_abs_f64 (v_f64_t x)
   return __builtin_fabs (x);
 }
 static inline v_u64_t
+v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
+{
+  return (y & ~m) | (x & m);
+}
+static inline v_u64_t
 v_cagt_f64 (v_f64_t x, v_f64_t y)
 {
   return fabs (x) > fabs (y);
@@ -384,6 +389,11 @@ v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
 {
   return p ? x : y;
 }
+static inline v_f64_t
+v_sqrt_f64 (v_f64_t x)
+{
+  return __builtin_sqrt (x);
+}
 static inline v_s64_t
 v_round_s64 (v_f64_t x)
 {
@@ -728,6 +738,11 @@ v_abs_f64 (v_f64_t x)
   return vabsq_f64 (x);
 }
 static inline v_u64_t
+v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
+{
+  return vbslq_u64 (m, x, y);
+}
+static inline v_u64_t
 v_cagt_f64 (v_f64_t x, v_f64_t y)
 {
   return vcagtq_f64 (x, y);
@@ -756,6 +771,11 @@ v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
 {
   return vbslq_f64 (p, x, y);
 }
+static inline v_f64_t
+v_sqrt_f64 (v_f64_t x)
+{
+  return vsqrtq_f64 (x);
+}
 static inline v_s64_t
 v_round_s64 (v_f64_t x)
 {
diff --git a/pl/math/vn_asinh_2u5.c b/pl/math/vn_asinh_2u5.c
new file mode 100644
index 0000000..ecc61ed
--- /dev/null
+++ b/pl/math/vn_asinh_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_asinh.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_asinh, _ZGVnN2v_asinh)
+#include "v_asinh_2u5.c"
+#endif
-- 
cgit v1.2.3


From 8a644bf15812edaba38b41ca142e8e7e328e7918 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 30 Nov 2022 09:42:41 +0000
Subject: pl/math: Add scalar and vector/Neon tanhf

Both routines use simplified inline versions of expm1f, and are
accurate to 2.6 ULP.
---
 pl/math/include/mathlib.h                 |  5 ++
 pl/math/s_tanhf_2u6.c                     |  6 ++
 pl/math/tanhf_2u6.c                       | 80 ++++++++++++++++++++++++++
 pl/math/test/mathbench_funcs.h            |  2 +
 pl/math/test/runulp.sh                    | 22 ++++++++
 pl/math/test/testcases/directed/tanhf.tst | 20 +++++++
 pl/math/test/ulp_funcs.h                  |  2 +
 pl/math/test/ulp_wrappers.h               |  1 +
 pl/math/v_tanhf_2u6.c                     | 93 +++++++++++++++++++++++++++++++
 pl/math/vn_tanhf_2u6.c                    | 12 ++++
 10 files changed, 243 insertions(+)
 create mode 100644 pl/math/s_tanhf_2u6.c
 create mode 100644 pl/math/tanhf_2u6.c
 create mode 100644 pl/math/test/testcases/directed/tanhf.tst
 create mode 100644 pl/math/v_tanhf_2u6.c
 create mode 100644 pl/math/vn_tanhf_2u6.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 6721d45..1266eb7 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -22,6 +22,7 @@ float log10f (float);
 float log1pf (float);
 float sinhf (float);
 float tanf (float);
+float tanhf (float);
 
 double acosh (double);
 double asinh (double);
@@ -47,6 +48,7 @@ float __s_log1pf (float);
 float __s_log2f (float);
 float __s_sinhf (float);
 float __s_tanf (float);
+float __s_tanhf (float);
 
 double __s_asinh (double);
 double __s_atan (double);
@@ -97,6 +99,7 @@ __f64x2_t __v_log2 (__f64x2_t);
 __f32x4_t __v_sinhf (__f32x4_t);
 __f64x2_t __v_sinh (__f64x2_t);
 __f32x4_t __v_tanf (__f32x4_t);
+__f32x4_t __v_tanhf (__f32x4_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
@@ -127,6 +130,7 @@ __vpcs __f64x2_t __vn_log2 (__f64x2_t);
 __vpcs __f32x4_t __vn_sinhf (__f32x4_t);
 __vpcs __f64x2_t __vn_sinh (__f64x2_t);
 __vpcs __f32x4_t __vn_tanf (__f32x4_t);
+__vpcs __f32x4_t __vn_tanhf (__f32x4_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
@@ -154,6 +158,7 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
 
 #endif
 
diff --git a/pl/math/s_tanhf_2u6.c b/pl/math/s_tanhf_2u6.c
new file mode 100644
index 0000000..bbb4569
--- /dev/null
+++ b/pl/math/s_tanhf_2u6.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanhf_2u6.c"
diff --git a/pl/math/tanhf_2u6.c b/pl/math/tanhf_2u6.c
new file mode 100644
index 0000000..9db2533
--- /dev/null
+++ b/pl/math/tanhf_2u6.c
@@ -0,0 +1,80 @@
+/*
+ * Single-precision tanh(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+#define BoringBound                                                            \
+  0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
+		negative).  */
+#define AbsMask 0x7fffffff
+#define One 0x3f800000
+
+#define Shift (0x1.8p23f)
+#define InvLn2 (0x1.715476p+0f)
+#define Ln2hi (0x1.62e4p-1f)
+#define Ln2lo (0x1.7f7d1cp-20f)
+
+#define C(i) __expm1f_poly[i]
+
+static inline float
+expm1f_inline (float x)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from expm1f_1u6.c, with several simplifications:
+     - No special-case handling for tiny or special values, instead return early
+       from the main routine.
+     - No special handling for large values:
+       - No early return for infinity.
+       - Simpler combination of p and t in final stage of algorithm.
+       - |i| < 27, so can calculate t by simpler shift-and-add, instead of
+	 ldexpf (same as vector algorithm).  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  float j = fmaf (InvLn2, x, Shift) - Shift;
+  int32_t i = j;
+  float f = fmaf (j, -Ln2hi, x);
+  f = fmaf (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main expm1f routine uses Horner.  */
+  float f2 = f * f;
+  float p_01 = fmaf (f, C (1), C (0));
+  float p_23 = fmaf (f, C (3), C (2));
+  float p = fmaf (f2, p_23, p_01);
+  p = fmaf (f2 * f2, C (4), p);
+  p = fmaf (f2, p, f);
+
+  /* t = 2^i.  */
+  float t = asfloat ((i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return fmaf (p, t, t - 1);
+}
+
+/* Approximation for single-precision tanh(x), using a simplified version of
+   expm1f. The maximum error is 2.58 ULP:
+   tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
+		      want 0x1.f9ba08p-5.  */
+float
+tanhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax > BoringBound))
+    {
+      if (iax > 0x7f800000)
+	return __math_invalidf (x);
+      return asfloat (One | sign);
+    }
+
+  if (unlikely (iax < 0x34000000))
+    return x;
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  float q = expm1f_inline (2 * x);
+  return q / (q + 2);
+}
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index b972f83..9e3b9a0 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -43,6 +43,7 @@ F (log2f, 0.01, 11.1)
 F (sinf, -3.1, 3.1)
 F (sinhf, -10.0, 10.0)
 F (tanf, -3.1, 3.1)
+F (tanhf, -10.0, 10.0)
 
 D (acosh, 1.0, 10.0)
 D (asinh, -10.0, 10.0)
@@ -84,6 +85,7 @@ ZVND (log2, 0.01, 11.1)
 ZVNF (sinhf, -10.0, 10.0)
 ZVND (sinh, -10.0, 10.0)
 ZVNF (tanf, -3.1, 3.1)
+ZVNF (tanhf, -10.0, 10.0)
 {"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
 {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
 {"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 5d0188d..484ebdf 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -215,6 +215,14 @@ L=1.03
 t cbrtf  0  inf 1000000
 t cbrtf -0 -inf 1000000
 
+L=2.09
+t tanhf  0              0x1p-23       1000
+t tanhf -0             -0x1p-23       1000
+t tanhf  0x1p-23        0x1.205966p+3 100000
+t tanhf -0x1p-23       -0x1.205966p+3 100000
+t tanhf  0x1.205966p+3  inf           100
+t tanhf -0x1.205966p+3 -inf           100
+
 done
 
 # vector functions
@@ -461,6 +469,15 @@ range_asinh='
  -0x1p511 -inf     40000
 '
 
+range_tanhf='
+  0              0x1p-23       1000
+ -0             -0x1p-23       1000
+  0x1p-23        0x1.205966p+3 100000
+ -0x1p-23       -0x1.205966p+3 100000
+  0x1.205966p+3  inf           100
+ -0x1.205966p+3 -inf           100
+'
+
 range_sve_cosf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
@@ -629,6 +646,7 @@ L_cosh=1.43
 L_atanhf=2.59
 L_cbrtf=1.03
 L_asinh=1.54
+L_tanhf=2.09
 
 L_sve_cosf=1.57
 L_sve_cos=1.61
@@ -796,6 +814,10 @@ asinh  _ZGVnN2v_asinh  $runvn   fenv -c 2
 asinh  __v_asinh       $runv    fenv -c 0x1p600
 asinh  __vn_asinh      $runvn   fenv -c 0x1p600
 asinh  _ZGVnN2v_asinh  $runvn   fenv -c 0x1p600
+tanhf  __s_tanhf       $runs    fenv
+tanhf  __v_tanhf       $runv    fenv
+tanhf  __vn_tanhf      $runvn   fenv
+tanhf  _ZGVnN4v_tanhf  $runvn   fenv
 
 sve_cosf     __sv_cosf         $runsv
 sve_cosf     _ZGVsMxv_cosf     $runsv
diff --git a/pl/math/test/testcases/directed/tanhf.tst b/pl/math/test/testcases/directed/tanhf.tst
new file mode 100644
index 0000000..c3edb50
--- /dev/null
+++ b/pl/math/test/testcases/directed/tanhf.tst
@@ -0,0 +1,20 @@
+; tanhf.tst
+;
+; Copyright 2007-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=tanhf op1=7fc00001 result=7fc00001 errno=0
+func=tanhf op1=ffc00001 result=7fc00001 errno=0
+func=tanhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=tanhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=tanhf op1=7f800000 result=3f800000 errno=0
+func=tanhf op1=ff800000 result=bf800000 errno=0
+func=tanhf op1=00000000 result=00000000 errno=0
+func=tanhf op1=80000000 result=80000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+; func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+; func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
+func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 465630d..86e2bed 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -46,6 +46,7 @@ F1 (log10)
 F1 (log1p)
 F1 (sinh)
 F1 (tan)
+F1 (tanh)
 D1 (acosh)
 D1 (asinh)
 D2 (atan2)
@@ -81,6 +82,7 @@ _ZVND1 (log2)
 _ZVNF1 (sinh)
 _ZVND1 (sinh)
 _ZVNF1 (tan)
+_ZVNF1 (tanh)
 #if WANT_SVE_MATH
 _ZSVF2 (atan2)
 _ZSVD2 (atan2)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 28ee251..be87c21 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -129,6 +129,7 @@ ZVNF1_WRAP(log1p)
 ZVNF1_WRAP(log2)
 ZVNF1_WRAP(sinh)
 ZVNF1_WRAP(tan)
+ZVNF1_WRAP(tanh)
 ZVND1_WRAP(asinh)
 ZVND1_WRAP(atan)
 ZVND2_WRAP(atan2)
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
new file mode 100644
index 0000000..571fd8b
--- /dev/null
+++ b/pl/math/v_tanhf_2u6.c
@@ -0,0 +1,93 @@
+/*
+ * Single-precision vector tanh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+
+#if V_SUPPORTED
+
+#define BoringBound                                                            \
+  0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
+		negative).  */
+#define AbsMask 0x7fffffff
+#define One 0x3f800000
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define MLn2hi v_f32 (-0x1.62e4p-1f)
+#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
+
+#define C(i) v_f32 (__expm1f_poly[i])
+
+static inline v_f32_t
+expm1f_inline (v_f32_t x)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from v_expm1f_1u6.c, with all special-case handling removed, as
+     special, tiny and large values are all dealt with in the main tanhf
+     routine.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
+  v_s32_t i = v_to_s32_f32 (j);
+  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
+  f = v_fma_f32 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
+  v_f32_t f2 = f * f;
+  v_f32_t p_01 = v_fma_f32 (f, C (1), C (0));
+  v_f32_t p_23 = v_fma_f32 (f, C (3), C (2));
+  v_f32_t p = v_fma_f32 (f2, p_23, p_01);
+  p = v_fma_f32 (f2 * f2, C (4), p);
+  p = v_fma_f32 (f2, p, f);
+
+  /* t = 2^i.  */
+  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return v_fma_f32 (p, t, t - 1);
+}
+
+static NOINLINE v_f32_t
+special_case (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision vector tanh(x), using a simplified version
+   of expm1f. The maximum error is 2.58 ULP:
+   __v_tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
+			  want 0x1.f9ba08p-5.  */
+VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+  v_u32_t sign = ix & ~AbsMask;
+  v_u32_t is_boring = v_cond_u32 (iax > BoringBound);
+  v_f32_t boring = v_as_f32_u32 (sign | One);
+
+#if WANT_ERRNO
+  /* If errno needs to be set properly, set all special and boring lanes to 1,
+     which will trigger no exceptions, and fix them up later.  */
+  v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000));
+  ix = v_sel_u32 (is_boring, v_u32 (One), ix);
+  if (unlikely (v_any_u32 (special)))
+    ix = v_sel_u32 (special, v_u32 (One), ix);
+#else
+  v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax == 0));
+#endif
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  v_f32_t q = expm1f_inline (2 * v_as_f32_u32 (ix));
+  v_f32_t y = q / (q + 2);
+  y = v_sel_f32 (is_boring, boring, y);
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+#endif
diff --git a/pl/math/vn_tanhf_2u6.c b/pl/math/vn_tanhf_2u6.c
new file mode 100644
index 0000000..96fd67a
--- /dev/null
+++ b/pl/math/vn_tanhf_2u6.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanhf.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_tanhf, _ZGVnN4v_tanhf)
+#include "v_tanhf_2u6.c"
+#endif
-- 
cgit v1.2.3


From 0d3c3cd35440d224ddbcd1496b48835443f4c7c1 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 5 Dec 2022 11:56:33 +0000
Subject: pl/math: Avoid UB in scalar tanhf

The ldexp shortcut was left-shifting a signed value. We now bias the
exponent first, will allows the shift to be done on an unsigned value.
---
 pl/math/tanhf_2u6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pl/math/tanhf_2u6.c b/pl/math/tanhf_2u6.c
index 9db2533..145f437 100644
--- a/pl/math/tanhf_2u6.c
+++ b/pl/math/tanhf_2u6.c
@@ -48,7 +48,7 @@ expm1f_inline (float x)
   p = fmaf (f2, p, f);
 
   /* t = 2^i.  */
-  float t = asfloat ((i << 23) + One);
+  float t = asfloat ((uint32_t) (i + 127) << 23);
   /* expm1(x) ~= p * t + (t - 1).  */
   return fmaf (p, t, t - 1);
 }
-- 
cgit v1.2.3


From 2a963bbff4f16998def16ab5c7b1c7ab92f825a8 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 6 Dec 2022 10:40:54 +0000
Subject: pl/math: Set fenv flags in Neon asinhf

Routine no longer relies on vector log1pf, as this has to become more
complex to deal with fenv itself. Instead we re-use a log1pf helper
from Neon atanhf which does no special-case handling, instead leaving
it all up to the main routine. We now just fall back to the scalar
routine for special-case handling. This uncovered a mistake in
asinhf's handling of NaNs, which has been fixed.
---
 pl/math/asinhf_3u5.c      | 10 ++++-----
 pl/math/test/runulp.sh    |  8 +++----
 pl/math/v_asinhf_2u7.c    | 43 ++++++++++++++++++++++--------------
 pl/math/v_atanhf_3u1.c    | 41 ++---------------------------------
 pl/math/v_log1pf_inline.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 93 insertions(+), 64 deletions(-)
 create mode 100644 pl/math/v_log1pf_inline.h

diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
index 10f9f31..8aa62ad 100644
--- a/pl/math/asinhf_3u5.c
+++ b/pl/math/asinhf_3u5.c
@@ -11,7 +11,6 @@
 #define Ln2 (0x1.62e4p-1f)
 #define One (0x3f8)
 #define ExpM12 (0x398)
-#define QNaN (0x7fc)
 
 #define C(i) __asinhf_data.coeffs[i]
 
@@ -45,10 +44,11 @@ asinhf (float x)
   float ax = asfloat (ia);
   uint32_t sign = ix & ~AbsMask;
 
-  if (ia12 < ExpM12 || ia12 == QNaN)
-    {
-      return x;
-    }
+  if (unlikely (ia12 < ExpM12 || ia == 0x7f800000))
+    return x;
+
+  if (unlikely (ia12 >= 0x7f8))
+    return __math_invalidf (x);
 
   if (ia12 < One)
     {
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 484ebdf..ed45c73 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -766,10 +766,10 @@ log1pf __s_log1pf      $runs
 log1pf __v_log1pf      $runv
 log1pf __vn_log1pf     $runvn
 log1pf _ZGVnN4v_log1pf $runvn
-asinhf __s_asinhf      $runs
-asinhf __v_asinhf      $runv
-asinhf __vn_asinhf     $runvn
-asinhf _ZGVnN4v_asinhf $runvn
+asinhf __s_asinhf      $runs    fenv
+asinhf __v_asinhf      $runv    fenv
+asinhf __vn_asinhf     $runvn   fenv
+asinhf _ZGVnN4v_asinhf $runvn   fenv
 log2f  __s_log2f       $runs
 log2f  __v_log2f       $runv
 log2f  __vn_log2f      $runvn
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
index 675b8a8..7bce7ff 100644
--- a/pl/math/v_asinhf_2u7.c
+++ b/pl/math/v_asinhf_2u7.c
@@ -11,34 +11,45 @@
 
 #define SignMask v_u32 (0x80000000)
 #define One v_f32 (1.0f)
-#define Ln2 v_f32 (0x1.62e43p-1f)
-#define SpecialBound v_u32 (0x5f800000) /* asuint(0x1p64).  */
+#define BigBound v_u32 (0x5f800000)  /* asuint(0x1p64).  */
+#define TinyBound v_u32 (0x30800000) /* asuint(0x1p-30).  */
+
+#include "v_log1pf_inline.h"
+
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (asinhf, x, y, special);
+}
 
 /* Single-precision implementation of vector asinh(x), using vector log1p.
    Worst-case error is 2.66 ULP, at roughly +/-0.25:
    __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3.  */
 VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x)
 {
-  v_f32_t ax = v_abs_f32 (x);
-  v_u32_t special = v_cond_u32 (v_as_u32_f32 (ax) >= SpecialBound);
-  v_u32_t sign = v_as_u32_f32 (x) & SignMask;
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & ~SignMask;
+  v_u32_t sign = ix & SignMask;
+  v_f32_t ax = v_as_f32_u32 (iax);
+  v_u32_t special = v_cond_u32 (iax >= BigBound);
+
+#if WANT_ERRNO
+  /* Sidestep tiny and large values to avoid inadvertently triggering
+     under/overflow.  */
+  special |= v_cond_u32 (iax < TinyBound);
+  if (unlikely (v_any_u32 (special)))
+    ax = v_sel_f32 (special, One, ax);
+#endif
 
   /* asinh(x) = log(x + sqrt(x * x + 1)).
      For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
   v_f32_t d = One + v_sqrt_f32 (ax * ax + One);
-  v_f32_t y = V_NAME (log1pf) (ax + ax * ax / d);
+  v_f32_t y = log1pf_inline (ax + ax * ax / d);
+  y = v_as_f32_u32 (sign | v_as_u32_f32 (y));
 
   if (unlikely (v_any_u32 (special)))
-    {
-      /* If |x| is too large, we cannot square it at low cost without overflow.
-	 At very large x, asinh(x) ~= log(2x) and log(x) ~= log1p(x), so we
-	 calculate asinh(x) as log1p(x) + log(2).  */
-      v_f32_t y_large = V_NAME (log1pf) (ax) + Ln2;
-      return v_as_f32_u32 (sign
-			   | v_as_u32_f32 (v_sel_f32 (special, y_large, y)));
-    }
-
-  return v_as_f32_u32 (sign | v_as_u32_f32 (y));
+    return specialcase (x, y, special);
+  return y;
 }
 VPCS_ALIAS
 
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
index 54dcb9b..1e3a561 100644
--- a/pl/math/v_atanhf_3u1.c
+++ b/pl/math/v_atanhf_3u1.c
@@ -9,50 +9,13 @@
 
 #if V_SUPPORTED
 
+#include "v_log1pf_inline.h"
+
 #define AbsMask 0x7fffffff
 #define Half 0x3f000000
 #define One 0x3f800000
-#define Four 0x40800000
-#define Ln2 0x1.62e43p-1f
 #define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
 
-#define C(i) v_f32 (__log1pf_data.coeffs[i])
-
-static inline v_f32_t
-eval_poly (v_f32_t m)
-{
-  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
-  v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
-  v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
-  v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
-  v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
-
-  v_f32_t m2 = m * m;
-  v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
-  v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
-  v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
-
-  v_f32_t m4 = m2 * m2;
-  v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
-
-  return v_fma_f32 (m4, m4 * p_79, p_06);
-}
-
-static inline v_f32_t
-log1pf_inline (v_f32_t x)
-{
-  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
-     special-case handling. See that file for details of the algorithm.  */
-  v_f32_t m = x + 1.0f;
-  v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
-  v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
-  v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
-		    + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
-  v_f32_t p = eval_poly (m_scale);
-  v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
-  return v_fma_f32 (scale_back, v_f32 (Ln2), p);
-}
-
 /* Approximation for vector single-precision atanh(x) using modified log1p.
    The maximum error is 3.08 ULP:
    __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
diff --git a/pl/math/v_log1pf_inline.h b/pl/math/v_log1pf_inline.h
new file mode 100644
index 0000000..cf32b2a
--- /dev/null
+++ b/pl/math/v_log1pf_inline.h
@@ -0,0 +1,55 @@
+/*
+ * Helper for single-precision routines which calculate log(1 + x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_V_LOG1PF_INLINE_H
+#define PL_MATH_V_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "math_config.h"
+
+#define Four 0x40800000
+#define Ln2 v_f32 (0x1.62e43p-1f)
+
+#define C(i) v_f32 (__log1pf_data.coeffs[i])
+
+static inline v_f32_t
+eval_poly (v_f32_t m)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
+  v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
+  v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
+  v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+
+  v_f32_t m2 = m * m;
+  v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
+  v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
+  v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+
+  v_f32_t m4 = m2 * m2;
+  v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
+
+  return v_fma_f32 (m4, m4 * p_79, p_06);
+}
+
+static inline v_f32_t
+log1pf_inline (v_f32_t x)
+{
+  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+     special-case handling. See that file for details of the algorithm.  */
+  v_f32_t m = x + 1.0f;
+  v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
+  v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
+  v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
+		    + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+  v_f32_t p = eval_poly (m_scale);
+  v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
+  return v_fma_f32 (scale_back, Ln2, p);
+}
+
+#endif //  PL_MATH_V_LOG1PF_INLINE_H
-- 
cgit v1.2.3


From fa1ba23f84ab1c65dea77d3940535261181766cf Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 7 Dec 2022 14:52:38 +0000
Subject: string: arm: Ensure correct cfi state at strcmp entry

Move code fragment corresponding to L(fastpath_exit) to after function
entry so that a .cfi_remember_state/.cfi_restore_state pair are not
needed prior to strcmp start.

The resulting reshuffle of code cleans up the entry part, fixing the
.size directive calculation, which at present calculates the function
size based on the address of __strcmp_arm and not L(strcmp_start_addr).
---
 string/arm/strcmp.S | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index a69dbff..bc6f75f 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -132,20 +132,7 @@
 #endif
 	.endm
 
-	.p2align	5
-L(strcmp_start_addr):
-	.fnstart
-	.cfi_startproc
-#if STRCMP_NO_PRECHECK == 0
-L(fastpath_exit):
-	sub	r0, r2, r3
-	epilogue push_ip=HAVE_PAC_LEAF
-	nop
-#endif
-	.global __strcmp_arm
-	.type __strcmp_arm,%function
-	.align 0
-__strcmp_arm:
+ENTRY(__strcmp_arm)
 	prologue push_ip=HAVE_PAC_LEAF
 #if STRCMP_NO_PRECHECK == 0
 	ldrb	r2, [src1]
@@ -332,11 +319,18 @@ L(misaligned_exit):
 	ldr	r4, [sp], #16
 	.cfi_restore 4
 	.cfi_adjust_cfa_offset -16
-
 	epilogue push_ip=HAVE_PAC_LEAF
 
 #if STRCMP_NO_PRECHECK == 0
+L(fastpath_exit):
+	.cfi_restore_state
+	.cfi_remember_state
+	sub	r0, r2, r3
+	epilogue push_ip=HAVE_PAC_LEAF
+
 L(aligned_m1):
+	.cfi_restore_state
+	.cfi_remember_state
 	add	src2, src2, #4
 #endif
 L(src1_aligned):
-- 
cgit v1.2.3


From 7d205b8787a4462d6e605ee826edf2666f899a34 Mon Sep 17 00:00:00 2001
From: Victor Do Nascimento <Victor.DoNascimento@arm.com>
Date: Wed, 7 Dec 2022 14:54:18 +0000
Subject: string: arm: Fix cfi restore info for hot loop exit

The branch out of the core memchr loop to label 60 jumps over the
popping of registers r4-r7.  The restoration of the cfi state at 60 is
adjusted to reflect this fact, avoiding restoring a state where r4-r7
have already been popped off the stack.

Built w/ arm-none-linux-gnueabihf, ran make check-string w/ qemu-arm-static.
---
 string/arm/memchr.S | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 9649e10..9b77b75 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -129,7 +129,12 @@ __memchr_arm:
 60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
 	@ r0 points to the start of the double word after the one that was tested
 	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
-	.cfi_restore_state
+	.cfi_restore_state	@ Standard post-prologue state
+	.cfi_adjust_cfa_offset 16
+	.cfi_rel_offset	4, 0
+	.cfi_rel_offset 5, 4
+	.cfi_rel_offset 6, 8
+	.cfi_rel_offset 7, 12
 	cmp	r5, #0
 	itte	eq
 	moveq	r5, r6		@ the end is in the 2nd word
-- 
cgit v1.2.3


From a5e45e4e299f5fe6b51601694cc3cb066a20723a Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed, 7 Dec 2022 15:31:03 +0000
Subject: math: Set fenv exceptions for several Neon routines

In most cases, we mask lanes which should not trigger exceptions with
a neutral value, then let the existing special-case handler fix them
up later. For exp and exp2 we replace the more complex special-case
handler with a simple scalar fallback. All new behaviour is tested in
runulp.sh, with a new option to pass -f to the run line. We also
extend the fenv testing to Neon log and logf, which already triggered
exceptions correctly. New behaviour is mostly hidden behind a new
config setting, WANT_SIMD_EXCEPT.
---
 config.mk.dist      |  4 ++++
 math/Dir.mk         |  2 +-
 math/test/runulp.sh | 43 ++++++++++++++++++++++++++-----------------
 math/v_cos.c        |  8 ++++++++
 math/v_cosf.c       |  8 ++++++++
 math/v_exp.c        | 34 ++++++++++++++++++++++++++++++++++
 math/v_exp2f.c      | 47 +++++++++++++++++++++++++++++++++++++++++++----
 math/v_expf.c       | 47 +++++++++++++++++++++++++++++++++++++++++++----
 math/v_math.h       | 20 ++++++++++++++++++++
 math/v_sin.c        | 25 +++++++++++++++++++++----
 math/v_sinf.c       | 19 ++++++++++++++++---
 11 files changed, 224 insertions(+), 33 deletions(-)

diff --git a/config.mk.dist b/config.mk.dist
index 25cfdca..352136d 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -75,6 +75,10 @@ math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
 WANT_ERRNO = 0
 math-cflags += -DWANT_ERRNO=$(WANT_ERRNO)
 
+# If set to 1, set fenv in vector math routines.
+WANT_SIMD_EXCEPT = 0
+math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
+
 # Disable fenv checks
 #math-ulpflags = -q -f
 #math-testflags = -nostatus
diff --git a/math/Dir.mk b/math/Dir.mk
index 534f997..a84528d 100644
--- a/math/Dir.mk
+++ b/math/Dir.mk
@@ -101,7 +101,7 @@ check-math-rtest: $(math-host-tools) $(math-tools)
 	cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags)
 
 check-math-ulp: $(math-tools)
-	ULPFLAGS="$(math-ulpflags)" build/bin/runulp.sh $(EMULATOR)
+	ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR)
 
 check-math: check-math-test check-math-rtest check-math-ulp
 
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 2b42ae1..4793b84 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -145,7 +145,7 @@ done
 # vector functions
 Ldir=0.5
 r='n'
-flags="${ULPFLAGS:--q} -f"
+flags="${ULPFLAGS:--q}"
 runs=
 check __s_exp 1 && runs=1
 runv=
@@ -229,7 +229,7 @@ L_sinf=1.4
 L_cosf=1.4
 L_powf=2.1
 
-while read G F R
+while read G F R D
 do
 	[ "$R" = 1 ] || continue
 	case "$G" in \#*) continue ;; esac
@@ -239,7 +239,16 @@ do
 	do
 		[ -n "$X" ] || continue
 		case "$X" in \#*) continue ;; esac
-		t $F $X
+		disable_fenv=""
+		if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then
+			# If library was built with SIMD exceptions
+			# disabled, disable fenv checking in ulp
+			# tool. Otherwise, fenv checking may still be
+			# disabled by adding -f to the end of the run
+			# line.
+			disable_fenv="-f"
+		fi
+		t $D $disable_fenv $F $X
 	done << EOF
 $range
 EOF
@@ -255,10 +264,10 @@ log  __v_log       $runv
 log  __vn_log      $runvn
 log  _ZGVnN2v_log  $runvn
 
-pow __s_pow       $runs
-pow __v_pow       $runv
-pow __vn_pow      $runvn
-pow _ZGVnN2vv_pow $runvn
+pow __s_pow       $runs         -f
+pow __v_pow       $runv         -f
+pow __vn_pow      $runvn        -f
+pow _ZGVnN2vv_pow $runvn        -f
 
 sin __s_sin       $runs
 sin __v_sin       $runv
@@ -275,18 +284,18 @@ expf __v_expf      $runv
 expf __vn_expf     $runvn
 expf _ZGVnN4v_expf $runvn
 
-expf_1u __s_expf_1u   $runs
-expf_1u __v_expf_1u   $runv
-expf_1u __vn_expf_1u  $runvn
+expf_1u __s_expf_1u   $runs     -f
+expf_1u __v_expf_1u   $runv     -f
+expf_1u __vn_expf_1u  $runvn    -f
 
 exp2f __s_exp2f      $runs
 exp2f __v_exp2f      $runv
 exp2f __vn_exp2f     $runvn
 exp2f _ZGVnN4v_exp2f $runvn
 
-exp2f_1u __s_exp2f_1u  $runs
-exp2f_1u __v_exp2f_1u  $runv
-exp2f_1u __vn_exp2f_1u $runvn
+exp2f_1u __s_exp2f_1u  $runs    -f
+exp2f_1u __v_exp2f_1u  $runv    -f
+exp2f_1u __vn_exp2f_1u $runvn   -f
 
 logf __s_logf      $runs
 logf __v_logf      $runv
@@ -303,10 +312,10 @@ cosf __v_cosf      $runv
 cosf __vn_cosf     $runvn
 cosf _ZGVnN4v_cosf $runvn
 
-powf __s_powf       $runs
-powf __v_powf       $runv
-powf __vn_powf      $runvn
-powf _ZGVnN4vv_powf $runvn
+powf __s_powf       $runs       -f
+powf __v_powf       $runv       -f
+powf __vn_powf      $runvn      -f
+powf _ZGVnN4vv_powf $runvn      -f
 EOF
 
 [ 0 -eq $FAIL ] || {
diff --git a/math/v_cos.c b/math/v_cos.c
index eb7e337..0a51481 100644
--- a/math/v_cos.c
+++ b/math/v_cos.c
@@ -55,6 +55,14 @@ V_NAME(cos) (v_f64_t x)
   r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
   cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
 
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       specialcase later.  */
+    r = v_sel_f64 (cmp, v_f64 (1.0), r);
+#endif
+
   /* n = rint((|x|+pi/2)/pi) - 0.5.  */
   n = v_fma_f64 (InvPi, r + HalfPi, Shift);
   odd = v_as_u64_f64 (n) << 63;
diff --git a/math/v_cosf.c b/math/v_cosf.c
index e1d656c..55ecbbb 100644
--- a/math/v_cosf.c
+++ b/math/v_cosf.c
@@ -47,6 +47,14 @@ V_NAME(cosf) (v_f32_t x)
   r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
   cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
 
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       specialcase later.  */
+    r = v_sel_f32 (cmp, v_f32 (1.0f), r);
+#endif
+
   /* n = rint((|x|+pi/2)/pi) - 0.5 */
   n = v_fma_f32 (InvPi, r + HalfPi, Shift);
   odd = v_as_u32_f32 (n) << 31;
diff --git a/math/v_exp.c b/math/v_exp.c
index 039504d..c25825f 100644
--- a/math/v_exp.c
+++ b/math/v_exp.c
@@ -36,6 +36,22 @@
 #define Tab __v_exp_data
 #define IndexMask v_u64 (N - 1)
 #define Shift v_f64 (0x1.8p+52)
+
+#if WANT_SIMD_EXCEPT
+
+#define TinyBound 0x200 /* top12 (asuint64 (0x1p-511)).  */
+#define BigBound 0x408	/* top12 (asuint64 (0x1p9)).  */
+
+VPCS_ATTR static NOINLINE v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f64 (exp, x, y, cmp);
+}
+
+#else
+
 #define Thres v_f64 (704.0)
 
 VPCS_ATTR
@@ -54,6 +70,8 @@ specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
   return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
 }
 
+#endif
+
 VPCS_ATTR
 v_f64_t
 V_NAME(exp) (v_f64_t x)
@@ -61,7 +79,18 @@ V_NAME(exp) (v_f64_t x)
   v_f64_t n, r, r2, s, y, z;
   v_u64_t cmp, u, e, i;
 
+#if WANT_SIMD_EXCEPT
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     specialcase to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  v_f64_t xm = x;
+  cmp = v_cond_u64 ((v_as_u64_f64 (v_abs_f64 (x)) >> 52) - TinyBound
+		    >= BigBound - TinyBound);
+  if (unlikely (v_any_u64 (cmp)))
+    x = v_sel_f64 (cmp, v_f64 (1), x);
+#else
   cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
+#endif
 
   /* n = round(x/(ln2/N)).  */
   z = v_fma_f64 (x, InvLn2, Shift);
@@ -87,7 +116,12 @@ V_NAME(exp) (v_f64_t x)
   s = v_as_f64_u64 (u + e);
 
   if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return specialcase (xm, v_fma_f64 (y, s, s), cmp);
+#else
     return specialcase (s, y, n);
+#endif
+
   return v_fma_f64 (y, s, s);
 }
 VPCS_ALIAS
diff --git a/math/v_exp2f.c b/math/v_exp2f.c
index b817560..22039ca 100644
--- a/math/v_exp2f.c
+++ b/math/v_exp2f.c
@@ -25,6 +25,22 @@ static const float Poly[] = {
 
 #define Shift v_f32 (0x1.8p23f)
 
+#if WANT_SIMD_EXCEPT
+
+#define TinyBound 0x20000000 /* asuint (0x1p-63).  */
+#define BigBound 0x42800000  /* asuint (0x1p6).  */
+
+VPCS_ATTR
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f32 (exp2f, x, y, cmp);
+}
+
+#else
+
 VPCS_ATTR
 static v_f32_t
 specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
@@ -41,15 +57,28 @@ specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f
   return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
 }
 
+#endif
+
 VPCS_ATTR
 v_f32_t
 V_NAME(exp2f) (v_f32_t x)
 {
-  v_f32_t n, r, r2, scale, p, q, poly, absn;
+  v_f32_t n, r, r2, scale, p, q, poly;
   v_u32_t cmp, e;
 
-  /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-     x = n + r, with r in [-1/2, 1/2].  */
+#if WANT_SIMD_EXCEPT
+  cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
+		    >= BigBound - TinyBound);
+  v_f32_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     specialcase to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = v_sel_f32 (cmp, v_f32 (1), x);
+#endif
+
+    /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+       x = n + r, with r in [-1/2, 1/2].  */
 #if 0
   v_f32_t z;
   z = x + Shift;
@@ -62,16 +91,26 @@ V_NAME(exp2f) (v_f32_t x)
   e = v_as_u32_s32 (v_round_s32 (x)) << 23;
 #endif
   scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-  absn = v_abs_f32 (n);
+
+#if !WANT_SIMD_EXCEPT
+  v_f32_t absn = v_abs_f32 (n);
   cmp = v_cond_u32 (absn > v_f32 (126.0f));
+#endif
+
   r2 = r * r;
   p = v_fma_f32 (C0, r, C1);
   q = v_fma_f32 (C2, r, C3);
   q = v_fma_f32 (p, r2, q);
   p = C4 * r;
   poly = v_fma_f32 (q, r2, p);
+
   if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
+#else
     return specialcase (poly, n, e, absn, cmp, scale);
+#endif
+
   return v_fma_f32 (poly, scale, scale);
 }
 VPCS_ALIAS
diff --git a/math/v_expf.c b/math/v_expf.c
index 2707ebc..cb4348e 100644
--- a/math/v_expf.c
+++ b/math/v_expf.c
@@ -28,6 +28,22 @@ static const float Poly[] = {
 #define Ln2hi v_f32 (0x1.62e4p-1f)
 #define Ln2lo v_f32 (0x1.7f7d1cp-20f)
 
+#if WANT_SIMD_EXCEPT
+
+#define TinyBound 0x20000000 /* asuint (0x1p-63).  */
+#define BigBound 0x42800000  /* asuint (0x1p6).  */
+
+VPCS_ATTR
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f32 (expf, x, y, cmp);
+}
+
+#else
+
 VPCS_ATTR
 static v_f32_t
 specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
@@ -44,15 +60,28 @@ specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f
   return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
 }
 
+#endif
+
 VPCS_ATTR
 v_f32_t
 V_NAME(expf) (v_f32_t x)
 {
-  v_f32_t n, r, r2, scale, p, q, poly, absn, z;
+  v_f32_t n, r, r2, scale, p, q, poly, z;
   v_u32_t cmp, e;
 
-  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+#if WANT_SIMD_EXCEPT
+  cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
+		    >= BigBound - TinyBound);
+  v_f32_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     specialcase to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = v_sel_f32 (cmp, v_f32 (1), x);
+#endif
+
+    /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+       x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
 #if 1
   z = v_fma_f32 (x, InvLn2, Shift);
   n = z - Shift;
@@ -67,16 +96,26 @@ V_NAME(expf) (v_f32_t x)
   e = v_as_u32_s32 (v_round_s32 (z)) << 23;
 #endif
   scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-  absn = v_abs_f32 (n);
+
+#if !WANT_SIMD_EXCEPT
+  v_f32_t absn = v_abs_f32 (n);
   cmp = v_cond_u32 (absn > v_f32 (126.0f));
+#endif
+
   r2 = r * r;
   p = v_fma_f32 (C0, r, C1);
   q = v_fma_f32 (C2, r, C3);
   q = v_fma_f32 (p, r2, q);
   p = C4 * r;
   poly = v_fma_f32 (q, r2, p);
+
   if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
+#else
     return specialcase (poly, n, e, absn, cmp, scale);
+#endif
+
   return v_fma_f32 (poly, scale, scale);
 }
 VPCS_ALIAS
diff --git a/math/v_math.h b/math/v_math.h
index 31df7ee..5848349 100644
--- a/math/v_math.h
+++ b/math/v_math.h
@@ -191,6 +191,11 @@ v_round_s32 (v_f32_t x)
 {
   return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
 }
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return p ? x : y;
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
@@ -311,6 +316,11 @@ v_round_s64 (v_f64_t x)
 {
   return __builtin_lround (x); /* relies on -fno-math-errno.  */
 }
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return p ? x : y;
+}
 /* convert to type1 from type2.  */
 static inline v_f64_t
 v_to_f64_s64 (v_s64_t x)
@@ -460,6 +470,11 @@ v_round_s32 (v_f32_t x)
 {
   return vcvtaq_s32_f32 (x);
 }
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return vbslq_f32 (p, x, y);
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
@@ -584,6 +599,11 @@ v_round_s64 (v_f64_t x)
 {
   return vcvtaq_s64_f64 (x);
 }
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return vbslq_f64 (p, x, y);
+}
 /* convert to type1 from type2.  */
 static inline v_f64_t
 v_to_f64_s64 (v_s64_t x)
diff --git a/math/v_sin.c b/math/v_sin.c
index 4e03576..af7ccf7 100644
--- a/math/v_sin.c
+++ b/math/v_sin.c
@@ -34,9 +34,15 @@ static const double Poly[] = {
 #define Pi2 v_f64 (0x1.1a62633145c06p-53)
 #define Pi3 v_f64 (0x1.c1cd129024e09p-106)
 #define Shift v_f64 (0x1.8p52)
-#define RangeVal v_f64 (0x1p23)
 #define AbsMask v_u64 (0x7fffffffffffffff)
 
+#if WANT_SIMD_EXCEPT
+#define TinyBound 0x202 /* top12 (asuint64 (0x1p-509)).  */
+#define Thresh 0x214	/* top12 (asuint64 (RangeVal)) - TinyBound.  */
+#else
+#define RangeVal v_f64 (0x1p23)
+#endif
+
 VPCS_ATTR
 __attribute__ ((noinline)) static v_f64_t
 specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
@@ -49,11 +55,22 @@ v_f64_t
 V_NAME(sin) (v_f64_t x)
 {
   v_f64_t n, r, r2, y;
-  v_u64_t sign, odd, cmp;
+  v_u64_t sign, odd, cmp, ir;
 
-  r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
+  ir = v_as_u64_f64 (x) & AbsMask;
+  r = v_as_f64_u64 (ir);
   sign = v_as_u64_f64 (x) & ~AbsMask;
-  cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
+
+#if WANT_SIMD_EXCEPT
+  /* Detect |x| <= 0x1p-509 or |x| >= RangeVal. If fenv exceptions are to be
+     triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
+     fenv). These lanes will be fixed by specialcase later.  */
+  cmp = v_cond_u64 ((ir >> 52) - TinyBound >= Thresh);
+  if (unlikely (v_any_u64 (cmp)))
+    r = v_sel_f64 (cmp, v_f64 (1), r);
+#else
+  cmp = v_cond_u64 (ir >= v_as_u64_f64 (RangeVal));
+#endif
 
   /* n = rint(|x|/pi).  */
   n = v_fma_f64 (InvPi, r, Shift);
diff --git a/math/v_sinf.c b/math/v_sinf.c
index d2e18b5..ee6ed9a 100644
--- a/math/v_sinf.c
+++ b/math/v_sinf.c
@@ -24,6 +24,7 @@ static const float Poly[] = {
 #define A7 v_f32 (Poly[1])
 #define A9 v_f32 (Poly[0])
 #define RangeVal v_f32 (0x1p20f)
+#define TinyBound v_f32 (0x1p-61f)
 #define InvPi v_f32 (0x1.45f306p-2f)
 #define Shift v_f32 (0x1.8p+23f)
 #define AbsMask v_u32 (0x7fffffff)
@@ -41,11 +42,23 @@ v_f32_t
 V_NAME(sinf) (v_f32_t x)
 {
   v_f32_t n, r, r2, y;
-  v_u32_t sign, odd, cmp;
+  v_u32_t sign, odd, cmp, ir;
 
-  r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
+  ir = v_as_u32_f32 (x) & AbsMask;
+  r = v_as_f32_u32 (ir);
   sign = v_as_u32_f32 (x) & ~AbsMask;
-  cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
+
+#if WANT_SIMD_EXCEPT
+  cmp = v_cond_u32 ((ir - v_as_u32_f32 (TinyBound)
+		     >= v_as_u32_f32 (RangeVal) - v_as_u32_f32 (TinyBound)));
+  if (unlikely (v_any_u32 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       specialcase later.  */
+    r = v_sel_f32 (cmp, v_f32 (1), r);
+#else
+  cmp = v_cond_u32 (ir >= v_as_u32_f32 (RangeVal));
+#endif
 
   /* n = rint(|x|/pi) */
   n = v_fma_f32 (InvPi, r, Shift);
-- 
cgit v1.2.3


From 61056f3ccca43a5e79edee0e359713614a1efd3c Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Thu, 8 Dec 2022 10:37:53 +0000
Subject: pl/math: Fix vector/SVE erf

Fixing a bug that resulted in potentially random results
in boring domain by saturating index at an appropriate value.
---
 pl/math/sv_erf_2u5.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/pl/math/sv_erf_2u5.c b/pl/math/sv_erf_2u5.c
index a68b9e3..1265047 100644
--- a/pl/math/sv_erf_2u5.c
+++ b/pl/math/sv_erf_2u5.c
@@ -8,7 +8,7 @@
 #include "sv_math.h"
 #if SV_SUPPORTED
 
-#define Scale (8.0f)
+#define Scale (8.0)
 #define AbsMask (0x7fffffffffffffff)
 
 static NOINLINE sv_f64_t
@@ -33,13 +33,14 @@ __sv_erf_x (sv_f64_t x, const svbool_t pg)
     = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30);
 
   /* Get sign and absolute value.  */
-  sv_f64_t a = svabs_f64_x (pg, x);
+  sv_f64_t a = sv_as_f64_u64 (svand_n_u64_x (pg, ix, AbsMask));
   sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
 
   /* i = trunc(Scale*x).  */
-  sv_u64_t i = svcvt_u64_f64_x (pg, svmul_n_f64_x (pg, a, Scale));
+  sv_f64_t a_scale = svmul_n_f64_x (pg, a, Scale);
   /* Saturate index of intervals.  */
-  i = svmin_u64_x (pg, i, sv_u64 (V_ERF_NINTS));
+  svbool_t a_lt_6 = svcmplt_n_u64 (pg, atop, 0x4018);
+  sv_u64_t i = svcvt_u64_f64_m (sv_u64 (V_ERF_NINTS - 1), a_lt_6, a_scale);
 
   /* Load polynomial coefficients.  */
   sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i);
@@ -56,8 +57,9 @@ __sv_erf_x (sv_f64_t x, const svbool_t pg)
   /* Get shift and scale.  */
   sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i);
 
-  /* Transform polynomial variable.  */
-  sv_f64_t z = sv_fma_n_f64_x (pg, Scale, a, shift);
+  /* Transform polynomial variable.
+     Set z = 0 in the boring domain to avoid overflow.  */
+  sv_f64_t z = svmla_f64_m (a_lt_6, shift, sv_f64 (Scale), a);
 
   /* Evaluate polynomial P(z) using level-2 Estrin.  */
   sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0);
@@ -75,10 +77,6 @@ __sv_erf_x (sv_f64_t x, const svbool_t pg)
   sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2);
   y = sv_fma_f64_x (pg, z4, y, q1);
 
-  /* Saturate y. This works because using the last interval on the boring domain
-     produces y > 1.  */
-  y = svmin_n_f64_x (pg, y, 1.0);
-
   /* y = erf(x) if x > 0, -erf(-x) otherwise.  */
   y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
 
-- 
cgit v1.2.3


From 08d6ff3d286d707f1c83ddebd4e5b4532dfc6051 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 8 Dec 2022 11:36:23 +0000
Subject: pl/math: Fix fenv in asinh

Special lanes were not being properly masked when a lane was
tiny. This is now fixed.
---
 pl/math/v_asinh_2u5.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c
index a1bdf5b..508ced1 100644
--- a/pl/math/v_asinh_2u5.c
+++ b/pl/math/v_asinh_2u5.c
@@ -127,6 +127,11 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
   v_u64_t gt1 = v_cond_u64 (top12 >= OneTop);
   v_u64_t special = v_cond_u64 (top12 >= HugeBound);
 
+#if WANT_ERRNO
+  v_u64_t tiny = v_cond_u64 (top12 < TinyBound);
+  special |= tiny;
+#endif
+
   /* Option 1: |x| >= 1.
      Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
      If WANT_ERRNO is enabled, sidestep special values, which will overflow, by
@@ -154,7 +159,6 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
   if (likely (v_any_u64 (~gt1)))
     {
 #if WANT_ERRNO
-      v_u64_t tiny = v_cond_u64 (top12 < TinyBound);
       ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
 #endif
       v_f64_t x2 = ax * ax;
-- 
cgit v1.2.3


From 132d2f5da6155e64ff39a54fdbb46145a3892d6a Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 9 Dec 2022 07:46:37 +0000
Subject: pl/math/test: Simplify runulp.sh

Small simplification - pl routines do not support different rounding
modes, so there is no need to support them in runulp.sh. As a result
we can also remove Ldir.
---
 pl/math/test/runulp.sh | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index ed45c73..14abfc6 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -11,7 +11,6 @@ set -eu
 # cd to bin directory.
 cd "${0%/*}"
 
-rmodes='n'
 flags="${ULPFLAGS:--q}"
 emu="$@"
 
@@ -22,20 +21,14 @@ FAIL=0
 PASS=0
 
 t() {
-	[ $r = "n" ] && Lt=$L || Lt=$Ldir
-	$emu ./ulp -r $r -e $Lt $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
+	$emu ./ulp -e $L $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
 }
 
 check() {
 	$emu ./ulp -f -q "$@" #>/dev/null
 }
 
-Ldir=0.5
-for r in $rmodes
-do
-
 L=0.6
-Ldir=0.9
 t erff  0      0xffff0000 10000
 t erff  0x1p-127  0x1p-26 40000
 t erff -0x1p-127 -0x1p-26 40000
@@ -44,7 +37,6 @@ t erff -0x1p-26  -0x1p3   40000
 t erff  0         inf     40000
 
 L=0.30
-Ldir=
 t log10f  0      0xffff0000 10000
 t log10f  0x1p-127  0x1p-26 50000
 t log10f  0x1p-26   0x1p3   50000
@@ -52,7 +44,6 @@ t log10f  0x1p-4    0x1p4   50000
 t log10f  0         inf     50000
 
 L=1.11
-Ldir=
 t log10  0 0xffff000000000000 10000
 t log10  0x1p-4    0x1p4      40000
 t log10  0         inf        40000
@@ -64,7 +55,6 @@ t erfc -0x1p-1022 -0x1p-26   40000
 t erfc  0x1p-26    0x1p5     40000
 t erfc -0x1p-26   -0x1p3     40000
 t erfc  0          inf       40000
-Ldir=0.5
 
 L=1.5
 t erfcf  0      0xffff0000 10000
@@ -128,7 +118,6 @@ t log1pf   -0.001     -1.0  50000
 t log1pf     -1.0      inf   5000
 
 L=2.80
-Ldir=
 t tanf  0      0xffff0000 10000
 t tanf  0x1p-127  0x1p-14 50000
 t tanf -0x1p-127 -0x1p-14 50000
@@ -223,11 +212,7 @@ t tanhf -0x1p-23       -0x1.205966p+3 100000
 t tanhf  0x1.205966p+3  inf           100
 t tanhf -0x1.205966p+3 -inf           100
 
-done
-
 # vector functions
-Ldir=0.5
-r='n'
 flags="${ULPFLAGS:--q}"
 runs=
 check __s_log10f 1 && runs=1
-- 
cgit v1.2.3


From bc7cc9d2a762a26b2fcbf150b3fc9c6993ffa16c Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 9 Dec 2022 12:19:38 +0000
Subject: pl/math: Add polynomial helpers

Add macros for simplifying polynomial evaluation using either Horner,
pairwise Horner or Estrin. Several routines have been modified to use
the new helpers. Readability is improved slightly, and we expect that
this will make prototyping new routines simpler.
---
 pl/math/asinh_2u5.c            | 39 +++++----------------------
 pl/math/asinhf_3u5.c           | 13 ++-------
 pl/math/atan_common.h          | 34 ++++-------------------
 pl/math/atanf_common.h         |  8 ++----
 pl/math/cbrtf_1u5.c            |  6 ++---
 pl/math/erfc_4u5.c             | 23 +++-------------
 pl/math/erfcf.h                | 38 +++++++++-----------------
 pl/math/erfcf_2u.c             |  4 +--
 pl/math/erff_1u5.c             | 22 +++++++--------
 pl/math/estrin.h               | 16 +++++++++++
 pl/math/estrin_wrap.h          | 48 +++++++++++++++++++++++++++++++++
 pl/math/estrinf.h              | 14 ++++++++++
 pl/math/expm1_2u5.c            | 23 +++-------------
 pl/math/expm1f_1u6.c           |  8 ++----
 pl/math/horner.h               | 14 ++++++++++
 pl/math/horner_wrap.h          | 34 +++++++++++++++++++++++
 pl/math/hornerf.h              | 14 ++++++++++
 pl/math/log1p_2u.c             | 13 +++++++--
 pl/math/log1p_common.h         | 61 ------------------------------------------
 pl/math/log1pf_2u1.c           | 15 +++--------
 pl/math/pairwise_horner.h      | 14 ++++++++++
 pl/math/pairwise_horner_wrap.h | 36 +++++++++++++++++++++++++
 pl/math/pairwise_hornerf.h     | 14 ++++++++++
 pl/math/test/runulp.sh         |  2 +-
 pl/math/v_asinh_2u5.c          | 38 ++++----------------------
 pl/math/v_erfc_4u.c            | 26 +++---------------
 pl/math/v_erfcf_1u.c           | 36 ++++---------------------
 pl/math/v_log1p_2u5.c          | 12 ++++++++-
 pl/math/v_tanf_3u2.c           | 10 +++----
 pl/math/v_tanhf_2u6.c          |  6 ++---
 30 files changed, 299 insertions(+), 342 deletions(-)
 create mode 100644 pl/math/estrin.h
 create mode 100644 pl/math/estrin_wrap.h
 create mode 100644 pl/math/estrinf.h
 create mode 100644 pl/math/horner.h
 create mode 100644 pl/math/horner_wrap.h
 create mode 100644 pl/math/hornerf.h
 delete mode 100644 pl/math/log1p_common.h
 create mode 100644 pl/math/pairwise_horner.h
 create mode 100644 pl/math/pairwise_horner_wrap.h
 create mode 100644 pl/math/pairwise_hornerf.h

diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c
index f22b342..9cbdd33 100644
--- a/pl/math/asinh_2u5.c
+++ b/pl/math/asinh_2u5.c
@@ -5,48 +5,17 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "math_config.h"
+#include "estrin.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26).  */
 #define One 0x3ff0000000000000	  /* asuint64(1.0).  */
 #define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511).  */
 #define Ln2 0x1.62e42fefa39efp-1
-#define C(i) __asinh_data.poly[i]
 
 double
 optr_aor_log_f64 (double);
 
-static inline double
-eval_poly (double z)
-{
-  /* Evaluate polynomial using Estrin scheme.  */
-  double p_01 = fma (z, C (1), C (0));
-  double p_23 = fma (z, C (3), C (2));
-  double p_45 = fma (z, C (5), C (4));
-  double p_67 = fma (z, C (7), C (6));
-  double p_89 = fma (z, C (9), C (8));
-  double p_ab = fma (z, C (11), C (10));
-  double p_cd = fma (z, C (13), C (12));
-  double p_ef = fma (z, C (15), C (14));
-  double p_gh = fma (z, C (17), C (16));
-
-  double z2 = z * z;
-  double p_03 = fma (z2, p_23, p_01);
-  double p_47 = fma (z2, p_67, p_45);
-  double p_8b = fma (z2, p_ab, p_89);
-  double p_cf = fma (z2, p_ef, p_cd);
-
-  double z4 = z2 * z2;
-  double p_07 = fma (z4, p_47, p_03);
-  double p_8f = fma (z4, p_cf, p_8b);
-
-  double z8 = z4 * z4;
-  double p_0f = fma (z8, p_8f, p_07);
-
-  double z16 = z8 * z8;
-  return fma (z16, p_gh, p_0f);
-}
-
 /* Scalar double-precision asinh implementation. This routine uses different
    approaches on different intervals:
 
@@ -86,7 +55,11 @@ asinh (double x)
   if (ia < One)
     {
       double x2 = x * x;
-      double p = eval_poly (x2);
+      double z2 = x2 * x2;
+      double z4 = z2 * z2;
+      double z8 = z4 * z4;
+#define C(i) __asinh_data.poly[i]
+      double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
       double y = fma (p, x2 * ax, ax);
       return asdouble (asuint64 (y) | sign);
     }
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
index 8aa62ad..48acdef 100644
--- a/pl/math/asinhf_3u5.c
+++ b/pl/math/asinhf_3u5.c
@@ -5,6 +5,7 @@
  */
 
 #include "math_config.h"
+#include "estrinf.h"
 
 #define AbsMask (0x7fffffff)
 #define SqrtFltMax (0x1.749e96p+10f)
@@ -53,17 +54,7 @@ asinhf (float x)
   if (ia12 < One)
     {
       float x2 = ax * ax;
-      float x4 = x2 * x2;
-
-      float p_01 = fmaf (ax, C (1), C (0));
-      float p_23 = fmaf (ax, C (3), C (2));
-      float p_45 = fmaf (ax, C (5), C (4));
-      float p_67 = fmaf (ax, C (7), C (6));
-
-      float p_03 = fmaf (x2, p_23, p_01);
-      float p_47 = fmaf (x2, p_67, p_45);
-
-      float p = fmaf (x4, p_47, p_03);
+      float p = ESTRIN_7 (ax, x2, x2 * x2, C);
       float y = fmaf (x2, p, ax);
       return asfloat (asuint (y) | sign);
     }
diff --git a/pl/math/atan_common.h b/pl/math/atan_common.h
index 1690e7e..331c1bb 100644
--- a/pl/math/atan_common.h
+++ b/pl/math/atan_common.h
@@ -7,19 +7,18 @@
  */
 
 #include "math_config.h"
+#include "estrin.h"
 
 #if V_SUPPORTED
 
 #include "v_math.h"
 
 #define DBL_T v_f64_t
-#define FMA v_fma_f64
 #define P(i) v_f64 (__atan_poly_data.poly[i])
 
 #else
 
 #define DBL_T double
-#define FMA fma
 #define P(i) __atan_poly_data.poly[i]
 
 #endif
@@ -29,37 +28,14 @@
 static inline DBL_T
 eval_poly (DBL_T z, DBL_T az, DBL_T shift)
 {
-  /* Use full Estrin scheme for P(z^2) with deg(P)=19.  */
+  /* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+     full scheme to avoid underflow in x^16.  */
   DBL_T z2 = z * z;
-  /* Level 1.  */
-  DBL_T P_1_0 = FMA (P (1), z2, P (0));
-  DBL_T P_3_2 = FMA (P (3), z2, P (2));
-  DBL_T P_5_4 = FMA (P (5), z2, P (4));
-  DBL_T P_7_6 = FMA (P (7), z2, P (6));
-  DBL_T P_9_8 = FMA (P (9), z2, P (8));
-  DBL_T P_11_10 = FMA (P (11), z2, P (10));
-  DBL_T P_13_12 = FMA (P (13), z2, P (12));
-  DBL_T P_15_14 = FMA (P (15), z2, P (14));
-  DBL_T P_17_16 = FMA (P (17), z2, P (16));
-  DBL_T P_19_18 = FMA (P (19), z2, P (18));
-
-  /* Level 2.  */
   DBL_T x2 = z2 * z2;
-  DBL_T P_3_0 = FMA (P_3_2, x2, P_1_0);
-  DBL_T P_7_4 = FMA (P_7_6, x2, P_5_4);
-  DBL_T P_11_8 = FMA (P_11_10, x2, P_9_8);
-  DBL_T P_15_12 = FMA (P_15_14, x2, P_13_12);
-  DBL_T P_19_16 = FMA (P_19_18, x2, P_17_16);
-
-  /* Level 3.  */
   DBL_T x4 = x2 * x2;
-  DBL_T P_7_0 = FMA (P_7_4, x4, P_3_0);
-  DBL_T P_15_8 = FMA (P_15_12, x4, P_11_8);
-
-  /* Level 4.  */
   DBL_T x8 = x4 * x4;
-  DBL_T y = FMA (P_19_16, x8, P_15_8);
-  y = FMA (y, x8, P_7_0);
+  DBL_T y
+    = FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P));
 
   /* Finalize. y = shift + z + z^3 * P(z^2).  */
   y = FMA (y, z2 * az, az);
diff --git a/pl/math/atanf_common.h b/pl/math/atanf_common.h
index 436b88b..3038e54 100644
--- a/pl/math/atanf_common.h
+++ b/pl/math/atanf_common.h
@@ -10,19 +10,18 @@
 #define PL_MATH_ATANF_COMMON_H
 
 #include "math_config.h"
+#include "estrinf.h"
 
 #if V_SUPPORTED
 
 #include "v_math.h"
 
 #define FLT_T v_f32_t
-#define FMA v_fma_f32
 #define P(i) v_f32 (__atanf_poly_data.poly[i])
 
 #else
 
 #define FLT_T float
-#define FMA fmaf
 #define P(i) __atanf_poly_data.poly[i]
 
 #endif
@@ -42,10 +41,7 @@ eval_poly (FLT_T z, FLT_T az, FLT_T shift)
   FLT_T z4 = z2 * z2;
 
   /* Then assemble polynomial.  */
-  FLT_T y
-    = FMA (z4,
-	   z4 * FMA (z4, (FMA (z2, P (7), P (6))), (FMA (z2, P (5), P (4)))),
-	   FMA (z4, (FMA (z2, P (3), P (2))), (FMA (z2, P (1), P (0)))));
+  FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P));
 
   /* Finalize:
      y = shift + z * P(z^2).  */
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
index 73b9049..d544a68 100644
--- a/pl/math/cbrtf_1u5.c
+++ b/pl/math/cbrtf_1u5.c
@@ -8,6 +8,7 @@
 #include <math.h>
 
 #include "math_config.h"
+#include "estrinf.h"
 
 #define AbsMask 0x7fffffff
 #define SignMask 0x80000000
@@ -40,10 +41,7 @@ cbrtf (float x)
   /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
      the less accurate the next stage of the algorithm needs to be. An order-4
      polynomial is enough for one Newton iteration.  */
-  float p_01 = fmaf (C (1), m, C (0));
-  float p_23 = fmaf (C (3), m, C (2));
-  float p = fmaf (m * m, p_23, p_01);
-
+  float p = ESTRIN_3 (m, m * m, C);
   /* One iteration of Newton's method for iteratively approximating cbrt.  */
   float m_by_3 = m / 3;
   float a = fmaf (TwoThirds, p, m_by_3 / (p * p));
diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
index 003a0cc..8088562 100644
--- a/pl/math/erfc_4u5.c
+++ b/pl/math/erfc_4u5.c
@@ -9,6 +9,7 @@
 #include <math.h>
 #include <errno.h>
 #include "math_config.h"
+#include "pairwise_horner.h"
 
 #define AbsMask (0x7fffffffffffffff)
 
@@ -19,28 +20,12 @@
 double
 __exp_dd (double x, double xtail);
 
-/* Evaluate order-12 polynomials using
-   pairwise summation and Horner scheme
-   in double precision.  */
 static inline double
 eval_poly_horner (double z, int i)
 {
-  double r1, r2, r3, r4, r5, r6, z2;
-  r1 = fma (z, PX[i][1], PX[i][0]);
-  r2 = fma (z, PX[i][3], PX[i][2]);
-  r3 = fma (z, PX[i][5], PX[i][4]);
-  r4 = fma (z, PX[i][7], PX[i][6]);
-  r5 = fma (z, PX[i][9], PX[i][8]);
-  r6 = fma (z, PX[i][11], PX[i][10]);
-  z2 = z * z;
-  double r = PX[i][12];
-  r = fma (z2, r, r6);
-  r = fma (z2, r, r5);
-  r = fma (z2, r, r4);
-  r = fma (z2, r, r3);
-  r = fma (z2, r, r2);
-  r = fma (z2, r, r1);
-  return r;
+  double z2 = z * z;
+#define C(j) PX[i][j]
+  return PAIRWISE_HORNER_12 (z, z2, C);
 }
 
 /* Accurate evaluation of exp(x^2)
diff --git a/pl/math/erfcf.h b/pl/math/erfcf.h
index 6adc6b4..98ead38 100644
--- a/pl/math/erfcf.h
+++ b/pl/math/erfcf.h
@@ -8,39 +8,24 @@
 #ifndef PL_MATH_ERFCF_H
 #define PL_MATH_ERFCF_H
 
-#include <math.h>
+#include "math_config.h"
+
+#define FMA fma
+#include "estrin_wrap.h"
 
 /* Accurate exponential from optimized-routines.  */
 double
 __exp_dd (double x, double xtail);
 
-/* Evaluate order-12 polynomials using pairwise summation and Horner scheme in
-   double precision.  */
 static inline double
-eval_poly_horner_lvl2 (double z, const double *coeff)
+eval_poly (double z, const double *coeff)
 {
-  double r1, r2, r3, r4, r5, r6, r7, r8;
-  double R1, R2, R3, R4;
-  double Q1, Q2;
-  double z2, z4, z8;
-  z2 = z * z;
-  r1 = fma (z, coeff[1], coeff[0]);
-  r2 = fma (z, coeff[3], coeff[2]);
-  z4 = z2 * z2;
-  z8 = z4 * z4;
-  R1 = fma (z2, r2, r1);
-  r3 = fma (z, coeff[5], coeff[4]);
-  r4 = fma (z, coeff[7], coeff[6]);
-  R2 = fma (z2, r4, r3);
-  Q1 = fma (z4, R2, R1);
-  r5 = fma (z, coeff[9], coeff[8]);
-  r6 = fma (z, coeff[11], coeff[10]);
-  R3 = fma (z2, r6, r5);
-  r7 = fma (z, coeff[13], coeff[12]);
-  r8 = fma (z, coeff[15], coeff[14]);
-  R4 = fma (z2, r8, r7);
-  Q2 = fma (z4, R4, R3);
-  return fma (z8, Q2, Q1);
+  double z2 = z * z;
+  double z4 = z2 * z2;
+  double z8 = z4 * z4;
+#define C(i) coeff[i]
+  return ESTRIN_15 (z, z2, z4, z8, C);
+#undef C
 }
 
 static inline double
@@ -49,4 +34,5 @@ eval_exp_mx2 (double x)
   return __exp_dd (-(x * x), 0.0);
 }
 
+#undef FMA
 #endif // PL_MATH_ERFCF_H
diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c
index 80dba83..8d4bba1 100644
--- a/pl/math/erfcf_2u.c
+++ b/pl/math/erfcf_2u.c
@@ -21,7 +21,7 @@ approx_erfcf_hi (float x, uint32_t sign, const double *coeff)
 
   /* Polynomial contribution.  */
   double z = (double) fabs (x);
-  float p = (float) eval_poly_horner_lvl2 (z, coeff);
+  float p = (float) eval_poly (z, coeff);
   /* Gaussian contribution.  */
   float e_mx2 = (float) eval_exp_mx2 (z);
 
@@ -34,7 +34,7 @@ approx_erfcf_lo (float x, uint32_t sign, const double *coeff)
 {
   /* Polynomial contribution.  */
   double z = (double) fabs (x);
-  float p = (float) eval_poly_horner_lvl2 (z, coeff);
+  float p = (float) eval_poly (z, coeff);
   /* Gaussian contribution.  */
   float e_mx2 = (float) eval_exp_mx2 (z);
 
diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c
index 1073603..bad68a6 100644
--- a/pl/math/erff_1u5.c
+++ b/pl/math/erff_1u5.c
@@ -7,7 +7,10 @@
 
 #include <stdint.h>
 #include <math.h>
+
 #include "math_config.h"
+#include "hornerf.h"
+#include "estrinf.h"
 
 #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
 #define A __erff_data.erff_poly_A
@@ -26,7 +29,7 @@ top12 (float x)
 float
 erff (float x)
 {
-  float r, x2, u;
+  float r, x2;
 
   /* Get top word.  */
   uint32_t ix = asuint (x);
@@ -54,23 +57,18 @@ erff (float x)
 
       /* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2).
        */
-      r = A[5];
-      r = fmaf (r, x2, A[4]);
-      r = fmaf (r, x2, A[3]);
-      r = fmaf (r, x2, A[2]);
-      r = fmaf (r, x2, A[1]);
-      r = fmaf (r, x2, A[0]);
-      r = fmaf (r, x, x);
+#define C(i) A[i]
+      r = fmaf (HORNER_5 (x2, C), x, x);
+#undef C
     }
   else if (ia12 < 0x408)
     { /* |x| < 4.0 - Use a custom Estrin scheme.  */
 
       float a = fabsf (x);
       /* Use Estrin scheme on high order (small magnitude) coefficients.  */
-      r = fmaf (B[6], a, B[5]);
-      u = fmaf (B[4], a, B[3]);
-      x2 = x * x;
-      r = fmaf (r, x2, u);
+#define C(i) B[i]
+      r = ESTRIN_3_ (a, x * x, C, 3);
+#undef C
       /* Then switch to pure Horner scheme.  */
       r = fmaf (r, a, B[2]);
       r = fmaf (r, a, B[1]);
diff --git a/pl/math/estrin.h b/pl/math/estrin.h
new file mode 100644
index 0000000..89df329
--- /dev/null
+++ b/pl/math/estrin.h
@@ -0,0 +1,16 @@
+/*
+ * Helper macros for double-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "estrin_wrap.h"
diff --git a/pl/math/estrin_wrap.h b/pl/math/estrin_wrap.h
new file mode 100644
index 0000000..93af2ab
--- /dev/null
+++ b/pl/math/estrin_wrap.h
@@ -0,0 +1,48 @@
+/*
+ * Helper macros for double-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  ESTRIN_1_(x,                  c, i) FMA(x,   c(1 + i),                        c(i))
+#define  ESTRIN_2_(x, x2,              c, i) FMA(x2,  c(2 + i),                        ESTRIN_1_(x,              c, i))
+#define  ESTRIN_3_(x, x2,              c, i) FMA(x2,  ESTRIN_1_(x,         c,  2 + i), ESTRIN_1_(x,              c, i))
+#define  ESTRIN_4_(x, x2, x4,          c, i) FMA(x4,  c(4 + i),                        ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_5_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_1_(x,         c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_6_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_2_(x, x2,     c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_7_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_3_(x, x2,     c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_8_(x, x2, x4, x8,      c, i) FMA(x8,  c(8 + i),                        ESTRIN_7_(x, x2, x4,      c, i))
+#define  ESTRIN_9_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_1_(x,         c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_10_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_2_(x, x2,     c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_11_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_3_(x, x2,     c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_12_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_4_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_13_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_5_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_14_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_6_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_15_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_7_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_16_(x, x2, x4, x8, x16, c, i) FMA(x16, c(16 + i),                       ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_17_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_1_(x,         c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_18_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_2_(x, x2,     c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_19_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_3_(x, x2,     c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+
+#define  ESTRIN_1(x,                  c)  ESTRIN_1_(x,                  c, 0)
+#define  ESTRIN_2(x, x2,              c)  ESTRIN_2_(x, x2,              c, 0)
+#define  ESTRIN_3(x, x2,              c)  ESTRIN_3_(x, x2,              c, 0)
+#define  ESTRIN_4(x, x2, x4,          c)  ESTRIN_4_(x, x2, x4,          c, 0)
+#define  ESTRIN_5(x, x2, x4,          c)  ESTRIN_5_(x, x2, x4,          c, 0)
+#define  ESTRIN_6(x, x2, x4,          c)  ESTRIN_6_(x, x2, x4,          c, 0)
+#define  ESTRIN_7(x, x2, x4,          c)  ESTRIN_7_(x, x2, x4,          c, 0)
+#define  ESTRIN_8(x, x2, x4, x8,      c)  ESTRIN_8_(x, x2, x4, x8,      c, 0)
+#define  ESTRIN_9(x, x2, x4, x8,      c)  ESTRIN_9_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_10(x, x2, x4, x8,      c) ESTRIN_10_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_11(x, x2, x4, x8,      c) ESTRIN_11_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_12(x, x2, x4, x8,      c) ESTRIN_12_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_13(x, x2, x4, x8,      c) ESTRIN_13_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_14(x, x2, x4, x8,      c) ESTRIN_14_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_15(x, x2, x4, x8,      c) ESTRIN_15_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_16(x, x2, x4, x8, x16, c) ESTRIN_16_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_17(x, x2, x4, x8, x16, c) ESTRIN_17_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_18(x, x2, x4, x8, x16, c) ESTRIN_18_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_19(x, x2, x4, x8, x16, c) ESTRIN_19_(x, x2, x4, x8, x16, c, 0)
+// clang-format on
diff --git a/pl/math/estrinf.h b/pl/math/estrinf.h
new file mode 100644
index 0000000..be52ab5
--- /dev/null
+++ b/pl/math/estrinf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "estrin_wrap.h"
diff --git a/pl/math/expm1_2u5.c b/pl/math/expm1_2u5.c
index c701d7e..98ef078 100644
--- a/pl/math/expm1_2u5.c
+++ b/pl/math/expm1_2u5.c
@@ -6,6 +6,7 @@
  */
 
 #include "math_config.h"
+#include "estrin.h"
 
 #define InvLn2 0x1.71547652b82fep0
 #define Ln2hi 0x1.62e42fefa39efp-1
@@ -19,25 +20,6 @@
 
 #define C(i) __expm1_poly[i]
 
-static inline double
-eval_poly (double f, double f2)
-{
-  /* Evaluate custom polynomial using Estrin scheme.  */
-  double p_01 = fma (f, C (1), C (0));
-  double p_23 = fma (f, C (3), C (2));
-  double p_45 = fma (f, C (5), C (4));
-  double p_67 = fma (f, C (7), C (6));
-  double p_89 = fma (f, C (9), C (8));
-
-  double p_03 = fma (f2, p_23, p_01);
-  double p_47 = fma (f2, p_67, p_45);
-  double p_8a = fma (f2, C (10), p_89);
-
-  double f4 = f2 * f2;
-  double p_07 = fma (f4, p_47, p_03);
-  return fma (f4 * f4, p_8a, p_07);
-}
-
 /* Approximation for exp(x) - 1 using polynomial on a reduced interval.
    The maximum error observed error is 2.17 ULP:
    expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2
@@ -80,7 +62,8 @@ expm1 (double x)
      So we calculate the polynomial P(f) = a + bf + cf^2 + ...
      and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
   double f2 = f * f;
-  double p = fma (f2, eval_poly (f, f2), f);
+  double f4 = f2 * f2;
+  double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
 
   /* Assemble the result, using a slight rearrangement to achieve acceptable
      accuracy.
diff --git a/pl/math/expm1f_1u6.c b/pl/math/expm1f_1u6.c
index 44981ca..0904652 100644
--- a/pl/math/expm1f_1u6.c
+++ b/pl/math/expm1f_1u6.c
@@ -6,6 +6,7 @@
  */
 
 #include "math_config.h"
+#include "hornerf.h"
 
 #define Shift (0x1.8p23f)
 #define InvLn2 (0x1.715476p+0f)
@@ -59,12 +60,7 @@ expm1f (float x)
 	 x + ax^2 + bx^3 + cx^4 ....
      So we calculate the polynomial P(f) = a + bf + cf^2 + ...
      and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
-  float p = fmaf (C (4), f, C (3));
-  p = fmaf (p, f, C (2));
-  p = fmaf (p, f, C (1));
-  p = fmaf (p, f, C (0));
-  p = fmaf (f * f, p, f);
-
+  float p = fmaf (f * f, HORNER_4 (f, C), f);
   /* Assemble the result, using a slight rearrangement to achieve acceptable
      accuracy.
      expm1(x) ~= 2^i * (p + 1) - 1
diff --git a/pl/math/horner.h b/pl/math/horner.h
new file mode 100644
index 0000000..4dbc122
--- /dev/null
+++ b/pl/math/horner.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "horner_wrap.h"
diff --git a/pl/math/horner_wrap.h b/pl/math/horner_wrap.h
new file mode 100644
index 0000000..892d63b
--- /dev/null
+++ b/pl/math/horner_wrap.h
@@ -0,0 +1,34 @@
+/*
+ * Helper macros for Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  HORNER_1_(x, c, i) FMA(C(i + 1), x, c(i))
+#define  HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i))
+#define  HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i))
+#define  HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i))
+#define  HORNER_5_(x, c, i) FMA(HORNER_4_ (x, c, i + 1), x, c(i))
+#define  HORNER_6_(x, c, i) FMA(HORNER_5_ (x, c, i + 1), x, c(i))
+#define  HORNER_7_(x, c, i) FMA(HORNER_6_ (x, c, i + 1), x, c(i))
+#define  HORNER_8_(x, c, i) FMA(HORNER_7_ (x, c, i + 1), x, c(i))
+#define  HORNER_9_(x, c, i) FMA(HORNER_8_ (x, c, i + 1), x, c(i))
+#define HORNER_10_(x, c, i) FMA(HORNER_9_ (x, c, i + 1), x, c(i))
+#define HORNER_11_(x, c, i) FMA(HORNER_10_(x, c, i + 1), x, c(i))
+#define HORNER_12_(x, c, i) FMA(HORNER_11_(x, c, i + 1), x, c(i))
+
+#define  HORNER_1(x, c) HORNER_1_ (x, c, 0)
+#define  HORNER_2(x, c) HORNER_2_ (x, c, 0)
+#define  HORNER_3(x, c) HORNER_3_ (x, c, 0)
+#define  HORNER_4(x, c) HORNER_4_ (x, c, 0)
+#define  HORNER_5(x, c) HORNER_5_ (x, c, 0)
+#define  HORNER_6(x, c) HORNER_6_ (x, c, 0)
+#define  HORNER_7(x, c) HORNER_7_ (x, c, 0)
+#define  HORNER_8(x, c) HORNER_8_ (x, c, 0)
+#define  HORNER_9(x, c) HORNER_9_ (x, c, 0)
+#define HORNER_10(x, c) HORNER_10_(x, c, 0)
+#define HORNER_11(x, c) HORNER_11_(x, c, 0)
+#define HORNER_12(x, c) HORNER_12_(x, c, 0)
+// clang-format on
diff --git a/pl/math/hornerf.h b/pl/math/hornerf.h
new file mode 100644
index 0000000..bec1593
--- /dev/null
+++ b/pl/math/hornerf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for double-precision Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "horner_wrap.h"
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
index b9c7e9e..ade5d87 100644
--- a/pl/math/log1p_2u.c
+++ b/pl/math/log1p_2u.c
@@ -5,8 +5,7 @@
  */
 
 #include "math_config.h"
-
-#include "log1p_common.h"
+#include "estrin.h"
 
 #define Ln2Hi 0x1.62e42fefa3800p-1
 #define Ln2Lo 0x1.ef35793c76730p-45
@@ -19,6 +18,16 @@
 #define Rt2MOne 0x3fda827999fcef32
 #define AbsMask 0x7fffffffffffffff
 #define ExpM63 0x3c00
+#define C(i) __log1p_data.coeffs[i]
+
+static inline double
+eval_poly (double f)
+{
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double f8 = f4 * f4;
+  return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
+}
 
 /* log1p approximation using polynomial on reduced interval. Largest
    observed errors are near the lower boundary of the region where k
diff --git a/pl/math/log1p_common.h b/pl/math/log1p_common.h
deleted file mode 100644
index 24e6f20..0000000
--- a/pl/math/log1p_common.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Double-precision polynomial evaluation function for scalar and vector
- * log1p(x)
- *
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef PL_MATH_LOG1P_COMMON_H
-#define PL_MATH_LOG1P_COMMON_H
-
-#include "math_config.h"
-
-#if V_SUPPORTED
-
-#include "v_math.h"
-
-#define DBL_T v_f64_t
-#define FMA v_fma_f64
-#define C(i) v_f64 (__log1p_data.coeffs[i])
-
-#else
-
-#define DBL_T double
-#define FMA fma
-#define C(i) __log1p_data.coeffs[i]
-
-#endif
-
-static inline DBL_T
-eval_poly (DBL_T f)
-{
-  /* Evaluate polynomial using Estrin's method.  */
-  DBL_T p_01 = FMA (f, C (1), C (0));
-  DBL_T p_23 = FMA (f, C (3), C (2));
-  DBL_T p_45 = FMA (f, C (5), C (4));
-  DBL_T p_67 = FMA (f, C (7), C (6));
-  DBL_T p_89 = FMA (f, C (9), C (8));
-  DBL_T p_ab = FMA (f, C (11), C (10));
-  DBL_T p_cd = FMA (f, C (13), C (12));
-  DBL_T p_ef = FMA (f, C (15), C (14));
-  DBL_T p_gh = FMA (f, C (17), C (16));
-
-  DBL_T f2 = f * f;
-  DBL_T p_03 = FMA (f2, p_23, p_01);
-  DBL_T p_47 = FMA (f2, p_67, p_45);
-  DBL_T p_8b = FMA (f2, p_ab, p_89);
-  DBL_T p_cf = FMA (f2, p_ef, p_cd);
-  DBL_T p_gi = FMA (f2, C (18), p_gh);
-
-  DBL_T f4 = f2 * f2;
-  DBL_T p_07 = FMA (f4, p_47, p_03);
-  DBL_T p_8f = FMA (f4, p_cf, p_8b);
-
-  DBL_T f8 = f4 * f4;
-  DBL_T p_0f = FMA (f8, p_8f, p_07);
-
-  return FMA (f8 * f8, p_gi, p_0f);
-}
-
-#endif // PL_MATH_LOG1P_COMMON_H
diff --git a/pl/math/log1pf_2u1.c b/pl/math/log1pf_2u1.c
index 5b0d542..9b7cb94 100644
--- a/pl/math/log1pf_2u1.c
+++ b/pl/math/log1pf_2u1.c
@@ -5,6 +5,7 @@
  */
 
 #include "math_config.h"
+#include "hornerf.h"
 
 #define Ln2 (0x1.62e43p-1f)
 #define SignMask (0x80000000)
@@ -21,8 +22,8 @@ eval_poly (float m, uint32_t e)
 {
 #ifdef LOG1PF_2U5
 
-  /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using Estrin
-     scheme.  */
+  /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using
+     slightly modified Estrin scheme (no x^0 term, and x term is just x).  */
   float p_12 = fmaf (m, C (1), C (0));
   float p_34 = fmaf (m, C (3), C (2));
   float p_56 = fmaf (m, C (5), C (4));
@@ -49,15 +50,7 @@ eval_poly (float m, uint32_t e)
      x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ...
      Hence approximation has the form m + m^2 * P(m)
        where P(x) = C1 + C2 * x + C3 * x^2 + ... .  */
-  float p = fmaf (C (8), m, C (7));
-  p = fmaf (p, m, C (6));
-  p = fmaf (p, m, C (5));
-  p = fmaf (p, m, C (4));
-  p = fmaf (p, m, C (3));
-  p = fmaf (p, m, C (2));
-  p = fmaf (p, m, C (1));
-  p = fmaf (p, m, C (0));
-  return fmaf (m, m * p, m);
+  return fmaf (m, m * HORNER_8 (m, C), m);
 
 #else
 #error No log1pf approximation exists with the requested precision. Options are 13 or 25.
diff --git a/pl/math/pairwise_horner.h b/pl/math/pairwise_horner.h
new file mode 100644
index 0000000..bee7592
--- /dev/null
+++ b/pl/math/pairwise_horner.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for double-precision pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "pairwise_horner_wrap.h"
diff --git a/pl/math/pairwise_horner_wrap.h b/pl/math/pairwise_horner_wrap.h
new file mode 100644
index 0000000..5bc287b
--- /dev/null
+++ b/pl/math/pairwise_horner_wrap.h
@@ -0,0 +1,36 @@
+/*
+ * Helper macros for pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  PW_HORNER_1_(x, c,     i) FMA(x,  C(i + 1),                      C(i))
+#define  PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_(x,     c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+
+#define  PAIRWISE_HORNER_1(x,     c) PW_HORNER_1_ (x, c, 0)
+#define  PAIRWISE_HORNER_3(x, x2, c) PW_HORNER_3_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_5(x, x2, c) PW_HORNER_5_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_7(x, x2, c) PW_HORNER_7_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_9(x, x2, c) PW_HORNER_9_ (x, x2, c, 0)
+#define PAIRWISE_HORNER_11(x, x2, c) PW_HORNER_11_(x, x2, c, 0)
+
+#define  PW_HORNER_2_(x, x2, c, i) FMA(x2, c(i + 2),                       PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_4_(x, x2, c, i) FMA(x2, PW_HORNER_2_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_6_(x, x2, c, i) FMA(x2, PW_HORNER_4_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_8_(x, x2, c, i) FMA(x2, PW_HORNER_6_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_10_(x, x2, c, i) FMA(x2, PW_HORNER_8_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_12_(x, x2, c, i) FMA(x2, PW_HORNER_10_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+
+#define  PAIRWISE_HORNER_2(x, x2, c) PW_HORNER_2_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_4(x, x2, c) PW_HORNER_4_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_6(x, x2, c) PW_HORNER_6_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_8(x, x2, c) PW_HORNER_8_(x, x2, c, 0)
+#define PAIRWISE_HORNER_10(x, x2, c) PW_HORNER_10_(x, x2, c, 0)
+#define PAIRWISE_HORNER_12(x, x2, c) PW_HORNER_12_(x, x2, c, 0)
+// clang-format on
diff --git a/pl/math/pairwise_hornerf.h b/pl/math/pairwise_hornerf.h
new file mode 100644
index 0000000..a8aa4d1
--- /dev/null
+++ b/pl/math/pairwise_hornerf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "pairwise_horner_wrap.h"
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 14abfc6..bd4c4c2 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -789,7 +789,7 @@ cbrtf  __vn_cbrtf      $runvn   fenv
 cbrtf  _ZGVnN4v_cbrtf  $runvn   fenv
 asinh  __s_asinh       $runs    fenv
 # Test vector asinh 3 times, with control lane < 1, > 1 and special.
-#  Ensures the v_sel is choosing the right option in all cases.
+# Ensures the v_sel is choosing the right option in all cases.
 asinh  __v_asinh       $runv    fenv -c 0.5
 asinh  __vn_asinh      $runvn   fenv -c 0.5
 asinh  _ZGVnN2v_asinh  $runvn   fenv -c 0.5
diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c
index 508ced1..974e6df 100644
--- a/pl/math/v_asinh_2u5.c
+++ b/pl/math/v_asinh_2u5.c
@@ -5,6 +5,7 @@
  */
 
 #include "v_math.h"
+#include "estrin.h"
 
 #if V_SUPPORTED
 
@@ -74,38 +75,6 @@ log_inline (v_f64_t x)
   return y;
 }
 
-static inline v_f64_t
-eval_poly (v_f64_t z)
-{
-  /* Custom polynomial, shared with scalar routine, for calculating asinh(x) in
-     [2^-26, 1]. Evaluated with Estrin scheme.  */
-  v_f64_t p_01 = v_fma_f64 (z, C (1), C (0));
-  v_f64_t p_23 = v_fma_f64 (z, C (3), C (2));
-  v_f64_t p_45 = v_fma_f64 (z, C (5), C (4));
-  v_f64_t p_67 = v_fma_f64 (z, C (7), C (6));
-  v_f64_t p_89 = v_fma_f64 (z, C (9), C (8));
-  v_f64_t p_ab = v_fma_f64 (z, C (11), C (10));
-  v_f64_t p_cd = v_fma_f64 (z, C (13), C (12));
-  v_f64_t p_ef = v_fma_f64 (z, C (15), C (14));
-  v_f64_t p_gh = v_fma_f64 (z, C (17), C (16));
-
-  v_f64_t z2 = z * z;
-  v_f64_t p_03 = v_fma_f64 (z2, p_23, p_01);
-  v_f64_t p_47 = v_fma_f64 (z2, p_67, p_45);
-  v_f64_t p_8b = v_fma_f64 (z2, p_ab, p_89);
-  v_f64_t p_cf = v_fma_f64 (z2, p_ef, p_cd);
-
-  v_f64_t z4 = z2 * z2;
-  v_f64_t p_07 = v_fma_f64 (z4, p_47, p_03);
-  v_f64_t p_8f = v_fma_f64 (z4, p_cf, p_8b);
-
-  v_f64_t z8 = z4 * z4;
-  v_f64_t p_0f = v_fma_f64 (z8, p_8f, p_07);
-
-  v_f64_t z16 = z8 * z8;
-  return v_fma_f64 (z16, p_gh, p_0f);
-}
-
 /* Double-precision implementation of vector asinh(x).
    asinh is very sensitive around 1, so it is impractical to devise a single
    low-cost algorithm which is sufficiently accurate on a wide range of input.
@@ -162,7 +131,10 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
       ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
 #endif
       v_f64_t x2 = ax * ax;
-      v_f64_t p = eval_poly (x2);
+      v_f64_t z2 = x2 * x2;
+      v_f64_t z4 = z2 * z2;
+      v_f64_t z8 = z4 * z4;
+      v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
       option_2 = v_fma_f64 (p, x2 * ax, ax);
 #if WANT_ERRNO
       option_2 = v_sel_f64 (tiny, x, option_2);
diff --git a/pl/math/v_erfc_4u.c b/pl/math/v_erfc_4u.c
index 603d8f5..80e11e7 100644
--- a/pl/math/v_erfc_4u.c
+++ b/pl/math/v_erfc_4u.c
@@ -7,6 +7,7 @@
 
 #include "math_config.h"
 #include "v_math.h"
+#include "horner.h"
 #if V_SUPPORTED
 
 /* Accurate exponential (vector variant of exp_dd).  */
@@ -56,28 +57,6 @@ lookup (v_u64_t i)
   return e;
 }
 
-/* Evaluate order-12 polynomials using pairwise summation and Horner
-   scheme.  */
-static inline v_f64_t
-v_eval_poly (v_f64_t z, struct entry e)
-{
-  v_f64_t r = e.P[12];
-  r = v_fma_f64 (z, r, e.P[11]);
-  r = v_fma_f64 (z, r, e.P[10]);
-  r = v_fma_f64 (z, r, e.P[9]);
-  r = v_fma_f64 (z, r, e.P[8]);
-  r = v_fma_f64 (z, r, e.P[7]);
-  r = v_fma_f64 (z, r, e.P[6]);
-  r = v_fma_f64 (z, r, e.P[5]);
-  r = v_fma_f64 (z, r, e.P[4]);
-  r = v_fma_f64 (z, r, e.P[3]);
-  r = v_fma_f64 (z, r, e.P[2]);
-  r = v_fma_f64 (z, r, e.P[1]);
-  r = v_fma_f64 (z, r, e.P[0]);
-
-  return r;
-}
-
 /* Accurate evaluation of exp(x^2) using compensated product
    (x^2 ~ x*x + e2) and custom exp(y+d) routine for small
    corrections d<<y.  */
@@ -154,7 +133,8 @@ v_f64_t V_NAME (erfc) (v_f64_t x)
 
   /* Evaluate Polynomial: P(|x|-x_i).  */
   z = a - dat.xi;
-  p = v_eval_poly (z, dat);
+#define C(i) dat.P[i]
+  p = HORNER_12 (z, C);
 
   /* Evaluate Gaussian: exp(-x^2).  */
   v_f64_t e = v_eval_gauss (a);
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c
index fc9571d..d9c65a5 100644
--- a/pl/math/v_erfcf_1u.c
+++ b/pl/math/v_erfcf_1u.c
@@ -7,6 +7,7 @@
 
 #include "v_math.h"
 #include "erfcf.h"
+#include "estrin.h"
 
 #if V_SUPPORTED
 
@@ -40,40 +41,13 @@ interval_index (uint32_t ia12)
 #define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
 #endif
 
-static inline v_f64_t
-v_eval_poly_estrin (v_f64_t z, const double *coeff1, const double *coeff2)
-{
-  v_f64_t z2 = z * z;
-  v_f64_t z4 = z2 * z2;
-  v_f64_t z8 = z4 * z4;
-
-  v_f64_t c0_zc1 = v_fma_f64 (z, C (1), C (0));
-  v_f64_t c2_zc3 = v_fma_f64 (z, C (3), C (2));
-  v_f64_t c4_zc5 = v_fma_f64 (z, C (5), C (4));
-  v_f64_t c6_zc7 = v_fma_f64 (z, C (7), C (6));
-  v_f64_t c8_zc9 = v_fma_f64 (z, C (9), C (8));
-  v_f64_t c10_zc11 = v_fma_f64 (z, C (11), C (10));
-  v_f64_t c12_zc13 = v_fma_f64 (z, C (13), C (12));
-  v_f64_t c14_zc15 = v_fma_f64 (z, C (15), C (14));
-
-  v_f64_t c0_z2c3 = v_fma_f64 (z2, c2_zc3, c0_zc1);
-  v_f64_t c4_z2c7 = v_fma_f64 (z2, c6_zc7, c4_zc5);
-  v_f64_t c8_z2c11 = v_fma_f64 (z2, c10_zc11, c8_zc9);
-  v_f64_t c12_z2c15 = v_fma_f64 (z2, c14_zc15, c12_zc13);
-
-  v_f64_t c0_z4c7 = v_fma_f64 (z4, c4_z2c7, c0_z2c3);
-  v_f64_t c8_z4c15 = v_fma_f64 (z4, c12_z2c15, c8_z2c11);
-
-  return v_fma_f64 (z8, c8_z4c15, c0_z4c7);
-}
-
-#undef C
-
 static inline v_f64_t
 v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
 			   const double *coeff2)
 {
-  v_f64_t poly = v_eval_poly_estrin (x, coeff1, coeff2);
+  v_f64_t x2 = x * x;
+  v_f64_t x4 = x2 * x2;
+  v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
   v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
   return poly * gauss;
 }
@@ -81,7 +55,7 @@ v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
 static inline float
 approx_poly_gauss (float abs_x, const double *coeff)
 {
-  return (float) (eval_poly_horner_lvl2 (abs_x, coeff) * eval_exp_mx2 (abs_x));
+  return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
 }
 
 static v_f32_t
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
index 51d0d51..d97a622 100644
--- a/pl/math/v_log1p_2u5.c
+++ b/pl/math/v_log1p_2u5.c
@@ -7,7 +7,7 @@
 #include "v_math.h"
 #if V_SUPPORTED
 
-#include "log1p_common.h"
+#include "estrin.h"
 
 #define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
 #define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
@@ -18,6 +18,16 @@
 #define OneTop12 0x3ff
 #define BottomMask 0xffffffff
 #define AbsMask 0x7fffffffffffffff
+#define C(i) v_f64 (__log1p_data.coeffs[i])
+
+static inline v_f64_t
+eval_poly (v_f64_t f)
+{
+  v_f64_t f2 = f * f;
+  v_f64_t f4 = f2 * f2;
+  v_f64_t f8 = f4 * f4;
+  return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
+}
 
 VPCS_ATTR
 NOINLINE static v_f64_t
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
index a6d9dd1..01f7f65 100644
--- a/pl/math/v_tanf_3u2.c
+++ b/pl/math/v_tanf_3u2.c
@@ -6,6 +6,8 @@
  */
 
 #include "v_math.h"
+#include "estrinf.h"
+
 #if V_SUPPORTED
 
 /* Constants.  */
@@ -32,13 +34,7 @@ eval_poly (v_f32_t z)
 {
   v_f32_t z2 = z * z;
   v_f32_t z4 = z2 * z2;
-  v_f32_t y_10 = v_fma_f32 (z, poly (1), poly (0));
-  v_f32_t y_32 = v_fma_f32 (z, poly (3), poly (2));
-  v_f32_t y_54 = v_fma_f32 (z, poly (5), poly (4));
-  v_f32_t y_6_54 = v_fma_f32 (z2, poly (6), y_54);
-  v_f32_t y_32_10 = v_fma_f32 (z2, y_32, y_10);
-  v_f32_t y = v_fma_f32 (z4, y_6_54, y_32_10);
-  return y;
+  return ESTRIN_6 (z, z2, z4, poly);
 }
 
 /* Fast implementation of Neon tanf.
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index 571fd8b..67e4520 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -6,6 +6,7 @@
 
 #include "v_math.h"
 #include "mathlib.h"
+#include "estrinf.h"
 
 #if V_SUPPORTED
 
@@ -39,10 +40,7 @@ expm1f_inline (v_f32_t x)
   /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
      Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
   v_f32_t f2 = f * f;
-  v_f32_t p_01 = v_fma_f32 (f, C (1), C (0));
-  v_f32_t p_23 = v_fma_f32 (f, C (3), C (2));
-  v_f32_t p = v_fma_f32 (f2, p_23, p_01);
-  p = v_fma_f32 (f2 * f2, C (4), p);
+  v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
   p = v_fma_f32 (f2, p, f);
 
   /* t = 2^i.  */
-- 
cgit v1.2.3


From f23dd5f60debbdc2f6b53ea9a211fd57e195a785 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pierre-Cl=C3=A9ment=20Tosi?= <ptosi@google.com>
Date: Fri, 9 Dec 2022 18:45:54 +0000
Subject: optimized-routines-mem: Share with bionic

Migrate the libc dependencies for Rust to Bionic but keep using this
library as a back-end for arm64 so update the visibility to point to
libc instead of client code directly.

Test: m pvmfw_bin && atest vmbase_example.integration_test
Change-Id: I0e8b2ef862fcb47fcb66494aa180a7e66575a0a7
---
 Android.bp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Android.bp b/Android.bp
index 3df808c..aaad058 100644
--- a/Android.bp
+++ b/Android.bp
@@ -189,7 +189,7 @@ cc_library_static {
             ],
         },
     },
-    visibility: ["//packages/modules/Virtualization:__subpackages__"],
+    visibility: ["//bionic/libc"],
 }
 
 // adb shell "/data/nativetest64/mathtest/mathtest /data/nativetest64/mathtest/test/testcases/directed/*"
-- 
cgit v1.2.3


From b490d4ce3ca0090833e26a12bd8f2afb8d4417f1 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 13 Dec 2022 09:26:56 +0000
Subject: pl/math: Set fenv flags in Neon log1pf

New behaviour is hidden behind WANT_ERRNO config option.
---
 pl/math/test/runulp.sh |  8 ++++----
 pl/math/v_log1pf_2u1.c | 39 ++++++++++++++++++++++++---------------
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index bd4c4c2..3d644ce 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -747,10 +747,10 @@ log10f __s_log10f      $runs
 log10f __v_log10f      $runv
 log10f __vn_log10f     $runvn
 log10f _ZGVnN4v_log10f $runvn
-log1pf __s_log1pf      $runs
-log1pf __v_log1pf      $runv
-log1pf __vn_log1pf     $runvn
-log1pf _ZGVnN4v_log1pf $runvn
+log1pf __s_log1pf      $runs    fenv
+log1pf __v_log1pf      $runv    fenv
+log1pf __vn_log1pf     $runvn   fenv
+log1pf _ZGVnN4v_log1pf $runvn   fenv
 asinhf __s_asinhf      $runs    fenv
 asinhf __v_asinhf      $runv    fenv
 asinhf __vn_asinhf     $runvn   fenv
diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c
index 9e81ff4..3ef8416 100644
--- a/pl/math/v_log1pf_2u1.c
+++ b/pl/math/v_log1pf_2u1.c
@@ -48,8 +48,7 @@ eval_poly (v_f32_t m)
   v_f32_t m4 = m2 * m2;
   v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
 
-  v_f32_t m8 = m4 * m4;
-  return v_fma_f32 (m8, p_79, p_06);
+  return v_fma_f32 (m4, m4 * p_79, p_06);
 
 #else
 #error No precision specified for v_log1pf
@@ -60,22 +59,26 @@ static inline float
 handle_special (float x)
 {
   uint32_t ix = asuint (x);
-  if (ix == 0xff800000 || ix > 0xbf800000)
+  uint32_t ia = ix & AbsMask;
+  if (ix == 0xff800000 || ia > 0x7f800000 || ix > 0xbf800000)
     {
-      /* x == -Inf => log1pf(x) = NaN.
-	 x <  -1.0 => log1pf(x) = NaN.  */
+      /* x == -Inf   => log1pf(x) = NaN.
+	 x <  -1.0   => log1pf(x) = NaN.
+	 x == +/-NaN => log1pf(x) = NaN.  */
+#if WANT_ERRNO
+      return __math_invalidf (asfloat (ia));
+#else
       return NAN;
+#endif
     }
   if (ix == 0xbf800000)
     {
       /* x == -1.0 => log1pf(x) = -Inf.  */
+#if WANT_ERRNO
+      return __math_divzerof (ix);
+#else
       return -INFINITY;
-    }
-  uint32_t ia = ix & AbsMask;
-  if (ia >= 0x7f800000)
-    {
-      /* x == +/-NaN => log1pf(x) = NaN, needs to be propagated.  */
-      return asfloat (ia);
+#endif
     }
   /* |x| < TinyBound => log1p(x)  =  x.  */
   return x;
@@ -92,6 +95,14 @@ VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x)
   v_u32_t special_cases
     = v_cond_u32 (ia12 - v_u32 (TinyBound) >= (0x7f8 - TinyBound))
       | v_cond_u32 (ix >= MinusOne);
+  v_f32_t special_arg = x;
+
+#if WANT_ERRNO
+  if (unlikely (v_any_u32 (special_cases)))
+    /* Side-step special lanes so fenv exceptions are not triggered
+       inadvertently.  */
+    x = v_sel_f32 (special_cases, v_f32 (1), x);
+#endif
 
   /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
 			   is in [-0.25, 0.5]):
@@ -108,7 +119,7 @@ VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x)
   v_s32_t k = (v_as_s32_f32 (m) - ThreeQuarters) & v_u32 (0xff800000);
 
   /* Scale x by exponent manipulation.  */
-  v_f32_t m_scale = v_as_f32_u32 (ix - v_as_u32_s32 (k));
+  v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - v_as_u32_s32 (k));
 
   /* Scale up to ensure that the scale factor is representable as normalised
      fp32 number, and scale m down accordingly.  */
@@ -126,9 +137,7 @@ VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x)
   v_f32_t y = v_fma_f32 (scale_back, v_f32 (Ln2), p);
 
   if (unlikely (v_any_u32 (special_cases)))
-    {
-      return v_call_f32 (handle_special, x, y, special_cases);
-    }
+    return v_call_f32 (handle_special, special_arg, y, special_cases);
   return y;
 }
 VPCS_ALIAS
-- 
cgit v1.2.3


From bfa600d23ba19dedabb69603ab17ac9b6f6510cb Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Mon, 12 Dec 2022 10:48:42 +0000
Subject: pl/math: Update ULP threshold for SVE atan2

Test threshold fixed.
---
 pl/math/test/runulp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 3d644ce..35b4e6b 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -640,7 +640,7 @@ L_sve_sin=2.03
 L_sve_atanf=2.9
 L_sve_atan=1.78
 L_sve_atan2f=2.45
-L_sve_atan2=1.73
+L_sve_atan2=1.78
 L_sve_log10=1.97
 L_sve_log10f=2.82
 L_sve_logf=2.85
-- 
cgit v1.2.3


From 0d1cc42cec9fb097102bd603dea29c6581a4b7cd Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 13 Dec 2022 10:18:51 +0000
Subject: pl/math: Set fenv flags in Neon log2f

Flags set correctly regardless of WANT_ERRNO.
---
 pl/math/test/runulp.sh |  8 ++++----
 pl/math/v_log2f_2u6.c  | 25 ++++++++++++++++++-------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 35b4e6b..9f49270 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -755,10 +755,10 @@ asinhf __s_asinhf      $runs    fenv
 asinhf __v_asinhf      $runv    fenv
 asinhf __vn_asinhf     $runvn   fenv
 asinhf _ZGVnN4v_asinhf $runvn   fenv
-log2f  __s_log2f       $runs
-log2f  __v_log2f       $runv
-log2f  __vn_log2f      $runvn
-log2f  _ZGVnN4v_log2f  $runvn
+log2f  __s_log2f       $runs    fenv
+log2f  __v_log2f       $runv    fenv
+log2f  __vn_log2f      $runvn   fenv
+log2f  _ZGVnN4v_log2f  $runvn   fenv
 tanf  __s_tanf         $runs
 tanf  __v_tanf         $runv
 tanf  __vn_tanf        $runvn
diff --git a/pl/math/v_log2f_2u6.c b/pl/math/v_log2f_2u6.c
index ce46206..73bb84f 100644
--- a/pl/math/v_log2f_2u6.c
+++ b/pl/math/v_log2f_2u6.c
@@ -13,19 +13,22 @@
 #define T __v_log2f_data.tab
 #define A __v_log2f_data.poly
 #define OFF 0x3f330000
+#define SubnormLim 0x800000
+#define One v_u32 (0x3f800000)
 
 static float
 handle_special (float x)
 {
+  if (x != x)
+    /* NaN - return NaN but do not trigger invalid.  */
+    return x;
   if (x < 0)
     /* log2f(-anything) = NaN.  */
-    return NAN;
+    return __math_invalidf (x);
   if (x == 0)
     /* log2f(0) = Inf.  */
     return __math_divzerof (1);
-  /* log2f(Inf)  =  Inf
-     log2f(Nan)  =  Nan
-     log2f(-NaN) = -NaN.  */
+  /* log2f(Inf)  =  Inf.  */
   return x;
 }
 
@@ -67,11 +70,19 @@ VPCS_ATTR v_f32_t V_NAME (log2f) (v_f32_t x)
   /* x is +-Inf, +-NaN, 0 or -ve.  */
   v_u32_t special = v_cond_u32 (ix >= 0x7f800000) | v_cond_u32 (ix == 0);
   /* |x| < 2^126 (i.e. x is subnormal).  */
-  v_u32_t subnorm = v_cond_u32 (v_calt_f32 (x, v_f32 (0x1p-126f)));
+  v_u32_t subnorm = v_cond_u32 (ix < SubnormLim);
+
+  /* Sidestep special lanes so they do not inadvertently trigger fenv
+     exceptions. They will be fixed up later.  */
+  if (unlikely (v_any_u32 (special)))
+    ix = v_sel_u32 (special, One, ix);
 
   if (unlikely (v_any_u32 (subnorm)))
-    /* Normalize any subnormals.  */
-    ix = v_as_u32_f32 (v_call_f32 (normalise, x, x, subnorm));
+    {
+      /* Normalize any subnormals.  */
+      v_f32_t tmp_x = v_as_f32_u32 (ix);
+      ix = v_as_u32_f32 (v_call_f32 (normalise, tmp_x, tmp_x, subnorm));
+    }
 
   /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
      The range is split into N subintervals.
-- 
cgit v1.2.3


From 80abd605ee62de59fbfbaba397028326a1148a16 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <joe.ramsay@arm.com>
Date: Tue, 13 Dec 2022 10:21:39 +0000
Subject: pl/math: Set fenv flags in Neon tanf

New behaviour is hidden behind WANT_ERRNO config option.
---
 pl/math/test/runulp.sh |  8 ++++----
 pl/math/v_tanf_3u2.c   | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 9f49270..21d0a8c 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -759,10 +759,10 @@ log2f  __s_log2f       $runs    fenv
 log2f  __v_log2f       $runv    fenv
 log2f  __vn_log2f      $runvn   fenv
 log2f  _ZGVnN4v_log2f  $runvn   fenv
-tanf  __s_tanf         $runs
-tanf  __v_tanf         $runv
-tanf  __vn_tanf        $runvn
-tanf  _ZGVnN4v_tanf    $runvn
+tanf  __s_tanf         $runs    fenv
+tanf  __v_tanf         $runv    fenv
+tanf  __vn_tanf        $runvn   fenv
+tanf  _ZGVnN4v_tanf    $runvn   fenv
 log1p  __s_log1p       $runs
 log1p  __v_log1p       $runv
 log1p  __vn_log1p      $runvn
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
index 01f7f65..8b3869c 100644
--- a/pl/math/v_tanf_3u2.c
+++ b/pl/math/v_tanf_3u2.c
@@ -15,8 +15,10 @@
 #define NegPio2_2 (v_f32 (0x1.777a5cp-25f))
 #define NegPio2_3 (v_f32 (0x1.ee59dap-50f))
 #define InvPio2 (v_f32 (0x1.45f306p-1f))
-#define RangeVal (v_f32 (0x1p17f))
+#define RangeVal (0x48000000)  /* asuint32(0x1p17f).  */
+#define TinyBound (0x30000000) /* asuint32 (0x1p-31).  */
 #define Shift (v_f32 (0x1.8p+23f))
+#define AbsMask (v_u32 (0x7fffffff))
 
 #define poly(i) v_f32 (__tanf_poly_data.poly_tan[i])
 
@@ -33,6 +35,13 @@ static inline v_f32_t
 eval_poly (v_f32_t z)
 {
   v_f32_t z2 = z * z;
+#if WANT_ERRNO
+  /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If errno is to be
+     set correctly, sidestep this by fixing such lanes to 0.  */
+  v_u32_t will_uflow = v_cond_u32 ((v_as_u32_f32 (z) & AbsMask) <= TinyBound);
+  if (unlikely (v_any_u32 (will_uflow)))
+    z2 = v_sel_f32 (will_uflow, v_f32 (0), z2);
+#endif
   v_f32_t z4 = z2 * z2;
   return ESTRIN_6 (z, z2, z4, poly);
 }
@@ -44,8 +53,23 @@ eval_poly (v_f32_t z)
 VPCS_ATTR
 v_f32_t V_NAME (tanf) (v_f32_t x)
 {
-  /* Determine whether input is too large to perform fast regression.  */
-  v_u32_t cmp = v_cage_f32 (x, RangeVal);
+  v_f32_t special_arg = x;
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+
+  /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast
+     regression.  */
+#if WANT_ERRNO
+  /* If errno is to be set correctly, also special-case tiny input, as this will
+     load to overflow later. Fix any special lanes to 1 to prevent any
+     exceptions being triggered.  */
+  v_u32_t special = v_cond_u32 (iax - TinyBound >= RangeVal - TinyBound);
+  if (unlikely (v_any_u32 (special)))
+    x = v_sel_f32 (special, v_f32 (1.0f), x);
+#else
+  /* Otherwise, special-case large and special values.  */
+  v_u32_t special = v_cond_u32 (iax >= RangeVal);
+#endif
 
   /* n = rint(x/(pi/2)).  */
   v_f32_t q = v_fma_f32 (InvPio2, x, Shift);
@@ -85,10 +109,8 @@ v_f32_t V_NAME (tanf) (v_f32_t x)
      therefore it is fixed here.  */
   y = v_sel_f32 (x == v_f32 (-0.0), x, y);
 
-  /* No need to pass pg to specialcase here since cmp is a strict subset,
-     guaranteed by the cmpge above.  */
-  if (unlikely (v_any_u32 (cmp)))
-    return specialcase (x, y, cmp);
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (special_arg, y, special);
   return y;
 }
 VPCS_ALIAS
-- 
cgit v1.2.3


From 3c0af1a73df1551a372294d4b05573b2a47f051e Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 13 Dec 2022 10:22:26 +0000
Subject: pl/math: Set fenv flags in Neon log1p

New behaviour is hidden behind WANT_ERRNO config option.
---
 pl/math/test/runulp.sh | 8 ++++----
 pl/math/v_log1p_2u5.c  | 7 ++++++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 21d0a8c..a1410b4 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -763,10 +763,10 @@ tanf  __s_tanf         $runs    fenv
 tanf  __v_tanf         $runv    fenv
 tanf  __vn_tanf        $runvn   fenv
 tanf  _ZGVnN4v_tanf    $runvn   fenv
-log1p  __s_log1p       $runs
-log1p  __v_log1p       $runv
-log1p  __vn_log1p      $runvn
-log1p  _ZGVnN2v_log1p  $runvn
+log1p  __s_log1p       $runs    fenv
+log1p  __v_log1p       $runv    fenv
+log1p  __vn_log1p      $runvn   fenv
+log1p  _ZGVnN2v_log1p  $runvn   fenv
 expm1f __s_expm1f      $runs    fenv
 expm1f __v_expm1f      $runv    fenv
 expm1f __vn_expm1f     $runvn   fenv
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
index d97a622..3781522 100644
--- a/pl/math/v_log1p_2u5.c
+++ b/pl/math/v_log1p_2u5.c
@@ -49,6 +49,11 @@ VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x)
     = v_cond_u64 ((ia >= v_u64 (0x7ff0000000000000))
 		  | (ix >= 0xbff0000000000000) | (ix == 0x8000000000000000));
 
+#if WANT_ERRNO
+  if (unlikely (v_any_u64 (special)))
+    x = v_sel_f64 (special, v_f64 (0), x);
+#endif
+
   /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
 			   is in [sqrt(2)/2, sqrt(2)]):
      log1p(x) = k*log(2) + log1p(f).
@@ -92,7 +97,7 @@ VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x)
   v_f64_t y = v_fma_f64 (f * f, p, ylo + yhi);
 
   if (unlikely (v_any_u64 (special)))
-    return specialcase (x, y, special);
+    return specialcase (v_as_f64_u64 (ix), y, special);
 
   return y;
 }
-- 
cgit v1.2.3


From 1bca1a541cce13c352296acd5dfa16160fc27bc9 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 15 Dec 2022 13:27:31 +0000
Subject: pl/math: Auto-generate mathbench and ulp headers

Instead of maintaining three separate lists of routines, which
are cumbersome and prone to merge conflicts, we provide a new
macro, PL_SIG, which by some preprocessor machinery outputs the
lists in the required format (macro formats have been changed
very slightly to make the generation simpler). Only routines with
simple signatures are handled - binary functions still need
mathbench wrappers defined manually. As well, routines with
non-standard references (i.e. powi/powk) still need entries and
wrappers manually defined.
---
 pl/math/Dir.mk                 |  18 +++++-
 pl/math/acosh_3u.c             |   3 +
 pl/math/acoshf_2u8.c           |   3 +
 pl/math/asinh_2u5.c            |   5 +-
 pl/math/asinhf_3u5.c           |   5 +-
 pl/math/atan2_2u5.c            |   6 +-
 pl/math/atan2f_3u.c            |   6 +-
 pl/math/atanhf_3u1.c           |   3 +
 pl/math/cbrtf_1u5.c            |   7 ++-
 pl/math/cosh_2u.c              |   3 +
 pl/math/coshf_1u9.c            |   3 +
 pl/math/erfc_4u5.c             |   6 +-
 pl/math/erfcf_2u.c             |   3 +
 pl/math/erff_1u5.c             |  11 ++--
 pl/math/expm1_2u5.c            |   5 +-
 pl/math/expm1f_1u6.c           |   5 +-
 pl/math/log10_2u.c             |   6 +-
 pl/math/log10f.c               |   6 ++
 pl/math/log1p_2u.c             |   5 +-
 pl/math/log1pf_2u1.c           |   5 +-
 pl/math/pl_sig.h               |  43 ++++++++++++++
 pl/math/sinh_3u.c              |   3 +
 pl/math/sinhf_2u3.c            |   3 +
 pl/math/sv_atan2_2u5.c         |   5 ++
 pl/math/sv_atan2f_3u.c         |   5 ++
 pl/math/sv_atan_2u5.c          |   3 +
 pl/math/sv_atanf_2u9.c         |   3 +
 pl/math/sv_cos_2u5.c           |   3 +
 pl/math/sv_cosf_2u1.c          |   3 +
 pl/math/sv_erf_2u5.c           |   3 +
 pl/math/sv_erfc_4u.c           |   2 +
 pl/math/sv_erff_1u3.c          |   3 +
 pl/math/sv_expf_2u.c           |   3 +
 pl/math/sv_log10_2u5.c         |   4 +-
 pl/math/sv_log10f_3u5.c        |   2 +
 pl/math/sv_log_2u5.c           |   3 +
 pl/math/sv_logf_3u4.c          |   3 +
 pl/math/sv_sin_3u.c            |   3 +
 pl/math/sv_sinf_1u9.c          |   3 +
 pl/math/sv_tanf_3u2.c          |   3 +
 pl/math/tanf_3u3.c             |   3 +
 pl/math/tanhf_2u6.c            |   3 +
 pl/math/test/mathbench_funcs.h | 125 ++++++++++++++---------------------------
 pl/math/test/ulp_funcs.h       | 117 ++++++++++++--------------------------
 pl/math/test/ulp_wrappers.h    |  75 +++++++++----------------
 pl/math/v_asinh_2u5.c          |   2 +
 pl/math/v_asinhf_2u7.c         |   2 +
 pl/math/v_atan2_3u.c           |   4 ++
 pl/math/v_atan2f_3u.c          |   4 ++
 pl/math/v_atan_2u5.c           |   4 ++
 pl/math/v_atanf_3u.c           |   4 ++
 pl/math/v_atanhf_3u1.c         |   2 +
 pl/math/v_cbrtf_1u5.c          |   2 +
 pl/math/v_cosh_2u.c            |   2 +
 pl/math/v_coshf_2u4.c          |   2 +
 pl/math/v_erf_2u.c             |   6 +-
 pl/math/v_erfc_4u.c            |   6 +-
 pl/math/v_erfcf_1u.c           |   3 +
 pl/math/v_erff_1u5.c           |   6 +-
 pl/math/v_exp_tail.c           |   2 +-
 pl/math/v_expf.c               |   2 +-
 pl/math/v_expm1_2u5.c          |   3 +
 pl/math/v_expm1f_1u6.c         |   3 +
 pl/math/v_log10_2u5.c          |   5 +-
 pl/math/v_log10f_3u5.c         |   6 +-
 pl/math/v_log1p_2u5.c          |   3 +
 pl/math/v_log1pf_2u1.c         |   3 +
 pl/math/v_log2_3u.c            |   5 +-
 pl/math/v_log2f_2u6.c          |   6 +-
 pl/math/v_sinh_3u.c            |   2 +
 pl/math/v_sinhf_2u3.c          |   2 +
 pl/math/v_tanf_3u2.c           |   3 +
 pl/math/v_tanhf_2u6.c          |   4 +-
 73 files changed, 393 insertions(+), 245 deletions(-)
 create mode 100644 pl/math/pl_sig.h

diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index 0fe1e67..b866fa4 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -54,7 +54,23 @@ $(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno
 $(math-host-objs): CC = $(HOST_CC)
 $(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS)
 
-$(B)/test/ulp.o: $(AOR)/test/ulp.h
+build/pl/include/test/ulp_funcs_gen.h: $(math-lib-srcs)
+	# Replace PL_SIG
+	cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f)" -P > $@
+
+build/pl/include/test/mathbench_funcs_gen.h: $(math-lib-srcs)
+	# Replace PL_SIG macros with mathbench func entries
+	cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f, ##__VA_ARGS__)" -P > $@
+
+build/pl/include/test/ulp_wrappers_gen.h: $(math-lib-srcs)
+	# Replace PL_SIG macros with ULP wrapper declarations
+	cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=Z##v##N##t##a##_WRAP(f)" -P > $@
+
+$(B)/test/ulp.o: $(AOR)/test/ulp.h build/pl/include/test/ulp_funcs_gen.h build/pl/include/test/ulp_wrappers_gen.h
+$(B)/test/ulp.o: CFLAGS_PL += -I build/pl/include/test
+
+$(B)/test/mathbench.o: build/pl/include/test/mathbench_funcs_gen.h
+$(B)/test/mathbench.o: CFLAGS_PL += -I build/pl/include/test
 
 build/pl/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
 	$(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^
diff --git a/pl/math/acosh_3u.c b/pl/math/acosh_3u.c
index 6ac64f6..e0014d6 100644
--- a/pl/math/acosh_3u.c
+++ b/pl/math/acosh_3u.c
@@ -5,6 +5,7 @@
  */
 
 #include "math_config.h"
+#include "pl_sig.h"
 
 #define Ln2 (0x1.62e42fefa39efp-1)
 #define MinusZero (0x8000000000000000)
@@ -53,3 +54,5 @@ acosh (double x)
   double xm1 = x - 1;
   return log1p (xm1 + sqrt (2 * xm1 + xm1 * xm1));
 }
+
+PL_SIG (S, D, 1, acosh, 1.0, 10.0)
diff --git a/pl/math/acoshf_2u8.c b/pl/math/acoshf_2u8.c
index fb8d12d..0b1e9c7 100644
--- a/pl/math/acoshf_2u8.c
+++ b/pl/math/acoshf_2u8.c
@@ -5,6 +5,7 @@
  */
 
 #include "math_config.h"
+#include "pl_sig.h"
 
 #define Ln2 (0x1.62e4p-1f)
 #define MinusZero 0x80000000
@@ -50,3 +51,5 @@ acoshf (float x)
   float xm1 = x - 1;
   return log1pf (xm1 + sqrtf (2 * xm1 + xm1 * xm1));
 }
+
+PL_SIG (S, F, 1, acosh, 1.0, 10.0)
diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c
index 9cbdd33..bbe6bee 100644
--- a/pl/math/asinh_2u5.c
+++ b/pl/math/asinh_2u5.c
@@ -4,8 +4,9 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
-#include "math_config.h"
 #include "estrin.h"
+#include "math_config.h"
+#include "pl_sig.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26).  */
@@ -72,3 +73,5 @@ asinh (double x)
   return asdouble (asuint64 (optr_aor_log_f64 (ax + sqrt (ax * ax + 1)))
 		   | sign);
 }
+
+PL_SIG (S, D, 1, asinh, -10.0, 10.0)
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
index 48acdef..ec3dd9b 100644
--- a/pl/math/asinhf_3u5.c
+++ b/pl/math/asinhf_3u5.c
@@ -4,8 +4,9 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "math_config.h"
 #include "estrinf.h"
+#include "math_config.h"
+#include "pl_sig.h"
 
 #define AbsMask (0x7fffffff)
 #define SqrtFltMax (0x1.749e96p+10f)
@@ -66,3 +67,5 @@ asinhf (float x)
 
   return asfloat (asuint (optr_aor_log_f32 (ax + sqrtf (ax * ax + 1))) | sign);
 }
+
+PL_SIG (S, F, 1, asinh, -10.0, 10.0)
diff --git a/pl/math/atan2_2u5.c b/pl/math/atan2_2u5.c
index 471c5c9..c1cf7a3 100644
--- a/pl/math/atan2_2u5.c
+++ b/pl/math/atan2_2u5.c
@@ -7,8 +7,9 @@
 
 #include <stdbool.h>
 
-#include "math_config.h"
 #include "atan_common.h"
+#include "math_config.h"
+#include "pl_sig.h"
 
 #define Pi (0x1.921fb54442d18p+1)
 #define PiOver2 (0x1.921fb54442d18p+0)
@@ -146,3 +147,6 @@ atan2 (double y, double x)
   /* Account for the sign of x and y.  */
   return asdouble (asuint64 (ret) ^ sign_xy);
 }
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (S, D, 2, atan2)
diff --git a/pl/math/atan2f_3u.c b/pl/math/atan2f_3u.c
index 3fa6296..7780be6 100644
--- a/pl/math/atan2f_3u.c
+++ b/pl/math/atan2f_3u.c
@@ -7,8 +7,9 @@
 
 #include <stdbool.h>
 
-#include "math_config.h"
 #include "atanf_common.h"
+#include "math_config.h"
+#include "pl_sig.h"
 
 #define Pi (0x1.921fb6p+1f)
 #define PiOver2 (0x1.921fb6p+0f)
@@ -154,3 +155,6 @@ atan2f (float y, float x)
   /* Account for the sign of x and y.  */
   return asfloat (asuint (ret) ^ sign_xy);
 }
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (S, F, 2, atan2)
diff --git a/pl/math/atanhf_3u1.c b/pl/math/atanhf_3u1.c
index 77795c8..db663bf 100644
--- a/pl/math/atanhf_3u1.c
+++ b/pl/math/atanhf_3u1.c
@@ -6,6 +6,7 @@
 
 #include "math_config.h"
 #include "mathlib.h"
+#include "pl_sig.h"
 
 #define AbsMask 0x7fffffff
 #define Half 0x3f000000
@@ -74,3 +75,5 @@ atanhf (float x)
   float ax = asfloat (iax);
   return halfsign * log1pf_inline ((2 * ax) / (1 - ax));
 }
+
+PL_SIG (S, F, 1, atanh, -1.0, 1.0)
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
index d544a68..74e7a49 100644
--- a/pl/math/cbrtf_1u5.c
+++ b/pl/math/cbrtf_1u5.c
@@ -5,10 +5,9 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include <math.h>
-
-#include "math_config.h"
 #include "estrinf.h"
+#include "math_config.h"
+#include "pl_sig.h"
 
 #define AbsMask 0x7fffffff
 #define SignMask 0x80000000
@@ -60,3 +59,5 @@ cbrtf (float x)
      Which can be done easily using ldexpf.  */
   return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign);
 }
+
+PL_SIG (S, F, 1, cbrt, -10.0, 10.0)
diff --git a/pl/math/cosh_2u.c b/pl/math/cosh_2u.c
index 7526cdf..6be189d 100644
--- a/pl/math/cosh_2u.c
+++ b/pl/math/cosh_2u.c
@@ -6,6 +6,7 @@
  */
 
 #include "math_config.h"
+#include "pl_sig.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define SpecialBound                                                           \
@@ -53,3 +54,5 @@ cosh (double x)
   double t = __exp_dd (ax, 0);
   return 0.5 * t + 0.5 / t;
 }
+
+PL_SIG (S, D, 1, cosh, -10.0, 10.0)
diff --git a/pl/math/coshf_1u9.c b/pl/math/coshf_1u9.c
index ca3f767..b9cbe54 100644
--- a/pl/math/coshf_1u9.c
+++ b/pl/math/coshf_1u9.c
@@ -6,6 +6,7 @@
  */
 
 #include "math_config.h"
+#include "pl_sig.h"
 
 #define AbsMask 0x7fffffff
 #define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this.  */
@@ -58,3 +59,5 @@ coshf (float x)
   float t = optr_aor_exp_f32 (ax);
   return 0.5f * t + 0.5f / t;
 }
+
+PL_SIG (S, F, 1, cosh, -10.0, 10.0)
diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
index 8088562..b418421 100644
--- a/pl/math/erfc_4u5.c
+++ b/pl/math/erfc_4u5.c
@@ -5,11 +5,9 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include <stdint.h>
-#include <math.h>
-#include <errno.h>
 #include "math_config.h"
 #include "pairwise_horner.h"
+#include "pl_sig.h"
 
 #define AbsMask (0x7fffffffffffffff)
 
@@ -145,3 +143,5 @@ erfc (double x)
       return __math_uflow (0);
     }
 }
+
+PL_SIG (S, D, 1, erfc, -6.0, 28.0)
diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c
index 8d4bba1..32a96dc 100644
--- a/pl/math/erfcf_2u.c
+++ b/pl/math/erfcf_2u.c
@@ -7,6 +7,7 @@
 
 #include "erfcf.h"
 #include "math_config.h"
+#include "pl_sig.h"
 
 #define P(i) __erfcf_poly_data.poly[i]
 
@@ -120,3 +121,5 @@ erfcf (float x)
     }
   return __math_uflowf (0);
 }
+
+PL_SIG (S, F, 1, erfc, -4.0, 10.0)
diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c
index bad68a6..afa5880 100644
--- a/pl/math/erff_1u5.c
+++ b/pl/math/erff_1u5.c
@@ -4,13 +4,10 @@
  * Copyright (c) 2020-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
-
-#include <stdint.h>
-#include <math.h>
-
-#include "math_config.h"
-#include "hornerf.h"
 #include "estrinf.h"
+#include "hornerf.h"
+#include "math_config.h"
+#include "pl_sig.h"
 
 #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
 #define A __erff_data.erff_poly_A
@@ -99,3 +96,5 @@ erff (float x)
     }
   return r;
 }
+
+PL_SIG (S, F, 1, erf, -4.0, 4.0)
diff --git a/pl/math/expm1_2u5.c b/pl/math/expm1_2u5.c
index 98ef078..55ddbd1 100644
--- a/pl/math/expm1_2u5.c
+++ b/pl/math/expm1_2u5.c
@@ -5,8 +5,9 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "math_config.h"
 #include "estrin.h"
+#include "math_config.h"
+#include "pl_sig.h"
 
 #define InvLn2 0x1.71547652b82fep0
 #define Ln2hi 0x1.62e42fefa39efp-1
@@ -73,3 +74,5 @@ expm1 (double x)
   /* expm1(x) ~= 2 * (p * t + (t - 1/2)).  */
   return 2 * fma (p, t, t - 0.5);
 }
+
+PL_SIG (S, D, 1, expm1, -9.9, 9.9)
diff --git a/pl/math/expm1f_1u6.c b/pl/math/expm1f_1u6.c
index 0904652..9c0c178 100644
--- a/pl/math/expm1f_1u6.c
+++ b/pl/math/expm1f_1u6.c
@@ -5,8 +5,9 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "math_config.h"
 #include "hornerf.h"
+#include "math_config.h"
+#include "pl_sig.h"
 
 #define Shift (0x1.8p23f)
 #define InvLn2 (0x1.715476p+0f)
@@ -69,3 +70,5 @@ expm1f (float x)
   /* expm1(x) ~= 2 * (p * t + (t - 1/2)).  */
   return 2 * fmaf (p, t, t - 0.5f);
 }
+
+PL_SIG (S, F, 1, expm1, -9.9, 9.9)
diff --git a/pl/math/log10_2u.c b/pl/math/log10_2u.c
index 3330389..b05e17b 100644
--- a/pl/math/log10_2u.c
+++ b/pl/math/log10_2u.c
@@ -6,9 +6,7 @@
  */
 
 #include "math_config.h"
-#include <float.h>
-#include <math.h>
-#include <stdint.h>
+#include "pl_sig.h"
 
 /* Polynomial coefficients and lookup tables.  */
 #define T __log10_data.tab
@@ -143,3 +141,5 @@ log10l (long double x)
 #endif
 #endif
 // clang-format on
+
+PL_SIG (S, D, 1, log10, 0.01, 11.1)
diff --git a/pl/math/log10f.c b/pl/math/log10f.c
index 79f5d12..ea67b4b 100644
--- a/pl/math/log10f.c
+++ b/pl/math/log10f.c
@@ -6,6 +6,7 @@
  */
 
 #include "math_config.h"
+#include "pl_sig.h"
 #include <math.h>
 #include <stdint.h>
 
@@ -84,7 +85,12 @@ log10f (float x)
 
   return eval_as_float (y);
 }
+
+// clang-format off
 #if USE_GLIBC_ABI
 strong_alias (log10f, __log10f_finite)
 hidden_alias (log10f, __ieee754_log10f)
 #endif
+
+PL_SIG (S, F, 1, log10, 0.01, 11.1)
+  // clang-format on
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
index ade5d87..20b4811 100644
--- a/pl/math/log1p_2u.c
+++ b/pl/math/log1p_2u.c
@@ -4,8 +4,9 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "math_config.h"
 #include "estrin.h"
+#include "math_config.h"
+#include "pl_sig.h"
 
 #define Ln2Hi 0x1.62e42fefa3800p-1
 #define Ln2Lo 0x1.ef35793c76730p-45
@@ -120,3 +121,5 @@ log1p (double x)
   double y = fma (Ln2Lo, kd, cm);
   return y + fma (Ln2Hi, kd, p);
 }
+
+PL_SIG (S, D, 1, log1p, -0.9, 10.0)
diff --git a/pl/math/log1pf_2u1.c b/pl/math/log1pf_2u1.c
index 9b7cb94..97dd1c4 100644
--- a/pl/math/log1pf_2u1.c
+++ b/pl/math/log1pf_2u1.c
@@ -4,8 +4,9 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "math_config.h"
 #include "hornerf.h"
+#include "math_config.h"
+#include "pl_sig.h"
 
 #define Ln2 (0x1.62e43p-1f)
 #define SignMask (0x80000000)
@@ -149,3 +150,5 @@ log1pf (float x)
   /* Apply the scaling back.  */
   return fmaf (scale_back, Ln2, p);
 }
+
+PL_SIG (S, F, 1, log1p, -0.9, 10.0)
diff --git a/pl/math/pl_sig.h b/pl/math/pl_sig.h
new file mode 100644
index 0000000..e9f54c0
--- /dev/null
+++ b/pl/math/pl_sig.h
@@ -0,0 +1,43 @@
+/*
+ * PL macros for emitting various ulp/bench entries based on function signature
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+#define PL_DECL_SF1(fun) float fun##f (float);
+#define PL_DECL_SF2(fun) float fun##f (float, float);
+#define PL_DECL_SD1(fun) double fun (double);
+#define PL_DECL_SD2(fun) double fun (double, double);
+
+#if V_SUPPORTED
+#define PL_DECL_VF1(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t);
+#define PL_DECL_VF2(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t, v_f32_t);
+#define PL_DECL_VD1(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t);
+#define PL_DECL_VD2(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t, v_f64_t);
+#else
+#define PL_DECL_VF1(fun)
+#define PL_DECL_VF2(fun)
+#define PL_DECL_VD1(fun)
+#define PL_DECL_VD2(fun)
+#endif
+
+#if SV_SUPPORTED
+#define PL_DECL_SVF1(fun) sv_f32_t __sv_##fun##f_x (sv_f32_t, svbool_t);
+#define PL_DECL_SVF2(fun)                                                      \
+  sv_f32_t __sv_##fun##f_x (sv_f32_t, sv_f32_t, svbool_t);
+#define PL_DECL_SVD1(fun) sv_f64_t __sv_##fun##_x (sv_f64_t, svbool_t);
+#define PL_DECL_SVD2(fun)                                                      \
+  sv_f64_t __sv_##fun##_x (sv_f64_t, sv_f64_t, svbool_t);
+#else
+#define PL_DECL_SVF1(fun)
+#define PL_DECL_SVF2(fun)
+#define PL_DECL_SVD1(fun)
+#define PL_DECL_SVD2(fun)
+#endif
+
+/* For building the routines, emit function prototype from PL_SIG. This
+   ensures that the correct signature has been chosen (wrong one will be a
+   compile error). PL_SIG is defined differently by various components of the
+   build system to emit entries in the wrappers and entries for mathbench and
+   ulp.  */
+#define PL_SIG(v, t, a, f, ...) PL_DECL_##v##t##a (f)
diff --git a/pl/math/sinh_3u.c b/pl/math/sinh_3u.c
index ce3ff13..f56b8d0 100644
--- a/pl/math/sinh_3u.c
+++ b/pl/math/sinh_3u.c
@@ -6,6 +6,7 @@
  */
 
 #include "math_config.h"
+#include "pl_sig.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define Half 0x3fe0000000000000
@@ -53,3 +54,5 @@ sinh (double x)
 		    (t + t / (t + 1)) / -2  for x < 0.  */
   return (t + t / (t + 1)) * halfsign;
 }
+
+PL_SIG (S, D, 1, sinh, -10.0, 10.0)
diff --git a/pl/math/sinhf_2u3.c b/pl/math/sinhf_2u3.c
index c616dac..cb5eb51 100644
--- a/pl/math/sinhf_2u3.c
+++ b/pl/math/sinhf_2u3.c
@@ -6,6 +6,7 @@
  */
 
 #include "math_config.h"
+#include "pl_sig.h"
 
 #define AbsMask 0x7fffffff
 #define Half 0x3f000000
@@ -63,3 +64,5 @@ sinhf (float x)
 		    (t + t / (t + 1)) / -2  for x < 0.  */
   return (t + t / (t + 1)) * halfsign;
 }
+
+PL_SIG (S, F, 1, sinh, -10.0, 10.0)
diff --git a/pl/math/sv_atan2_2u5.c b/pl/math/sv_atan2_2u5.c
index bc98ccd..c047595 100644
--- a/pl/math/sv_atan2_2u5.c
+++ b/pl/math/sv_atan2_2u5.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #include "sv_atan_common.h"
@@ -79,4 +81,7 @@ __sv_atan2_x (sv_f64_t y, sv_f64_t x, const svbool_t pg)
 
 strong_alias (__sv_atan2_x, _ZGVsMxvv_atan2)
 
+  /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.
+   */
+  PL_SIG (SV, D, 2, atan2)
 #endif
diff --git a/pl/math/sv_atan2f_3u.c b/pl/math/sv_atan2f_3u.c
index 5f93c49..0ce7071 100644
--- a/pl/math/sv_atan2f_3u.c
+++ b/pl/math/sv_atan2f_3u.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #include "sv_atanf_common.h"
@@ -80,4 +82,7 @@ __sv_atan2f_x (sv_f32_t y, sv_f32_t x, const svbool_t pg)
 
 strong_alias (__sv_atan2f_x, _ZGVsMxvv_atan2f)
 
+  /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.
+   */
+  PL_SIG (SV, F, 2, atan2)
 #endif
diff --git a/pl/math/sv_atan_2u5.c b/pl/math/sv_atan_2u5.c
index 49c5e82..4f52b43 100644
--- a/pl/math/sv_atan_2u5.c
+++ b/pl/math/sv_atan_2u5.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #include "sv_atan_common.h"
@@ -49,4 +51,5 @@ __sv_atan_x (sv_f64_t x, const svbool_t pg)
 
 strong_alias (__sv_atan_x, _ZGVsMxv_atan)
 
+  PL_SIG (SV, D, 1, atan, -3.1, 3.1)
 #endif
diff --git a/pl/math/sv_atanf_2u9.c b/pl/math/sv_atanf_2u9.c
index d195ca5..db15830 100644
--- a/pl/math/sv_atanf_2u9.c
+++ b/pl/math/sv_atanf_2u9.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #include "sv_atanf_common.h"
@@ -46,4 +48,5 @@ __sv_atanf_x (sv_f32_t x, const svbool_t pg)
 
 strong_alias (__sv_atanf_x, _ZGVsMxv_atanf)
 
+  PL_SIG (SV, F, 1, atan, -3.1, 3.1)
 #endif
diff --git a/pl/math/sv_cos_2u5.c b/pl/math/sv_cos_2u5.c
index 483c73f..a19be9b 100644
--- a/pl/math/sv_cos_2u5.c
+++ b/pl/math/sv_cos_2u5.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1))
@@ -74,4 +76,5 @@ __sv_cos_x (sv_f64_t x, const svbool_t pg)
 
 strong_alias (__sv_cos_x, _ZGVsMxv_cos)
 
+  PL_SIG (SV, D, 1, cos, -3.1, 3.1)
 #endif
diff --git a/pl/math/sv_cosf_2u1.c b/pl/math/sv_cosf_2u1.c
index 70057ea..3bc3d71 100644
--- a/pl/math/sv_cosf_2u1.c
+++ b/pl/math/sv_cosf_2u1.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
@@ -72,4 +74,5 @@ __sv_cosf_x (sv_f32_t x, const svbool_t pg)
 
 strong_alias (__sv_cosf_x, _ZGVsMxv_cosf)
 
+  PL_SIG (SV, F, 1, cos, -3.1, 3.1)
 #endif
diff --git a/pl/math/sv_erf_2u5.c b/pl/math/sv_erf_2u5.c
index 1265047..eac500c 100644
--- a/pl/math/sv_erf_2u5.c
+++ b/pl/math/sv_erf_2u5.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #define Scale (8.0)
@@ -87,4 +89,5 @@ __sv_erf_x (sv_f64_t x, const svbool_t pg)
 
 strong_alias (__sv_erf_x, _ZGVsMxv_erf)
 
+  PL_SIG (SV, D, 1, erf, -4.0, 4.0)
 #endif
diff --git a/pl/math/sv_erfc_4u.c b/pl/math/sv_erfc_4u.c
index 33c1c62..41fb654 100644
--- a/pl/math/sv_erfc_4u.c
+++ b/pl/math/sv_erfc_4u.c
@@ -6,6 +6,7 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
 
 #if SV_SUPPORTED
 #include "sv_exp_tail.h"
@@ -133,4 +134,5 @@ __sv_erfc_x (sv_f64_t x, const svbool_t pg)
 
 strong_alias (__sv_erfc_x, _ZGVsMxv_erfc)
 
+  PL_SIG (SV, D, 1, erfc, -4.0, 10.0)
 #endif
diff --git a/pl/math/sv_erff_1u3.c b/pl/math/sv_erff_1u3.c
index f0af98e..02d7625 100644
--- a/pl/math/sv_erff_1u3.c
+++ b/pl/math/sv_erff_1u3.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #define AbsMask (0x7fffffff)
@@ -88,4 +90,5 @@ __sv_erff_x (sv_f32_t x, const svbool_t pg)
 
 strong_alias (__sv_erff_x, _ZGVsMxv_erff)
 
+  PL_SIG (SV, F, 1, erf, -4.0, 4.0)
 #endif
diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c
index 9ae9d60..d301392 100644
--- a/pl/math/sv_expf_2u.c
+++ b/pl/math/sv_expf_2u.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #define C(i) __sv_expf_poly[i]
@@ -140,4 +142,5 @@ __sv_expf_x (sv_f32_t x, const svbool_t pg)
 
 strong_alias (__sv_expf_x, _ZGVsMxv_expf)
 
+  PL_SIG (SV, F, 1, exp, -9.9, 9.9)
 #endif // SV_SUPPORTED
diff --git a/pl/math/sv_log10_2u5.c b/pl/math/sv_log10_2u5.c
index 92dbfa4..d6ed49a 100644
--- a/pl/math/sv_log10_2u5.c
+++ b/pl/math/sv_log10_2u5.c
@@ -5,8 +5,9 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "math_config.h"
 #include "sv_math.h"
+#include "math_config.h"
+#include "pl_sig.h"
 
 #if SV_SUPPORTED
 
@@ -76,4 +77,5 @@ __sv_log10_x (sv_f64_t x, const svbool_t pg)
 
 strong_alias (__sv_log10_x, _ZGVsMxv_log10)
 
+  PL_SIG (SV, D, 1, log10, 0.01, 11.1)
 #endif
diff --git a/pl/math/sv_log10f_3u5.c b/pl/math/sv_log10f_3u5.c
index fe8ecfd..c1ff196 100644
--- a/pl/math/sv_log10f_3u5.c
+++ b/pl/math/sv_log10f_3u5.c
@@ -6,6 +6,7 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
 
 #if SV_SUPPORTED
 
@@ -75,4 +76,5 @@ __sv_log10f_x (sv_f32_t x, const svbool_t pg)
 
 strong_alias (__sv_log10f_x, _ZGVsMxv_log10f)
 
+  PL_SIG (SV, F, 1, log10, 0.01, 11.1)
 #endif
diff --git a/pl/math/sv_log_2u5.c b/pl/math/sv_log_2u5.c
index c10299c..a50c3d6 100644
--- a/pl/math/sv_log_2u5.c
+++ b/pl/math/sv_log_2u5.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #define A(i) __sv_log_data.poly[i]
@@ -71,4 +73,5 @@ __sv_log_x (sv_f64_t x, const svbool_t pg)
 
 strong_alias (__sv_log_x, _ZGVsMxv_log);
 
+PL_SIG (SV, D, 1, log, 0.01, 11.1)
 #endif // SV_SUPPORTED
diff --git a/pl/math/sv_logf_3u4.c b/pl/math/sv_logf_3u4.c
index 125f806..e9147e4 100644
--- a/pl/math/sv_logf_3u4.c
+++ b/pl/math/sv_logf_3u4.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #define P(i) __sv_logf_poly[i]
@@ -63,4 +65,5 @@ __sv_logf_x (sv_f32_t x, const svbool_t pg)
 
 strong_alias (__sv_logf_x, _ZGVsMxv_logf)
 
+  PL_SIG (SV, F, 1, log, 0.01, 11.1)
 #endif // SV_SUPPORTED
diff --git a/pl/math/sv_sin_3u.c b/pl/math/sv_sin_3u.c
index be873a2..4d879e0 100644
--- a/pl/math/sv_sin_3u.c
+++ b/pl/math/sv_sin_3u.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #define InvPi (sv_f64 (0x1.45f306dc9c883p-2))
@@ -79,4 +81,5 @@ __sv_sin_x (sv_f64_t x, const svbool_t pg)
 
 strong_alias (__sv_sin_x, _ZGVsMxv_sin)
 
+  PL_SIG (SV, D, 1, sin, -3.1, 3.1)
 #endif
diff --git a/pl/math/sv_sinf_1u9.c b/pl/math/sv_sinf_1u9.c
index f7913ca..5634a87 100644
--- a/pl/math/sv_sinf_1u9.c
+++ b/pl/math/sv_sinf_1u9.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 #define A3 (sv_f32 (__sv_sinf_data.coeffs[3]))
@@ -74,4 +76,5 @@ __sv_sinf_x (sv_f32_t x, const svbool_t pg)
 
 strong_alias (__sv_sinf_x, _ZGVsMxv_sinf)
 
+  PL_SIG (SV, F, 1, sin, -3.1, 3.1)
 #endif
diff --git a/pl/math/sv_tanf_3u2.c b/pl/math/sv_tanf_3u2.c
index e1d3757..2f28239 100644
--- a/pl/math/sv_tanf_3u2.c
+++ b/pl/math/sv_tanf_3u2.c
@@ -6,6 +6,8 @@
  */
 
 #include "sv_math.h"
+#include "pl_sig.h"
+
 #if SV_SUPPORTED
 
 /* Constants.  */
@@ -98,4 +100,5 @@ __sv_tanf_x (sv_f32_t x, const svbool_t pg)
 
 strong_alias (__sv_tanf_x, _ZGVsMxv_tanf)
 
+  PL_SIG (SV, F, 1, tan, -3.1, 3.1)
 #endif
diff --git a/pl/math/tanf_3u3.c b/pl/math/tanf_3u3.c
index e6f899f..3e4ad38 100644
--- a/pl/math/tanf_3u3.c
+++ b/pl/math/tanf_3u3.c
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "math_config.h"
+#include "pl_sig.h"
 
 /* Useful constants.  */
 #define NegPio2_1 (-0x1.921fb6p+0f)
@@ -190,3 +191,5 @@ tanf (float x)
   /* A unified way of assembling the result on both interval types.  */
   return fmaf (scale, p, offset);
 }
+
+PL_SIG (S, F, 1, tan, -3.1, 3.1)
diff --git a/pl/math/tanhf_2u6.c b/pl/math/tanhf_2u6.c
index 145f437..90f561f 100644
--- a/pl/math/tanhf_2u6.c
+++ b/pl/math/tanhf_2u6.c
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "math_config.h"
+#include "pl_sig.h"
 
 #define BoringBound                                                            \
   0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
@@ -78,3 +79,5 @@ tanhf (float x)
   float q = expm1f_inline (2 * x);
   return q / (q + 2);
 }
+
+PL_SIG (S, F, 1, tanh, -10.0, 10.0)
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index 9e3b9a0..e3eda6f 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -6,86 +6,64 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+#define _ZSF1(fun, a, b) F(fun##f, a, b)
+#define _ZSD1(f, a, b) D(f, a, b)
+
 #ifdef __vpcs
 
-#define ZVNF(f, a, b) F(__s_##f, a, b) VF(__v_##f, a, b) VNF(__vn_##f, a, b) VNF(_ZGVnN4v_##f, a, b)
-#define ZVND(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) VND(__vn_##f, a, b) VND(_ZGVnN2v_##f, a, b)
+#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b) VNF(__vn_##fun##f, a, b) VNF(_ZGVnN4v_##fun##f, a, b)
+#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) VND(__vn_##f, a, b) VND(_ZGVnN2v_##f, a, b)
 
 #elif __aarch64__
 
-#define ZVNF(f, a, b) F(__s_##f, a, b) VF(__v_##f, a, b)
-#define ZVND(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b)
+#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b)
+#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b)
+
+#elif WANT_VMATH
+
+#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b)
+#define _ZVD1(f, a, b) D(__s_##f, a, b)
+
+#else
+
+#define _ZVF1(f, a, b)
+#define _ZVD1(f, a, b)
+
+#endif
+
+#if WANT_SVE_MATH
+
+#define _ZSVF1(fun, a, b) SVF(__sv_##fun##f_x, a, b) SVF(_ZGVsMxv_##fun##f, a, b)
+#define _ZSVD1(f, a, b) SVD(__sv_##f##_x, a, b) SVD(_ZGVsMxv_##f, a, b)
 
 #else
 
-#define ZVNF(f, a, b) F(__s_##f, a, b)
-#define ZVND(f, a, b) D(__s_##f, a, b)
+#define _ZSVF1(f, a, b)
+#define _ZSVD1(f, a, b)
 
 #endif
 
-#define VZSVF(f, a, b) SVF(__sv_##f##_x, a, b) SVF(_ZGVsMxv_##f, a, b)
-#define VZSVD(f, a, b) SVD(__sv_##f##_x, a, b) SVD(_ZGVsMxv_##f, a, b)
+/* No auto-generated wrappers for binary functions - they have be
+   manually defined in mathbench_wrappers.h. We have to define silent
+   macros for them anyway as they will be emitted by PL_SIG.  */
+#define _ZSF2(...)
+#define _ZSD2(...)
+#define _ZVF2(...)
+#define _ZVD2(...)
+#define _ZSVF2(...)
+#define _ZSVD2(...)
+
+#include "mathbench_funcs_gen.h"
+
+/* PL_SIG only emits entries for unary functions, since if a function
+   needs to be wrapped in mathbench there is no way for it to know the
+   same of the wrapper. Add entries for binary functions, or any other
+   exotic signatures that need wrapping, below.  */
 
-F (acoshf, 1.0, 10.0)
-F (asinhf, -10.0, 10.0)
-F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
-F (atanhf, -1.0, 1.0)
-F (cbrtf, -10.0, 10.0)
-F (cosf, -3.1, 3.1)
-F (coshf, -10.0, 10.0)
-F (erfcf, -4.0, 10.0)
-F (erff, -4.0, 4.0)
-F (expm1f, -9.9, 9.9)
-F (log10f, 0.01, 11.1)
-F (log1pf, -0.9, 10.0)
-F (log2f, 0.01, 11.1)
-F (sinf, -3.1, 3.1)
-F (sinhf, -10.0, 10.0)
-F (tanf, -3.1, 3.1)
-F (tanhf, -10.0, 10.0)
-
-D (acosh, 1.0, 10.0)
-D (asinh, -10.0, 10.0)
-D (atan, -10.0, 10.0)
 {"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
-D (cos, -3.1, 3.1)
-D (cosh, -10.0, 10.0)
-D (erf, -6,6)
-D (erfc, -6.0, 28.0)
-D (expm1, -9.9, 9.9)
-D (log10, 0.01, 11.1)
-D (log1p, -0.9, 10.0)
-D (log2, 0.01, 11.1)
 {"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}},
-D (sin, -3.1, 3.1)
-D (sinh, -10.0, 10.0)
-
-#if WANT_VMATH
-ZVNF (asinhf, -10.0, 10.0)
-ZVND (asinh, -10.0, 10.0)
-ZVNF (atanf, -10.0, 10.0)
-ZVNF (atanhf, -1.0, 1.0)
-ZVND (atan, -10.0, 10.0)
-ZVNF (cbrtf, -10.0, 10.0)
-ZVNF (coshf, -10.0, 10.0)
-ZVND (cosh, -10.0, 10.0)
-ZVNF (erff, -4.0, 4.0)
-ZVND (erf, -6.0, 6.0)
-ZVNF (erfcf, -6.0, 28.0)
-ZVND (erfc, -6.0, 28.0)
-ZVNF (expm1f, -9.9, 9.9)
-ZVND (expm1, -9.9, 9.9)
-ZVNF (log10f, 0.01, 11.1)
-ZVND (log10, 0.01, 11.1)
-ZVNF (log1pf, -0.9, 10.0)
-ZVND (log1p, -0.9, 10.0)
-ZVNF (log2f, 0.01, 11.1)
-ZVND (log2, 0.01, 11.1)
-ZVNF (sinhf, -10.0, 10.0)
-ZVND (sinh, -10.0, 10.0)
-ZVNF (tanf, -3.1, 3.1)
-ZVNF (tanhf, -10.0, 10.0)
+
 {"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
 {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
 {"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
@@ -96,30 +74,13 @@ ZVNF (tanhf, -10.0, 10.0)
 {"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
 
 #if WANT_SVE_MATH
-VZSVF (atanf, -3.1, 3.1)
-VZSVD (atan, -3.1, 3.1)
 {"__sv_atan2f_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_atan2f_wrap}},
 {"_ZGVsMxvv_atan2f", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}},
 {"__sv_atan2_x", 'd', 'n', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
 {"_ZGVsM2vv_atan2", 'd', 'n', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
-VZSVF (erff, -4.0, 4.0)
-VZSVD (erf, -4.0, 4.0)
-VZSVD (erfc, -4, 10)
-VZSVF (expf, -9.9, 9.9)
-VZSVF (cosf, -3.1, 3.1)
-VZSVD (cos, -3.1, 3.1)
-VZSVF (sinf, -3.1, 3.1)
-VZSVD (sin, -3.1, 3.1)
-VZSVF (logf, 0.01, 11.1)
-VZSVD (log, 0.01, 11.1)
-VZSVF (log10f, 0.01, 11.1)
-VZSVD (log10, 0.01, 11.1)
-VZSVF (tanf, -3.1, 3.1)
 {"__sv_powif_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_powif_wrap}},
 {"_ZGVsMxvv_powi", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
 {"__sv_powi_x", 'd', 'n', -10.0, 10.0, {.svd = __sv_powi_wrap}},
 {"_ZGVsMxvv_powk", 'd', 'n', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}},
-
-#endif
 #endif
   // clang-format on
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 86e2bed..af1c464 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -7,103 +7,60 @@
 
 #ifdef __vpcs
 
-#define _ZVNF1(f) SF1 (f) VF1 (f) ZVNF1 (f)
-#define _ZVND1(f) SD1 (f) VD1 (f) ZVND1 (f)
-#define _ZVNF2(f) SF2 (f) VF2 (f) ZVNF2 (f)
-#define _ZVND2(f) SD2 (f) VD2 (f) ZVND2 (f)
+#define _ZVF1(f) SF1 (f) VF1 (f) ZVNF1 (f)
+#define _ZVD1(f) SD1 (f) VD1 (f) ZVND1 (f)
+#define _ZVF2(f) SF2 (f) VF2 (f) ZVNF2 (f)
+#define _ZVD2(f) SD2 (f) VD2 (f) ZVND2 (f)
 
 #elif __aarch64
 
-#define _ZVNF1(f) SF1 (f) VF1 (f)
-#define _ZVND1(f) SD1 (f) VD1 (f)
-#define _ZVNF2(f) SF2 (f) VF2 (f)
-#define _ZVND2(f) SD2 (f) VD2 (f)
+#define _ZVF1(f) SF1 (f) VF1 (f)
+#define _ZVD1(f) SD1 (f) VD1 (f)
+#define _ZVF2(f) SF2 (f) VF2 (f)
+#define _ZVD2(f) SD2 (f) VD2 (f)
+
+#elif WANT_VMATH
+
+#define _ZVF1(f) SF1 (f)
+#define _ZVD1(f) SD1 (f)
+#define _ZVF2(f) SF2 (f)
+#define _ZVD2(f) SD2 (f)
 
 #else
 
-#define _ZVNF1(f) SF1 (f)
-#define _ZVND1(f) SD1 (f)
-#define _ZVNF2(f) SF2 (f)
-#define _ZVND2(f) SD2 (f)
+#define _ZVF1(f)
+#define _ZVD1(f)
+#define _ZVF2(f)
+#define _ZVD2(f)
 
 #endif
 
+#if WANT_SVE_MATH
+
 #define _ZSVF1(f) SVF1 (f) ZSVF1 (f)
 #define _ZSVF2(f) SVF2 (f) ZSVF2 (f)
 #define _ZSVD1(f) SVD1 (f) ZSVD1 (f)
 #define _ZSVD2(f) SVD2 (f) ZSVD2 (f)
 
-F1 (acosh)
-F1 (asinh)
-F2 (atan2)
-F1 (atanh)
-F1 (cbrt)
-F1 (cosh)
-F1 (erfc)
-F1 (erf)
-F1 (expm1)
-F1 (log10)
-F1 (log1p)
-F1 (sinh)
-F1 (tan)
-F1 (tanh)
-D1 (acosh)
-D1 (asinh)
-D2 (atan2)
-D1 (cosh)
-D1 (erfc)
-D1 (expm1)
-D1 (log10)
-D1 (log1p)
-D1 (sinh)
-#if WANT_VMATH
-_ZVNF1 (asinh)
-_ZVND1 (asinh)
-_ZVNF1 (atan)
-_ZVND1 (atan)
-_ZVNF2 (atan2)
-_ZVND2 (atan2)
-_ZVNF1 (atanh)
-_ZVNF1 (cbrt)
-_ZVNF1 (cosh)
-_ZVND1 (cosh)
-_ZVNF1 (erf)
-_ZVND1 (erf)
-_ZVNF1 (erfc)
-_ZVND1 (erfc)
-_ZVNF1 (expm1)
-_ZVND1 (expm1)
-_ZVNF1 (log10)
-_ZVND1 (log10)
-_ZVNF1 (log1p)
-_ZVND1 (log1p)
-_ZVNF1 (log2)
-_ZVND1 (log2)
-_ZVNF1 (sinh)
-_ZVND1 (sinh)
-_ZVNF1 (tan)
-_ZVNF1 (tanh)
+#else
+
+#define _ZSVF1(f)
+#define _ZSVF2(f)
+#define _ZSVD1(f)
+#define _ZSVD2(f)
+
+#endif
+
+#define _ZSF1(f) F1 (f)
+#define _ZSF2(f) F2 (f)
+#define _ZSD1(f) D1 (f)
+#define _ZSD2(f) D2 (f)
+
+#include "ulp_funcs_gen.h"
+
 #if WANT_SVE_MATH
-_ZSVF2 (atan2)
-_ZSVD2 (atan2)
-_ZSVF1 (atan)
-_ZSVD1 (atan)
-_ZSVF1 (cos)
-_ZSVD1 (cos)
-_ZSVF1 (erf)
-_ZSVD1 (erf)
-_ZSVD1 (erfc)
-_ZSVF1 (exp)
-_ZSVF1 (log)
-_ZSVD1 (log)
-_ZSVF1 (log10)
-_ZSVD1 (log10)
 F (__sv_powi, sv_powi, ref_powi, mpfr_powi, 2, 0, d2, 0)
 F (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0)
 F (__sv_powif, sv_powif, ref_powif, mpfr_powi, 2, 1, f2, 0)
 F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0)
-_ZSVF1 (sin)
-_ZSVD1 (sin)
-_ZSVF1 (tan)
-#endif
 #endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index be87c21..e91cbe5 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -89,13 +89,20 @@ DECL_POW_INT_REF(ref_powi, long double, double, int)
 #define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func)
 #define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func)
 
-#else
+#elif WANT_VMATH
 
 #define ZVNF1_WRAP(func) VF1_WRAP(func)
 #define ZVNF2_WRAP(func) VF2_WRAP(func)
 #define ZVND1_WRAP(func) VD1_WRAP(func)
 #define ZVND2_WRAP(func) VD2_WRAP(func)
 
+#else
+
+#define ZVNF1_WRAP(func)
+#define ZVNF2_WRAP(func)
+#define ZVND1_WRAP(func)
+#define ZVND2_WRAP(func)
+
 #endif
 
 #define SVF1_WRAP(func) static float sv_##func##f(float x) { return svretf(__sv_##func##f_x(svargf(x), svptrue_b32())); }
@@ -108,62 +115,34 @@ DECL_POW_INT_REF(ref_powi, long double, double, int)
 #define ZSVD1_WRAP(func) static double Z_sv_##func(double x) { return svretd(_ZGVsMxv_##func(svargd(x), svptrue_b64())); }
 #define ZSVD2_WRAP(func) static double Z_sv_##func(double x, double y) { return svretd(_ZGVsMxvv_##func(svargd(x), svargd(y), svptrue_b64())); }
 
+#if WANT_SVE_MATH
+
 #define ZSVNF1_WRAP(func) SVF1_WRAP(func) ZSVF1_WRAP(func)
 #define ZSVNF2_WRAP(func) SVF2_WRAP(func) ZSVF2_WRAP(func)
 #define ZSVND1_WRAP(func) SVD1_WRAP(func) ZSVD1_WRAP(func)
 #define ZSVND2_WRAP(func) SVD2_WRAP(func) ZSVD2_WRAP(func)
 
-/* Wrappers for vector functions.  */
-#if WANT_VMATH
-ZVNF1_WRAP(asinh)
-ZVNF1_WRAP(atan)
-ZVNF2_WRAP(atan2)
-ZVNF1_WRAP(atanh)
-ZVNF1_WRAP(cbrt)
-ZVNF1_WRAP(cosh)
-ZVNF1_WRAP(erf)
-ZVNF1_WRAP(erfc)
-ZVNF1_WRAP(expm1)
-ZVNF1_WRAP(log10)
-ZVNF1_WRAP(log1p)
-ZVNF1_WRAP(log2)
-ZVNF1_WRAP(sinh)
-ZVNF1_WRAP(tan)
-ZVNF1_WRAP(tanh)
-ZVND1_WRAP(asinh)
-ZVND1_WRAP(atan)
-ZVND2_WRAP(atan2)
-ZVND1_WRAP(cosh)
-ZVND1_WRAP(erf)
-ZVND1_WRAP(erfc)
-ZVND1_WRAP(expm1)
-ZVND1_WRAP(log10)
-ZVND1_WRAP(log1p)
-ZVND1_WRAP(log2)
-ZVND1_WRAP(sinh)
+#else
+
+#define ZSVNF1_WRAP(func)
+#define ZSVNF2_WRAP(func)
+#define ZSVND1_WRAP(func)
+#define ZSVND2_WRAP(func)
+
+#endif
+
+/* No wrappers for scalar routines, but PL_SIG will emit them.  */
+#define ZSNF1_WRAP(func)
+#define ZSNF2_WRAP(func)
+#define ZSND1_WRAP(func)
+#define ZSND2_WRAP(func)
+
+#include "ulp_wrappers_gen.h"
+
 #if WANT_SVE_MATH
-ZSVNF2_WRAP(atan2)
-ZSVNF1_WRAP(atan)
-ZSVNF1_WRAP(cos)
-ZSVNF1_WRAP(erf)
-ZSVNF1_WRAP(exp)
-ZSVNF1_WRAP(log)
-ZSVNF1_WRAP(log10)
-ZSVNF1_WRAP(sin)
-ZSVNF1_WRAP(tan)
 static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); }
 static float sv_powif(float x, float y) { return svretf(__sv_powif_x(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); }
-
-ZSVND2_WRAP(atan2)
-ZSVND1_WRAP(atan)
-ZSVND1_WRAP(cos)
-ZSVND1_WRAP(erf)
-ZSVND1_WRAP(erfc)
-ZSVND1_WRAP(log)
-ZSVND1_WRAP(log10)
-ZSVND1_WRAP(sin)
 static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); }
 static double sv_powi(double x, double y) { return svretd(__sv_powi_x(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); }
 #endif
-#endif
 // clang-format on
diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c
index 974e6df..02e8098 100644
--- a/pl/math/v_asinh_2u5.c
+++ b/pl/math/v_asinh_2u5.c
@@ -6,6 +6,7 @@
 
 #include "v_math.h"
 #include "estrin.h"
+#include "pl_sig.h"
 
 #if V_SUPPORTED
 
@@ -152,4 +153,5 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, D, 1, asinh, -10.0, 10.0)
 #endif
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
index 7bce7ff..18a2395 100644
--- a/pl/math/v_asinhf_2u7.c
+++ b/pl/math/v_asinhf_2u7.c
@@ -6,6 +6,7 @@
 
 #include "v_math.h"
 #include "include/mathlib.h"
+#include "pl_sig.h"
 
 #if V_SUPPORTED
 
@@ -53,4 +54,5 @@ VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, F, 1, asinh, -10.0, 10.0)
 #endif
diff --git a/pl/math/v_atan2_3u.c b/pl/math/v_atan2_3u.c
index d69d221..2b31bec 100644
--- a/pl/math/v_atan2_3u.c
+++ b/pl/math/v_atan2_3u.c
@@ -6,6 +6,8 @@
  */
 
 #include "v_math.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #include "atan_common.h"
@@ -75,4 +77,6 @@ v_f64_t V_NAME (atan2) (v_f64_t y, v_f64_t x)
 }
 VPCS_ALIAS
 
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (V, D, 2, atan2)
 #endif
diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
index dc0fbca..8c2c8f2 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/pl/math/v_atan2f_3u.c
@@ -6,6 +6,8 @@
  */
 
 #include "v_math.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #include "atanf_common.h"
@@ -75,4 +77,6 @@ v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x)
 }
 VPCS_ALIAS
 
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (V, F, 2, atan2)
 #endif
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
index 05c77c0..3e504e7 100644
--- a/pl/math/v_atan_2u5.c
+++ b/pl/math/v_atan_2u5.c
@@ -6,6 +6,8 @@
  */
 
 #include "v_math.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #include "atan_common.h"
@@ -47,4 +49,6 @@ v_f64_t V_NAME (atan) (v_f64_t x)
   return y;
 }
 VPCS_ALIAS
+
+PL_SIG (V, D, 1, atan, -10.0, 10.0)
 #endif
diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c
index 7c84244..9a0230a 100644
--- a/pl/math/v_atanf_3u.c
+++ b/pl/math/v_atanf_3u.c
@@ -6,6 +6,8 @@
  */
 
 #include "v_math.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #include "atanf_common.h"
@@ -46,4 +48,6 @@ v_f32_t V_NAME (atanf) (v_f32_t x)
   return y;
 }
 VPCS_ALIAS
+
+PL_SIG (V, F, 1, atan, -10.0, 10.0)
 #endif
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
index 1e3a561..5c9ceb1 100644
--- a/pl/math/v_atanhf_3u1.c
+++ b/pl/math/v_atanhf_3u1.c
@@ -6,6 +6,7 @@
 
 #include "v_math.h"
 #include "mathlib.h"
+#include "pl_sig.h"
 
 #if V_SUPPORTED
 
@@ -48,4 +49,5 @@ VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x)
 
 VPCS_ALIAS
 
+PL_SIG (V, F, 1, atanh, -1.0, 1.0)
 #endif
diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c
index fd43051..9db2579 100644
--- a/pl/math/v_cbrtf_1u5.c
+++ b/pl/math/v_cbrtf_1u5.c
@@ -6,6 +6,7 @@
 
 #include "v_math.h"
 #include "mathlib.h"
+#include "pl_sig.h"
 
 #if V_SUPPORTED
 
@@ -85,4 +86,5 @@ VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
 #endif
diff --git a/pl/math/v_cosh_2u.c b/pl/math/v_cosh_2u.c
index 6d1a9ed..fa4f1e6 100644
--- a/pl/math/v_cosh_2u.c
+++ b/pl/math/v_cosh_2u.c
@@ -5,6 +5,7 @@
  */
 
 #include "v_math.h"
+#include "pl_sig.h"
 
 #include "v_exp_tail.h"
 #define C1 v_f64 (C1_scal)
@@ -83,4 +84,5 @@ VPCS_ATTR v_f64_t V_NAME (cosh) (v_f64_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, D, 1, cosh, -10.0, 10.0)
 #endif
diff --git a/pl/math/v_coshf_2u4.c b/pl/math/v_coshf_2u4.c
index 7d7a228..95288d9 100644
--- a/pl/math/v_coshf_2u4.c
+++ b/pl/math/v_coshf_2u4.c
@@ -6,6 +6,7 @@
 
 #include "v_math.h"
 #include "mathlib.h"
+#include "pl_sig.h"
 
 #define AbsMask 0x7fffffff
 #define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this.  */
@@ -59,4 +60,5 @@ VPCS_ATTR v_f32_t V_NAME (coshf) (v_f32_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, F, 1, cosh, -10.0, 10.0)
 #endif
diff --git a/pl/math/v_erf_2u.c b/pl/math/v_erf_2u.c
index 5a7403f..192b6ed 100644
--- a/pl/math/v_erf_2u.c
+++ b/pl/math/v_erf_2u.c
@@ -5,9 +5,11 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+#include "v_math.h"
 #include "include/mathlib.h"
 #include "math_config.h"
-#include "v_math.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #define AbsMask v_u64 (0x7fffffffffffffff)
@@ -101,4 +103,6 @@ v_f64_t V_NAME (erf) (v_f64_t x)
   return y;
 }
 VPCS_ALIAS
+
+PL_SIG (V, D, 1, erf, -6.0, 6.0)
 #endif
diff --git a/pl/math/v_erfc_4u.c b/pl/math/v_erfc_4u.c
index 80e11e7..88f5172 100644
--- a/pl/math/v_erfc_4u.c
+++ b/pl/math/v_erfc_4u.c
@@ -5,9 +5,11 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "math_config.h"
 #include "v_math.h"
 #include "horner.h"
+#include "math_config.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 /* Accurate exponential (vector variant of exp_dd).  */
@@ -153,4 +155,6 @@ v_f64_t V_NAME (erfc) (v_f64_t x)
   return y;
 }
 VPCS_ALIAS
+
+PL_SIG (V, D, 1, erfc, -6.0, 28.0)
 #endif
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c
index d9c65a5..cf2b174 100644
--- a/pl/math/v_erfcf_1u.c
+++ b/pl/math/v_erfcf_1u.c
@@ -8,6 +8,7 @@
 #include "v_math.h"
 #include "erfcf.h"
 #include "estrin.h"
+#include "pl_sig.h"
 
 #if V_SUPPORTED
 
@@ -169,4 +170,6 @@ v_f32_t V_NAME (erfcf) (v_f32_t x)
   return y;
 }
 VPCS_ALIAS
+
+PL_SIG (V, F, 1, erfc, -6.0, 28.0)
 #endif
diff --git a/pl/math/v_erff_1u5.c b/pl/math/v_erff_1u5.c
index 4407cd1..3e8d400 100644
--- a/pl/math/v_erff_1u5.c
+++ b/pl/math/v_erff_1u5.c
@@ -5,9 +5,11 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+#include "v_math.h"
 #include "include/mathlib.h"
 #include "math_config.h"
-#include "v_math.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 VPCS_ATTR v_f32_t V_NAME (expf) (v_f32_t);
@@ -101,4 +103,6 @@ v_f32_t V_NAME (erff) (v_f32_t x)
   return y;
 }
 VPCS_ALIAS
+
+PL_SIG (V, F, 1, erf, -4.0, 4.0)
 #endif
diff --git a/pl/math/v_exp_tail.c b/pl/math/v_exp_tail.c
index cf834e6..fabc110 100644
--- a/pl/math/v_exp_tail.c
+++ b/pl/math/v_exp_tail.c
@@ -5,8 +5,8 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "math_config.h"
 #include "v_math.h"
+#include "math_config.h"
 #if V_SUPPORTED
 #include "v_exp_tail.h"
 
diff --git a/pl/math/v_expf.c b/pl/math/v_expf.c
index 2707ebc..d6e5720 100644
--- a/pl/math/v_expf.c
+++ b/pl/math/v_expf.c
@@ -5,8 +5,8 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "mathlib.h"
 #include "v_math.h"
+#include "mathlib.h"
 #if V_SUPPORTED
 
 static const float Poly[] = {
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
index 7a5818b..78576a9 100644
--- a/pl/math/v_expm1_2u5.c
+++ b/pl/math/v_expm1_2u5.c
@@ -6,6 +6,7 @@
  */
 
 #include "v_math.h"
+#include "pl_sig.h"
 
 #if V_SUPPORTED
 
@@ -98,4 +99,6 @@ v_f64_t V_NAME (expm1) (v_f64_t x)
   return y;
 }
 VPCS_ALIAS
+
+PL_SIG (V, D, 1, expm1, -9.9, 9.9)
 #endif
diff --git a/pl/math/v_expm1f_1u6.c b/pl/math/v_expm1f_1u6.c
index e18814e..6d1ae0e 100644
--- a/pl/math/v_expm1f_1u6.c
+++ b/pl/math/v_expm1f_1u6.c
@@ -6,6 +6,7 @@
  */
 
 #include "v_math.h"
+#include "pl_sig.h"
 
 #if V_SUPPORTED
 
@@ -81,4 +82,6 @@ v_f32_t V_NAME (expm1f) (v_f32_t x)
   return y;
 }
 VPCS_ALIAS
+
+PL_SIG (V, F, 1, expm1, -9.9, 9.9)
 #endif
diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c
index c34167f..c481b00 100644
--- a/pl/math/v_log10_2u5.c
+++ b/pl/math/v_log10_2u5.c
@@ -5,8 +5,10 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "include/mathlib.h"
 #include "v_math.h"
+#include "include/mathlib.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #define A(i) v_f64 (__v_log10_data.poly[i])
@@ -98,4 +100,5 @@ v_f64_t V_NAME (log10) (v_f64_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, D, 1, log10, 0.01, 11.1)
 #endif
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
index 4dede3d..8bf8bb3 100644
--- a/pl/math/v_log10f_3u5.c
+++ b/pl/math/v_log10f_3u5.c
@@ -5,8 +5,10 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "mathlib.h"
 #include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #define P(i) v_f32 (__v_log10f_poly[i])
@@ -70,4 +72,6 @@ v_f32_t V_NAME (log10f) (v_f32_t x)
   return y;
 }
 VPCS_ALIAS
+
+PL_SIG (V, F, 1, log10, 0.01, 11.1)
 #endif
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
index 3781522..e0d58ec 100644
--- a/pl/math/v_log1p_2u5.c
+++ b/pl/math/v_log1p_2u5.c
@@ -5,6 +5,8 @@
  */
 
 #include "v_math.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #include "estrin.h"
@@ -104,4 +106,5 @@ VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x)
 
 VPCS_ALIAS
 
+PL_SIG (V, D, 1, log1p, -0.9, 10.0)
 #endif
diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c
index 3ef8416..361fa4f 100644
--- a/pl/math/v_log1pf_2u1.c
+++ b/pl/math/v_log1pf_2u1.c
@@ -5,6 +5,8 @@
  */
 
 #include "v_math.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #define AbsMask 0x7fffffff
@@ -142,4 +144,5 @@ VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, F, 1, log1p, -0.9, 10.0)
 #endif
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
index af7340d..07afced 100644
--- a/pl/math/v_log2_3u.c
+++ b/pl/math/v_log2_3u.c
@@ -5,8 +5,10 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "include/mathlib.h"
 #include "v_math.h"
+#include "include/mathlib.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #define InvLn2 v_f64 (0x1.71547652b82fep0)
@@ -85,4 +87,5 @@ v_f64_t V_NAME (log2) (v_f64_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, D, 1, log2, 0.01, 11.1)
 #endif
diff --git a/pl/math/v_log2f_2u6.c b/pl/math/v_log2f_2u6.c
index 73bb84f..335b324 100644
--- a/pl/math/v_log2f_2u6.c
+++ b/pl/math/v_log2f_2u6.c
@@ -5,8 +5,10 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "math_config.h"
 #include "v_math.h"
+#include "math_config.h"
+#include "pl_sig.h"
+
 #if V_SUPPORTED
 
 #define N (1 << V_LOG2F_TABLE_BITS)
@@ -116,4 +118,6 @@ VPCS_ATTR v_f32_t V_NAME (log2f) (v_f32_t x)
   return y;
 }
 VPCS_ALIAS
+
+PL_SIG (V, F, 1, log2, 0.01, 11.1)
 #endif
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
index c707364..cec8208 100644
--- a/pl/math/v_sinh_3u.c
+++ b/pl/math/v_sinh_3u.c
@@ -6,6 +6,7 @@
 
 #include "v_math.h"
 #include "mathlib.h"
+#include "pl_sig.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define Half 0x3fe0000000000000
@@ -42,4 +43,5 @@ VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, D, 1, sinh, -10.0, 10.0)
 #endif
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
index 4397bca..299f081 100644
--- a/pl/math/v_sinhf_2u3.c
+++ b/pl/math/v_sinhf_2u3.c
@@ -6,6 +6,7 @@
 
 #include "v_math.h"
 #include "mathlib.h"
+#include "pl_sig.h"
 
 #if V_SUPPORTED
 
@@ -41,4 +42,5 @@ VPCS_ATTR v_f32_t V_NAME (sinhf) (v_f32_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, F, 1, sinh, -10.0, 10.0)
 #endif
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
index 8b3869c..9a11ba4 100644
--- a/pl/math/v_tanf_3u2.c
+++ b/pl/math/v_tanf_3u2.c
@@ -7,6 +7,7 @@
 
 #include "v_math.h"
 #include "estrinf.h"
+#include "pl_sig.h"
 
 #if V_SUPPORTED
 
@@ -114,4 +115,6 @@ v_f32_t V_NAME (tanf) (v_f32_t x)
   return y;
 }
 VPCS_ALIAS
+
+PL_SIG (V, F, 1, tan, -3.1, 3.1)
 #endif
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index 67e4520..1196c4a 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -5,8 +5,9 @@
  */
 
 #include "v_math.h"
-#include "mathlib.h"
 #include "estrinf.h"
+#include "mathlib.h"
+#include "pl_sig.h"
 
 #if V_SUPPORTED
 
@@ -88,4 +89,5 @@ VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
 }
 VPCS_ALIAS
 
+PL_SIG (V, F, 1, tanh, -10.0, 10.0)
 #endif
-- 
cgit v1.2.3


From ecb1c6f6ea7872645cb4c26514d5f64815b61a1b Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 15 Dec 2022 13:27:39 +0000
Subject: pl/math: Move ULP limits to routine source files

Introduces a new set of macros and Make rules for mechanically
generating a list of ULP limits for each routine, to be consumed
by runulp.sh. This removes the need to maintain long lists of
thresholds in runulp.sh.
---
 pl/math/Dir.mk            | 33 +++++++++++++++++--
 pl/math/acosh_3u.c        |  2 ++
 pl/math/acoshf_2u8.c      |  2 ++
 pl/math/asinh_2u5.c       |  2 ++
 pl/math/asinhf_3u5.c      |  2 ++
 pl/math/atan2_2u5.c       |  2 ++
 pl/math/atan2f_3u.c       |  2 ++
 pl/math/atanhf_3u1.c      |  2 ++
 pl/math/cbrtf_1u5.c       |  2 ++
 pl/math/cosh_2u.c         |  2 ++
 pl/math/coshf_1u9.c       |  2 ++
 pl/math/erfc_4u5.c        |  2 ++
 pl/math/erfcf_2u.c        |  2 ++
 pl/math/erff_1u5.c        |  2 ++
 pl/math/expm1_2u5.c       |  2 ++
 pl/math/expm1f_1u6.c      |  2 ++
 pl/math/include/pl_test.h | 16 ++++++++++
 pl/math/log10_2u.c        |  2 ++
 pl/math/log10f.c          | 14 +++------
 pl/math/log1p_2u.c        |  2 ++
 pl/math/log1pf_2u1.c      |  2 ++
 pl/math/sinh_3u.c         |  2 ++
 pl/math/sinhf_2u3.c       |  2 ++
 pl/math/sv_atan2_2u5.c    |  9 +++---
 pl/math/sv_atan2f_3u.c    |  9 +++---
 pl/math/sv_atan_2u5.c     |  6 ++--
 pl/math/sv_atanf_2u9.c    |  6 ++--
 pl/math/sv_cos_2u5.c      |  6 ++--
 pl/math/sv_cosf_2u1.c     |  6 ++--
 pl/math/sv_erf_2u5.c      |  6 ++--
 pl/math/sv_erfc_4u.c      |  6 ++--
 pl/math/sv_erff_1u3.c     |  6 ++--
 pl/math/sv_expf_2u.c      |  6 ++--
 pl/math/sv_log10_2u5.c    |  6 ++--
 pl/math/sv_log10f_3u5.c   |  6 ++--
 pl/math/sv_log_2u5.c      |  4 ++-
 pl/math/sv_logf_3u4.c     |  6 ++--
 pl/math/sv_sin_3u.c       |  6 ++--
 pl/math/sv_sinf_1u9.c     |  6 ++--
 pl/math/sv_tanf_3u2.c     |  6 ++--
 pl/math/tanf_3u3.c        |  2 ++
 pl/math/tanhf_2u6.c       |  2 ++
 pl/math/test/pl_test.h    | 14 +++++++++
 pl/math/test/runulp.sh    | 80 ++++-------------------------------------------
 pl/math/v_asinh_2u5.c     |  2 ++
 pl/math/v_asinhf_2u7.c    |  2 ++
 pl/math/v_atan2_3u.c      |  3 ++
 pl/math/v_atan2f_3u.c     |  2 ++
 pl/math/v_atan_2u5.c      |  2 ++
 pl/math/v_atanf_3u.c      |  2 ++
 pl/math/v_atanhf_3u1.c    |  3 +-
 pl/math/v_cbrtf_1u5.c     |  2 ++
 pl/math/v_cosh_2u.c       |  4 ++-
 pl/math/v_coshf_2u4.c     |  2 ++
 pl/math/v_erf_2u.c        |  2 ++
 pl/math/v_erfc_4u.c       |  2 ++
 pl/math/v_erfcf_1u.c      |  2 ++
 pl/math/v_erff_1u5.c      |  2 ++
 pl/math/v_expm1_2u5.c     |  2 ++
 pl/math/v_expm1f_1u6.c    |  2 ++
 pl/math/v_log10_2u5.c     |  2 ++
 pl/math/v_log10f_3u5.c    |  2 ++
 pl/math/v_log1p_2u5.c     |  6 ++--
 pl/math/v_log1pf_2u1.c    |  2 ++
 pl/math/v_log2_3u.c       |  2 ++
 pl/math/v_log2f_2u6.c     |  2 ++
 pl/math/v_sinh_3u.c       |  2 ++
 pl/math/v_sinhf_2u3.c     |  2 ++
 pl/math/v_tanf_3u2.c      |  2 ++
 pl/math/v_tanhf_2u6.c     |  2 ++
 pl/math/vn_asinh_2u5.c    |  2 +-
 pl/math/vn_asinhf_2u7.c   |  2 +-
 pl/math/vn_atan2_3u.c     |  2 +-
 pl/math/vn_atan2f_3u.c    |  2 +-
 pl/math/vn_atan_2u5.c     |  2 +-
 pl/math/vn_atanf_3u.c     |  2 +-
 pl/math/vn_atanhf_3u1.c   |  2 +-
 pl/math/vn_cbrtf_1u5.c    |  2 +-
 pl/math/vn_cosh_2u.c      |  2 +-
 pl/math/vn_coshf_2u4.c    |  2 +-
 pl/math/vn_erf_2u.c       |  2 +-
 pl/math/vn_erfc_4u.c      |  2 +-
 pl/math/vn_erfcf_1u.c     |  2 +-
 pl/math/vn_erff_1u5.c     |  2 +-
 pl/math/vn_expm1_2u5.c    |  2 +-
 pl/math/vn_expm1f_1u6.c   |  2 +-
 pl/math/vn_log10_2u5.c    |  2 +-
 pl/math/vn_log10f_3u5.c   |  2 +-
 pl/math/vn_log1p_2u5.c    |  2 +-
 pl/math/vn_log1pf_2u1.c   |  2 +-
 pl/math/vn_log2_3u.c      |  2 +-
 pl/math/vn_log2f_2u6.c    |  2 +-
 pl/math/vn_sinh_3u.c      |  2 +-
 pl/math/vn_sinhf_2u3.c    |  2 +-
 pl/math/vn_tanf_3u2.c     |  2 +-
 pl/math/vn_tanhf_2u6.c    |  2 +-
 96 files changed, 266 insertions(+), 153 deletions(-)
 create mode 100644 pl/math/include/pl_test.h
 create mode 100644 pl/math/test/pl_test.h

diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index b866fa4..0cae7be 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -143,8 +143,37 @@ check-pl/math-test: $(math-tools)
 check-pl/math-rtest: $(math-host-tools) $(math-tools)
 	cat $(pl-math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
 
-check-pl/math-ulp: $(math-tools)
-	WANT_ERRNO=$(WANT_ERRNO) WANT_SVE_MATH=$(WANT_SVE_MATH) ULPFLAGS="$(math-ulpflags)" build/pl/bin/runulp.sh $(EMULATOR)
+ulp-input-dir=$(B)/test/inputs
+
+math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(math-lib-srcs)))
+math-lib-aliases = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.alias,$(basename $(math-lib-srcs)))
+
+$(math-lib-lims): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
+$(math-lib-aliases): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
+
+$(ulp-input-dir)/%.ulp: $(PLM)/%.c
+	mkdir -p $(@D)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ULP" || true; } > $@
+
+$(ulp-input-dir)/%.alias: $(PLM)/%.c
+	mkdir -p $(@D)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ALIAS" || true; } | sed "s/_x / /g"> $@
+
+ulp-lims := $(ulp-input-dir)/limits
+$(ulp-lims): $(math-lib-lims)
+	cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@
+
+ulp-aliases := $(ulp-input-dir)/aliases
+$(ulp-aliases): $(math-lib-aliases)
+	cat $^ | sed "s/PL_TEST_ALIAS //g;s/^ *//g" > $@
+
+check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases)
+	WANT_ERRNO=$(WANT_ERRNO) \
+	WANT_SVE_MATH=$(WANT_SVE_MATH) \
+	ULPFLAGS="$(math-ulpflags)" \
+	LIMITS=../../../$(ulp-lims) \
+	ALIASES=../../../$(ulp-aliases) \
+	build/pl/bin/runulp.sh $(EMULATOR)
 
 check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
 
diff --git a/pl/math/acosh_3u.c b/pl/math/acosh_3u.c
index e0014d6..f135b5d 100644
--- a/pl/math/acosh_3u.c
+++ b/pl/math/acosh_3u.c
@@ -6,6 +6,7 @@
 
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define Ln2 (0x1.62e42fefa39efp-1)
 #define MinusZero (0x8000000000000000)
@@ -56,3 +57,4 @@ acosh (double x)
 }
 
 PL_SIG (S, D, 1, acosh, 1.0, 10.0)
+PL_TEST_ULP (acosh, 2.19)
diff --git a/pl/math/acoshf_2u8.c b/pl/math/acoshf_2u8.c
index 0b1e9c7..0f9824d 100644
--- a/pl/math/acoshf_2u8.c
+++ b/pl/math/acoshf_2u8.c
@@ -6,6 +6,7 @@
 
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define Ln2 (0x1.62e4p-1f)
 #define MinusZero 0x80000000
@@ -53,3 +54,4 @@ acoshf (float x)
 }
 
 PL_SIG (S, F, 1, acosh, 1.0, 10.0)
+PL_TEST_ULP (acoshf, 2.30)
diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c
index bbe6bee..44435be 100644
--- a/pl/math/asinh_2u5.c
+++ b/pl/math/asinh_2u5.c
@@ -7,6 +7,7 @@
 #include "estrin.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26).  */
@@ -75,3 +76,4 @@ asinh (double x)
 }
 
 PL_SIG (S, D, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (asinh, 1.54)
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
index ec3dd9b..36c332d 100644
--- a/pl/math/asinhf_3u5.c
+++ b/pl/math/asinhf_3u5.c
@@ -7,6 +7,7 @@
 #include "estrinf.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define AbsMask (0x7fffffff)
 #define SqrtFltMax (0x1.749e96p+10f)
@@ -69,3 +70,4 @@ asinhf (float x)
 }
 
 PL_SIG (S, F, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (asinhf, 2.9)
diff --git a/pl/math/atan2_2u5.c b/pl/math/atan2_2u5.c
index c1cf7a3..fb5ced4 100644
--- a/pl/math/atan2_2u5.c
+++ b/pl/math/atan2_2u5.c
@@ -10,6 +10,7 @@
 #include "atan_common.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define Pi (0x1.921fb54442d18p+1)
 #define PiOver2 (0x1.921fb54442d18p+0)
@@ -150,3 +151,4 @@ atan2 (double y, double x)
 
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
 PL_SIG (S, D, 2, atan2)
+PL_TEST_ULP (atan2, 1.78)
diff --git a/pl/math/atan2f_3u.c b/pl/math/atan2f_3u.c
index 7780be6..0a3e975 100644
--- a/pl/math/atan2f_3u.c
+++ b/pl/math/atan2f_3u.c
@@ -10,6 +10,7 @@
 #include "atanf_common.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define Pi (0x1.921fb6p+1f)
 #define PiOver2 (0x1.921fb6p+0f)
@@ -158,3 +159,4 @@ atan2f (float y, float x)
 
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
 PL_SIG (S, F, 2, atan2)
+PL_TEST_ULP (atan2f, 2.4)
diff --git a/pl/math/atanhf_3u1.c b/pl/math/atanhf_3u1.c
index db663bf..47b9486 100644
--- a/pl/math/atanhf_3u1.c
+++ b/pl/math/atanhf_3u1.c
@@ -7,6 +7,7 @@
 #include "math_config.h"
 #include "mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define AbsMask 0x7fffffff
 #define Half 0x3f000000
@@ -77,3 +78,4 @@ atanhf (float x)
 }
 
 PL_SIG (S, F, 1, atanh, -1.0, 1.0)
+PL_TEST_ULP (atanhf, 2.59)
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
index 74e7a49..c6d1de9 100644
--- a/pl/math/cbrtf_1u5.c
+++ b/pl/math/cbrtf_1u5.c
@@ -8,6 +8,7 @@
 #include "estrinf.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define AbsMask 0x7fffffff
 #define SignMask 0x80000000
@@ -61,3 +62,4 @@ cbrtf (float x)
 }
 
 PL_SIG (S, F, 1, cbrt, -10.0, 10.0)
+PL_TEST_ULP (cbrtf, 1.03)
diff --git a/pl/math/cosh_2u.c b/pl/math/cosh_2u.c
index 6be189d..9e137ff 100644
--- a/pl/math/cosh_2u.c
+++ b/pl/math/cosh_2u.c
@@ -7,6 +7,7 @@
 
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define SpecialBound                                                           \
@@ -56,3 +57,4 @@ cosh (double x)
 }
 
 PL_SIG (S, D, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (cosh, 1.43)
diff --git a/pl/math/coshf_1u9.c b/pl/math/coshf_1u9.c
index b9cbe54..0e7b30f 100644
--- a/pl/math/coshf_1u9.c
+++ b/pl/math/coshf_1u9.c
@@ -7,6 +7,7 @@
 
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define AbsMask 0x7fffffff
 #define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this.  */
@@ -61,3 +62,4 @@ coshf (float x)
 }
 
 PL_SIG (S, F, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (coshf, 1.89)
diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
index b418421..8394e48 100644
--- a/pl/math/erfc_4u5.c
+++ b/pl/math/erfc_4u5.c
@@ -8,6 +8,7 @@
 #include "math_config.h"
 #include "pairwise_horner.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define AbsMask (0x7fffffffffffffff)
 
@@ -145,3 +146,4 @@ erfc (double x)
 }
 
 PL_SIG (S, D, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (erfc, 3.56)
diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c
index 32a96dc..f76a11a 100644
--- a/pl/math/erfcf_2u.c
+++ b/pl/math/erfcf_2u.c
@@ -8,6 +8,7 @@
 #include "erfcf.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define P(i) __erfcf_poly_data.poly[i]
 
@@ -123,3 +124,4 @@ erfcf (float x)
 }
 
 PL_SIG (S, F, 1, erfc, -4.0, 10.0)
+PL_TEST_ULP (erfcf, 1.5)
diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c
index afa5880..fa1e55f 100644
--- a/pl/math/erff_1u5.c
+++ b/pl/math/erff_1u5.c
@@ -8,6 +8,7 @@
 #include "hornerf.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
 #define A __erff_data.erff_poly_A
@@ -98,3 +99,4 @@ erff (float x)
 }
 
 PL_SIG (S, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (erff, 0.6)
diff --git a/pl/math/expm1_2u5.c b/pl/math/expm1_2u5.c
index 55ddbd1..cfde806 100644
--- a/pl/math/expm1_2u5.c
+++ b/pl/math/expm1_2u5.c
@@ -8,6 +8,7 @@
 #include "estrin.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define InvLn2 0x1.71547652b82fep0
 #define Ln2hi 0x1.62e42fefa39efp-1
@@ -76,3 +77,4 @@ expm1 (double x)
 }
 
 PL_SIG (S, D, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (expm1, 1.68)
diff --git a/pl/math/expm1f_1u6.c b/pl/math/expm1f_1u6.c
index 9c0c178..82dc28d 100644
--- a/pl/math/expm1f_1u6.c
+++ b/pl/math/expm1f_1u6.c
@@ -8,6 +8,7 @@
 #include "hornerf.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define Shift (0x1.8p23f)
 #define InvLn2 (0x1.715476p+0f)
@@ -72,3 +73,4 @@ expm1f (float x)
 }
 
 PL_SIG (S, F, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (expm1f, 1.02)
diff --git a/pl/math/include/pl_test.h b/pl/math/include/pl_test.h
new file mode 100644
index 0000000..1ab3814
--- /dev/null
+++ b/pl/math/include/pl_test.h
@@ -0,0 +1,16 @@
+/*
+ * PL macros to aid testing. This version of this file is used for building the
+ * routine, not the tests. Separate definitions are found in test/pl_test.h
+ * which emit test parameters.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+
+/* Emit max ULP threshold - silenced for building the routine.  */
+#define PL_TEST_ULP(f, l)
+
+/* Emit alias. The PL_TEST_ALIAS declaration is piggy-backed on top of
+   strong_alias. Use PL_ALIAS instead of strong_alias to make sure the alias is
+   also added to the test suite.  */
+#define PL_ALIAS(a, b) strong_alias (a, b)
diff --git a/pl/math/log10_2u.c b/pl/math/log10_2u.c
index b05e17b..1827bb9 100644
--- a/pl/math/log10_2u.c
+++ b/pl/math/log10_2u.c
@@ -7,6 +7,7 @@
 
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 /* Polynomial coefficients and lookup tables.  */
 #define T __log10_data.tab
@@ -143,3 +144,4 @@ log10l (long double x)
 // clang-format on
 
 PL_SIG (S, D, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (log10, 1.11)
diff --git a/pl/math/log10f.c b/pl/math/log10f.c
index ea67b4b..84db420 100644
--- a/pl/math/log10f.c
+++ b/pl/math/log10f.c
@@ -5,11 +5,13 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "math_config.h"
-#include "pl_sig.h"
 #include <math.h>
 #include <stdint.h>
 
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
 /* Data associated to logf:
 
    LOGF_TABLE_BITS = 4
@@ -86,11 +88,5 @@ log10f (float x)
   return eval_as_float (y);
 }
 
-// clang-format off
-#if USE_GLIBC_ABI
-strong_alias (log10f, __log10f_finite)
-hidden_alias (log10f, __ieee754_log10f)
-#endif
-
 PL_SIG (S, F, 1, log10, 0.01, 11.1)
-  // clang-format on
+PL_TEST_ULP (log10f, 0.30)
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
index 20b4811..5a6f798 100644
--- a/pl/math/log1p_2u.c
+++ b/pl/math/log1p_2u.c
@@ -7,6 +7,7 @@
 #include "estrin.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define Ln2Hi 0x1.62e42fefa3800p-1
 #define Ln2Lo 0x1.ef35793c76730p-45
@@ -123,3 +124,4 @@ log1p (double x)
 }
 
 PL_SIG (S, D, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (log1p, 1.26)
diff --git a/pl/math/log1pf_2u1.c b/pl/math/log1pf_2u1.c
index 97dd1c4..f791105 100644
--- a/pl/math/log1pf_2u1.c
+++ b/pl/math/log1pf_2u1.c
@@ -7,6 +7,7 @@
 #include "hornerf.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define Ln2 (0x1.62e43p-1f)
 #define SignMask (0x80000000)
@@ -152,3 +153,4 @@ log1pf (float x)
 }
 
 PL_SIG (S, F, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (log1pf, 1.52)
diff --git a/pl/math/sinh_3u.c b/pl/math/sinh_3u.c
index f56b8d0..86f00a1 100644
--- a/pl/math/sinh_3u.c
+++ b/pl/math/sinh_3u.c
@@ -7,6 +7,7 @@
 
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define Half 0x3fe0000000000000
@@ -56,3 +57,4 @@ sinh (double x)
 }
 
 PL_SIG (S, D, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (sinh, 2.08)
diff --git a/pl/math/sinhf_2u3.c b/pl/math/sinhf_2u3.c
index cb5eb51..15786d9 100644
--- a/pl/math/sinhf_2u3.c
+++ b/pl/math/sinhf_2u3.c
@@ -7,6 +7,7 @@
 
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define AbsMask 0x7fffffff
 #define Half 0x3f000000
@@ -66,3 +67,4 @@ sinhf (float x)
 }
 
 PL_SIG (S, F, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (sinhf, 1.76)
diff --git a/pl/math/sv_atan2_2u5.c b/pl/math/sv_atan2_2u5.c
index c047595..4ab2fea 100644
--- a/pl/math/sv_atan2_2u5.c
+++ b/pl/math/sv_atan2_2u5.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -79,9 +80,9 @@ __sv_atan2_x (sv_f64_t y, sv_f64_t x, const svbool_t pg)
   return ret;
 }
 
-strong_alias (__sv_atan2_x, _ZGVsMxvv_atan2)
+PL_ALIAS (__sv_atan2_x, _ZGVsMxvv_atan2)
 
-  /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.
-   */
-  PL_SIG (SV, D, 2, atan2)
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (SV, D, 2, atan2)
+PL_TEST_ULP (__sv_atan2, 1.78)
 #endif
diff --git a/pl/math/sv_atan2f_3u.c b/pl/math/sv_atan2f_3u.c
index 0ce7071..90656f0 100644
--- a/pl/math/sv_atan2f_3u.c
+++ b/pl/math/sv_atan2f_3u.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -80,9 +81,9 @@ __sv_atan2f_x (sv_f32_t y, sv_f32_t x, const svbool_t pg)
   return ret;
 }
 
-strong_alias (__sv_atan2f_x, _ZGVsMxvv_atan2f)
+PL_ALIAS (__sv_atan2f_x, _ZGVsMxvv_atan2f)
 
-  /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.
-   */
-  PL_SIG (SV, F, 2, atan2)
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (SV, F, 2, atan2)
+PL_TEST_ULP (__sv_atan2f, 2.45)
 #endif
diff --git a/pl/math/sv_atan_2u5.c b/pl/math/sv_atan_2u5.c
index 4f52b43..93b39b1 100644
--- a/pl/math/sv_atan_2u5.c
+++ b/pl/math/sv_atan_2u5.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -49,7 +50,8 @@ __sv_atan_x (sv_f64_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_atan_x, _ZGVsMxv_atan)
+PL_ALIAS (__sv_atan_x, _ZGVsMxv_atan)
 
-  PL_SIG (SV, D, 1, atan, -3.1, 3.1)
+PL_SIG (SV, D, 1, atan, -3.1, 3.1)
+PL_TEST_ULP (__sv_atan, 1.78)
 #endif
diff --git a/pl/math/sv_atanf_2u9.c b/pl/math/sv_atanf_2u9.c
index db15830..386c28e 100644
--- a/pl/math/sv_atanf_2u9.c
+++ b/pl/math/sv_atanf_2u9.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -46,7 +47,8 @@ __sv_atanf_x (sv_f32_t x, const svbool_t pg)
   return sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign));
 }
 
-strong_alias (__sv_atanf_x, _ZGVsMxv_atanf)
+PL_ALIAS (__sv_atanf_x, _ZGVsMxv_atanf)
 
-  PL_SIG (SV, F, 1, atan, -3.1, 3.1)
+PL_SIG (SV, F, 1, atan, -3.1, 3.1)
+PL_TEST_ULP (__sv_atanf, 2.9)
 #endif
diff --git a/pl/math/sv_cos_2u5.c b/pl/math/sv_cos_2u5.c
index a19be9b..146ca22 100644
--- a/pl/math/sv_cos_2u5.c
+++ b/pl/math/sv_cos_2u5.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -74,7 +75,8 @@ __sv_cos_x (sv_f64_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_cos_x, _ZGVsMxv_cos)
+PL_ALIAS (__sv_cos_x, _ZGVsMxv_cos)
 
-  PL_SIG (SV, D, 1, cos, -3.1, 3.1)
+PL_SIG (SV, D, 1, cos, -3.1, 3.1)
+PL_TEST_ULP (__sv_cos, 1.61)
 #endif
diff --git a/pl/math/sv_cosf_2u1.c b/pl/math/sv_cosf_2u1.c
index 3bc3d71..fdc4b1e 100644
--- a/pl/math/sv_cosf_2u1.c
+++ b/pl/math/sv_cosf_2u1.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -72,7 +73,8 @@ __sv_cosf_x (sv_f32_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_cosf_x, _ZGVsMxv_cosf)
+PL_ALIAS (__sv_cosf_x, _ZGVsMxv_cosf)
 
-  PL_SIG (SV, F, 1, cos, -3.1, 3.1)
+PL_SIG (SV, F, 1, cos, -3.1, 3.1)
+PL_TEST_ULP (__sv_cosf, 1.57)
 #endif
diff --git a/pl/math/sv_erf_2u5.c b/pl/math/sv_erf_2u5.c
index eac500c..f91aa41 100644
--- a/pl/math/sv_erf_2u5.c
+++ b/pl/math/sv_erf_2u5.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -87,7 +88,8 @@ __sv_erf_x (sv_f64_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_erf_x, _ZGVsMxv_erf)
+PL_ALIAS (__sv_erf_x, _ZGVsMxv_erf)
 
-  PL_SIG (SV, D, 1, erf, -4.0, 4.0)
+PL_SIG (SV, D, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (__sv_erf, 1.97)
 #endif
diff --git a/pl/math/sv_erfc_4u.c b/pl/math/sv_erfc_4u.c
index 41fb654..d426fa9 100644
--- a/pl/math/sv_erfc_4u.c
+++ b/pl/math/sv_erfc_4u.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 #include "sv_exp_tail.h"
@@ -132,7 +133,8 @@ __sv_erfc_x (sv_f64_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_erfc_x, _ZGVsMxv_erfc)
+PL_ALIAS (__sv_erfc_x, _ZGVsMxv_erfc)
 
-  PL_SIG (SV, D, 1, erfc, -4.0, 10.0)
+PL_SIG (SV, D, 1, erfc, -4.0, 10.0)
+PL_TEST_ULP (__sv_erfc, 3.15)
 #endif
diff --git a/pl/math/sv_erff_1u3.c b/pl/math/sv_erff_1u3.c
index 02d7625..9589fb3 100644
--- a/pl/math/sv_erff_1u3.c
+++ b/pl/math/sv_erff_1u3.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -88,7 +89,8 @@ __sv_erff_x (sv_f32_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_erff_x, _ZGVsMxv_erff)
+PL_ALIAS (__sv_erff_x, _ZGVsMxv_erff)
 
-  PL_SIG (SV, F, 1, erf, -4.0, 4.0)
+PL_SIG (SV, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (__sv_erff, 0.76)
 #endif
diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c
index d301392..f97a762 100644
--- a/pl/math/sv_expf_2u.c
+++ b/pl/math/sv_expf_2u.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -140,7 +141,8 @@ __sv_expf_x (sv_f32_t x, const svbool_t pg)
   return sv_fma_f32_x (pg, poly, scale, scale);
 }
 
-strong_alias (__sv_expf_x, _ZGVsMxv_expf)
+PL_ALIAS (__sv_expf_x, _ZGVsMxv_expf)
 
-  PL_SIG (SV, F, 1, exp, -9.9, 9.9)
+PL_SIG (SV, F, 1, exp, -9.9, 9.9)
+PL_TEST_ULP (__sv_expf, 1.46)
 #endif // SV_SUPPORTED
diff --git a/pl/math/sv_log10_2u5.c b/pl/math/sv_log10_2u5.c
index d6ed49a..a9b002b 100644
--- a/pl/math/sv_log10_2u5.c
+++ b/pl/math/sv_log10_2u5.c
@@ -8,6 +8,7 @@
 #include "sv_math.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -75,7 +76,8 @@ __sv_log10_x (sv_f64_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_log10_x, _ZGVsMxv_log10)
+PL_ALIAS (__sv_log10_x, _ZGVsMxv_log10)
 
-  PL_SIG (SV, D, 1, log10, 0.01, 11.1)
+PL_SIG (SV, D, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (__sv_log10, 1.97)
 #endif
diff --git a/pl/math/sv_log10f_3u5.c b/pl/math/sv_log10f_3u5.c
index c1ff196..b29ee80 100644
--- a/pl/math/sv_log10f_3u5.c
+++ b/pl/math/sv_log10f_3u5.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -74,7 +75,8 @@ __sv_log10f_x (sv_f32_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_log10f_x, _ZGVsMxv_log10f)
+PL_ALIAS (__sv_log10f_x, _ZGVsMxv_log10f)
 
-  PL_SIG (SV, F, 1, log10, 0.01, 11.1)
+PL_SIG (SV, F, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (__sv_log10f, 2.82)
 #endif
diff --git a/pl/math/sv_log_2u5.c b/pl/math/sv_log_2u5.c
index a50c3d6..8477739 100644
--- a/pl/math/sv_log_2u5.c
+++ b/pl/math/sv_log_2u5.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -71,7 +72,8 @@ __sv_log_x (sv_f64_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_log_x, _ZGVsMxv_log);
+PL_ALIAS (__sv_log_x, _ZGVsMxv_log)
 
 PL_SIG (SV, D, 1, log, 0.01, 11.1)
+PL_TEST_ULP (__sv_log, 1.68)
 #endif // SV_SUPPORTED
diff --git a/pl/math/sv_logf_3u4.c b/pl/math/sv_logf_3u4.c
index e9147e4..8fea406 100644
--- a/pl/math/sv_logf_3u4.c
+++ b/pl/math/sv_logf_3u4.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -63,7 +64,8 @@ __sv_logf_x (sv_f32_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_logf_x, _ZGVsMxv_logf)
+PL_ALIAS (__sv_logf_x, _ZGVsMxv_logf)
 
-  PL_SIG (SV, F, 1, log, 0.01, 11.1)
+PL_SIG (SV, F, 1, log, 0.01, 11.1)
+PL_TEST_ULP (__sv_logf, 2.85)
 #endif // SV_SUPPORTED
diff --git a/pl/math/sv_sin_3u.c b/pl/math/sv_sin_3u.c
index 4d879e0..5637ebe 100644
--- a/pl/math/sv_sin_3u.c
+++ b/pl/math/sv_sin_3u.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -79,7 +80,8 @@ __sv_sin_x (sv_f64_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_sin_x, _ZGVsMxv_sin)
+PL_ALIAS (__sv_sin_x, _ZGVsMxv_sin)
 
-  PL_SIG (SV, D, 1, sin, -3.1, 3.1)
+PL_SIG (SV, D, 1, sin, -3.1, 3.1)
+PL_TEST_ULP (__sv_sin, 2.03)
 #endif
diff --git a/pl/math/sv_sinf_1u9.c b/pl/math/sv_sinf_1u9.c
index 5634a87..ca26e92 100644
--- a/pl/math/sv_sinf_1u9.c
+++ b/pl/math/sv_sinf_1u9.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -74,7 +75,8 @@ __sv_sinf_x (sv_f32_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_sinf_x, _ZGVsMxv_sinf)
+PL_ALIAS (__sv_sinf_x, _ZGVsMxv_sinf)
 
-  PL_SIG (SV, F, 1, sin, -3.1, 3.1)
+PL_SIG (SV, F, 1, sin, -3.1, 3.1)
+PL_TEST_ULP (__sv_sinf, 1.40)
 #endif
diff --git a/pl/math/sv_tanf_3u2.c b/pl/math/sv_tanf_3u2.c
index 2f28239..8629b05 100644
--- a/pl/math/sv_tanf_3u2.c
+++ b/pl/math/sv_tanf_3u2.c
@@ -7,6 +7,7 @@
 
 #include "sv_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if SV_SUPPORTED
 
@@ -98,7 +99,8 @@ __sv_tanf_x (sv_f32_t x, const svbool_t pg)
   return y;
 }
 
-strong_alias (__sv_tanf_x, _ZGVsMxv_tanf)
+PL_ALIAS (__sv_tanf_x, _ZGVsMxv_tanf)
 
-  PL_SIG (SV, F, 1, tan, -3.1, 3.1)
+PL_SIG (SV, F, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (__sv_tanf, 2.7)
 #endif
diff --git a/pl/math/tanf_3u3.c b/pl/math/tanf_3u3.c
index 3e4ad38..e8784d8 100644
--- a/pl/math/tanf_3u3.c
+++ b/pl/math/tanf_3u3.c
@@ -6,6 +6,7 @@
  */
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 /* Useful constants.  */
 #define NegPio2_1 (-0x1.921fb6p+0f)
@@ -193,3 +194,4 @@ tanf (float x)
 }
 
 PL_SIG (S, F, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (tanf, 2.80)
diff --git a/pl/math/tanhf_2u6.c b/pl/math/tanhf_2u6.c
index 90f561f..e6cbbd0 100644
--- a/pl/math/tanhf_2u6.c
+++ b/pl/math/tanhf_2u6.c
@@ -6,6 +6,7 @@
  */
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define BoringBound                                                            \
   0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
@@ -81,3 +82,4 @@ tanhf (float x)
 }
 
 PL_SIG (S, F, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (tanhf, 2.09)
diff --git a/pl/math/test/pl_test.h b/pl/math/test/pl_test.h
new file mode 100644
index 0000000..d4901b1
--- /dev/null
+++ b/pl/math/test/pl_test.h
@@ -0,0 +1,14 @@
+/*
+ * PL macros for emitting various details about routines for consumption by
+ * runulp.sh.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+
+/* Emit the max ULP threshold, l, for routine f.  */
+#define PL_TEST_ULP(f, l) PL_TEST_ULP f l
+
+/* Emit aliases to allow test params to be mapped from aliases back to their
+   aliasees.  */
+#define PL_ALIAS(a, b) PL_TEST_ALIAS a b
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index a1410b4..3e31c00 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -21,14 +21,16 @@ FAIL=0
 PASS=0
 
 t() {
-	$emu ./ulp -e $L $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
+	key=$(cat $ALIASES | { grep " $1$" || echo $1; } | awk '{print $1}')
+	L=$(cat $LIMITS | grep "^$key " | awk '{print $2}')
+	[[ $L =~ ^[0-9]+\.[0-9]+$ ]]
+	$emu ./ulp -e $L $flags ${5:-} $1 $2 $3 $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
 }
 
 check() {
 	$emu ./ulp -f -q "$@" #>/dev/null
 }
 
-L=0.6
 t erff  0      0xffff0000 10000
 t erff  0x1p-127  0x1p-26 40000
 t erff -0x1p-127 -0x1p-26 40000
@@ -36,19 +38,16 @@ t erff  0x1p-26   0x1p3   40000
 t erff -0x1p-26  -0x1p3   40000
 t erff  0         inf     40000
 
-L=0.30
 t log10f  0      0xffff0000 10000
 t log10f  0x1p-127  0x1p-26 50000
 t log10f  0x1p-26   0x1p3   50000
 t log10f  0x1p-4    0x1p4   50000
 t log10f  0         inf     50000
 
-L=1.11
 t log10  0 0xffff000000000000 10000
 t log10  0x1p-4    0x1p4      40000
 t log10  0         inf        40000
 
-L=3.56
 t erfc  0       0xffff0000   10000
 t erfc  0x1p-1022  0x1p-26   40000
 t erfc -0x1p-1022 -0x1p-26   40000
@@ -56,7 +55,6 @@ t erfc  0x1p-26    0x1p5     40000
 t erfc -0x1p-26   -0x1p3     40000
 t erfc  0          inf       40000
 
-L=1.5
 t erfcf  0      0xffff0000 10000
 t erfcf  0x1p-127  0x1p-26 40000
 t erfcf -0x1p-127 -0x1p-26 40000
@@ -64,31 +62,27 @@ t erfcf  0x1p-26    0x1p5  40000
 t erfcf -0x1p-26   -0x1p3  40000
 t erfcf  0          inf    40000
 
-L=1.78
 t atan2     -10.0      10.0               50000
 t atan2      -1.0       1.0               40000
 t atan2       0.0       1.0               40000
 t atan2       1.0     100.0               40000
 t atan2       1e6      1e32               40000
-t atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000
+check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000
 # Regression-test for correct NaN handling
 check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan
 check atan2 nan nan x -nan -nan
 
-L=2.4
 t atan2f -10.0       10.0  50000
 t atan2f  -1.0        1.0  40000
 t atan2f   0.0        1.0  40000
 t atan2f   1.0      100.0  40000
 t atan2f   1e6       1e32  40000
 
-L=2.9
 t asinhf        0  0x1p-12  5000
 t asinhf  0x1p-12      1.0  50000
 t asinhf      1.0   0x1p11  50000
 t asinhf   0x1p11  0x1p127  20000
 
-L=1.54
 t asinh -0x1p-26 0x1p-26   50000
 t asinh  0x1p-26     1.0   40000
 t asinh -0x1p-26    -1.0   10000
@@ -97,7 +91,6 @@ t asinh     -1.0  -100.0   10000
 t asinh    100.0     inf   50000
 t asinh   -100.0    -inf   10000
 
-L=1.26
 t log1p    -10.0     10.0  10000
 t log1p      0.0  0x1p-23  50000
 t log1p  0x1p-23    0.001  50000
@@ -107,7 +100,6 @@ t log1p -0x1p-23   -0.001  50000
 t log1p   -0.001     -1.0  50000
 t log1p     -1.0      inf   5000
 
-L=1.52
 t log1pf    -10.0     10.0  10000
 t log1pf      0.0  0x1p-23  50000
 t log1pf  0x1p-23    0.001  50000
@@ -117,7 +109,6 @@ t log1pf -0x1p-23   -0.001  50000
 t log1pf   -0.001     -1.0  50000
 t log1pf     -1.0      inf   5000
 
-L=2.80
 t tanf  0      0xffff0000 10000
 t tanf  0x1p-127  0x1p-14 50000
 t tanf -0x1p-127 -0x1p-14 50000
@@ -132,27 +123,23 @@ t tanf -0x1p17   -0x1p54  50000
 t tanf  0x1p54    inf     50000
 t tanf -0x1p54   -inf     50000
 
-L=2.30
 t acoshf 0      1         100
 t acoshf 1      2       10000
 t acoshf 2      0x1p64 100000
 t acoshf 0x1p64 inf    100000
 t acoshf -0     -inf    10000
 
-L=2.19
 t acosh 0        1       10000
 t acosh 1        2       100000
 t acosh 2        0x1p511 100000
 t acosh 0x1p511  inf     100000
 t acosh -0      -inf     10000
 
-L=1.02
 t expm1f  0        0x1p-23       1000
 t expm1f -0       -0x1p-23       1000
 t expm1f  0x1p-23  0x1.644716p6  100000
 t expm1f -0x1p-23 -0x1.9bbabcp+6 100000
 
-L=1.76
 t sinhf  0              0x1.62e43p+6  100000
 t sinhf -0             -0x1.62e43p+6  100000
 t sinhf  0x1.62e43p+6   0x1.65a9fap+6 100
@@ -160,7 +147,6 @@ t sinhf -0x1.62e43p+6  -0x1.65a9fap+6 100
 t sinhf  0x1.65a9fap+6  inf           100
 t sinhf -0x1.65a9fap+6 -inf           100
 
-L=1.89
 t coshf  0              0x1p-63         100
 t coshf  0              0x1.5a92d8p+6   80000
 t coshf  0x1.5a92d8p+6  inf             2000
@@ -168,7 +154,6 @@ t coshf -0             -0x1p-63         100
 t coshf -0             -0x1.5a92d8p+6   80000
 t coshf -0x1.5a92d8p+6 -inf             2000
 
-L=1.68
 t expm1  0                     0x1p-51              1000
 t expm1 -0                    -0x1p-51              1000
 t expm1  0x1p-51               0x1.63108c75a1937p+9 100000
@@ -176,7 +161,6 @@ t expm1 -0x1p-51              -0x1.740bf7c0d927dp+9 100000
 t expm1  0x1.63108c75a1937p+9  inf                  100
 t expm1 -0x1.740bf7c0d927dp+9 -inf                  100
 
-L=2.08
 t sinh  0                    0x1p-51             100
 t sinh -0                   -0x1p-51             100
 t sinh  0x1p-51              0x1.62e42fefa39fp+9 100000
@@ -184,7 +168,6 @@ t sinh -0x1p-51             -0x1.62e42fefa39fp+9 100000
 t sinh  0x1.62e42fefa39fp+9  inf                 1000
 t sinh -0x1.62e42fefa39fp+9 -inf                 1000
 
-L=1.43
 t cosh  0                     0x1.61da04cbafe44p+9 100000
 t cosh -0                    -0x1.61da04cbafe44p+9 100000
 t cosh  0x1.61da04cbafe44p+9  0x1p10               1000
@@ -192,7 +175,6 @@ t cosh -0x1.61da04cbafe44p+9 -0x1p10               1000
 t cosh  0x1p10                inf                  100
 t cosh -0x1p10               -inf                  100
 
-L=2.59
 t atanhf  0        0x1p-12 500
 t atanhf  0x1p-12  1       200000
 t atanhf  1        inf     1000
@@ -200,11 +182,9 @@ t atanhf -0       -0x1p-12 500
 t atanhf -0x1p-12 -1       200000
 t atanhf -1       -inf     1000
 
-L=1.03
 t cbrtf  0  inf 1000000
 t cbrtf -0 -inf 1000000
 
-L=2.09
 t tanhf  0              0x1p-23       1000
 t tanhf -0             -0x1p-23       1000
 t tanhf  0x1p-23        0x1.205966p+3 100000
@@ -604,59 +584,11 @@ range_sve_erfc='
    0          inf    40000
 '
 
-# error limits
-L_erfc=3.15
-L_erfcf=0.26
-L_log10=1.97
-L_log10f=2.81
-L_erf=1.26
-L_erff=0.76
-# TODO tighten this once __v_atan2 is fixed
-L_atan2=2.9
-L_atan=1.78
-L_atan2f=2.46
-L_atanf=2.5
-L_log1pf=1.53
-L_asinhf=2.17
-L_log2f=2.10
-L_log2=2.10
-L_tanf=2.7
-L_log1p=1.97
-L_expm1f=1.02
-L_sinhf=1.76
-L_coshf=1.89
-L_expm1=1.68
-L_sinh=2.08
-L_cosh=1.43
-L_atanhf=2.59
-L_cbrtf=1.03
-L_asinh=1.54
-L_tanhf=2.09
-
-L_sve_cosf=1.57
-L_sve_cos=1.61
-L_sve_sinf=1.40
-L_sve_sin=2.03
-L_sve_atanf=2.9
-L_sve_atan=1.78
-L_sve_atan2f=2.45
-L_sve_atan2=1.78
-L_sve_log10=1.97
-L_sve_log10f=2.82
-L_sve_logf=2.85
-L_sve_log=1.68
-L_sve_expf=1.46
-L_sve_erff=0.76
-L_sve_erf=1.97
-L_sve_tanf=2.7
-L_sve_erfc=3.15
-
 while read G F R D A
 do
 	[ "$R" = 1 ] && { [[ $G != sve_* ]] || [ $WANT_SVE_MATH -eq 1 ]; } || continue
 	case "$G" in \#*) continue ;; esac
 	eval range="\${range_$G}"
-	eval L="\${L_$G}"
 	while read X
 	do
 		[ -n "$X" ] || continue
@@ -683,7 +615,7 @@ do
 			fi
 		fi
 		case "$X" in \#*) continue ;; esac
-		t $A $f $F $X
+		t $F $X "$A $f"
 	done << EOF
 $range
 EOF
diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c
index 02e8098..d7f9a50 100644
--- a/pl/math/v_asinh_2u5.c
+++ b/pl/math/v_asinh_2u5.c
@@ -7,6 +7,7 @@
 #include "v_math.h"
 #include "estrin.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -154,4 +155,5 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (asinh), 1.54)
 #endif
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
index 18a2395..812e28f 100644
--- a/pl/math/v_asinhf_2u7.c
+++ b/pl/math/v_asinhf_2u7.c
@@ -7,6 +7,7 @@
 #include "v_math.h"
 #include "include/mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -55,4 +56,5 @@ VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (asinhf), 2.17)
 #endif
diff --git a/pl/math/v_atan2_3u.c b/pl/math/v_atan2_3u.c
index 2b31bec..27af80d 100644
--- a/pl/math/v_atan2_3u.c
+++ b/pl/math/v_atan2_3u.c
@@ -7,6 +7,7 @@
 
 #include "v_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -79,4 +80,6 @@ VPCS_ALIAS
 
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
 PL_SIG (V, D, 2, atan2)
+// TODO tighten this once __v_atan2 is fixed
+PL_TEST_ULP (V_NAME (atan2), 2.9)
 #endif
diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
index 8c2c8f2..3d8f9fc 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/pl/math/v_atan2f_3u.c
@@ -7,6 +7,7 @@
 
 #include "v_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -79,4 +80,5 @@ VPCS_ALIAS
 
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
 PL_SIG (V, F, 2, atan2)
+PL_TEST_ULP (V_NAME (atan2f), 2.46)
 #endif
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
index 3e504e7..de39fa7 100644
--- a/pl/math/v_atan_2u5.c
+++ b/pl/math/v_atan_2u5.c
@@ -7,6 +7,7 @@
 
 #include "v_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -51,4 +52,5 @@ v_f64_t V_NAME (atan) (v_f64_t x)
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (atan), 1.78)
 #endif
diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c
index 9a0230a..8014d65 100644
--- a/pl/math/v_atanf_3u.c
+++ b/pl/math/v_atanf_3u.c
@@ -7,6 +7,7 @@
 
 #include "v_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -50,4 +51,5 @@ v_f32_t V_NAME (atanf) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (atanf), 2.5)
 #endif
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
index 5c9ceb1..c950c46 100644
--- a/pl/math/v_atanhf_3u1.c
+++ b/pl/math/v_atanhf_3u1.c
@@ -7,6 +7,7 @@
 #include "v_math.h"
 #include "mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -46,8 +47,8 @@ VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x)
     return v_call_f32 (atanhf, x, y, special);
   return y;
 }
-
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, atanh, -1.0, 1.0)
+PL_TEST_ULP (V_NAME (atanhf), 2.59)
 #endif
diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c
index 9db2579..b5f4c72 100644
--- a/pl/math/v_cbrtf_1u5.c
+++ b/pl/math/v_cbrtf_1u5.c
@@ -7,6 +7,7 @@
 #include "v_math.h"
 #include "mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -87,4 +88,5 @@ VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (cbrtf), 1.03)
 #endif
diff --git a/pl/math/v_cosh_2u.c b/pl/math/v_cosh_2u.c
index fa4f1e6..1cac350 100644
--- a/pl/math/v_cosh_2u.c
+++ b/pl/math/v_cosh_2u.c
@@ -6,8 +6,9 @@
 
 #include "v_math.h"
 #include "pl_sig.h"
-
+#include "pl_test.h"
 #include "v_exp_tail.h"
+
 #define C1 v_f64 (C1_scal)
 #define C2 v_f64 (C2_scal)
 #define C3 v_f64 (C3_scal)
@@ -85,4 +86,5 @@ VPCS_ATTR v_f64_t V_NAME (cosh) (v_f64_t x)
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (cosh), 1.43)
 #endif
diff --git a/pl/math/v_coshf_2u4.c b/pl/math/v_coshf_2u4.c
index 95288d9..b0a2be1 100644
--- a/pl/math/v_coshf_2u4.c
+++ b/pl/math/v_coshf_2u4.c
@@ -7,6 +7,7 @@
 #include "v_math.h"
 #include "mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define AbsMask 0x7fffffff
 #define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this.  */
@@ -61,4 +62,5 @@ VPCS_ATTR v_f32_t V_NAME (coshf) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (coshf), 1.89)
 #endif
diff --git a/pl/math/v_erf_2u.c b/pl/math/v_erf_2u.c
index 192b6ed..e33d405 100644
--- a/pl/math/v_erf_2u.c
+++ b/pl/math/v_erf_2u.c
@@ -9,6 +9,7 @@
 #include "include/mathlib.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -105,4 +106,5 @@ v_f64_t V_NAME (erf) (v_f64_t x)
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, erf, -6.0, 6.0)
+PL_TEST_ULP (V_NAME (erf), 1.26)
 #endif
diff --git a/pl/math/v_erfc_4u.c b/pl/math/v_erfc_4u.c
index 88f5172..9b08ead 100644
--- a/pl/math/v_erfc_4u.c
+++ b/pl/math/v_erfc_4u.c
@@ -9,6 +9,7 @@
 #include "horner.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -157,4 +158,5 @@ v_f64_t V_NAME (erfc) (v_f64_t x)
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (V_NAME (erfc), 3.15)
 #endif
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c
index cf2b174..e39801e 100644
--- a/pl/math/v_erfcf_1u.c
+++ b/pl/math/v_erfcf_1u.c
@@ -9,6 +9,7 @@
 #include "erfcf.h"
 #include "estrin.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -172,4 +173,5 @@ v_f32_t V_NAME (erfcf) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (V_NAME (erfcf), 0.26)
 #endif
diff --git a/pl/math/v_erff_1u5.c b/pl/math/v_erff_1u5.c
index 3e8d400..52f063c 100644
--- a/pl/math/v_erff_1u5.c
+++ b/pl/math/v_erff_1u5.c
@@ -9,6 +9,7 @@
 #include "include/mathlib.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -105,4 +106,5 @@ v_f32_t V_NAME (erff) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (V_NAME (erff), 0.76)
 #endif
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
index 78576a9..e0a31a5 100644
--- a/pl/math/v_expm1_2u5.c
+++ b/pl/math/v_expm1_2u5.c
@@ -7,6 +7,7 @@
 
 #include "v_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -101,4 +102,5 @@ v_f64_t V_NAME (expm1) (v_f64_t x)
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (V_NAME (expm1), 1.68)
 #endif
diff --git a/pl/math/v_expm1f_1u6.c b/pl/math/v_expm1f_1u6.c
index 6d1ae0e..dd211f9 100644
--- a/pl/math/v_expm1f_1u6.c
+++ b/pl/math/v_expm1f_1u6.c
@@ -7,6 +7,7 @@
 
 #include "v_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -84,4 +85,5 @@ v_f32_t V_NAME (expm1f) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (V_NAME (expm1f), 1.02)
 #endif
diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c
index c481b00..014accc 100644
--- a/pl/math/v_log10_2u5.c
+++ b/pl/math/v_log10_2u5.c
@@ -8,6 +8,7 @@
 #include "v_math.h"
 #include "include/mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -101,4 +102,5 @@ v_f64_t V_NAME (log10) (v_f64_t x)
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log10), 1.97)
 #endif
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
index 8bf8bb3..f25da91 100644
--- a/pl/math/v_log10f_3u5.c
+++ b/pl/math/v_log10f_3u5.c
@@ -8,6 +8,7 @@
 #include "v_math.h"
 #include "mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -74,4 +75,5 @@ v_f32_t V_NAME (log10f) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log10f), 2.81)
 #endif
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
index e0d58ec..889fac0 100644
--- a/pl/math/v_log1p_2u5.c
+++ b/pl/math/v_log1p_2u5.c
@@ -5,12 +5,12 @@
  */
 
 #include "v_math.h"
+#include "estrin.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
-#include "estrin.h"
-
 #define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
 #define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
 #define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32.  */
@@ -103,8 +103,8 @@ VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x)
 
   return y;
 }
-
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (V_NAME (log1p), 1.97)
 #endif
diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c
index 361fa4f..93c896b 100644
--- a/pl/math/v_log1pf_2u1.c
+++ b/pl/math/v_log1pf_2u1.c
@@ -6,6 +6,7 @@
 
 #include "v_math.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -145,4 +146,5 @@ VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (V_NAME (log1pf), 1.53)
 #endif
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
index 07afced..3bdfd2e 100644
--- a/pl/math/v_log2_3u.c
+++ b/pl/math/v_log2_3u.c
@@ -8,6 +8,7 @@
 #include "v_math.h"
 #include "include/mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -88,4 +89,5 @@ v_f64_t V_NAME (log2) (v_f64_t x)
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log2), 2.10)
 #endif
diff --git a/pl/math/v_log2f_2u6.c b/pl/math/v_log2f_2u6.c
index 335b324..8d7d138 100644
--- a/pl/math/v_log2f_2u6.c
+++ b/pl/math/v_log2f_2u6.c
@@ -8,6 +8,7 @@
 #include "v_math.h"
 #include "math_config.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -120,4 +121,5 @@ VPCS_ATTR v_f32_t V_NAME (log2f) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log2f), 2.10)
 #endif
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
index cec8208..7d6b612 100644
--- a/pl/math/v_sinh_3u.c
+++ b/pl/math/v_sinh_3u.c
@@ -7,6 +7,7 @@
 #include "v_math.h"
 #include "mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define Half 0x3fe0000000000000
@@ -44,4 +45,5 @@ VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x)
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (sinh), 2.08)
 #endif
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
index 299f081..76bfe78 100644
--- a/pl/math/v_sinhf_2u3.c
+++ b/pl/math/v_sinhf_2u3.c
@@ -7,6 +7,7 @@
 #include "v_math.h"
 #include "mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -43,4 +44,5 @@ VPCS_ATTR v_f32_t V_NAME (sinhf) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (sinhf), 1.76)
 #endif
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
index 9a11ba4..73b0807 100644
--- a/pl/math/v_tanf_3u2.c
+++ b/pl/math/v_tanf_3u2.c
@@ -8,6 +8,7 @@
 #include "v_math.h"
 #include "estrinf.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -117,4 +118,5 @@ v_f32_t V_NAME (tanf) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (V_NAME (tanf), 2.7)
 #endif
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index 1196c4a..bb86794 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -8,6 +8,7 @@
 #include "estrinf.h"
 #include "mathlib.h"
 #include "pl_sig.h"
+#include "pl_test.h"
 
 #if V_SUPPORTED
 
@@ -90,4 +91,5 @@ VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (tanhf), 2.09)
 #endif
diff --git a/pl/math/vn_asinh_2u5.c b/pl/math/vn_asinh_2u5.c
index ecc61ed..e349530 100644
--- a/pl/math/vn_asinh_2u5.c
+++ b/pl/math/vn_asinh_2u5.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_asinh, _ZGVnN2v_asinh)
+#define VPCS_ALIAS PL_ALIAS (__vn_asinh, _ZGVnN2v_asinh)
 #include "v_asinh_2u5.c"
 #endif
diff --git a/pl/math/vn_asinhf_2u7.c b/pl/math/vn_asinhf_2u7.c
index c42e37e..8efe099 100644
--- a/pl/math/vn_asinhf_2u7.c
+++ b/pl/math/vn_asinhf_2u7.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_asinhf, _ZGVnN4v_asinhf)
+#define VPCS_ALIAS PL_ALIAS (__vn_asinhf, _ZGVnN4v_asinhf)
 #include "v_asinhf_2u7.c"
 #endif
diff --git a/pl/math/vn_atan2_3u.c b/pl/math/vn_atan2_3u.c
index b7c46e9..7575bff 100644
--- a/pl/math/vn_atan2_3u.c
+++ b/pl/math/vn_atan2_3u.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_atan2, _ZGVnN2vv_atan2)
+#define VPCS_ALIAS PL_ALIAS (__vn_atan2, _ZGVnN2vv_atan2)
 #include "v_atan2_3u.c"
 #endif
diff --git a/pl/math/vn_atan2f_3u.c b/pl/math/vn_atan2f_3u.c
index 23aad38..b378806 100644
--- a/pl/math/vn_atan2f_3u.c
+++ b/pl/math/vn_atan2f_3u.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_atan2f, _ZGVnN4vv_atan2f)
+#define VPCS_ALIAS PL_ALIAS (__vn_atan2f, _ZGVnN4vv_atan2f)
 #include "v_atan2f_3u.c"
 #endif
diff --git a/pl/math/vn_atan_2u5.c b/pl/math/vn_atan_2u5.c
index 22baab9..539e61b 100644
--- a/pl/math/vn_atan_2u5.c
+++ b/pl/math/vn_atan_2u5.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_atan, _ZGVnN2v_atan)
+#define VPCS_ALIAS PL_ALIAS (__vn_atan, _ZGVnN2v_atan)
 #include "v_atan_2u5.c"
 #endif
diff --git a/pl/math/vn_atanf_3u.c b/pl/math/vn_atanf_3u.c
index 17ba6b8..aaeef5b 100644
--- a/pl/math/vn_atanf_3u.c
+++ b/pl/math/vn_atanf_3u.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_atanf, _ZGVnN4v_atanf)
+#define VPCS_ALIAS PL_ALIAS (__vn_atanf, _ZGVnN4v_atanf)
 #include "v_atanf_3u.c"
 #endif
diff --git a/pl/math/vn_atanhf_3u1.c b/pl/math/vn_atanhf_3u1.c
index d4ad391..32e2c45 100644
--- a/pl/math/vn_atanhf_3u1.c
+++ b/pl/math/vn_atanhf_3u1.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_atanhf, _ZGVnN4v_atanhf)
+#define VPCS_ALIAS PL_ALIAS (__vn_atanhf, _ZGVnN4v_atanhf)
 #include "v_atanhf_3u1.c"
 #endif
diff --git a/pl/math/vn_cbrtf_1u5.c b/pl/math/vn_cbrtf_1u5.c
index 3452807..53774cf 100644
--- a/pl/math/vn_cbrtf_1u5.c
+++ b/pl/math/vn_cbrtf_1u5.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_cbrtf, _ZGVnN4v_cbrtf)
+#define VPCS_ALIAS PL_ALIAS (__vn_cbrtf, _ZGVnN4v_cbrtf)
 #include "v_cbrtf_1u5.c"
 #endif
diff --git a/pl/math/vn_cosh_2u.c b/pl/math/vn_cosh_2u.c
index 5f02efd..5950e2d 100644
--- a/pl/math/vn_cosh_2u.c
+++ b/pl/math/vn_cosh_2u.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_cosh, _ZGVnN2v_cosh)
+#define VPCS_ALIAS PL_ALIAS (__vn_cosh, _ZGVnN2v_cosh)
 #include "v_cosh_2u.c"
 #endif
diff --git a/pl/math/vn_coshf_2u4.c b/pl/math/vn_coshf_2u4.c
index 6bc4635..e2fdc13 100644
--- a/pl/math/vn_coshf_2u4.c
+++ b/pl/math/vn_coshf_2u4.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_coshf, _ZGVnN4v_coshf)
+#define VPCS_ALIAS PL_ALIAS (__vn_coshf, _ZGVnN4v_coshf)
 #include "v_coshf_2u4.c"
 #endif
diff --git a/pl/math/vn_erf_2u.c b/pl/math/vn_erf_2u.c
index 2841eca..0ffad52 100644
--- a/pl/math/vn_erf_2u.c
+++ b/pl/math/vn_erf_2u.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_erf, _ZGVnN2v_erf)
+#define VPCS_ALIAS PL_ALIAS (__vn_erf, _ZGVnN2v_erf)
 #include "v_erf_2u.c"
 #endif
diff --git a/pl/math/vn_erfc_4u.c b/pl/math/vn_erfc_4u.c
index 678e316..940188a 100644
--- a/pl/math/vn_erfc_4u.c
+++ b/pl/math/vn_erfc_4u.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_erfc, _ZGVnN2v_erfc)
+#define VPCS_ALIAS PL_ALIAS (__vn_erfc, _ZGVnN2v_erfc)
 #include "v_erfc_4u.c"
 #endif
diff --git a/pl/math/vn_erfcf_1u.c b/pl/math/vn_erfcf_1u.c
index 2248f79..58829b5 100644
--- a/pl/math/vn_erfcf_1u.c
+++ b/pl/math/vn_erfcf_1u.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_erfcf, _ZGVnN4v_erfcf)
+#define VPCS_ALIAS PL_ALIAS (__vn_erfcf, _ZGVnN4v_erfcf)
 #include "v_erfcf_1u.c"
 #endif
diff --git a/pl/math/vn_erff_1u5.c b/pl/math/vn_erff_1u5.c
index 5b48442..f39560e 100644
--- a/pl/math/vn_erff_1u5.c
+++ b/pl/math/vn_erff_1u5.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_erff, _ZGVnN4v_erff)
+#define VPCS_ALIAS PL_ALIAS (__vn_erff, _ZGVnN4v_erff)
 #include "v_erff_1u5.c"
 #endif
diff --git a/pl/math/vn_expm1_2u5.c b/pl/math/vn_expm1_2u5.c
index fc88b06..d946808 100644
--- a/pl/math/vn_expm1_2u5.c
+++ b/pl/math/vn_expm1_2u5.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_expm1, _ZGVnN2v_expm1)
+#define VPCS_ALIAS PL_ALIAS (__vn_expm1, _ZGVnN2v_expm1)
 #include "v_expm1_2u5.c"
 #endif
diff --git a/pl/math/vn_expm1f_1u6.c b/pl/math/vn_expm1f_1u6.c
index 5cbb929..304e0a5 100644
--- a/pl/math/vn_expm1f_1u6.c
+++ b/pl/math/vn_expm1f_1u6.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_expm1f, _ZGVnN4v_expm1f)
+#define VPCS_ALIAS PL_ALIAS (__vn_expm1f, _ZGVnN4v_expm1f)
 #include "v_expm1f_1u6.c"
 #endif
diff --git a/pl/math/vn_log10_2u5.c b/pl/math/vn_log10_2u5.c
index b94499b..e52285c 100644
--- a/pl/math/vn_log10_2u5.c
+++ b/pl/math/vn_log10_2u5.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_log10, _ZGVnN2v_log10)
+#define VPCS_ALIAS PL_ALIAS (__vn_log10, _ZGVnN2v_log10)
 #include "v_log10_2u5.c"
 #endif
diff --git a/pl/math/vn_log10f_3u5.c b/pl/math/vn_log10f_3u5.c
index b419d0a..7d6fe25 100644
--- a/pl/math/vn_log10f_3u5.c
+++ b/pl/math/vn_log10f_3u5.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_log10f, _ZGVnN4v_log10f)
+#define VPCS_ALIAS PL_ALIAS (__vn_log10f, _ZGVnN4v_log10f)
 #include "v_log10f_3u5.c"
 #endif
diff --git a/pl/math/vn_log1p_2u5.c b/pl/math/vn_log1p_2u5.c
index 4fed0b3..7beab12 100644
--- a/pl/math/vn_log1p_2u5.c
+++ b/pl/math/vn_log1p_2u5.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_log1p, _ZGVnN2v_log1p)
+#define VPCS_ALIAS PL_ALIAS (__vn_log1p, _ZGVnN2v_log1p)
 #include "v_log1p_2u5.c"
 #endif
diff --git a/pl/math/vn_log1pf_2u1.c b/pl/math/vn_log1pf_2u1.c
index 429d167..f5ebcd8 100644
--- a/pl/math/vn_log1pf_2u1.c
+++ b/pl/math/vn_log1pf_2u1.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_log1pf, _ZGVnN4v_log1pf)
+#define VPCS_ALIAS PL_ALIAS (__vn_log1pf, _ZGVnN4v_log1pf)
 #include "v_log1pf_2u1.c"
 #endif
diff --git a/pl/math/vn_log2_3u.c b/pl/math/vn_log2_3u.c
index d74f9ca..3a67e03 100644
--- a/pl/math/vn_log2_3u.c
+++ b/pl/math/vn_log2_3u.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_log2, _ZGVnN2v_log2)
+#define VPCS_ALIAS PL_ALIAS (__vn_log2, _ZGVnN2v_log2)
 #include "v_log2_3u.c"
 #endif
diff --git a/pl/math/vn_log2f_2u6.c b/pl/math/vn_log2f_2u6.c
index dc5ab03..18effaf 100644
--- a/pl/math/vn_log2f_2u6.c
+++ b/pl/math/vn_log2f_2u6.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_log2f, _ZGVnN4v_log2f)
+#define VPCS_ALIAS PL_ALIAS (__vn_log2f, _ZGVnN4v_log2f)
 #include "v_log2f_2u6.c"
 #endif
diff --git a/pl/math/vn_sinh_3u.c b/pl/math/vn_sinh_3u.c
index 2b68578..fb42f20 100644
--- a/pl/math/vn_sinh_3u.c
+++ b/pl/math/vn_sinh_3u.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_sinh, _ZGVnN2v_sinh)
+#define VPCS_ALIAS PL_ALIAS (__vn_sinh, _ZGVnN2v_sinh)
 #include "v_sinh_3u.c"
 #endif
diff --git a/pl/math/vn_sinhf_2u3.c b/pl/math/vn_sinhf_2u3.c
index fcedb6d..230ee6e 100644
--- a/pl/math/vn_sinhf_2u3.c
+++ b/pl/math/vn_sinhf_2u3.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_sinhf, _ZGVnN4v_sinhf)
+#define VPCS_ALIAS PL_ALIAS (__vn_sinhf, _ZGVnN4v_sinhf)
 #include "v_sinhf_2u3.c"
 #endif
diff --git a/pl/math/vn_tanf_3u2.c b/pl/math/vn_tanf_3u2.c
index a086cc9..e37976d 100644
--- a/pl/math/vn_tanf_3u2.c
+++ b/pl/math/vn_tanf_3u2.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_tanf, _ZGVnN4v_tanf)
+#define VPCS_ALIAS PL_ALIAS (__vn_tanf, _ZGVnN4v_tanf)
 #include "v_tanf_3u2.c"
 #endif
diff --git a/pl/math/vn_tanhf_2u6.c b/pl/math/vn_tanhf_2u6.c
index 96fd67a..86e460c 100644
--- a/pl/math/vn_tanhf_2u6.c
+++ b/pl/math/vn_tanhf_2u6.c
@@ -7,6 +7,6 @@
 #include "include/mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_tanhf, _ZGVnN4v_tanhf)
+#define VPCS_ALIAS PL_ALIAS (__vn_tanhf, _ZGVnN4v_tanhf)
 #include "v_tanhf_2u6.c"
 #endif
-- 
cgit v1.2.3


From d748e1520dd2ff5ad3574bd0827cdd882bf6bed8 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 15 Dec 2022 13:27:57 +0000
Subject: pl/math: Move fenv expectations out of runulp.sh

Introduces a new macro, similar to how ULP thresholds are now
handled, that emits a list of routines which are expected to
correctly trigger fenv exceptions, to be consumed by runulp.sh.
All scalar routines are expected to do so. A small number of Neon
routines are also expected to, dependent on WANT_ERRNO.
---
 pl/math/Dir.mk            |  15 ++++-
 pl/math/include/pl_test.h |   6 ++
 pl/math/test/pl_test.h    |  17 ++++-
 pl/math/test/runulp.sh    | 162 ++++++++++++++++++++--------------------------
 pl/math/v_asinh_2u5.c     |   1 +
 pl/math/v_asinhf_2u7.c    |   1 +
 pl/math/v_atanhf_3u1.c    |   1 +
 pl/math/v_cbrtf_1u5.c     |   1 +
 pl/math/v_cosh_2u.c       |   1 +
 pl/math/v_coshf_2u4.c     |   1 +
 pl/math/v_expm1_2u5.c     |   1 +
 pl/math/v_expm1f_1u6.c    |   1 +
 pl/math/v_log1p_2u5.c     |   1 +
 pl/math/v_log1pf_2u1.c    |   1 +
 pl/math/v_log2f_2u6.c     |   1 +
 pl/math/v_sinh_3u.c       |   1 +
 pl/math/v_sinhf_2u3.c     |   1 +
 pl/math/v_tanf_3u2.c      |   1 +
 pl/math/v_tanhf_2u6.c     |   1 +
 19 files changed, 121 insertions(+), 94 deletions(-)

diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index 0cae7be..b27bcee 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -147,18 +147,24 @@ ulp-input-dir=$(B)/test/inputs
 
 math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(math-lib-srcs)))
 math-lib-aliases = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.alias,$(basename $(math-lib-srcs)))
+math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(math-lib-srcs)))
 
 $(math-lib-lims): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
 $(math-lib-aliases): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
+$(math-lib-fenvs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
 
 $(ulp-input-dir)/%.ulp: $(PLM)/%.c
 	mkdir -p $(@D)
-	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ULP" || true; } > $@
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_ULP [^ ]* [^ ]*" || true; } > $@
 
 $(ulp-input-dir)/%.alias: $(PLM)/%.c
 	mkdir -p $(@D)
 	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ALIAS" || true; } | sed "s/_x / /g"> $@
 
+$(ulp-input-dir)/%.fenv: $(PLM)/%.c
+	mkdir -p $(@D)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@
+
 ulp-lims := $(ulp-input-dir)/limits
 $(ulp-lims): $(math-lib-lims)
 	cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@
@@ -167,12 +173,17 @@ ulp-aliases := $(ulp-input-dir)/aliases
 $(ulp-aliases): $(math-lib-aliases)
 	cat $^ | sed "s/PL_TEST_ALIAS //g;s/^ *//g" > $@
 
-check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases)
+fenv-exps := $(ulp-input-dir)/fenv
+$(fenv-exps): $(math-lib-fenvs)
+	cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@
+
+check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps)
 	WANT_ERRNO=$(WANT_ERRNO) \
 	WANT_SVE_MATH=$(WANT_SVE_MATH) \
 	ULPFLAGS="$(math-ulpflags)" \
 	LIMITS=../../../$(ulp-lims) \
 	ALIASES=../../../$(ulp-aliases) \
+	FENV=../../../$(fenv-exps) \
 	build/pl/bin/runulp.sh $(EMULATOR)
 
 check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
diff --git a/pl/math/include/pl_test.h b/pl/math/include/pl_test.h
index 1ab3814..8999efa 100644
--- a/pl/math/include/pl_test.h
+++ b/pl/math/include/pl_test.h
@@ -14,3 +14,9 @@
    strong_alias. Use PL_ALIAS instead of strong_alias to make sure the alias is
    also added to the test suite.  */
 #define PL_ALIAS(a, b) strong_alias (a, b)
+
+/* Emit routine name if e == 1 and f is expected to correctly trigger fenv
+   exceptions. e allows declaration to be emitted conditionally upon certain
+   build flags - defer expansion by one pass to allow those flags to be expanded
+   properly.  */
+#define PL_TEST_EXPECT_FENV(f, e)
diff --git a/pl/math/test/pl_test.h b/pl/math/test/pl_test.h
index d4901b1..66dc73a 100644
--- a/pl/math/test/pl_test.h
+++ b/pl/math/test/pl_test.h
@@ -6,9 +6,24 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
  */
 
-/* Emit the max ULP threshold, l, for routine f.  */
+/* Emit the max ULP threshold, l, for routine f. Piggy-back PL_TEST_EXPECT_FENV
+   on PL_TEST_ULP to add EXPECT_FENV to all scalar routines.  */
+#if !(V_SUPPORTED || SV_SUPPORTED)
+#define PL_TEST_ULP(f, l)                                                      \
+  PL_TEST_EXPECT_FENV (f, 1)                                                   \
+  PL_TEST_ULP f l
+#else
 #define PL_TEST_ULP(f, l) PL_TEST_ULP f l
+#endif
 
 /* Emit aliases to allow test params to be mapped from aliases back to their
    aliasees.  */
 #define PL_ALIAS(a, b) PL_TEST_ALIAS a b
+
+/* Emit routine name if e == 1 and f is expected to correctly trigger fenv
+   exceptions. e allows declaration to be emitted conditionally upon certain
+   build flags - defer expansion by one pass to allow those flags to be expanded
+   properly.  */
+#define PL_TEST_EXPECT_FENV(f, e) PL_TEST_EXPECT_FENV_ (f, e)
+#define PL_TEST_EXPECT_FENV_(f, e) PL_TEST_EXPECT_FENV_##e (f)
+#define PL_TEST_EXPECT_FENV_1(f) PL_TEST_EXPECT_FENV_ENABLED f
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 3e31c00..d6c3196 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -24,7 +24,9 @@ t() {
 	key=$(cat $ALIASES | { grep " $1$" || echo $1; } | awk '{print $1}')
 	L=$(cat $LIMITS | grep "^$key " | awk '{print $2}')
 	[[ $L =~ ^[0-9]+\.[0-9]+$ ]]
-	$emu ./ulp -e $L $flags ${5:-} $1 $2 $3 $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
+	extra_flags="${5:-}"
+	grep -q "^$key$" $FENV || extra_flags="$extra_flags -f"
+	$emu ./ulp -e $L $flags ${extra_flags} $1 $2 $3 $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
 }
 
 check() {
@@ -584,7 +586,7 @@ range_sve_erfc='
    0          inf    40000
 '
 
-while read G F R D A
+while read G F R A
 do
 	[ "$R" = 1 ] && { [[ $G != sve_* ]] || [ $WANT_SVE_MATH -eq 1 ]; } || continue
 	case "$G" in \#*) continue ;; esac
@@ -592,30 +594,8 @@ do
 	while read X
 	do
 		[ -n "$X" ] || continue
-		# fenv checking is enabled by default, but we almost
-		# always want to disable it for vector routines. There
-		# are, however, a small number of vector routines in
-		# pl/math which are supposed to set fenv correctly
-		# when WANT_ERRNO is enabled. A hack is needed to
-		# ensure fenv checking is enabled for routines where
-		# this is the case. Pass "fenv" as fourth argument to
-		# prevent -f being added to the run line when
-		# WANT_ERRNO is enabled.
-		f="-f"
-		if [ $WANT_ERRNO -eq 1 ]; then
-			if [ "$D" = "fenv" ]; then
-				f=""
-			elif [ "$D" = "nofenv" ]; then
-				# Need to pass this if you want additional
-				# arguments but keep fenv checking disabled.
-				f="-f"
-			elif [ ! -z "$D" ]; then
-				echo "Unrecognised 4th argument: $D"
-				exit 1
-			fi
-		fi
 		case "$X" in \#*) continue ;; esac
-		t $F $X "$A $f"
+		t $F $X "$A"
 	done << EOF
 $range
 EOF
@@ -646,18 +626,18 @@ log2   __s_log2        $runs
 log2   __v_log2        $runv
 log2   __vn_log2       $runvn
 log2   _ZGVnN2v_log2   $runvn
-expm1  __s_expm1       $runs    fenv
-expm1  __v_expm1       $runv    fenv
-expm1  __vn_expm1      $runvn   fenv
-expm1  _ZGVnN2v_expm1  $runvn   fenv
-sinh   __s_sinh        $runs    fenv
-sinh   __v_sinh        $runv    fenv
-sinh   __vn_sinh       $runvn   fenv
-sinh   _ZGVnN2v_sinh   $runvn   fenv
-cosh   __s_cosh        $runs    fenv
-cosh   __v_cosh        $runv    fenv
-cosh   __vn_cosh       $runvn   fenv
-cosh   _ZGVnN2v_cosh   $runvn   fenv
+expm1  __s_expm1       $runs
+expm1  __v_expm1       $runv
+expm1  __vn_expm1      $runvn
+expm1  _ZGVnN2v_expm1  $runvn
+sinh   __s_sinh        $runs
+sinh   __v_sinh        $runv
+sinh   __vn_sinh       $runvn
+sinh   _ZGVnN2v_sinh   $runvn
+cosh   __s_cosh        $runs
+cosh   __v_cosh        $runv
+cosh   __vn_cosh       $runvn
+cosh   _ZGVnN2v_cosh   $runvn
 
 atanf  __s_atanf       $runs
 atanf  __v_atanf       $runv
@@ -679,62 +659,62 @@ log10f __s_log10f      $runs
 log10f __v_log10f      $runv
 log10f __vn_log10f     $runvn
 log10f _ZGVnN4v_log10f $runvn
-log1pf __s_log1pf      $runs    fenv
-log1pf __v_log1pf      $runv    fenv
-log1pf __vn_log1pf     $runvn   fenv
-log1pf _ZGVnN4v_log1pf $runvn   fenv
-asinhf __s_asinhf      $runs    fenv
-asinhf __v_asinhf      $runv    fenv
-asinhf __vn_asinhf     $runvn   fenv
-asinhf _ZGVnN4v_asinhf $runvn   fenv
-log2f  __s_log2f       $runs    fenv
-log2f  __v_log2f       $runv    fenv
-log2f  __vn_log2f      $runvn   fenv
-log2f  _ZGVnN4v_log2f  $runvn   fenv
-tanf  __s_tanf         $runs    fenv
-tanf  __v_tanf         $runv    fenv
-tanf  __vn_tanf        $runvn   fenv
-tanf  _ZGVnN4v_tanf    $runvn   fenv
-log1p  __s_log1p       $runs    fenv
-log1p  __v_log1p       $runv    fenv
-log1p  __vn_log1p      $runvn   fenv
-log1p  _ZGVnN2v_log1p  $runvn   fenv
-expm1f __s_expm1f      $runs    fenv
-expm1f __v_expm1f      $runv    fenv
-expm1f __vn_expm1f     $runvn   fenv
-expm1f _ZGVnN4v_expm1f $runvn   fenv
-sinhf  __s_sinhf       $runs    fenv
-sinhf  __v_sinhf       $runv    fenv
-sinhf  __vn_sinhf      $runvn   fenv
-sinhf  _ZGVnN4v_sinhf  $runvn   fenv
-coshf  __s_coshf       $runs    fenv
-coshf  __v_coshf       $runv    fenv
-coshf  __vn_coshf      $runvn   fenv
-coshf  _ZGVnN4v_coshf  $runvn   fenv
-atanhf __s_atanhf      $runs    fenv -c 0
-atanhf __v_atanhf      $runv    fenv -c 0
-atanhf __vn_atanhf     $runvn   fenv -c 0
-atanhf _ZGVnN4v_atanhf $runvn   fenv -c 0
-cbrtf  __s_cbrtf       $runs    fenv
-cbrtf  __v_cbrtf       $runv    fenv
-cbrtf  __vn_cbrtf      $runvn   fenv
-cbrtf  _ZGVnN4v_cbrtf  $runvn   fenv
-asinh  __s_asinh       $runs    fenv
+log1pf __s_log1pf      $runs
+log1pf __v_log1pf      $runv
+log1pf __vn_log1pf     $runvn
+log1pf _ZGVnN4v_log1pf $runvn
+asinhf __s_asinhf      $runs
+asinhf __v_asinhf      $runv
+asinhf __vn_asinhf     $runvn
+asinhf _ZGVnN4v_asinhf $runvn
+log2f  __s_log2f       $runs
+log2f  __v_log2f       $runv
+log2f  __vn_log2f      $runvn
+log2f  _ZGVnN4v_log2f  $runvn
+tanf  __s_tanf         $runs
+tanf  __v_tanf         $runv
+tanf  __vn_tanf        $runvn
+tanf  _ZGVnN4v_tanf    $runvn
+log1p  __s_log1p       $runs
+log1p  __v_log1p       $runv
+log1p  __vn_log1p      $runvn
+log1p  _ZGVnN2v_log1p  $runvn
+expm1f __s_expm1f      $runs
+expm1f __v_expm1f      $runv
+expm1f __vn_expm1f     $runvn
+expm1f _ZGVnN4v_expm1f $runvn
+sinhf  __s_sinhf       $runs
+sinhf  __v_sinhf       $runv
+sinhf  __vn_sinhf      $runvn
+sinhf  _ZGVnN4v_sinhf  $runvn
+coshf  __s_coshf       $runs
+coshf  __v_coshf       $runv
+coshf  __vn_coshf      $runvn
+coshf  _ZGVnN4v_coshf  $runvn
+atanhf __s_atanhf      $runs  -c 0
+atanhf __v_atanhf      $runv  -c 0
+atanhf __vn_atanhf     $runvn -c 0
+atanhf _ZGVnN4v_atanhf $runvn -c 0
+cbrtf  __s_cbrtf       $runs
+cbrtf  __v_cbrtf       $runv
+cbrtf  __vn_cbrtf      $runvn
+cbrtf  _ZGVnN4v_cbrtf  $runvn
+asinh  __s_asinh       $runs
 # Test vector asinh 3 times, with control lane < 1, > 1 and special.
 # Ensures the v_sel is choosing the right option in all cases.
-asinh  __v_asinh       $runv    fenv -c 0.5
-asinh  __vn_asinh      $runvn   fenv -c 0.5
-asinh  _ZGVnN2v_asinh  $runvn   fenv -c 0.5
-asinh  __v_asinh       $runv    fenv -c 2
-asinh  __vn_asinh      $runvn   fenv -c 2
-asinh  _ZGVnN2v_asinh  $runvn   fenv -c 2
-asinh  __v_asinh       $runv    fenv -c 0x1p600
-asinh  __vn_asinh      $runvn   fenv -c 0x1p600
-asinh  _ZGVnN2v_asinh  $runvn   fenv -c 0x1p600
-tanhf  __s_tanhf       $runs    fenv
-tanhf  __v_tanhf       $runv    fenv
-tanhf  __vn_tanhf      $runvn   fenv
-tanhf  _ZGVnN4v_tanhf  $runvn   fenv
+asinh  __v_asinh       $runv    -c 0.5
+asinh  __vn_asinh      $runvn   -c 0.5
+asinh  _ZGVnN2v_asinh  $runvn   -c 0.5
+asinh  __v_asinh       $runv    -c 2
+asinh  __vn_asinh      $runvn   -c 2
+asinh  _ZGVnN2v_asinh  $runvn   -c 2
+asinh  __v_asinh       $runv    -c 0x1p600
+asinh  __vn_asinh      $runvn   -c 0x1p600
+asinh  _ZGVnN2v_asinh  $runvn   -c 0x1p600
+tanhf  __s_tanhf       $runs
+tanhf  __v_tanhf       $runv
+tanhf  __vn_tanhf      $runvn
+tanhf  _ZGVnN4v_tanhf  $runvn
 
 sve_cosf     __sv_cosf         $runsv
 sve_cosf     _ZGVsMxv_cosf     $runsv
diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c
index d7f9a50..23171a1 100644
--- a/pl/math/v_asinh_2u5.c
+++ b/pl/math/v_asinh_2u5.c
@@ -156,4 +156,5 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, asinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (asinh), 1.54)
+PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
index 812e28f..32fe773 100644
--- a/pl/math/v_asinhf_2u7.c
+++ b/pl/math/v_asinhf_2u7.c
@@ -57,4 +57,5 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, asinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (asinhf), 2.17)
+PL_TEST_EXPECT_FENV (V_NAME (asinhf), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
index c950c46..4cff1fc 100644
--- a/pl/math/v_atanhf_3u1.c
+++ b/pl/math/v_atanhf_3u1.c
@@ -51,4 +51,5 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, atanh, -1.0, 1.0)
 PL_TEST_ULP (V_NAME (atanhf), 2.59)
+PL_TEST_EXPECT_FENV (V_NAME (atanhf), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c
index b5f4c72..756a468 100644
--- a/pl/math/v_cbrtf_1u5.c
+++ b/pl/math/v_cbrtf_1u5.c
@@ -89,4 +89,5 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (cbrtf), 1.03)
+PL_TEST_EXPECT_FENV (V_NAME (cbrtf), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_cosh_2u.c b/pl/math/v_cosh_2u.c
index 1cac350..63f877e 100644
--- a/pl/math/v_cosh_2u.c
+++ b/pl/math/v_cosh_2u.c
@@ -87,4 +87,5 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, cosh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (cosh), 1.43)
+PL_TEST_EXPECT_FENV (V_NAME (cosh), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_coshf_2u4.c b/pl/math/v_coshf_2u4.c
index b0a2be1..f101681 100644
--- a/pl/math/v_coshf_2u4.c
+++ b/pl/math/v_coshf_2u4.c
@@ -63,4 +63,5 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, cosh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (coshf), 1.89)
+PL_TEST_EXPECT_FENV (V_NAME (coshf), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
index e0a31a5..216bdbc 100644
--- a/pl/math/v_expm1_2u5.c
+++ b/pl/math/v_expm1_2u5.c
@@ -103,4 +103,5 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, expm1, -9.9, 9.9)
 PL_TEST_ULP (V_NAME (expm1), 1.68)
+PL_TEST_EXPECT_FENV (V_NAME (expm1), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_expm1f_1u6.c b/pl/math/v_expm1f_1u6.c
index dd211f9..6e47fac 100644
--- a/pl/math/v_expm1f_1u6.c
+++ b/pl/math/v_expm1f_1u6.c
@@ -86,4 +86,5 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, expm1, -9.9, 9.9)
 PL_TEST_ULP (V_NAME (expm1f), 1.02)
+PL_TEST_EXPECT_FENV (V_NAME (expm1f), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
index 889fac0..7a8c6bf 100644
--- a/pl/math/v_log1p_2u5.c
+++ b/pl/math/v_log1p_2u5.c
@@ -107,4 +107,5 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, log1p, -0.9, 10.0)
 PL_TEST_ULP (V_NAME (log1p), 1.97)
+PL_TEST_EXPECT_FENV (V_NAME (log1p), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c
index 93c896b..f351ecd 100644
--- a/pl/math/v_log1pf_2u1.c
+++ b/pl/math/v_log1pf_2u1.c
@@ -147,4 +147,5 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, log1p, -0.9, 10.0)
 PL_TEST_ULP (V_NAME (log1pf), 1.53)
+PL_TEST_EXPECT_FENV (V_NAME (log1pf), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_log2f_2u6.c b/pl/math/v_log2f_2u6.c
index 8d7d138..a3c9aac 100644
--- a/pl/math/v_log2f_2u6.c
+++ b/pl/math/v_log2f_2u6.c
@@ -122,4 +122,5 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, log2, 0.01, 11.1)
 PL_TEST_ULP (V_NAME (log2f), 2.10)
+PL_TEST_EXPECT_FENV (V_NAME (log2f), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
index 7d6b612..bab8896 100644
--- a/pl/math/v_sinh_3u.c
+++ b/pl/math/v_sinh_3u.c
@@ -46,4 +46,5 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinh), 2.08)
+PL_TEST_EXPECT_FENV (V_NAME (sinh), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
index 76bfe78..ecedf55 100644
--- a/pl/math/v_sinhf_2u3.c
+++ b/pl/math/v_sinhf_2u3.c
@@ -45,4 +45,5 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinhf), 1.76)
+PL_TEST_EXPECT_FENV (V_NAME (sinhf), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
index 73b0807..51ede3c 100644
--- a/pl/math/v_tanf_3u2.c
+++ b/pl/math/v_tanf_3u2.c
@@ -119,4 +119,5 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, tan, -3.1, 3.1)
 PL_TEST_ULP (V_NAME (tanf), 2.7)
+PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_ERRNO)
 #endif
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index bb86794..ae87f50 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -92,4 +92,5 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, tanh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (tanhf), 2.09)
+PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_ERRNO)
 #endif
-- 
cgit v1.2.3


From 202e46317ee8983516b6413066a57bd624ffa044 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 15 Dec 2022 13:28:06 +0000
Subject: pl/math: Move test intervals to routine source files

To conclude the work on simplifying the runulp.sh script, a new macro
has been introduced to specify the intervals in which a routine should
be tested in the routine source. This is eventually consumed by
runulp.sh.
---
 pl/math/Dir.mk            |  31 ++-
 pl/math/acosh_3u.c        |   5 +
 pl/math/acoshf_2u8.c      |   5 +
 pl/math/asinh_2u5.c       |   7 +
 pl/math/asinhf_3u5.c      |   4 +
 pl/math/atan2_2u5.c       |   5 +
 pl/math/atan2f_3u.c       |   5 +
 pl/math/atanhf_3u1.c      |   6 +
 pl/math/cbrtf_1u5.c       |   2 +
 pl/math/cosh_2u.c         |   6 +
 pl/math/coshf_1u9.c       |   6 +
 pl/math/erfc_4u5.c        |   6 +
 pl/math/erfcf_2u.c        |   6 +
 pl/math/erff_1u5.c        |   6 +
 pl/math/expm1_2u5.c       |   6 +
 pl/math/expm1f_1u6.c      |   4 +
 pl/math/include/pl_test.h |   3 +
 pl/math/log10_2u.c        |   3 +
 pl/math/log10f.c          |   5 +
 pl/math/log1p_2u.c        |   8 +
 pl/math/log1pf_2u1.c      |   8 +
 pl/math/sinh_3u.c         |   6 +
 pl/math/sinhf_2u3.c       |   6 +
 pl/math/sv_atan2_2u5.c    |   5 +
 pl/math/sv_atan2f_3u.c    |   5 +
 pl/math/sv_atan_2u5.c     |   5 +
 pl/math/sv_atanf_2u9.c    |   5 +
 pl/math/sv_cos_2u5.c      |   2 +
 pl/math/sv_cosf_2u1.c     |   2 +
 pl/math/sv_erf_2u5.c      |   8 +
 pl/math/sv_erfc_4u.c      |   6 +
 pl/math/sv_erff_1u3.c     |   8 +
 pl/math/sv_expf_2u.c      |   8 +
 pl/math/sv_log10_2u5.c    |   6 +
 pl/math/sv_log10f_3u5.c   |   6 +
 pl/math/sv_log_2u5.c      |   6 +
 pl/math/sv_logf_3u4.c     |   6 +
 pl/math/sv_sin_3u.c       |   2 +
 pl/math/sv_sinf_1u9.c     |   2 +
 pl/math/sv_tanf_3u2.c     |   8 +
 pl/math/tanf_3u3.c        |  13 +
 pl/math/tanhf_2u6.c       |   6 +
 pl/math/test/pl_test.h    |   3 +
 pl/math/test/runulp.sh    | 695 +---------------------------------------------
 pl/math/v_asinh_2u5.c     |  14 +
 pl/math/v_asinhf_2u7.c    |   8 +
 pl/math/v_atan2_3u.c      |   5 +
 pl/math/v_atan2f_3u.c     |   5 +
 pl/math/v_atan_2u5.c      |   5 +
 pl/math/v_atanf_3u.c      |   5 +
 pl/math/v_atanhf_3u1.c    |   6 +
 pl/math/v_cbrtf_1u5.c     |   2 +
 pl/math/v_cosh_2u.c       |   4 +
 pl/math/v_coshf_2u4.c     |   6 +
 pl/math/v_erf_2u.c        |   6 +
 pl/math/v_erfc_4u.c       |   6 +
 pl/math/v_erfcf_1u.c      |   6 +
 pl/math/v_erff_1u5.c      |   6 +
 pl/math/v_expm1_2u5.c     |   6 +
 pl/math/v_expm1f_1u6.c    |   4 +
 pl/math/v_log10_2u5.c     |   3 +
 pl/math/v_log10f_3u5.c    |   2 +
 pl/math/v_log1p_2u5.c     |   8 +
 pl/math/v_log1pf_2u1.c    |   8 +
 pl/math/v_log2_3u.c       |   6 +
 pl/math/v_log2f_2u6.c     |   6 +
 pl/math/v_sinh_3u.c       |   6 +
 pl/math/v_sinhf_2u3.c     |   6 +
 pl/math/v_tanf_3u2.c      |   8 +
 pl/math/v_tanhf_2u6.c     |   6 +
 70 files changed, 416 insertions(+), 693 deletions(-)

diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index b27bcee..1433b7b 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -148,10 +148,11 @@ ulp-input-dir=$(B)/test/inputs
 math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(math-lib-srcs)))
 math-lib-aliases = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.alias,$(basename $(math-lib-srcs)))
 math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(math-lib-srcs)))
+math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(math-lib-srcs)))
 
-$(math-lib-lims): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
-$(math-lib-aliases): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
-$(math-lib-fenvs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
+ulp-inputs = $(math-lib-lims) $(math-lib-aliases) $(math-lib-fenvs) $(math-lib-itvs)
+
+$(ulp-inputs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
 
 $(ulp-input-dir)/%.ulp: $(PLM)/%.c
 	mkdir -p $(@D)
@@ -165,6 +166,10 @@ $(ulp-input-dir)/%.fenv: $(PLM)/%.c
 	mkdir -p $(@D)
 	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@
 
+$(ulp-input-dir)/%.itv: $(PLM)/%.c
+	mkdir -p $(dir $@)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_INTERVAL " || true; } | sed "s/ PL_TEST_INTERVAL/\nPL_TEST_INTERVAL/g" > $@
+
 ulp-lims := $(ulp-input-dir)/limits
 $(ulp-lims): $(math-lib-lims)
 	cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@
@@ -177,12 +182,30 @@ fenv-exps := $(ulp-input-dir)/fenv
 $(fenv-exps): $(math-lib-fenvs)
 	cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@
 
-check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps)
+ulp-itvs-noalias := $(ulp-input-dir)/itvs_noalias
+$(ulp-itvs-noalias): $(math-lib-itvs)
+	cat $^ > $@
+
+rename-aliases := $(ulp-input-dir)/rename_alias.sed
+$(rename-aliases): $(ulp-aliases)
+	# Build sed script for replacing aliases from generated alias file
+	cat $< |  awk '{ print "s/ " $$1 " / " $$2 " /g" }' > $@
+
+ulp-itvs-alias := $(ulp-input-dir)/itvs_alias
+$(ulp-itvs-alias): $(ulp-itvs-noalias) $(rename-aliases)
+	cat $< | sed  -f $(rename-aliases) > $@
+
+ulp-itvs := $(ulp-input-dir)/intervals
+$(ulp-itvs): $(ulp-itvs-alias) $(ulp-itvs-noalias)
+	cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@
+
+check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps) $(ulp-itvs)
 	WANT_ERRNO=$(WANT_ERRNO) \
 	WANT_SVE_MATH=$(WANT_SVE_MATH) \
 	ULPFLAGS="$(math-ulpflags)" \
 	LIMITS=../../../$(ulp-lims) \
 	ALIASES=../../../$(ulp-aliases) \
+	INTERVALS=../../../$(ulp-itvs) \
 	FENV=../../../$(fenv-exps) \
 	build/pl/bin/runulp.sh $(EMULATOR)
 
diff --git a/pl/math/acosh_3u.c b/pl/math/acosh_3u.c
index f135b5d..d2c195f 100644
--- a/pl/math/acosh_3u.c
+++ b/pl/math/acosh_3u.c
@@ -58,3 +58,8 @@ acosh (double x)
 
 PL_SIG (S, D, 1, acosh, 1.0, 10.0)
 PL_TEST_ULP (acosh, 2.19)
+PL_TEST_INTERVAL (acosh, 0, 1, 10000)
+PL_TEST_INTERVAL (acosh, 1, 2, 100000)
+PL_TEST_INTERVAL (acosh, 2, 0x1p511, 100000)
+PL_TEST_INTERVAL (acosh, 0x1p511, inf, 100000)
+PL_TEST_INTERVAL (acosh, -0, -inf, 10000)
diff --git a/pl/math/acoshf_2u8.c b/pl/math/acoshf_2u8.c
index 0f9824d..bd9c561 100644
--- a/pl/math/acoshf_2u8.c
+++ b/pl/math/acoshf_2u8.c
@@ -55,3 +55,8 @@ acoshf (float x)
 
 PL_SIG (S, F, 1, acosh, 1.0, 10.0)
 PL_TEST_ULP (acoshf, 2.30)
+PL_TEST_INTERVAL (acoshf, 0, 1, 100)
+PL_TEST_INTERVAL (acoshf, 1, 2, 10000)
+PL_TEST_INTERVAL (acoshf, 2, 0x1p64, 100000)
+PL_TEST_INTERVAL (acoshf, 0x1p64, inf, 100000)
+PL_TEST_INTERVAL (acoshf, -0, -inf, 10000)
diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c
index 44435be..064d81e 100644
--- a/pl/math/asinh_2u5.c
+++ b/pl/math/asinh_2u5.c
@@ -77,3 +77,10 @@ asinh (double x)
 
 PL_SIG (S, D, 1, asinh, -10.0, 10.0)
 PL_TEST_ULP (asinh, 1.54)
+PL_TEST_INTERVAL (asinh, -0x1p-26, 0x1p-26, 50000)
+PL_TEST_INTERVAL (asinh, 0x1p-26, 1.0, 40000)
+PL_TEST_INTERVAL (asinh, -0x1p-26, -1.0, 10000)
+PL_TEST_INTERVAL (asinh, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (asinh, -1.0, -100.0, 10000)
+PL_TEST_INTERVAL (asinh, 100.0, inf, 50000)
+PL_TEST_INTERVAL (asinh, -100.0, -inf, 10000)
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
index 36c332d..2429e82 100644
--- a/pl/math/asinhf_3u5.c
+++ b/pl/math/asinhf_3u5.c
@@ -71,3 +71,7 @@ asinhf (float x)
 
 PL_SIG (S, F, 1, asinh, -10.0, 10.0)
 PL_TEST_ULP (asinhf, 2.9)
+PL_TEST_INTERVAL (asinhf, 0, 0x1p-12, 5000)
+PL_TEST_INTERVAL (asinhf, 0x1p-12, 1.0, 50000)
+PL_TEST_INTERVAL (asinhf, 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (asinhf, 0x1p11, 0x1p127, 20000)
diff --git a/pl/math/atan2_2u5.c b/pl/math/atan2_2u5.c
index fb5ced4..ba39d9a 100644
--- a/pl/math/atan2_2u5.c
+++ b/pl/math/atan2_2u5.c
@@ -152,3 +152,8 @@ atan2 (double y, double x)
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
 PL_SIG (S, D, 2, atan2)
 PL_TEST_ULP (atan2, 1.78)
+PL_TEST_INTERVAL (atan2, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (atan2, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (atan2, 1e6, 1e32, 40000)
diff --git a/pl/math/atan2f_3u.c b/pl/math/atan2f_3u.c
index 0a3e975..e84ea0b 100644
--- a/pl/math/atan2f_3u.c
+++ b/pl/math/atan2f_3u.c
@@ -160,3 +160,8 @@ atan2f (float y, float x)
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
 PL_SIG (S, F, 2, atan2)
 PL_TEST_ULP (atan2f, 2.4)
+PL_TEST_INTERVAL (atan2f, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (atan2f, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2f, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2f, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (atan2f, 1e6, 1e32, 40000)
diff --git a/pl/math/atanhf_3u1.c b/pl/math/atanhf_3u1.c
index 47b9486..c7f80b0 100644
--- a/pl/math/atanhf_3u1.c
+++ b/pl/math/atanhf_3u1.c
@@ -79,3 +79,9 @@ atanhf (float x)
 
 PL_SIG (S, F, 1, atanh, -1.0, 1.0)
 PL_TEST_ULP (atanhf, 2.59)
+PL_TEST_INTERVAL (atanhf, 0, 0x1p-12, 500)
+PL_TEST_INTERVAL (atanhf, 0x1p-12, 1, 200000)
+PL_TEST_INTERVAL (atanhf, 1, inf, 1000)
+PL_TEST_INTERVAL (atanhf, -0, -0x1p-12, 500)
+PL_TEST_INTERVAL (atanhf, -0x1p-12, -1, 200000)
+PL_TEST_INTERVAL (atanhf, -1, -inf, 1000)
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
index c6d1de9..86a6088 100644
--- a/pl/math/cbrtf_1u5.c
+++ b/pl/math/cbrtf_1u5.c
@@ -63,3 +63,5 @@ cbrtf (float x)
 
 PL_SIG (S, F, 1, cbrt, -10.0, 10.0)
 PL_TEST_ULP (cbrtf, 1.03)
+PL_TEST_INTERVAL (cbrtf, 0, inf, 1000000)
+PL_TEST_INTERVAL (cbrtf, -0, -inf, 1000000)
diff --git a/pl/math/cosh_2u.c b/pl/math/cosh_2u.c
index 9e137ff..5ec3b77 100644
--- a/pl/math/cosh_2u.c
+++ b/pl/math/cosh_2u.c
@@ -58,3 +58,9 @@ cosh (double x)
 
 PL_SIG (S, D, 1, cosh, -10.0, 10.0)
 PL_TEST_ULP (cosh, 1.43)
+PL_TEST_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000)
+PL_TEST_INTERVAL (cosh, -0, -0x1.61da04cbafe44p+9, 100000)
+PL_TEST_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000)
+PL_TEST_INTERVAL (cosh, -0x1.61da04cbafe44p+9, -0x1p10, 1000)
+PL_TEST_INTERVAL (cosh, 0x1p10, inf, 100)
+PL_TEST_INTERVAL (cosh, -0x1p10, -inf, 100)
diff --git a/pl/math/coshf_1u9.c b/pl/math/coshf_1u9.c
index 0e7b30f..2f93f1c 100644
--- a/pl/math/coshf_1u9.c
+++ b/pl/math/coshf_1u9.c
@@ -63,3 +63,9 @@ coshf (float x)
 
 PL_SIG (S, F, 1, cosh, -10.0, 10.0)
 PL_TEST_ULP (coshf, 1.89)
+PL_TEST_INTERVAL (coshf, 0, 0x1p-63, 100)
+PL_TEST_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000)
+PL_TEST_INTERVAL (coshf, -0, -0x1p-63, 100)
+PL_TEST_INTERVAL (coshf, -0, -0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (coshf, -0x1.5a92d8p+6, -inf, 2000)
diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
index 8394e48..6d4a29a 100644
--- a/pl/math/erfc_4u5.c
+++ b/pl/math/erfc_4u5.c
@@ -147,3 +147,9 @@ erfc (double x)
 
 PL_SIG (S, D, 1, erfc, -6.0, 28.0)
 PL_TEST_ULP (erfc, 3.56)
+PL_TEST_INTERVAL (erfc, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (erfc, 0x1p-1022, 0x1p-26, 40000)
+PL_TEST_INTERVAL (erfc, -0x1p-1022, -0x1p-26, 40000)
+PL_TEST_INTERVAL (erfc, 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (erfc, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (erfc, 0, inf, 40000)
diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c
index f76a11a..7a55000 100644
--- a/pl/math/erfcf_2u.c
+++ b/pl/math/erfcf_2u.c
@@ -125,3 +125,9 @@ erfcf (float x)
 
 PL_SIG (S, F, 1, erfc, -4.0, 10.0)
 PL_TEST_ULP (erfcf, 1.5)
+PL_TEST_INTERVAL (erfcf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (erfcf, 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (erfcf, -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (erfcf, 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (erfcf, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (erfcf, 0, inf, 40000)
diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c
index fa1e55f..3d8cfee 100644
--- a/pl/math/erff_1u5.c
+++ b/pl/math/erff_1u5.c
@@ -100,3 +100,9 @@ erff (float x)
 
 PL_SIG (S, F, 1, erf, -4.0, 4.0)
 PL_TEST_ULP (erff, 0.6)
+PL_TEST_INTERVAL (erff, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (erff, -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (erff, 0x1p-26, 0x1p3, 40000)
+PL_TEST_INTERVAL (erff, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (erff, 0, inf, 40000)
diff --git a/pl/math/expm1_2u5.c b/pl/math/expm1_2u5.c
index cfde806..60a556e 100644
--- a/pl/math/expm1_2u5.c
+++ b/pl/math/expm1_2u5.c
@@ -78,3 +78,9 @@ expm1 (double x)
 
 PL_SIG (S, D, 1, expm1, -9.9, 9.9)
 PL_TEST_ULP (expm1, 1.68)
+PL_TEST_INTERVAL (expm1, 0, 0x1p-51, 1000)
+PL_TEST_INTERVAL (expm1, -0, -0x1p-51, 1000)
+PL_TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000)
+PL_TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000)
+PL_TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100)
+PL_TEST_INTERVAL (expm1, -0x1.740bf7c0d927dp+9, -inf, 100)
diff --git a/pl/math/expm1f_1u6.c b/pl/math/expm1f_1u6.c
index 82dc28d..5138865 100644
--- a/pl/math/expm1f_1u6.c
+++ b/pl/math/expm1f_1u6.c
@@ -74,3 +74,7 @@ expm1f (float x)
 
 PL_SIG (S, F, 1, expm1, -9.9, 9.9)
 PL_TEST_ULP (expm1f, 1.02)
+PL_TEST_INTERVAL (expm1f, 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (expm1f, -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000)
+PL_TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000)
diff --git a/pl/math/include/pl_test.h b/pl/math/include/pl_test.h
index 8999efa..e578a0d 100644
--- a/pl/math/include/pl_test.h
+++ b/pl/math/include/pl_test.h
@@ -20,3 +20,6 @@
    build flags - defer expansion by one pass to allow those flags to be expanded
    properly.  */
 #define PL_TEST_EXPECT_FENV(f, e)
+
+#define PL_TEST_INTERVAL(f, lo, hi, n)
+#define PL_TEST_INTERVAL_C(f, lo, hi, n, c)
diff --git a/pl/math/log10_2u.c b/pl/math/log10_2u.c
index 1827bb9..81f73a8 100644
--- a/pl/math/log10_2u.c
+++ b/pl/math/log10_2u.c
@@ -145,3 +145,6 @@ log10l (long double x)
 
 PL_SIG (S, D, 1, log10, 0.01, 11.1)
 PL_TEST_ULP (log10, 1.11)
+PL_TEST_INTERVAL (log10, 0, 0xffff000000000000, 10000)
+PL_TEST_INTERVAL (log10, 0x1p-4, 0x1p4, 40000)
+PL_TEST_INTERVAL (log10, 0, inf, 40000)
diff --git a/pl/math/log10f.c b/pl/math/log10f.c
index 84db420..32de42f 100644
--- a/pl/math/log10f.c
+++ b/pl/math/log10f.c
@@ -90,3 +90,8 @@ log10f (float x)
 
 PL_SIG (S, F, 1, log10, 0.01, 11.1)
 PL_TEST_ULP (log10f, 0.30)
+PL_TEST_INTERVAL (log10f, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (log10f, 0x1p-127, 0x1p-26, 50000)
+PL_TEST_INTERVAL (log10f, 0x1p-26, 0x1p3, 50000)
+PL_TEST_INTERVAL (log10f, 0x1p-4, 0x1p4, 50000)
+PL_TEST_INTERVAL (log10f, 0, inf, 50000)
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
index 5a6f798..519df42 100644
--- a/pl/math/log1p_2u.c
+++ b/pl/math/log1p_2u.c
@@ -125,3 +125,11 @@ log1p (double x)
 
 PL_SIG (S, D, 1, log1p, -0.9, 10.0)
 PL_TEST_ULP (log1p, 1.26)
+PL_TEST_INTERVAL (log1p, -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (log1p, 0.0, 0x1p-23, 50000)
+PL_TEST_INTERVAL (log1p, 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (log1p, 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (log1p, 0.0, -0x1p-23, 50000)
+PL_TEST_INTERVAL (log1p, -0x1p-23, -0.001, 50000)
+PL_TEST_INTERVAL (log1p, -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (log1p, -1.0, inf, 5000)
diff --git a/pl/math/log1pf_2u1.c b/pl/math/log1pf_2u1.c
index f791105..cb1d4bc 100644
--- a/pl/math/log1pf_2u1.c
+++ b/pl/math/log1pf_2u1.c
@@ -154,3 +154,11 @@ log1pf (float x)
 
 PL_SIG (S, F, 1, log1p, -0.9, 10.0)
 PL_TEST_ULP (log1pf, 1.52)
+PL_TEST_INTERVAL (log1pf, -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (log1pf, 0.0, 0x1p-23, 50000)
+PL_TEST_INTERVAL (log1pf, 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (log1pf, 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (log1pf, 0.0, -0x1p-23, 50000)
+PL_TEST_INTERVAL (log1pf, -0x1p-23, -0.001, 50000)
+PL_TEST_INTERVAL (log1pf, -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (log1pf, -1.0, inf, 5000)
diff --git a/pl/math/sinh_3u.c b/pl/math/sinh_3u.c
index 86f00a1..52ca156 100644
--- a/pl/math/sinh_3u.c
+++ b/pl/math/sinh_3u.c
@@ -58,3 +58,9 @@ sinh (double x)
 
 PL_SIG (S, D, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (sinh, 2.08)
+PL_TEST_INTERVAL (sinh, 0, 0x1p-51, 100)
+PL_TEST_INTERVAL (sinh, -0, -0x1p-51, 100)
+PL_TEST_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
+PL_TEST_INTERVAL (sinh, -0x1p-51, -0x1.62e42fefa39fp+9, 100000)
+PL_TEST_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000)
+PL_TEST_INTERVAL (sinh, -0x1.62e42fefa39fp+9, -inf, 1000)
diff --git a/pl/math/sinhf_2u3.c b/pl/math/sinhf_2u3.c
index 15786d9..38f59b0 100644
--- a/pl/math/sinhf_2u3.c
+++ b/pl/math/sinhf_2u3.c
@@ -68,3 +68,9 @@ sinhf (float x)
 
 PL_SIG (S, F, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (sinhf, 1.76)
+PL_TEST_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000)
+PL_TEST_INTERVAL (sinhf, -0, -0x1.62e43p+6, 100000)
+PL_TEST_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100)
+PL_TEST_INTERVAL (sinhf, -0x1.62e43p+6, -0x1.65a9fap+6, 100)
+PL_TEST_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100)
+PL_TEST_INTERVAL (sinhf, -0x1.65a9fap+6, -inf, 100)
diff --git a/pl/math/sv_atan2_2u5.c b/pl/math/sv_atan2_2u5.c
index 4ab2fea..b230b36 100644
--- a/pl/math/sv_atan2_2u5.c
+++ b/pl/math/sv_atan2_2u5.c
@@ -85,4 +85,9 @@ PL_ALIAS (__sv_atan2_x, _ZGVsMxvv_atan2)
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
 PL_SIG (SV, D, 2, atan2)
 PL_TEST_ULP (__sv_atan2, 1.78)
+PL_TEST_INTERVAL (__sv_atan2, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atan2, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2, 1e6, 1e32, 40000)
 #endif
diff --git a/pl/math/sv_atan2f_3u.c b/pl/math/sv_atan2f_3u.c
index 90656f0..5e9d59b 100644
--- a/pl/math/sv_atan2f_3u.c
+++ b/pl/math/sv_atan2f_3u.c
@@ -86,4 +86,9 @@ PL_ALIAS (__sv_atan2f_x, _ZGVsMxvv_atan2f)
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
 PL_SIG (SV, F, 2, atan2)
 PL_TEST_ULP (__sv_atan2f, 2.45)
+PL_TEST_INTERVAL (__sv_atan2f, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atan2f, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2f, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2f, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2f, 1e6, 1e32, 40000)
 #endif
diff --git a/pl/math/sv_atan_2u5.c b/pl/math/sv_atan_2u5.c
index 93b39b1..16430a2 100644
--- a/pl/math/sv_atan_2u5.c
+++ b/pl/math/sv_atan_2u5.c
@@ -54,4 +54,9 @@ PL_ALIAS (__sv_atan_x, _ZGVsMxv_atan)
 
 PL_SIG (SV, D, 1, atan, -3.1, 3.1)
 PL_TEST_ULP (__sv_atan, 1.78)
+PL_TEST_INTERVAL (__sv_atan, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atan, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atan, 1e6, 1e32, 40000)
 #endif
diff --git a/pl/math/sv_atanf_2u9.c b/pl/math/sv_atanf_2u9.c
index 386c28e..41f99e5 100644
--- a/pl/math/sv_atanf_2u9.c
+++ b/pl/math/sv_atanf_2u9.c
@@ -51,4 +51,9 @@ PL_ALIAS (__sv_atanf_x, _ZGVsMxv_atanf)
 
 PL_SIG (SV, F, 1, atan, -3.1, 3.1)
 PL_TEST_ULP (__sv_atanf, 2.9)
+PL_TEST_INTERVAL (__sv_atanf, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atanf, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atanf, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atanf, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atanf, 1e6, 1e32, 40000)
 #endif
diff --git a/pl/math/sv_cos_2u5.c b/pl/math/sv_cos_2u5.c
index 146ca22..a06ab9a 100644
--- a/pl/math/sv_cos_2u5.c
+++ b/pl/math/sv_cos_2u5.c
@@ -79,4 +79,6 @@ PL_ALIAS (__sv_cos_x, _ZGVsMxv_cos)
 
 PL_SIG (SV, D, 1, cos, -3.1, 3.1)
 PL_TEST_ULP (__sv_cos, 1.61)
+PL_TEST_INTERVAL (__sv_cos, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_cos, 0x1p-4, 0x1p4, 500000)
 #endif
diff --git a/pl/math/sv_cosf_2u1.c b/pl/math/sv_cosf_2u1.c
index fdc4b1e..b8ec846 100644
--- a/pl/math/sv_cosf_2u1.c
+++ b/pl/math/sv_cosf_2u1.c
@@ -77,4 +77,6 @@ PL_ALIAS (__sv_cosf_x, _ZGVsMxv_cosf)
 
 PL_SIG (SV, F, 1, cos, -3.1, 3.1)
 PL_TEST_ULP (__sv_cosf, 1.57)
+PL_TEST_INTERVAL (__sv_cosf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_cosf, 0x1p-4, 0x1p4, 500000)
 #endif
diff --git a/pl/math/sv_erf_2u5.c b/pl/math/sv_erf_2u5.c
index f91aa41..b4c9186 100644
--- a/pl/math/sv_erf_2u5.c
+++ b/pl/math/sv_erf_2u5.c
@@ -92,4 +92,12 @@ PL_ALIAS (__sv_erf_x, _ZGVsMxv_erf)
 
 PL_SIG (SV, D, 1, erf, -4.0, 4.0)
 PL_TEST_ULP (__sv_erf, 1.97)
+PL_TEST_INTERVAL (__sv_erf, 0, 0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erf, 0x1p-28, 1, 60000)
+PL_TEST_INTERVAL (__sv_erf, 1, 0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erf, 0x1p28, inf, 20000)
+PL_TEST_INTERVAL (__sv_erf, -0, -0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erf, -0x1p-28, -1, 60000)
+PL_TEST_INTERVAL (__sv_erf, -1, -0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erf, -0x1p28, -inf, 20000)
 #endif
diff --git a/pl/math/sv_erfc_4u.c b/pl/math/sv_erfc_4u.c
index d426fa9..5b2fc18 100644
--- a/pl/math/sv_erfc_4u.c
+++ b/pl/math/sv_erfc_4u.c
@@ -137,4 +137,10 @@ PL_ALIAS (__sv_erfc_x, _ZGVsMxv_erfc)
 
 PL_SIG (SV, D, 1, erfc, -4.0, 10.0)
 PL_TEST_ULP (__sv_erfc, 3.15)
+PL_TEST_INTERVAL (__sv_erfc, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_erfc, 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (__sv_erfc, -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (__sv_erfc, 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (__sv_erfc, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (__sv_erfc, 0, inf, 40000)
 #endif
diff --git a/pl/math/sv_erff_1u3.c b/pl/math/sv_erff_1u3.c
index 9589fb3..fb1bef8 100644
--- a/pl/math/sv_erff_1u3.c
+++ b/pl/math/sv_erff_1u3.c
@@ -93,4 +93,12 @@ PL_ALIAS (__sv_erff_x, _ZGVsMxv_erff)
 
 PL_SIG (SV, F, 1, erf, -4.0, 4.0)
 PL_TEST_ULP (__sv_erff, 0.76)
+PL_TEST_INTERVAL (__sv_erff, 0, 0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erff, 0x1p-28, 1, 60000)
+PL_TEST_INTERVAL (__sv_erff, 1, 0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erff, 0x1p28, inf, 20000)
+PL_TEST_INTERVAL (__sv_erff, -0, -0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erff, -0x1p-28, -1, 60000)
+PL_TEST_INTERVAL (__sv_erff, -1, -0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erff, -0x1p28, -inf, 20000)
 #endif
diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c
index f97a762..30a6c62 100644
--- a/pl/math/sv_expf_2u.c
+++ b/pl/math/sv_expf_2u.c
@@ -145,4 +145,12 @@ PL_ALIAS (__sv_expf_x, _ZGVsMxv_expf)
 
 PL_SIG (SV, F, 1, exp, -9.9, 9.9)
 PL_TEST_ULP (__sv_expf, 1.46)
+PL_TEST_INTERVAL (__sv_expf, 0, 0x1p-23, 40000)
+PL_TEST_INTERVAL (__sv_expf, 0x1p-23, 1, 50000)
+PL_TEST_INTERVAL (__sv_expf, 1, 0x1p23, 50000)
+PL_TEST_INTERVAL (__sv_expf, 0x1p23, inf, 50000)
+PL_TEST_INTERVAL (__sv_expf, -0, -0x1p-23, 40000)
+PL_TEST_INTERVAL (__sv_expf, -0x1p-23, -1, 50000)
+PL_TEST_INTERVAL (__sv_expf, -1, -0x1p23, 50000)
+PL_TEST_INTERVAL (__sv_expf, -0x1p23, -inf, 50000)
 #endif // SV_SUPPORTED
diff --git a/pl/math/sv_log10_2u5.c b/pl/math/sv_log10_2u5.c
index a9b002b..770b964 100644
--- a/pl/math/sv_log10_2u5.c
+++ b/pl/math/sv_log10_2u5.c
@@ -80,4 +80,10 @@ PL_ALIAS (__sv_log10_x, _ZGVsMxv_log10)
 
 PL_SIG (SV, D, 1, log10, 0.01, 11.1)
 PL_TEST_ULP (__sv_log10, 1.97)
+PL_TEST_INTERVAL (__sv_log10, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_log10, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log10, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log10, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log10, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log10, 100, inf, 50000)
 #endif
diff --git a/pl/math/sv_log10f_3u5.c b/pl/math/sv_log10f_3u5.c
index b29ee80..06c0908 100644
--- a/pl/math/sv_log10f_3u5.c
+++ b/pl/math/sv_log10f_3u5.c
@@ -79,4 +79,10 @@ PL_ALIAS (__sv_log10f_x, _ZGVsMxv_log10f)
 
 PL_SIG (SV, F, 1, log10, 0.01, 11.1)
 PL_TEST_ULP (__sv_log10f, 2.82)
+PL_TEST_INTERVAL (__sv_log10f, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_log10f, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log10f, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log10f, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log10f, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log10f, 100, inf, 50000)
 #endif
diff --git a/pl/math/sv_log_2u5.c b/pl/math/sv_log_2u5.c
index 8477739..7eeb206 100644
--- a/pl/math/sv_log_2u5.c
+++ b/pl/math/sv_log_2u5.c
@@ -76,4 +76,10 @@ PL_ALIAS (__sv_log_x, _ZGVsMxv_log)
 
 PL_SIG (SV, D, 1, log, 0.01, 11.1)
 PL_TEST_ULP (__sv_log, 1.68)
+PL_TEST_INTERVAL (__sv_log, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_log, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log, 100, inf, 50000)
 #endif // SV_SUPPORTED
diff --git a/pl/math/sv_logf_3u4.c b/pl/math/sv_logf_3u4.c
index 8fea406..4ca1ead 100644
--- a/pl/math/sv_logf_3u4.c
+++ b/pl/math/sv_logf_3u4.c
@@ -68,4 +68,10 @@ PL_ALIAS (__sv_logf_x, _ZGVsMxv_logf)
 
 PL_SIG (SV, F, 1, log, 0.01, 11.1)
 PL_TEST_ULP (__sv_logf, 2.85)
+PL_TEST_INTERVAL (__sv_logf, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_logf, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_logf, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_logf, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_logf, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_logf, 100, inf, 50000)
 #endif // SV_SUPPORTED
diff --git a/pl/math/sv_sin_3u.c b/pl/math/sv_sin_3u.c
index 5637ebe..9072ef4 100644
--- a/pl/math/sv_sin_3u.c
+++ b/pl/math/sv_sin_3u.c
@@ -84,4 +84,6 @@ PL_ALIAS (__sv_sin_x, _ZGVsMxv_sin)
 
 PL_SIG (SV, D, 1, sin, -3.1, 3.1)
 PL_TEST_ULP (__sv_sin, 2.03)
+PL_TEST_INTERVAL (__sv_sin, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_sin, 0x1p-4, 0x1p4, 500000)
 #endif
diff --git a/pl/math/sv_sinf_1u9.c b/pl/math/sv_sinf_1u9.c
index ca26e92..576baea 100644
--- a/pl/math/sv_sinf_1u9.c
+++ b/pl/math/sv_sinf_1u9.c
@@ -79,4 +79,6 @@ PL_ALIAS (__sv_sinf_x, _ZGVsMxv_sinf)
 
 PL_SIG (SV, F, 1, sin, -3.1, 3.1)
 PL_TEST_ULP (__sv_sinf, 1.40)
+PL_TEST_INTERVAL (__sv_sinf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_sinf, 0x1p-4, 0x1p4, 500000)
 #endif
diff --git a/pl/math/sv_tanf_3u2.c b/pl/math/sv_tanf_3u2.c
index 8629b05..ca5c5de 100644
--- a/pl/math/sv_tanf_3u2.c
+++ b/pl/math/sv_tanf_3u2.c
@@ -103,4 +103,12 @@ PL_ALIAS (__sv_tanf_x, _ZGVsMxv_tanf)
 
 PL_SIG (SV, F, 1, tan, -3.1, 3.1)
 PL_TEST_ULP (__sv_tanf, 2.7)
+PL_TEST_INTERVAL (__sv_tanf, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-23, 0.7, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 1.5, 100, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 100, 0x1p17, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p17, inf, 50000)
 #endif
diff --git a/pl/math/tanf_3u3.c b/pl/math/tanf_3u3.c
index e8784d8..f6673f5 100644
--- a/pl/math/tanf_3u3.c
+++ b/pl/math/tanf_3u3.c
@@ -195,3 +195,16 @@ tanf (float x)
 
 PL_SIG (S, F, 1, tan, -3.1, 3.1)
 PL_TEST_ULP (tanf, 2.80)
+PL_TEST_INTERVAL (tanf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p-127, -0x1p-14, 50000)
+PL_TEST_INTERVAL (tanf, 0x1p-14, 0.7, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p-14, -0.7, 50000)
+PL_TEST_INTERVAL (tanf, 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (tanf, -0.7, -1.5, 50000)
+PL_TEST_INTERVAL (tanf, 1.5, 0x1p17, 50000)
+PL_TEST_INTERVAL (tanf, -1.5, -0x1p17, 50000)
+PL_TEST_INTERVAL (tanf, 0x1p17, 0x1p54, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p17, -0x1p54, 50000)
+PL_TEST_INTERVAL (tanf, 0x1p54, inf, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p54, -inf, 50000)
diff --git a/pl/math/tanhf_2u6.c b/pl/math/tanhf_2u6.c
index e6cbbd0..745e5e3 100644
--- a/pl/math/tanhf_2u6.c
+++ b/pl/math/tanhf_2u6.c
@@ -83,3 +83,9 @@ tanhf (float x)
 
 PL_SIG (S, F, 1, tanh, -10.0, 10.0)
 PL_TEST_ULP (tanhf, 2.09)
+PL_TEST_INTERVAL (tanhf, 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (tanhf, -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (tanhf, -0x1p-23, -0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (tanhf, 0x1.205966p+3, inf, 100)
+PL_TEST_INTERVAL (tanhf, -0x1.205966p+3, -inf, 100)
diff --git a/pl/math/test/pl_test.h b/pl/math/test/pl_test.h
index 66dc73a..9bbcaf1 100644
--- a/pl/math/test/pl_test.h
+++ b/pl/math/test/pl_test.h
@@ -27,3 +27,6 @@
 #define PL_TEST_EXPECT_FENV(f, e) PL_TEST_EXPECT_FENV_ (f, e)
 #define PL_TEST_EXPECT_FENV_(f, e) PL_TEST_EXPECT_FENV_##e (f)
 #define PL_TEST_EXPECT_FENV_1(f) PL_TEST_EXPECT_FENV_ENABLED f
+
+#define PL_TEST_INTERVAL(f, lo, hi, n) PL_TEST_INTERVAL f lo hi n
+#define PL_TEST_INTERVAL_C(f, lo, hi, n, c) PL_TEST_INTERVAL f lo hi n c
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index d6c3196..7fa4058 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -24,7 +24,8 @@ t() {
 	key=$(cat $ALIASES | { grep " $1$" || echo $1; } | awk '{print $1}')
 	L=$(cat $LIMITS | grep "^$key " | awk '{print $2}')
 	[[ $L =~ ^[0-9]+\.[0-9]+$ ]]
-	extra_flags="${5:-}"
+	extra_flags=""
+	[[ -z "${5:-}" ]] || extra_flags="$extra_flags -c $5"
 	grep -q "^$key$" $FENV || extra_flags="$extra_flags -f"
 	$emu ./ulp -e $L $flags ${extra_flags} $1 $2 $3 $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
 }
@@ -33,167 +34,11 @@ check() {
 	$emu ./ulp -f -q "$@" #>/dev/null
 }
 
-t erff  0      0xffff0000 10000
-t erff  0x1p-127  0x1p-26 40000
-t erff -0x1p-127 -0x1p-26 40000
-t erff  0x1p-26   0x1p3   40000
-t erff -0x1p-26  -0x1p3   40000
-t erff  0         inf     40000
-
-t log10f  0      0xffff0000 10000
-t log10f  0x1p-127  0x1p-26 50000
-t log10f  0x1p-26   0x1p3   50000
-t log10f  0x1p-4    0x1p4   50000
-t log10f  0         inf     50000
-
-t log10  0 0xffff000000000000 10000
-t log10  0x1p-4    0x1p4      40000
-t log10  0         inf        40000
-
-t erfc  0       0xffff0000   10000
-t erfc  0x1p-1022  0x1p-26   40000
-t erfc -0x1p-1022 -0x1p-26   40000
-t erfc  0x1p-26    0x1p5     40000
-t erfc -0x1p-26   -0x1p3     40000
-t erfc  0          inf       40000
-
-t erfcf  0      0xffff0000 10000
-t erfcf  0x1p-127  0x1p-26 40000
-t erfcf -0x1p-127 -0x1p-26 40000
-t erfcf  0x1p-26    0x1p5  40000
-t erfcf -0x1p-26   -0x1p3  40000
-t erfcf  0          inf    40000
-
-t atan2     -10.0      10.0               50000
-t atan2      -1.0       1.0               40000
-t atan2       0.0       1.0               40000
-t atan2       1.0     100.0               40000
-t atan2       1e6      1e32               40000
+# Regression-test for correct NaN handling in atan2
 check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000
-# Regression-test for correct NaN handling
 check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan
 check atan2 nan nan x -nan -nan
 
-t atan2f -10.0       10.0  50000
-t atan2f  -1.0        1.0  40000
-t atan2f   0.0        1.0  40000
-t atan2f   1.0      100.0  40000
-t atan2f   1e6       1e32  40000
-
-t asinhf        0  0x1p-12  5000
-t asinhf  0x1p-12      1.0  50000
-t asinhf      1.0   0x1p11  50000
-t asinhf   0x1p11  0x1p127  20000
-
-t asinh -0x1p-26 0x1p-26   50000
-t asinh  0x1p-26     1.0   40000
-t asinh -0x1p-26    -1.0   10000
-t asinh      1.0   100.0   40000
-t asinh     -1.0  -100.0   10000
-t asinh    100.0     inf   50000
-t asinh   -100.0    -inf   10000
-
-t log1p    -10.0     10.0  10000
-t log1p      0.0  0x1p-23  50000
-t log1p  0x1p-23    0.001  50000
-t log1p    0.001      1.0  50000
-t log1p      0.0 -0x1p-23  50000
-t log1p -0x1p-23   -0.001  50000
-t log1p   -0.001     -1.0  50000
-t log1p     -1.0      inf   5000
-
-t log1pf    -10.0     10.0  10000
-t log1pf      0.0  0x1p-23  50000
-t log1pf  0x1p-23    0.001  50000
-t log1pf    0.001      1.0  50000
-t log1pf      0.0 -0x1p-23  50000
-t log1pf -0x1p-23   -0.001  50000
-t log1pf   -0.001     -1.0  50000
-t log1pf     -1.0      inf   5000
-
-t tanf  0      0xffff0000 10000
-t tanf  0x1p-127  0x1p-14 50000
-t tanf -0x1p-127 -0x1p-14 50000
-t tanf  0x1p-14   0.7     50000
-t tanf -0x1p-14  -0.7     50000
-t tanf  0.7       1.5     50000
-t tanf -0.7      -1.5     50000
-t tanf  1.5       0x1p17  50000
-t tanf -1.5      -0x1p17  50000
-t tanf  0x1p17    0x1p54  50000
-t tanf -0x1p17   -0x1p54  50000
-t tanf  0x1p54    inf     50000
-t tanf -0x1p54   -inf     50000
-
-t acoshf 0      1         100
-t acoshf 1      2       10000
-t acoshf 2      0x1p64 100000
-t acoshf 0x1p64 inf    100000
-t acoshf -0     -inf    10000
-
-t acosh 0        1       10000
-t acosh 1        2       100000
-t acosh 2        0x1p511 100000
-t acosh 0x1p511  inf     100000
-t acosh -0      -inf     10000
-
-t expm1f  0        0x1p-23       1000
-t expm1f -0       -0x1p-23       1000
-t expm1f  0x1p-23  0x1.644716p6  100000
-t expm1f -0x1p-23 -0x1.9bbabcp+6 100000
-
-t sinhf  0              0x1.62e43p+6  100000
-t sinhf -0             -0x1.62e43p+6  100000
-t sinhf  0x1.62e43p+6   0x1.65a9fap+6 100
-t sinhf -0x1.62e43p+6  -0x1.65a9fap+6 100
-t sinhf  0x1.65a9fap+6  inf           100
-t sinhf -0x1.65a9fap+6 -inf           100
-
-t coshf  0              0x1p-63         100
-t coshf  0              0x1.5a92d8p+6   80000
-t coshf  0x1.5a92d8p+6  inf             2000
-t coshf -0             -0x1p-63         100
-t coshf -0             -0x1.5a92d8p+6   80000
-t coshf -0x1.5a92d8p+6 -inf             2000
-
-t expm1  0                     0x1p-51              1000
-t expm1 -0                    -0x1p-51              1000
-t expm1  0x1p-51               0x1.63108c75a1937p+9 100000
-t expm1 -0x1p-51              -0x1.740bf7c0d927dp+9 100000
-t expm1  0x1.63108c75a1937p+9  inf                  100
-t expm1 -0x1.740bf7c0d927dp+9 -inf                  100
-
-t sinh  0                    0x1p-51             100
-t sinh -0                   -0x1p-51             100
-t sinh  0x1p-51              0x1.62e42fefa39fp+9 100000
-t sinh -0x1p-51             -0x1.62e42fefa39fp+9 100000
-t sinh  0x1.62e42fefa39fp+9  inf                 1000
-t sinh -0x1.62e42fefa39fp+9 -inf                 1000
-
-t cosh  0                     0x1.61da04cbafe44p+9 100000
-t cosh -0                    -0x1.61da04cbafe44p+9 100000
-t cosh  0x1.61da04cbafe44p+9  0x1p10               1000
-t cosh -0x1.61da04cbafe44p+9 -0x1p10               1000
-t cosh  0x1p10                inf                  100
-t cosh -0x1p10               -inf                  100
-
-t atanhf  0        0x1p-12 500
-t atanhf  0x1p-12  1       200000
-t atanhf  1        inf     1000
-t atanhf -0       -0x1p-12 500
-t atanhf -0x1p-12 -1       200000
-t atanhf -1       -inf     1000
-
-t cbrtf  0  inf 1000000
-t cbrtf -0 -inf 1000000
-
-t tanhf  0              0x1p-23       1000
-t tanhf -0             -0x1p-23       1000
-t tanhf  0x1p-23        0x1.205966p+3 100000
-t tanhf -0x1p-23       -0x1.205966p+3 100000
-t tanhf  0x1.205966p+3  inf           100
-t tanhf -0x1.205966p+3 -inf           100
-
 # vector functions
 flags="${ULPFLAGS:--q}"
 runs=
@@ -220,538 +65,10 @@ check -q -f -e 0 __sv_powi   0  inf x -0 -1000 100000 && runsv=1
 check -q -f -e 0 __sv_powi  -0 -inf x -0 -1000 100000 && runsv=1
 fi
 
-range_erfc='
-   0       0xffff0000   10000
-   0x1p-1022  0x1p-26   40000
-  -0x1p-1022 -0x1p-26   40000
-   0x1p-26    0x1p5     40000
-  -0x1p-26   -0x1p3     40000
-   0          inf       40000
-'
-
-range_erfcf='
-   0      0xffff0000 10000
-   0x1p-127  0x1p-26 40000
-  -0x1p-127 -0x1p-26 40000
-   0x1p-26    0x1p5  40000
-  -0x1p-26   -0x1p3  40000
-   0          inf    40000
-'
-
-range_log10='
-  0 0xffff000000000000 10000
-  0x1p-4     0x1p4     400000
-  0          inf       400000
-'
-
-range_log10f='
- 0    0xffff0000    10000
- 0x1p-4    0x1p4    500000
-'
-
-range_erf='
- 0      0xffff0000 10000
- 0x1p-127  0x1p-26 40000
--0x1p-127 -0x1p-26 40000
- 0x1p-26   0x1p3   40000
--0x1p-26  -0x1p3   40000
- 0         inf     40000
-'
-
-range_erff='
- 0      0xffff0000 10000
- 0x1p-127  0x1p-26 40000
--0x1p-127 -0x1p-26 40000
- 0x1p-26   0x1p3   40000
--0x1p-26  -0x1p3   40000
- 0         inf     40000
-'
-
-range_atan2='
- -10.0       10.0  50000
-  -1.0        1.0  40000
-   0.0        1.0  40000
-   1.0      100.0  40000
-   1e6       1e32  40000
-'
-
-range_atan='
- -10.0       10.0  50000
-  -1.0        1.0  40000
-   0.0        1.0  40000
-   1.0      100.0  40000
-   1e6       1e32  40000
-'
-
-range_atan2f='
- -10.0       10.0  50000
-  -1.0        1.0  40000
-   0.0        1.0  40000
-   1.0      100.0  40000
-   1e6       1e32  40000
-'
-
-range_atanf='
- -10.0       10.0  50000
-  -1.0        1.0  40000
-   0.0        1.0  40000
-   1.0      100.0  40000
-   1e6       1e32  40000
-'
-
-range_log1pf='
-    -10.0     10.0  10000
-      0.0  0x1p-23  30000
-  0x1p-23    0.001  50000
-    0.001      1.0  50000
-      0.0 -0x1p-23  30000
- -0x1p-23   -0.001  30000
-   -0.001     -1.0  50000
-     -1.0      inf   1000
-'
-
-range_asinhf='
-        0  0x1p-12  40000
-  0x1p-12      1.0  40000
-      1.0   0x1p11  40000
-   0x1p11      inf  40000
-        0 -0x1p-12  20000
- -0x1p-12     -1.0  20000
-     -1.0  -0x1p11  20000
-  -0x1p11     -inf  20000
-'
-
-range_log2f='
-     -0.0  -0x1p126  100
- 0x1p-149  0x1p-126  4000
- 0x1p-126   0x1p-23  50000
-  0x1p-23       1.0  50000
-      1.0       100  50000
-      100       inf  50000
-'
-
-range_log2='
-     -0.0  -0x1p126  100
- 0x1p-149  0x1p-126  4000
- 0x1p-126   0x1p-23  50000
-  0x1p-23       1.0  50000
-      1.0       100  50000
-      100       inf  50000
-'
-
-range_tanf='
-     -0.0  -0x1p126  100
- 0x1p-149  0x1p-126  4000
- 0x1p-126   0x1p-23  50000
-  0x1p-23       0.7  50000
-      0.7       1.5  50000
-      1.5       100  50000
-      100    0x1p17  50000
-   0x1p17       inf  50000
-'
-
-range_log1p='
-    -10.0     10.0  10000
-      0.0  0x1p-23  50000
-  0x1p-23    0.001  50000
-    0.001      1.0  50000
-      0.0 -0x1p-23  50000
- -0x1p-23   -0.001  50000
-   -0.001     -1.0  50000
-     -1.0      inf   5000
-'
-
-range_expm1f='
-  0        0x1p-23       1000
- -0       -0x1p-23       1000
-  0x1p-23  0x1.644716p6  1000000
- -0x1p-23 -0x1.9bbabcp+6 1000000
-'
-
-range_sinhf='
-  0              0x1.62e43p+6  100000
- -0             -0x1.62e43p+6  100000
-  0x1.62e43p+6   0x1.65a9fap+6 100
- -0x1.62e43p+6  -0x1.65a9fap+6 100
-  0x1.65a9fap+6  inf           100
- -0x1.65a9fap+6 -inf           100
-'
-
-range_coshf='
-  0              0x1p-63         100
-  0              0x1.5a92d8p+6   80000
-  0x1.5a92d8p+6  inf             2000
- -0             -0x1p-63         100
- -0             -0x1.5a92d8p+6   80000
- -0x1.5a92d8p+6 -inf             2000
-'
-
-range_expm1='
-  0                     0x1p-51              1000
- -0                    -0x1p-51              1000
-  0x1p-51               0x1.63108c75a1937p+9 100000
- -0x1p-51              -0x1.740bf7c0d927dp+9 100000
-  0x1.63108c75a1937p+9  inf                  100
- -0x1.740bf7c0d927dp+9 -inf                  100
-'
-
-range_sinh='
-  0                    0x1p-51             100
- -0                   -0x1p-51             100
-  0x1p-51              0x1.62e42fefa39fp+9 100000
- -0x1p-51             -0x1.62e42fefa39fp+9 100000
-  0x1.62e42fefa39fp+9  inf                 1000
- -0x1.62e42fefa39fp+9 -inf                 1000
-'
-
-range_cosh='
-  0        0x1.6p9   100000
- -0       -0x1.6p9   100000
-  0x1.6p9  inf       1000
- -0x1.6p9 -inf       1000
-'
-
-range_atanhf='
-  0        0x1p-12 500
-  0x1p-12  1       200000
-  1        inf     1000
- -0       -0x1p-12 500
- -0x1p-12 -1       200000
- -1       -inf     1000
-'
-
-range_cbrtf='
-  0  inf 1000000
- -0 -inf 1000000
-'
-
-range_asinh='
-  0        0x1p-26 50000
-  0x1p-26  1       50000
-  1        0x1p511 50000
-  0x1p511  inf     40000
- -0       -0x1p-26 50000
- -0x1p-26 -1       50000
- -1       -0x1p511 50000
- -0x1p511 -inf     40000
-'
-
-range_tanhf='
-  0              0x1p-23       1000
- -0             -0x1p-23       1000
-  0x1p-23        0x1.205966p+3 100000
- -0x1p-23       -0x1.205966p+3 100000
-  0x1.205966p+3  inf           100
- -0x1.205966p+3 -inf           100
-'
-
-range_sve_cosf='
- 0    0xffff0000    10000
- 0x1p-4    0x1p4    500000
-'
-
-range_sve_cos='
- 0    0xffff0000    10000
- 0x1p-4    0x1p4    500000
-'
-
-range_sve_sinf='
- 0    0xffff0000    10000
- 0x1p-4    0x1p4    500000
-'
-
-range_sve_sin='
- 0    0xffff0000    10000
- 0x1p-4    0x1p4    500000
-'
-
-range_sve_atanf='
- -10.0       10.0  50000
-  -1.0        1.0  40000
-   0.0        1.0  40000
-   1.0      100.0  40000
-   1e6       1e32  40000
-'
-
-range_sve_atan='
- -10.0       10.0  50000
-  -1.0        1.0  40000
-   0.0        1.0  40000
-   1.0      100.0  40000
-   1e6       1e32  40000
-'
-
-range_sve_atan2f='
- -10.0       10.0  50000
-  -1.0        1.0  40000
-   0.0        1.0  40000
-   1.0      100.0  40000
-   1e6       1e32  40000
-'
-
-range_sve_atan2='
- -10.0       10.0  50000
-  -1.0        1.0  40000
-   0.0        1.0  40000
-   1.0      100.0  40000
-   1e6       1e32  40000
-'
-
-range_sve_log10='
-     -0.0  -0x1p126  100
- 0x1p-149  0x1p-126  4000
- 0x1p-126   0x1p-23  50000
-  0x1p-23       1.0  50000
-      1.0       100  50000
-      100       inf  50000
-'
-
-range_sve_log10f='
-     -0.0  -0x1p126  100
- 0x1p-149  0x1p-126  4000
- 0x1p-126   0x1p-23  50000
-  0x1p-23       1.0  50000
-      1.0       100  50000
-      100       inf  50000
-'
-
-range_sve_logf='
-     -0.0  -0x1p126  100
- 0x1p-149  0x1p-126  4000
- 0x1p-126   0x1p-23  50000
-  0x1p-23       1.0  50000
-      1.0       100  50000
-      100       inf  50000
-'
-
-range_sve_log='
-     -0.0  -0x1p126  100
- 0x1p-149  0x1p-126  4000
- 0x1p-126   0x1p-23  50000
-  0x1p-23       1.0  50000
-      1.0       100  50000
-      100       inf  50000
-'
-
-range_sve_expf='
-  0        0x1p-23   40000
-  0x1p-23  1         50000
-  1        0x1p23    50000
-  0x1p23   inf       50000
-  -0       -0x1p-23  40000
-  -0x1p-23 -1        50000
-  -1       -0x1p23   50000
-  -0x1p23  -inf      50000
-'
-
-range_sve_erff='
-  0        0x1p-28   20000
-  0x1p-28  1         60000
-  1        0x1p28    60000
-  0x1p28   inf       20000
-  -0       -0x1p-28  20000
-  -0x1p-28 -1        60000
-  -1       -0x1p28   60000
-  -0x1p28  -inf      20000
-'
-
-range_sve_erf='
-  0        0x1p-28   20000
-  0x1p-28  1         60000
-  1        0x1p28    60000
-  0x1p28   inf       20000
-  -0       -0x1p-28  20000
-  -0x1p-28 -1        60000
-  -1       -0x1p28   60000
-  -0x1p28  -inf      20000
-'
-
-range_sve_tanf='
-     -0.0  -0x1p126  100
- 0x1p-149  0x1p-126  4000
- 0x1p-126   0x1p-23  50000
-  0x1p-23       0.7  50000
-      0.7       1.5  50000
-      1.5       100  50000
-      100    0x1p17  50000
-   0x1p17       inf  50000
-'
-
-range_sve_erfc='
-   0      0xffff0000 10000
-   0x1p-127  0x1p-26 40000
-  -0x1p-127 -0x1p-26 40000
-   0x1p-26    0x1p5  40000
-  -0x1p-26   -0x1p3  40000
-   0          inf    40000
-'
-
-while read G F R A
+cat $INTERVALS | while read F LO HI N C
 do
-	[ "$R" = 1 ] && { [[ $G != sve_* ]] || [ $WANT_SVE_MATH -eq 1 ]; } || continue
-	case "$G" in \#*) continue ;; esac
-	eval range="\${range_$G}"
-	while read X
-	do
-		[ -n "$X" ] || continue
-		case "$X" in \#*) continue ;; esac
-		t $F $X "$A"
-	done << EOF
-$range
-EOF
-done << EOF
-# group symbol run
-
-atan   __s_atan        $runs
-atan   __v_atan        $runv
-atan   __vn_atan       $runvn
-atan   _ZGVnN2v_atan   $runvn
-atan2 __s_atan2        $runs
-atan2 __v_atan2        $runv
-atan2 __vn_atan2       $runvn
-atan2 _ZGVnN2vv_atan2  $runvn
-erf   __s_erf          $runs
-erf   __v_erf          $runv
-erf   __vn_erf         $runvn
-erf   _ZGVnN2v_erf     $runvn
-erfc   __s_erfc        $runs
-erfc   __v_erfc        $runv
-erfc   __vn_erfc       $runvn
-erfc   _ZGVnN2v_erfc   $runvn
-log10  __s_log10       $runs
-log10  __v_log10       $runv
-log10  __vn_log10      $runvn
-log10  _ZGVnN2v_log10  $runvn
-log2   __s_log2        $runs
-log2   __v_log2        $runv
-log2   __vn_log2       $runvn
-log2   _ZGVnN2v_log2   $runvn
-expm1  __s_expm1       $runs
-expm1  __v_expm1       $runv
-expm1  __vn_expm1      $runvn
-expm1  _ZGVnN2v_expm1  $runvn
-sinh   __s_sinh        $runs
-sinh   __v_sinh        $runv
-sinh   __vn_sinh       $runvn
-sinh   _ZGVnN2v_sinh   $runvn
-cosh   __s_cosh        $runs
-cosh   __v_cosh        $runv
-cosh   __vn_cosh       $runvn
-cosh   _ZGVnN2v_cosh   $runvn
-
-atanf  __s_atanf       $runs
-atanf  __v_atanf       $runv
-atanf  __vn_atanf      $runvn
-atanf  _ZGVnN4v_atanf  $runvn
-atan2f __s_atan2f       $runs
-atan2f __v_atan2f       $runv
-atan2f __vn_atan2f      $runvn
-atan2f _ZGVnN4vv_atan2f $runvn
-erff   __s_erff        $runs
-erff   __v_erff        $runv
-erff   __vn_erff       $runvn
-erff   _ZGVnN4v_erff   $runvn
-erfcf  __s_erfcf       $runs
-erfcf  __v_erfcf       $runv
-erfcf  __vn_erfcf      $runvn
-erfcf  _ZGVnN4v_erfcf  $runvn
-log10f __s_log10f      $runs
-log10f __v_log10f      $runv
-log10f __vn_log10f     $runvn
-log10f _ZGVnN4v_log10f $runvn
-log1pf __s_log1pf      $runs
-log1pf __v_log1pf      $runv
-log1pf __vn_log1pf     $runvn
-log1pf _ZGVnN4v_log1pf $runvn
-asinhf __s_asinhf      $runs
-asinhf __v_asinhf      $runv
-asinhf __vn_asinhf     $runvn
-asinhf _ZGVnN4v_asinhf $runvn
-log2f  __s_log2f       $runs
-log2f  __v_log2f       $runv
-log2f  __vn_log2f      $runvn
-log2f  _ZGVnN4v_log2f  $runvn
-tanf  __s_tanf         $runs
-tanf  __v_tanf         $runv
-tanf  __vn_tanf        $runvn
-tanf  _ZGVnN4v_tanf    $runvn
-log1p  __s_log1p       $runs
-log1p  __v_log1p       $runv
-log1p  __vn_log1p      $runvn
-log1p  _ZGVnN2v_log1p  $runvn
-expm1f __s_expm1f      $runs
-expm1f __v_expm1f      $runv
-expm1f __vn_expm1f     $runvn
-expm1f _ZGVnN4v_expm1f $runvn
-sinhf  __s_sinhf       $runs
-sinhf  __v_sinhf       $runv
-sinhf  __vn_sinhf      $runvn
-sinhf  _ZGVnN4v_sinhf  $runvn
-coshf  __s_coshf       $runs
-coshf  __v_coshf       $runv
-coshf  __vn_coshf      $runvn
-coshf  _ZGVnN4v_coshf  $runvn
-atanhf __s_atanhf      $runs  -c 0
-atanhf __v_atanhf      $runv  -c 0
-atanhf __vn_atanhf     $runvn -c 0
-atanhf _ZGVnN4v_atanhf $runvn -c 0
-cbrtf  __s_cbrtf       $runs
-cbrtf  __v_cbrtf       $runv
-cbrtf  __vn_cbrtf      $runvn
-cbrtf  _ZGVnN4v_cbrtf  $runvn
-asinh  __s_asinh       $runs
-# Test vector asinh 3 times, with control lane < 1, > 1 and special.
-# Ensures the v_sel is choosing the right option in all cases.
-asinh  __v_asinh       $runv    -c 0.5
-asinh  __vn_asinh      $runvn   -c 0.5
-asinh  _ZGVnN2v_asinh  $runvn   -c 0.5
-asinh  __v_asinh       $runv    -c 2
-asinh  __vn_asinh      $runvn   -c 2
-asinh  _ZGVnN2v_asinh  $runvn   -c 2
-asinh  __v_asinh       $runv    -c 0x1p600
-asinh  __vn_asinh      $runvn   -c 0x1p600
-asinh  _ZGVnN2v_asinh  $runvn   -c 0x1p600
-tanhf  __s_tanhf       $runs
-tanhf  __v_tanhf       $runv
-tanhf  __vn_tanhf      $runvn
-tanhf  _ZGVnN4v_tanhf  $runvn
-
-sve_cosf     __sv_cosf         $runsv
-sve_cosf     _ZGVsMxv_cosf     $runsv
-sve_sinf     __sv_sinf         $runsv
-sve_sinf     _ZGVsMxv_sinf     $runsv
-sve_atan2f   __sv_atan2f       $runsv
-sve_atan2f   _ZGVsMxvv_atan2f  $runsv
-sve_atanf    __sv_atanf        $runsv
-sve_atanf    _ZGVsMxv_atanf    $runsv
-sve_log10f   __sv_log10f       $runsv
-sve_log10f   _ZGVsMxv_log10f   $runsv
-sve_logf     __sv_logf         $runsv
-sve_logf     _ZGVsMxv_logf     $runsv
-sve_expf     __sv_expf         $runsv
-sve_expf     _ZGVsMxv_expf     $runsv
-sve_erff     __sv_erff         $runsv
-sve_erff     _ZGVsMxv_erff     $runsv
-sve_tanf    __sv_tanf          $runsv
-sve_tanf    _ZGVsMxv_tanf      $runsv
-
-sve_cos    __sv_cos        $runsv
-sve_cos    _ZGVsMxv_cos    $runsv
-sve_sin    __sv_sin        $runsv
-sve_sin    _ZGVsMxv_sin    $runsv
-sve_atan   __sv_atan       $runsv
-sve_atan   _ZGVsMxv_atan   $runsv
-sve_atan2  __sv_atan2      $runsv
-sve_atan2  _ZGVsMxvv_atan2 $runsv
-sve_log10  __sv_log10      $runsv
-sve_log10  _ZGVsMxv_log10  $runsv
-sve_log    __sv_log        $runsv
-sve_log    _ZGVsMxv_log    $runsv
-sve_erf    __sv_erf        $runsv
-sve_erf    _ZGVsMxv_erf    $runsv
-sve_erfc   __sv_erfc       $runsv
-sve_erfc   _ZGVsMxv_erfc   $runsv
-EOF
+	t $F $LO $HI $N $C
+done
 
 [ 0 -eq $FAIL ] || {
 	echo "FAILED $FAIL PASSED $PASS"
diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c
index 23171a1..4eeec4a 100644
--- a/pl/math/v_asinh_2u5.c
+++ b/pl/math/v_asinh_2u5.c
@@ -157,4 +157,18 @@ VPCS_ALIAS
 PL_SIG (V, D, 1, asinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (asinh), 1.54)
 PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_ERRNO)
+/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
+   Ensures the v_sel is choosing the right option in all cases.  */
+#define V_ASINH_INTERVAL(lo, hi, n)                                            \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0.5)                          \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 2)                            \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0x1p600)
+V_ASINH_INTERVAL (0, 0x1p-26, 50000)
+V_ASINH_INTERVAL (0x1p-26, 1, 50000)
+V_ASINH_INTERVAL (1, 0x1p511, 50000)
+V_ASINH_INTERVAL (0x1p511, inf, 40000)
+V_ASINH_INTERVAL (-0, -0x1p-26, 50000)
+V_ASINH_INTERVAL (-0x1p-26, -1, 50000)
+V_ASINH_INTERVAL (-1, -0x1p511, 50000)
+V_ASINH_INTERVAL (-0x1p511, -inf, 40000)
 #endif
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
index 32fe773..79bf80f 100644
--- a/pl/math/v_asinhf_2u7.c
+++ b/pl/math/v_asinhf_2u7.c
@@ -58,4 +58,12 @@ VPCS_ALIAS
 PL_SIG (V, F, 1, asinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (asinhf), 2.17)
 PL_TEST_EXPECT_FENV (V_NAME (asinhf), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0, 0x1p-12, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p-12, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 1.0, 0x1p11, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p11, inf, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0, -0x1p-12, 20000)
+PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p-12, -1.0, 20000)
+PL_TEST_INTERVAL (V_NAME (asinhf), -1.0, -0x1p11, 20000)
+PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p11, -inf, 20000)
 #endif
diff --git a/pl/math/v_atan2_3u.c b/pl/math/v_atan2_3u.c
index 27af80d..b123cfa 100644
--- a/pl/math/v_atan2_3u.c
+++ b/pl/math/v_atan2_3u.c
@@ -82,4 +82,9 @@ VPCS_ALIAS
 PL_SIG (V, D, 2, atan2)
 // TODO tighten this once __v_atan2 is fixed
 PL_TEST_ULP (V_NAME (atan2), 2.9)
+PL_TEST_INTERVAL (V_NAME (atan2), -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (V_NAME (atan2), -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2), 1e6, 1e32, 40000)
 #endif
diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
index 3d8f9fc..abf8f5e 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/pl/math/v_atan2f_3u.c
@@ -81,4 +81,9 @@ VPCS_ALIAS
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
 PL_SIG (V, F, 2, atan2)
 PL_TEST_ULP (V_NAME (atan2f), 2.46)
+PL_TEST_INTERVAL (V_NAME (atan2f), -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (V_NAME (atan2f), -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 1e6, 1e32, 40000)
 #endif
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
index de39fa7..43b4abd 100644
--- a/pl/math/v_atan_2u5.c
+++ b/pl/math/v_atan_2u5.c
@@ -53,4 +53,9 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, atan, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (atan), 1.78)
+PL_TEST_INTERVAL (V_NAME (atan), -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (V_NAME (atan), -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan), 1e6, 1e32, 40000)
 #endif
diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c
index 8014d65..3cb51b1 100644
--- a/pl/math/v_atanf_3u.c
+++ b/pl/math/v_atanf_3u.c
@@ -52,4 +52,9 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, atan, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (atanf), 2.5)
+PL_TEST_INTERVAL (V_NAME (atanf), -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (V_NAME (atanf), -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), 1e6, 1e32, 40000)
 #endif
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
index 4cff1fc..68dbdf6 100644
--- a/pl/math/v_atanhf_3u1.c
+++ b/pl/math/v_atanhf_3u1.c
@@ -52,4 +52,10 @@ VPCS_ALIAS
 PL_SIG (V, F, 1, atanh, -1.0, 1.0)
 PL_TEST_ULP (V_NAME (atanhf), 2.59)
 PL_TEST_EXPECT_FENV (V_NAME (atanhf), WANT_ERRNO)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), 0, 0x1p-12, 500, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), 0x1p-12, 1, 200000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), 1, inf, 1000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), -0, -0x1p-12, 500, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), -0x1p-12, -1, 200000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), -1, -inf, 1000, 0)
 #endif
diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c
index 756a468..cffc488 100644
--- a/pl/math/v_cbrtf_1u5.c
+++ b/pl/math/v_cbrtf_1u5.c
@@ -90,4 +90,6 @@ VPCS_ALIAS
 PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (cbrtf), 1.03)
 PL_TEST_EXPECT_FENV (V_NAME (cbrtf), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (cbrtf), 0, inf, 1000000)
+PL_TEST_INTERVAL (V_NAME (cbrtf), -0, -inf, 1000000)
 #endif
diff --git a/pl/math/v_cosh_2u.c b/pl/math/v_cosh_2u.c
index 63f877e..20d5b38 100644
--- a/pl/math/v_cosh_2u.c
+++ b/pl/math/v_cosh_2u.c
@@ -88,4 +88,8 @@ VPCS_ALIAS
 PL_SIG (V, D, 1, cosh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (cosh), 1.43)
 PL_TEST_EXPECT_FENV (V_NAME (cosh), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (cosh), 0, 0x1.6p9, 100000)
+PL_TEST_INTERVAL (V_NAME (cosh), -0, -0x1.6p9, 100000)
+PL_TEST_INTERVAL (V_NAME (cosh), 0x1.6p9, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (cosh), -0x1.6p9, -inf, 1000)
 #endif
diff --git a/pl/math/v_coshf_2u4.c b/pl/math/v_coshf_2u4.c
index f101681..6ea6eb3 100644
--- a/pl/math/v_coshf_2u4.c
+++ b/pl/math/v_coshf_2u4.c
@@ -64,4 +64,10 @@ VPCS_ALIAS
 PL_SIG (V, F, 1, cosh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (coshf), 1.89)
 PL_TEST_EXPECT_FENV (V_NAME (coshf), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1p-63, 100)
+PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (V_NAME (coshf), 0x1.5a92d8p+6, inf, 2000)
+PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1p-63, 100)
+PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (V_NAME (coshf), -0x1.5a92d8p+6, -inf, 2000)
 #endif
diff --git a/pl/math/v_erf_2u.c b/pl/math/v_erf_2u.c
index e33d405..caec4d8 100644
--- a/pl/math/v_erf_2u.c
+++ b/pl/math/v_erf_2u.c
@@ -107,4 +107,10 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, erf, -6.0, 6.0)
 PL_TEST_ULP (V_NAME (erf), 1.26)
+PL_TEST_INTERVAL (V_NAME (erf), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erf), 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), 0x1p-26, 0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), 0, inf, 40000)
 #endif
diff --git a/pl/math/v_erfc_4u.c b/pl/math/v_erfc_4u.c
index 9b08ead..9247f87 100644
--- a/pl/math/v_erfc_4u.c
+++ b/pl/math/v_erfc_4u.c
@@ -159,4 +159,10 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, erfc, -6.0, 28.0)
 PL_TEST_ULP (V_NAME (erfc), 3.15)
+PL_TEST_INTERVAL (V_NAME (erfc), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-1022, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-1022, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), 0, inf, 40000)
 #endif
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c
index e39801e..4b495d0 100644
--- a/pl/math/v_erfcf_1u.c
+++ b/pl/math/v_erfcf_1u.c
@@ -174,4 +174,10 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, erfc, -6.0, 28.0)
 PL_TEST_ULP (V_NAME (erfcf), 0.26)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000)
 #endif
diff --git a/pl/math/v_erff_1u5.c b/pl/math/v_erff_1u5.c
index 52f063c..bb9b786 100644
--- a/pl/math/v_erff_1u5.c
+++ b/pl/math/v_erff_1u5.c
@@ -107,4 +107,10 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, erf, -4.0, 4.0)
 PL_TEST_ULP (V_NAME (erff), 0.76)
+PL_TEST_INTERVAL (V_NAME (erff), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erff), 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), 0x1p-26, 0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), 0, inf, 40000)
 #endif
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
index 216bdbc..3f63760 100644
--- a/pl/math/v_expm1_2u5.c
+++ b/pl/math/v_expm1_2u5.c
@@ -104,4 +104,10 @@ VPCS_ALIAS
 PL_SIG (V, D, 1, expm1, -9.9, 9.9)
 PL_TEST_ULP (V_NAME (expm1), 1.68)
 PL_TEST_EXPECT_FENV (V_NAME (expm1), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (expm1), 0, 0x1p-51, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1), -0, -0x1p-51, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1), 0x1p-51, 0x1.63108c75a1937p+9, 100000)
+PL_TEST_INTERVAL (V_NAME (expm1), -0x1p-51, -0x1.740bf7c0d927dp+9, 100000)
+PL_TEST_INTERVAL (V_NAME (expm1), 0x1.63108c75a1937p+9, inf, 100)
+PL_TEST_INTERVAL (V_NAME (expm1), -0x1.740bf7c0d927dp+9, -inf, 100)
 #endif
diff --git a/pl/math/v_expm1f_1u6.c b/pl/math/v_expm1f_1u6.c
index 6e47fac..9977b8e 100644
--- a/pl/math/v_expm1f_1u6.c
+++ b/pl/math/v_expm1f_1u6.c
@@ -87,4 +87,8 @@ VPCS_ALIAS
 PL_SIG (V, F, 1, expm1, -9.9, 9.9)
 PL_TEST_ULP (V_NAME (expm1f), 1.02)
 PL_TEST_EXPECT_FENV (V_NAME (expm1f), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (expm1f), 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1f), -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1f), 0x1p-23, 0x1.644716p6, 1000000)
+PL_TEST_INTERVAL (V_NAME (expm1f), -0x1p-23, -0x1.9bbabcp+6, 1000000)
 #endif
diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c
index 014accc..6fb7447 100644
--- a/pl/math/v_log10_2u5.c
+++ b/pl/math/v_log10_2u5.c
@@ -103,4 +103,7 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, log10, 0.01, 11.1)
 PL_TEST_ULP (V_NAME (log10), 1.97)
+PL_TEST_INTERVAL (V_NAME (log10), 0, 0xffff000000000000, 10000)
+PL_TEST_INTERVAL (V_NAME (log10), 0x1p-4, 0x1p4, 400000)
+PL_TEST_INTERVAL (V_NAME (log10), 0, inf, 400000)
 #endif
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
index f25da91..4c22540 100644
--- a/pl/math/v_log10f_3u5.c
+++ b/pl/math/v_log10f_3u5.c
@@ -76,4 +76,6 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, log10, 0.01, 11.1)
 PL_TEST_ULP (V_NAME (log10f), 2.81)
+PL_TEST_INTERVAL (V_NAME (log10f), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (log10f), 0x1p-4, 0x1p4, 500000)
 #endif
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
index 7a8c6bf..ccde382 100644
--- a/pl/math/v_log1p_2u5.c
+++ b/pl/math/v_log1p_2u5.c
@@ -108,4 +108,12 @@ VPCS_ALIAS
 PL_SIG (V, D, 1, log1p, -0.9, 10.0)
 PL_TEST_ULP (V_NAME (log1p), 1.97)
 PL_TEST_EXPECT_FENV (V_NAME (log1p), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (log1p), -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0.0, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0.0, -0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), -0x1p-23, -0.001, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), -1.0, inf, 5000)
 #endif
diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c
index f351ecd..96ac02d 100644
--- a/pl/math/v_log1pf_2u1.c
+++ b/pl/math/v_log1pf_2u1.c
@@ -148,4 +148,12 @@ VPCS_ALIAS
 PL_SIG (V, F, 1, log1p, -0.9, 10.0)
 PL_TEST_ULP (V_NAME (log1pf), 1.53)
 PL_TEST_EXPECT_FENV (V_NAME (log1pf), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (log1pf), -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, 0x1p-23, 30000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, -0x1p-23, 30000)
+PL_TEST_INTERVAL (V_NAME (log1pf), -0x1p-23, -0.001, 30000)
+PL_TEST_INTERVAL (V_NAME (log1pf), -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1pf), -1.0, inf, 1000)
 #endif
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
index 3bdfd2e..7dca684 100644
--- a/pl/math/v_log2_3u.c
+++ b/pl/math/v_log2_3u.c
@@ -90,4 +90,10 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, log2, 0.01, 11.1)
 PL_TEST_ULP (V_NAME (log2), 2.10)
+PL_TEST_INTERVAL (V_NAME (log2), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME (log2), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME (log2), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log2), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log2), 1.0, 100, 50000)
+PL_TEST_INTERVAL (V_NAME (log2), 100, inf, 50000)
 #endif
diff --git a/pl/math/v_log2f_2u6.c b/pl/math/v_log2f_2u6.c
index a3c9aac..aa011cd 100644
--- a/pl/math/v_log2f_2u6.c
+++ b/pl/math/v_log2f_2u6.c
@@ -123,4 +123,10 @@ VPCS_ALIAS
 PL_SIG (V, F, 1, log2, 0.01, 11.1)
 PL_TEST_ULP (V_NAME (log2f), 2.10)
 PL_TEST_EXPECT_FENV (V_NAME (log2f), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 1.0, 100, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 100, inf, 50000)
 #endif
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
index bab8896..019cf84 100644
--- a/pl/math/v_sinh_3u.c
+++ b/pl/math/v_sinh_3u.c
@@ -47,4 +47,10 @@ VPCS_ALIAS
 PL_SIG (V, D, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinh), 2.08)
 PL_TEST_EXPECT_FENV (V_NAME (sinh), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (sinh), 0, 0x1p-51, 100)
+PL_TEST_INTERVAL (V_NAME (sinh), -0, -0x1p-51, 100)
+PL_TEST_INTERVAL (V_NAME (sinh), 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
+PL_TEST_INTERVAL (V_NAME (sinh), -0x1p-51, -0x1.62e42fefa39fp+9, 100000)
+PL_TEST_INTERVAL (V_NAME (sinh), 0x1.62e42fefa39fp+9, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), -0x1.62e42fefa39fp+9, -inf, 1000)
 #endif
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
index ecedf55..a8bf5ae 100644
--- a/pl/math/v_sinhf_2u3.c
+++ b/pl/math/v_sinhf_2u3.c
@@ -46,4 +46,10 @@ VPCS_ALIAS
 PL_SIG (V, F, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinhf), 1.76)
 PL_TEST_EXPECT_FENV (V_NAME (sinhf), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (sinhf), 0, 0x1.62e43p+6, 100000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -0, -0x1.62e43p+6, 100000)
+PL_TEST_INTERVAL (V_NAME (sinhf), 0x1.62e43p+6, 0x1.65a9fap+6, 100)
+PL_TEST_INTERVAL (V_NAME (sinhf), -0x1.62e43p+6, -0x1.65a9fap+6, 100)
+PL_TEST_INTERVAL (V_NAME (sinhf), 0x1.65a9fap+6, inf, 100)
+PL_TEST_INTERVAL (V_NAME (sinhf), -0x1.65a9fap+6, -inf, 100)
 #endif
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
index 51ede3c..648690d 100644
--- a/pl/math/v_tanf_3u2.c
+++ b/pl/math/v_tanf_3u2.c
@@ -120,4 +120,12 @@ VPCS_ALIAS
 PL_SIG (V, F, 1, tan, -3.1, 3.1)
 PL_TEST_ULP (V_NAME (tanf), 2.7)
 PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (tanf), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-23, 0.7, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 1.5, 100, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 100, 0x1p17, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p17, inf, 50000)
 #endif
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index ae87f50..c10be40 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -93,4 +93,10 @@ VPCS_ALIAS
 PL_SIG (V, F, 1, tanh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (tanhf), 2.09)
 PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0x1p-23, 0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0x1p-23, -0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0x1.205966p+3, inf, 100)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0x1.205966p+3, -inf, 100)
 #endif
-- 
cgit v1.2.3


From 0976cbd22158e1152070079642c21a6e02c21141 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Mon, 19 Dec 2022 12:06:01 +0000
Subject: pl/math: Improve vector/Neon log2f

A new implementation based on the same approach as
Neon logf, that is accurate to 2.48 ULPs.

Flags set correctly regardless of WANT_ERRNO.
---
 pl/math/math_config.h        |  10 +---
 pl/math/s_log2f_2u5.c        |   6 ++
 pl/math/s_log2f_2u6.c        |   6 --
 pl/math/tools/v_log2f.sollya |  38 +++++++++++++
 pl/math/v_log2f_2u5.c        |  68 ++++++++++++++++++++++
 pl/math/v_log2f_2u6.c        | 132 -------------------------------------------
 pl/math/v_log2f_data.c       |  36 +++---------
 pl/math/vn_log2f_2u5.c       |  12 ++++
 pl/math/vn_log2f_2u6.c       |  12 ----
 9 files changed, 133 insertions(+), 187 deletions(-)
 create mode 100644 pl/math/s_log2f_2u5.c
 delete mode 100644 pl/math/s_log2f_2u6.c
 create mode 100644 pl/math/tools/v_log2f.sollya
 create mode 100644 pl/math/v_log2f_2u5.c
 delete mode 100644 pl/math/v_log2f_2u6.c
 create mode 100644 pl/math/vn_log2f_2u5.c
 delete mode 100644 pl/math/vn_log2f_2u6.c

diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 99132a0..81da863 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -476,16 +476,10 @@ extern const struct tanf_poly_data
   float poly_cotan[TANF_Q_POLY_NCOEFFS];
 } __tanf_poly_data HIDDEN;
 
-#define V_LOG2F_TABLE_BITS 4
-#define V_LOG2F_POLY_ORDER 4
+#define V_LOG2F_POLY_NCOEFFS 9
 extern const struct v_log2f_data
 {
-  struct
-  {
-    /* Pad with dummy for quad-aligned memory access.  */
-    float invc_hi, invc_lo, logc, dummy;
-  } tab[1 << V_LOG2F_TABLE_BITS];
-  float poly[V_LOG2F_POLY_ORDER];
+  float poly[V_LOG2F_POLY_NCOEFFS];
 } __v_log2f_data HIDDEN;
 
 #define V_LOG2_TABLE_BITS 7
diff --git a/pl/math/s_log2f_2u5.c b/pl/math/s_log2f_2u5.c
new file mode 100644
index 0000000..7077814
--- /dev/null
+++ b/pl/math/s_log2f_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log2f_2u5.c"
diff --git a/pl/math/s_log2f_2u6.c b/pl/math/s_log2f_2u6.c
deleted file mode 100644
index 8e5569d..0000000
--- a/pl/math/s_log2f_2u6.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_log2f_2u6.c"
diff --git a/pl/math/tools/v_log2f.sollya b/pl/math/tools/v_log2f.sollya
new file mode 100644
index 0000000..18869a5
--- /dev/null
+++ b/pl/math/tools/v_log2f.sollya
@@ -0,0 +1,38 @@
+// polynomial used for __v_log2f(x)
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 9; // poly degree
+a = -1/3;
+b = 1/3;
+
+ln2 = evaluate(log(2),0);
+invln2 = single(1/ln2);
+
+// find log2(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log2(1+x) is the same * x)
+deg = deg-1; // because of /x
+
+// f = log2(1+x)/x; using taylor series
+f = 0;
+for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+f = f * invln2;
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = invln2;
+for i from 1 to deg do {
+  p = roundcoefficients(approx(poly,i), [|SG ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/pl/math/v_log2f_2u5.c b/pl/math/v_log2f_2u5.c
new file mode 100644
index 0000000..343185c
--- /dev/null
+++ b/pl/math/v_log2f_2u5.c
@@ -0,0 +1,68 @@
+/*
+ * Single-precision vector log2 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pairwise_hornerf.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#if V_SUPPORTED
+
+#define C(i) v_f32 (__v_log2f_data.poly[i])
+
+#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Mask v_u32 (0x007fffff)
+#define Off v_u32 (0x3f2aaaab) /* 0.666667 */
+
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (log2f, x, y, cmp);
+}
+
+/* Fast implementation for single precision log2,
+   relies on same argument reduction as Neon logf.
+   Maximum error: 2.48 ULPs
+   __v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
+			   want 0x1.a9be8p-2.  */
+VPCS_ATTR
+v_f32_t V_NAME (log2f) (v_f32_t x)
+{
+  v_u32_t u = v_as_u32_f32 (x);
+  v_u32_t cmp = v_cond_u32 (u - Min >= Max - Min);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u -= Off;
+  v_f32_t n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend.  */
+  u &= Mask;
+  u += Off;
+  v_f32_t r = v_as_f32_u32 (u) - v_f32 (1.0f);
+
+  /* y = log2(1+r) + n.  */
+  v_f32_t r2 = r * r;
+  v_f32_t p = PAIRWISE_HORNER_8 (r, r2, C);
+  v_f32_t y = v_fma_f32 (p, r, n);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log2f), 1.99)
+PL_TEST_EXPECT_FENV (V_NAME (log2f), WANT_ERRNO)
+PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 1.0, 100, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 100, inf, 50000)
+#endif
diff --git a/pl/math/v_log2f_2u6.c b/pl/math/v_log2f_2u6.c
deleted file mode 100644
index aa011cd..0000000
--- a/pl/math/v_log2f_2u6.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Single-precision vector log2 function.
- *
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if V_SUPPORTED
-
-#define N (1 << V_LOG2F_TABLE_BITS)
-#define T __v_log2f_data.tab
-#define A __v_log2f_data.poly
-#define OFF 0x3f330000
-#define SubnormLim 0x800000
-#define One v_u32 (0x3f800000)
-
-static float
-handle_special (float x)
-{
-  if (x != x)
-    /* NaN - return NaN but do not trigger invalid.  */
-    return x;
-  if (x < 0)
-    /* log2f(-anything) = NaN.  */
-    return __math_invalidf (x);
-  if (x == 0)
-    /* log2f(0) = Inf.  */
-    return __math_divzerof (1);
-  /* log2f(Inf)  =  Inf.  */
-  return x;
-}
-
-static float
-normalise (float x)
-{
-  return asfloat (asuint (x * 0x1p23f) - (23 << 23));
-}
-
-#ifdef SCALAR
-
-#define DEFINE_LOOKUP_FUNC(p)                                                  \
-  static inline float lookup_##p (uint32_t i) { return T[i].p; }
-
-#else
-
-#define DEFINE_LOOKUP_FUNC(p)                                                  \
-  static inline v_f32_t lookup_##p (v_u32_t i)                                 \
-  {                                                                            \
-    return (v_f32_t){T[i[0]].p, T[i[1]].p, T[i[2]].p, T[i[3]].p};              \
-  }
-
-#endif
-
-DEFINE_LOOKUP_FUNC (invc_lo)
-DEFINE_LOOKUP_FUNC (invc_hi)
-DEFINE_LOOKUP_FUNC (logc)
-
-/* Single-precision vector log2 routine. Implements the same algorithms as
-   scalar log2f, but using only single-precision arithmetic, with invc
-   represented as a two-limb float. Accurate to 2.6 ulp. The maximum error is
-   near sqrt(2):
-  __v_log2f(0x1.6a0484p+0) got 0x1.ffea02p-2
-			  want 0x1.ffea08p-2.  */
-VPCS_ATTR v_f32_t V_NAME (log2f) (v_f32_t x)
-{
-  v_u32_t ix = v_as_u32_f32 (x);
-
-  /* x is +-Inf, +-NaN, 0 or -ve.  */
-  v_u32_t special = v_cond_u32 (ix >= 0x7f800000) | v_cond_u32 (ix == 0);
-  /* |x| < 2^126 (i.e. x is subnormal).  */
-  v_u32_t subnorm = v_cond_u32 (ix < SubnormLim);
-
-  /* Sidestep special lanes so they do not inadvertently trigger fenv
-     exceptions. They will be fixed up later.  */
-  if (unlikely (v_any_u32 (special)))
-    ix = v_sel_u32 (special, One, ix);
-
-  if (unlikely (v_any_u32 (subnorm)))
-    {
-      /* Normalize any subnormals.  */
-      v_f32_t tmp_x = v_as_f32_u32 (ix);
-      ix = v_as_u32_f32 (v_call_f32 (normalise, tmp_x, tmp_x, subnorm));
-    }
-
-  /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
-     The range is split into N subintervals.
-     The ith subinterval contains z and c is near its center.  */
-  v_u32_t tmp = ix - OFF;
-  v_u32_t i = (tmp >> (23 - V_LOG2F_TABLE_BITS)) % N;
-  v_u32_t top = tmp & 0xff800000;
-  v_u32_t iz = ix - top;
-  v_f32_t k = v_to_f32_s32 (v_as_s32_u32 (tmp) >> 23); /* Arithmetic shift.  */
-  v_f32_t z = v_as_f32_u32 (iz);
-
-  v_f32_t invc_lo = lookup_invc_lo (i);
-  v_f32_t invc_hi = lookup_invc_hi (i);
-  v_f32_t logc = lookup_logc (i);
-
-  /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k.  */
-  v_f32_t r = v_fma_f32 (z, invc_hi, v_f32 (-1));
-  r = v_fma_f32 (z, invc_lo, r);
-  v_f32_t y0 = logc + k;
-
-  /* Pipelined polynomial evaluation to approximate log1p(r)/ln2.  */
-  v_f32_t r2 = r * r;
-  v_f32_t y = v_fma_f32 (v_f32 (A[1]), r, v_f32 (A[2]));
-  y = v_fma_f32 (v_f32 (A[0]), r2, y);
-  v_f32_t p = v_fma_f32 (v_f32 (A[3]), r, y0);
-  y = v_fma_f32 (y, r2, p);
-
-  if (unlikely (v_any_u32 (special)))
-    return v_call_f32 (handle_special, x, y, special);
-
-  return y;
-}
-VPCS_ALIAS
-
-PL_SIG (V, F, 1, log2, 0.01, 11.1)
-PL_TEST_ULP (V_NAME (log2f), 2.10)
-PL_TEST_EXPECT_FENV (V_NAME (log2f), WANT_ERRNO)
-PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME (log2f), 1.0, 100, 50000)
-PL_TEST_INTERVAL (V_NAME (log2f), 100, inf, 50000)
-#endif
diff --git a/pl/math/v_log2f_data.c b/pl/math/v_log2f_data.c
index e6c1f71..7e5cb1e 100644
--- a/pl/math/v_log2f_data.c
+++ b/pl/math/v_log2f_data.c
@@ -1,5 +1,5 @@
 /*
- * Coefficients and table entries for vector log2f
+ * Coefficients for vector log2f
  *
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
@@ -7,31 +7,9 @@
 
 #include "math_config.h"
 
-const struct v_log2f_data __v_log2f_data = {
-/* All values here are derived from the values in math/log2f_data.c.
-   For all i:
-     tab[i].invc_hi = (float) log2f_data.invc
-     tab[i].invc_lo = log2f_data.invc - (double) tab[i].invc_hi
-     tab[i].logc    = (float) log2f_data.logc
-     poly[i]        = (float) log2f_data.poly[i].  */
-  .tab = {
-  { 0x1.661ec8p+0,  -0x1.81c31p-26,  -0x1.efec66p-2, 0},
-  { 0x1.571ed4p+0,   0x1.55f108p-25, -0x1.b0b684p-2, 0},
-  { 0x1.4953ap+0,   -0x1.e1fdeap-25, -0x1.7418bp-2, 0},
-  { 0x1.3c995cp+0,  -0x1.e8ff9p-25,  -0x1.39de92p-2, 0},
-  { 0x1.30d19p+0,    0x1.910c94p-25, -0x1.01d9cp-2, 0},
-  { 0x1.25e228p+0,  -0x1.3d1c58p-26, -0x1.97c1d2p-3, 0},
-  { 0x1.1bb4a4p+0,   0x1.434688p-25, -0x1.2f9e3ap-3, 0},
-  { 0x1.12359p+0,   -0x1.eea348p-25, -0x1.960cbcp-4, 0},
-  { 0x1.0953f4p+0,   0x1.9900a8p-28, -0x1.a6f9dcp-5, 0},
-  { 0x1p+0,          0x0p+0,          0x0p+0, 0},
-  { 0x1.e608dp-1,   -0x1.32dc2ap-28,  0x1.338caap-4, 0},
-  { 0x1.ca4b32p-1,  -0x1.fb2acp-30,   0x1.476a96p-3, 0},
-  { 0x1.b20366p-1,  -0x1.12a064p-26,  0x1.e840b4p-3, 0},
-  { 0x1.9c2d16p-1,   0x1.d0d516p-28,  0x1.40646p-2, 0},
-  { 0x1.886e6p-1,    0x1.bc20f6p-28,  0x1.88e9c2p-2, 0},
-  { 0x1.767ddp-1,   -0x1.5596f4p-26,  0x1.ce0a44p-2, 0},
-  },
-  .poly = { -0x1.712b70p-2, 0x1.ecabf4p-2,
-	    -0x1.71547ap-1, 0x1.715476p+0 }
-};
+/* See tools/v_log2f.sollya for the algorithm used to generate these
+   coefficients.  */
+const struct v_log2f_data __v_log2f_data
+  = {.poly = {0x1.715476p0f, /* (float)(1 / ln(2)).  */
+	      -0x1.715458p-1f, 0x1.ec701cp-2f, -0x1.7171a4p-2f, 0x1.27a0b8p-2f,
+	      -0x1.e5143ep-3f, 0x1.9d8ecap-3f, -0x1.c675bp-3f, 0x1.9e495p-3f}};
diff --git a/pl/math/vn_log2f_2u5.c b/pl/math/vn_log2f_2u5.c
new file mode 100644
index 0000000..b1e491a
--- /dev/null
+++ b/pl/math/vn_log2f_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log2f.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log2f, _ZGVnN4v_log2f)
+#include "v_log2f_2u5.c"
+#endif
diff --git a/pl/math/vn_log2f_2u6.c b/pl/math/vn_log2f_2u6.c
deleted file mode 100644
index 18effaf..0000000
--- a/pl/math/vn_log2f_2u6.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_log2f.
- *
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_log2f, _ZGVnN4v_log2f)
-#include "v_log2f_2u6.c"
-#endif
-- 
cgit v1.2.3


From d05594e6718e6d86959c823bea4f019dea878bcb Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 19 Dec 2022 12:34:51 +0000
Subject: pl/math: Replace WANT_ERRNO with WANT_SIMD_EXCEPT for Neon fenv

We were previously misusing the WANT_ERRNO build flag. This is now
replaced everywhere appropriate with WANT_SIMD_EXCEPT. A small number
of vector routines get fp exceptions right with no modification - the
tests have been updated to track this.
---
 pl/math/Dir.mk            |  1 -
 pl/math/include/pl_test.h |  1 +
 pl/math/math_config.h     |  5 +++++
 pl/math/test/pl_test.h    |  3 ++-
 pl/math/v_asinh_2u5.c     | 22 +++++++++++-----------
 pl/math/v_asinhf_2u7.c    |  4 ++--
 pl/math/v_atanhf_3u1.c    |  4 ++--
 pl/math/v_cbrtf_1u5.c     |  2 +-
 pl/math/v_cosh_2u.c       |  2 +-
 pl/math/v_coshf_2u4.c     | 12 ++++++------
 pl/math/v_expm1_2u5.c     | 10 +++++-----
 pl/math/v_expm1f_1u6.c    | 10 +++++-----
 pl/math/v_log10_2u5.c     |  1 +
 pl/math/v_log10f_3u5.c    |  1 +
 pl/math/v_log1p_2u5.c     |  4 ++--
 pl/math/v_log1pf_2u1.c    |  8 ++++----
 pl/math/v_log2_3u.c       |  1 +
 pl/math/v_log2f_2u5.c     |  2 +-
 pl/math/v_sinh_3u.c       |  2 +-
 pl/math/v_sinhf_2u3.c     |  2 +-
 pl/math/v_tanf_3u2.c      | 16 ++++++++--------
 pl/math/v_tanhf_2u6.c     |  8 ++++----
 22 files changed, 65 insertions(+), 56 deletions(-)

diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index 1433b7b..60814f8 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -200,7 +200,6 @@ $(ulp-itvs): $(ulp-itvs-alias) $(ulp-itvs-noalias)
 	cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@
 
 check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps) $(ulp-itvs)
-	WANT_ERRNO=$(WANT_ERRNO) \
 	WANT_SVE_MATH=$(WANT_SVE_MATH) \
 	ULPFLAGS="$(math-ulpflags)" \
 	LIMITS=../../../$(ulp-lims) \
diff --git a/pl/math/include/pl_test.h b/pl/math/include/pl_test.h
index e578a0d..30d39c1 100644
--- a/pl/math/include/pl_test.h
+++ b/pl/math/include/pl_test.h
@@ -20,6 +20,7 @@
    build flags - defer expansion by one pass to allow those flags to be expanded
    properly.  */
 #define PL_TEST_EXPECT_FENV(f, e)
+#define PL_TEST_EXPECT_FENV_ALWAYS(f)
 
 #define PL_TEST_INTERVAL(f, lo, hi, n)
 #define PL_TEST_INTERVAL_C(f, lo, hi, n, c)
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 81da863..90d571c 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -24,6 +24,11 @@
    set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.  */
 # define WANT_ERRNO 0
 #endif
+#ifndef WANT_SIMD_EXCEPT
+/* If defined to 1, trigger fp exceptions in vector routines, consistently with
+   behaviour expected from the corresponding scalar routine.  */
+#define WANT_SIMD_EXCEPT 0
+#endif
 
 /* Compiler can inline round as a single instruction.  */
 #ifndef HAVE_FAST_ROUND
diff --git a/pl/math/test/pl_test.h b/pl/math/test/pl_test.h
index 9bbcaf1..158db5f 100644
--- a/pl/math/test/pl_test.h
+++ b/pl/math/test/pl_test.h
@@ -10,7 +10,7 @@
    on PL_TEST_ULP to add EXPECT_FENV to all scalar routines.  */
 #if !(V_SUPPORTED || SV_SUPPORTED)
 #define PL_TEST_ULP(f, l)                                                      \
-  PL_TEST_EXPECT_FENV (f, 1)                                                   \
+  PL_TEST_EXPECT_FENV_ALWAYS (f)                                               \
   PL_TEST_ULP f l
 #else
 #define PL_TEST_ULP(f, l) PL_TEST_ULP f l
@@ -27,6 +27,7 @@
 #define PL_TEST_EXPECT_FENV(f, e) PL_TEST_EXPECT_FENV_ (f, e)
 #define PL_TEST_EXPECT_FENV_(f, e) PL_TEST_EXPECT_FENV_##e (f)
 #define PL_TEST_EXPECT_FENV_1(f) PL_TEST_EXPECT_FENV_ENABLED f
+#define PL_TEST_EXPECT_FENV_ALWAYS(f) PL_TEST_EXPECT_FENV (f, 1)
 
 #define PL_TEST_INTERVAL(f, lo, hi, n) PL_TEST_INTERVAL f lo hi n
 #define PL_TEST_INTERVAL_C(f, lo, hi, n, c) PL_TEST_INTERVAL f lo hi n c
diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c
index 4eeec4a..04d369d 100644
--- a/pl/math/v_asinh_2u5.c
+++ b/pl/math/v_asinh_2u5.c
@@ -98,19 +98,19 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
   v_u64_t gt1 = v_cond_u64 (top12 >= OneTop);
   v_u64_t special = v_cond_u64 (top12 >= HugeBound);
 
-#if WANT_ERRNO
+#if WANT_SIMD_EXCEPT
   v_u64_t tiny = v_cond_u64 (top12 < TinyBound);
   special |= tiny;
 #endif
 
   /* Option 1: |x| >= 1.
      Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
-     If WANT_ERRNO is enabled, sidestep special values, which will overflow, by
-     setting special lanes to 1. These will be fixed later.  */
+     If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
+     overflow, by setting special lanes to 1. These will be fixed later.  */
   v_f64_t option_1 = v_f64 (0);
   if (likely (v_any_u64 (gt1)))
     {
-#if WANT_ERRNO
+#if WANT_SIMD_EXCEPT
       v_f64_t xm = v_sel_f64 (special, v_f64 (1), ax);
 #else
       v_f64_t xm = ax;
@@ -120,16 +120,16 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
 
   /* Option 2: |x| < 1.
      Compute asinh(x) using a polynomial.
-     If WANT_ERRNO is enabled, sidestep special lanes, which will overflow, and
-     tiny lanes, which will underflow, by setting them to 0. They will be fixed
-     later, either by selecting x or falling back to the scalar special-case.
-     The largest observed error in this region is 1.47 ULPs:
+     If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
+     overflow, and tiny lanes, which will underflow, by setting them to 0. They
+     will be fixed later, either by selecting x or falling back to the scalar
+     special-case. The largest observed error in this region is 1.47 ULPs:
      __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
 				    want 0x1.c1d6bf874019cp-1.  */
   v_f64_t option_2 = v_f64 (0);
   if (likely (v_any_u64 (~gt1)))
     {
-#if WANT_ERRNO
+#if WANT_SIMD_EXCEPT
       ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
 #endif
       v_f64_t x2 = ax * ax;
@@ -138,7 +138,7 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
       v_f64_t z8 = z4 * z4;
       v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
       option_2 = v_fma_f64 (p, x2 * ax, ax);
-#if WANT_ERRNO
+#if WANT_SIMD_EXCEPT
       option_2 = v_sel_f64 (tiny, x, option_2);
 #endif
     }
@@ -156,7 +156,7 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, asinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (asinh), 1.54)
-PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_ERRNO)
+PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_SIMD_EXCEPT)
 /* Test vector asinh 3 times, with control lane < 1, > 1 and special.
    Ensures the v_sel is choosing the right option in all cases.  */
 #define V_ASINH_INTERVAL(lo, hi, n)                                            \
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
index 79bf80f..4710a22 100644
--- a/pl/math/v_asinhf_2u7.c
+++ b/pl/math/v_asinhf_2u7.c
@@ -35,7 +35,7 @@ VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x)
   v_f32_t ax = v_as_f32_u32 (iax);
   v_u32_t special = v_cond_u32 (iax >= BigBound);
 
-#if WANT_ERRNO
+#if WANT_SIMD_EXCEPT
   /* Sidestep tiny and large values to avoid inadvertently triggering
      under/overflow.  */
   special |= v_cond_u32 (iax < TinyBound);
@@ -57,7 +57,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, asinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (asinhf), 2.17)
-PL_TEST_EXPECT_FENV (V_NAME (asinhf), WANT_ERRNO)
+PL_TEST_EXPECT_FENV (V_NAME (asinhf), WANT_SIMD_EXCEPT)
 PL_TEST_INTERVAL (V_NAME (asinhf), 0, 0x1p-12, 40000)
 PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p-12, 1.0, 40000)
 PL_TEST_INTERVAL (V_NAME (asinhf), 1.0, 0x1p11, 40000)
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
index 68dbdf6..7a027fc 100644
--- a/pl/math/v_atanhf_3u1.c
+++ b/pl/math/v_atanhf_3u1.c
@@ -31,7 +31,7 @@ VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x)
 
   v_f32_t ax = v_as_f32_u32 (iax);
 
-#if WANT_ERRNO
+#if WANT_SIMD_EXCEPT
   v_u32_t special = v_cond_u32 ((iax >= One) | (iax <= TinyBound));
   /* Side-step special cases by setting those lanes to 0, which will trigger no
      exceptions. These will be fixed up later.  */
@@ -51,7 +51,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, atanh, -1.0, 1.0)
 PL_TEST_ULP (V_NAME (atanhf), 2.59)
-PL_TEST_EXPECT_FENV (V_NAME (atanhf), WANT_ERRNO)
+PL_TEST_EXPECT_FENV (V_NAME (atanhf), WANT_SIMD_EXCEPT)
 PL_TEST_INTERVAL_C (V_NAME (atanhf), 0, 0x1p-12, 500, 0)
 PL_TEST_INTERVAL_C (V_NAME (atanhf), 0x1p-12, 1, 200000, 0)
 PL_TEST_INTERVAL_C (V_NAME (atanhf), 1, inf, 1000, 0)
diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c
index cffc488..38c20e3 100644
--- a/pl/math/v_cbrtf_1u5.c
+++ b/pl/math/v_cbrtf_1u5.c
@@ -89,7 +89,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (cbrtf), 1.03)
-PL_TEST_EXPECT_FENV (V_NAME (cbrtf), WANT_ERRNO)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrtf))
 PL_TEST_INTERVAL (V_NAME (cbrtf), 0, inf, 1000000)
 PL_TEST_INTERVAL (V_NAME (cbrtf), -0, -inf, 1000000)
 #endif
diff --git a/pl/math/v_cosh_2u.c b/pl/math/v_cosh_2u.c
index 20d5b38..67390d4 100644
--- a/pl/math/v_cosh_2u.c
+++ b/pl/math/v_cosh_2u.c
@@ -87,7 +87,7 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, cosh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (cosh), 1.43)
-PL_TEST_EXPECT_FENV (V_NAME (cosh), WANT_ERRNO)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cosh))
 PL_TEST_INTERVAL (V_NAME (cosh), 0, 0x1.6p9, 100000)
 PL_TEST_INTERVAL (V_NAME (cosh), -0, -0x1.6p9, 100000)
 PL_TEST_INTERVAL (V_NAME (cosh), 0x1.6p9, inf, 1000)
diff --git a/pl/math/v_coshf_2u4.c b/pl/math/v_coshf_2u4.c
index 6ea6eb3..bee46ed 100644
--- a/pl/math/v_coshf_2u4.c
+++ b/pl/math/v_coshf_2u4.c
@@ -30,10 +30,10 @@ VPCS_ATTR v_f32_t V_NAME (coshf) (v_f32_t x)
   v_f32_t ax = v_as_f32_u32 (iax);
   v_u32_t special = v_cond_u32 (iax >= SpecialBound);
 
-#if WANT_ERRNO
-  /* If errno is to be set correctly, fall back to the scalar variant for all
-     inputs if any input is a special value or above the bound at which expf
-     overflows. */
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all inputs if any input is a special value or above the bound
+     at which expf overflows. */
   if (unlikely (v_any_u32 (special)))
     return v_call_f32 (coshf, x, x, v_u32 (-1));
 
@@ -49,7 +49,7 @@ VPCS_ATTR v_f32_t V_NAME (coshf) (v_f32_t x)
   v_f32_t t = V_NAME (expf) (ax);
   v_f32_t y = t * Half + Half / t;
 
-#if WANT_ERRNO
+#if WANT_SIMD_EXCEPT
   if (unlikely (v_any_u32 (tiny)))
     return v_sel_f32 (tiny, v_f32 (1), y);
 #else
@@ -63,7 +63,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, cosh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (coshf), 1.89)
-PL_TEST_EXPECT_FENV (V_NAME (coshf), WANT_ERRNO)
+PL_TEST_EXPECT_FENV (V_NAME (coshf), WANT_SIMD_EXCEPT)
 PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1p-63, 100)
 PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1.5a92d8p+6, 80000)
 PL_TEST_INTERVAL (V_NAME (coshf), 0x1.5a92d8p+6, inf, 2000)
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
index 3f63760..879fcb6 100644
--- a/pl/math/v_expm1_2u5.c
+++ b/pl/math/v_expm1_2u5.c
@@ -55,9 +55,9 @@ v_f64_t V_NAME (expm1) (v_f64_t x)
   v_u64_t ix = v_as_u64_f64 (x);
   v_u64_t ax = ix & AbsMask;
 
-#if WANT_ERRNO
-  /* If errno is to be set correctly, fall back to the scalar variant for all
-     lanes if any of them should trigger an exception.  */
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all lanes if any of them should trigger an exception.  */
   v_u64_t special = v_cond_u64 ((ax >= SpecialBound) | (ax <= TinyBound));
   if (unlikely (v_any_u64 (special)))
     return v_call_f64 (expm1, x, x, v_u64 (-1));
@@ -92,7 +92,7 @@ v_f64_t V_NAME (expm1) (v_f64_t x)
   /* expm1(x) ~= p * t + (t - 1).  */
   v_f64_t y = v_fma_f64 (p, t, t - 1);
 
-#if !WANT_ERRNO
+#if !WANT_SIMD_EXCEPT
   if (unlikely (v_any_u64 (special)))
     return v_call_f64 (expm1, x, y, special);
 #endif
@@ -103,7 +103,7 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, expm1, -9.9, 9.9)
 PL_TEST_ULP (V_NAME (expm1), 1.68)
-PL_TEST_EXPECT_FENV (V_NAME (expm1), WANT_ERRNO)
+PL_TEST_EXPECT_FENV (V_NAME (expm1), WANT_SIMD_EXCEPT)
 PL_TEST_INTERVAL (V_NAME (expm1), 0, 0x1p-51, 1000)
 PL_TEST_INTERVAL (V_NAME (expm1), -0, -0x1p-51, 1000)
 PL_TEST_INTERVAL (V_NAME (expm1), 0x1p-51, 0x1.63108c75a1937p+9, 100000)
diff --git a/pl/math/v_expm1f_1u6.c b/pl/math/v_expm1f_1u6.c
index 9977b8e..7a59ddc 100644
--- a/pl/math/v_expm1f_1u6.c
+++ b/pl/math/v_expm1f_1u6.c
@@ -34,9 +34,9 @@ v_f32_t V_NAME (expm1f) (v_f32_t x)
   v_u32_t ix = v_as_u32_f32 (x);
   v_u32_t ax = ix & AbsMask;
 
-#if WANT_ERRNO
-  /* If errno is to be set correctly, fall back to the scalar variant for all
-     lanes if any of them should trigger an exception.  */
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all lanes if any of them should trigger an exception.  */
   v_u32_t special
     = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000) | (ax < TinyBound));
   if (unlikely (v_any_u32 (special)))
@@ -75,7 +75,7 @@ v_f32_t V_NAME (expm1f) (v_f32_t x)
   /* expm1(x) ~= p * t + (t - 1).  */
   v_f32_t y = v_fma_f32 (p, t, t - 1);
 
-#if !WANT_ERRNO
+#if !WANT_SIMD_EXCEPT
   if (unlikely (v_any_u32 (special)))
     return v_call_f32 (expm1f, x, y, special);
 #endif
@@ -86,7 +86,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, expm1, -9.9, 9.9)
 PL_TEST_ULP (V_NAME (expm1f), 1.02)
-PL_TEST_EXPECT_FENV (V_NAME (expm1f), WANT_ERRNO)
+PL_TEST_EXPECT_FENV (V_NAME (expm1f), WANT_SIMD_EXCEPT)
 PL_TEST_INTERVAL (V_NAME (expm1f), 0, 0x1p-23, 1000)
 PL_TEST_INTERVAL (V_NAME (expm1f), -0, -0x1p-23, 1000)
 PL_TEST_INTERVAL (V_NAME (expm1f), 0x1p-23, 0x1.644716p6, 1000000)
diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c
index 6fb7447..e8d8021 100644
--- a/pl/math/v_log10_2u5.c
+++ b/pl/math/v_log10_2u5.c
@@ -103,6 +103,7 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, log10, 0.01, 11.1)
 PL_TEST_ULP (V_NAME (log10), 1.97)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10))
 PL_TEST_INTERVAL (V_NAME (log10), 0, 0xffff000000000000, 10000)
 PL_TEST_INTERVAL (V_NAME (log10), 0x1p-4, 0x1p4, 400000)
 PL_TEST_INTERVAL (V_NAME (log10), 0, inf, 400000)
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
index 4c22540..a032fa9 100644
--- a/pl/math/v_log10f_3u5.c
+++ b/pl/math/v_log10f_3u5.c
@@ -76,6 +76,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, log10, 0.01, 11.1)
 PL_TEST_ULP (V_NAME (log10f), 2.81)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10f))
 PL_TEST_INTERVAL (V_NAME (log10f), 0, 0xffff0000, 10000)
 PL_TEST_INTERVAL (V_NAME (log10f), 0x1p-4, 0x1p4, 500000)
 #endif
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
index ccde382..7ff948f 100644
--- a/pl/math/v_log1p_2u5.c
+++ b/pl/math/v_log1p_2u5.c
@@ -51,7 +51,7 @@ VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x)
     = v_cond_u64 ((ia >= v_u64 (0x7ff0000000000000))
 		  | (ix >= 0xbff0000000000000) | (ix == 0x8000000000000000));
 
-#if WANT_ERRNO
+#if WANT_SIMD_EXCEPT
   if (unlikely (v_any_u64 (special)))
     x = v_sel_f64 (special, v_f64 (0), x);
 #endif
@@ -107,7 +107,7 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, log1p, -0.9, 10.0)
 PL_TEST_ULP (V_NAME (log1p), 1.97)
-PL_TEST_EXPECT_FENV (V_NAME (log1p), WANT_ERRNO)
+PL_TEST_EXPECT_FENV (V_NAME (log1p), WANT_SIMD_EXCEPT)
 PL_TEST_INTERVAL (V_NAME (log1p), -10.0, 10.0, 10000)
 PL_TEST_INTERVAL (V_NAME (log1p), 0.0, 0x1p-23, 50000)
 PL_TEST_INTERVAL (V_NAME (log1p), 0x1p-23, 0.001, 50000)
diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c
index 96ac02d..ab5e7b7 100644
--- a/pl/math/v_log1pf_2u1.c
+++ b/pl/math/v_log1pf_2u1.c
@@ -68,7 +68,7 @@ handle_special (float x)
       /* x == -Inf   => log1pf(x) = NaN.
 	 x <  -1.0   => log1pf(x) = NaN.
 	 x == +/-NaN => log1pf(x) = NaN.  */
-#if WANT_ERRNO
+#if WANT_SIMD_EXCEPT
       return __math_invalidf (asfloat (ia));
 #else
       return NAN;
@@ -77,7 +77,7 @@ handle_special (float x)
   if (ix == 0xbf800000)
     {
       /* x == -1.0 => log1pf(x) = -Inf.  */
-#if WANT_ERRNO
+#if WANT_SIMD_EXCEPT
       return __math_divzerof (ix);
 #else
       return -INFINITY;
@@ -100,7 +100,7 @@ VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x)
       | v_cond_u32 (ix >= MinusOne);
   v_f32_t special_arg = x;
 
-#if WANT_ERRNO
+#if WANT_SIMD_EXCEPT
   if (unlikely (v_any_u32 (special_cases)))
     /* Side-step special lanes so fenv exceptions are not triggered
        inadvertently.  */
@@ -147,7 +147,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, log1p, -0.9, 10.0)
 PL_TEST_ULP (V_NAME (log1pf), 1.53)
-PL_TEST_EXPECT_FENV (V_NAME (log1pf), WANT_ERRNO)
+PL_TEST_EXPECT_FENV (V_NAME (log1pf), WANT_SIMD_EXCEPT)
 PL_TEST_INTERVAL (V_NAME (log1pf), -10.0, 10.0, 10000)
 PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, 0x1p-23, 30000)
 PL_TEST_INTERVAL (V_NAME (log1pf), 0x1p-23, 0.001, 50000)
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
index 7dca684..e0a854f 100644
--- a/pl/math/v_log2_3u.c
+++ b/pl/math/v_log2_3u.c
@@ -90,6 +90,7 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, log2, 0.01, 11.1)
 PL_TEST_ULP (V_NAME (log2), 2.10)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2))
 PL_TEST_INTERVAL (V_NAME (log2), -0.0, -0x1p126, 100)
 PL_TEST_INTERVAL (V_NAME (log2), 0x1p-149, 0x1p-126, 4000)
 PL_TEST_INTERVAL (V_NAME (log2), 0x1p-126, 0x1p-23, 50000)
diff --git a/pl/math/v_log2f_2u5.c b/pl/math/v_log2f_2u5.c
index 343185c..f4fa0ab 100644
--- a/pl/math/v_log2f_2u5.c
+++ b/pl/math/v_log2f_2u5.c
@@ -58,7 +58,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, log2, 0.01, 11.1)
 PL_TEST_ULP (V_NAME (log2f), 1.99)
-PL_TEST_EXPECT_FENV (V_NAME (log2f), WANT_ERRNO)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2f))
 PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100)
 PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000)
 PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000)
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
index 019cf84..37d7e45 100644
--- a/pl/math/v_sinh_3u.c
+++ b/pl/math/v_sinh_3u.c
@@ -46,7 +46,7 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinh), 2.08)
-PL_TEST_EXPECT_FENV (V_NAME (sinh), WANT_ERRNO)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (sinh))
 PL_TEST_INTERVAL (V_NAME (sinh), 0, 0x1p-51, 100)
 PL_TEST_INTERVAL (V_NAME (sinh), -0, -0x1p-51, 100)
 PL_TEST_INTERVAL (V_NAME (sinh), 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
index a8bf5ae..50fc786 100644
--- a/pl/math/v_sinhf_2u3.c
+++ b/pl/math/v_sinhf_2u3.c
@@ -45,7 +45,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinhf), 1.76)
-PL_TEST_EXPECT_FENV (V_NAME (sinhf), WANT_ERRNO)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (sinhf))
 PL_TEST_INTERVAL (V_NAME (sinhf), 0, 0x1.62e43p+6, 100000)
 PL_TEST_INTERVAL (V_NAME (sinhf), -0, -0x1.62e43p+6, 100000)
 PL_TEST_INTERVAL (V_NAME (sinhf), 0x1.62e43p+6, 0x1.65a9fap+6, 100)
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
index 648690d..6125319 100644
--- a/pl/math/v_tanf_3u2.c
+++ b/pl/math/v_tanf_3u2.c
@@ -37,9 +37,9 @@ static inline v_f32_t
 eval_poly (v_f32_t z)
 {
   v_f32_t z2 = z * z;
-#if WANT_ERRNO
-  /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If errno is to be
-     set correctly, sidestep this by fixing such lanes to 0.  */
+#if WANT_SIMD_EXCEPT
+  /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions
+     are to be triggered correctly, sidestep this by fixing such lanes to 0.  */
   v_u32_t will_uflow = v_cond_u32 ((v_as_u32_f32 (z) & AbsMask) <= TinyBound);
   if (unlikely (v_any_u32 (will_uflow)))
     z2 = v_sel_f32 (will_uflow, v_f32 (0), z2);
@@ -61,10 +61,10 @@ v_f32_t V_NAME (tanf) (v_f32_t x)
 
   /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast
      regression.  */
-#if WANT_ERRNO
-  /* If errno is to be set correctly, also special-case tiny input, as this will
-     load to overflow later. Fix any special lanes to 1 to prevent any
-     exceptions being triggered.  */
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, also special-case tiny
+     input, as this will load to overflow later. Fix any special lanes to 1 to
+     prevent any exceptions being triggered.  */
   v_u32_t special = v_cond_u32 (iax - TinyBound >= RangeVal - TinyBound);
   if (unlikely (v_any_u32 (special)))
     x = v_sel_f32 (special, v_f32 (1.0f), x);
@@ -119,7 +119,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, tan, -3.1, 3.1)
 PL_TEST_ULP (V_NAME (tanf), 2.7)
-PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_ERRNO)
+PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_SIMD_EXCEPT)
 PL_TEST_INTERVAL (V_NAME (tanf), -0.0, -0x1p126, 100)
 PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-149, 0x1p-126, 4000)
 PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-126, 0x1p-23, 50000)
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index c10be40..dedc085 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -69,9 +69,9 @@ VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
   v_u32_t is_boring = v_cond_u32 (iax > BoringBound);
   v_f32_t boring = v_as_f32_u32 (sign | One);
 
-#if WANT_ERRNO
-  /* If errno needs to be set properly, set all special and boring lanes to 1,
-     which will trigger no exceptions, and fix them up later.  */
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered properly, set all special and boring
+     lanes to 1, which will trigger no exceptions, and fix them up later.  */
   v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000));
   ix = v_sel_u32 (is_boring, v_u32 (One), ix);
   if (unlikely (v_any_u32 (special)))
@@ -92,7 +92,7 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, tanh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (tanhf), 2.09)
-PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_ERRNO)
+PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_SIMD_EXCEPT)
 PL_TEST_INTERVAL (V_NAME (tanhf), 0, 0x1p-23, 1000)
 PL_TEST_INTERVAL (V_NAME (tanhf), -0, -0x1p-23, 1000)
 PL_TEST_INTERVAL (V_NAME (tanhf), 0x1p-23, 0x1.205966p+3, 100000)
-- 
cgit v1.2.3


From a5fc3ed57ba4bc6df2e582f6a51c5fcc8e4459cd Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 19 Dec 2022 12:45:41 +0000
Subject: pl/math: Update ULP threshold for Neon asinh

New max observed - updated filenames, comments and runulp threshold.
---
 pl/math/s_asinh_2u5.c  |   6 --
 pl/math/s_asinh_3u5.c  |   6 ++
 pl/math/v_asinh_2u5.c  | 174 -------------------------------------------------
 pl/math/v_asinh_3u5.c  | 174 +++++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_asinh_2u5.c |  12 ----
 pl/math/vn_asinh_3u5.c |  12 ++++
 6 files changed, 192 insertions(+), 192 deletions(-)
 delete mode 100644 pl/math/s_asinh_2u5.c
 create mode 100644 pl/math/s_asinh_3u5.c
 delete mode 100644 pl/math/v_asinh_2u5.c
 create mode 100644 pl/math/v_asinh_3u5.c
 delete mode 100644 pl/math/vn_asinh_2u5.c
 create mode 100644 pl/math/vn_asinh_3u5.c

diff --git a/pl/math/s_asinh_2u5.c b/pl/math/s_asinh_2u5.c
deleted file mode 100644
index 6da30bd..0000000
--- a/pl/math/s_asinh_2u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_asinh_2u5.c"
diff --git a/pl/math/s_asinh_3u5.c b/pl/math/s_asinh_3u5.c
new file mode 100644
index 0000000..d767100
--- /dev/null
+++ b/pl/math/s_asinh_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_asinh_3u5.c"
diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c
deleted file mode 100644
index 04d369d..0000000
--- a/pl/math/v_asinh_2u5.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Double-precision vector asinh(x) function.
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "estrin.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if V_SUPPORTED
-
-#define OneTop 0x3ff	/* top12(asuint64(1.0f)).  */
-#define HugeBound 0x5fe /* top12(asuint64(0x1p511)).  */
-#define TinyBound 0x3e5 /* top12(asuint64(0x1p-26)).  */
-#define AbsMask v_u64 (0x7fffffffffffffff)
-#define C(i) v_f64 (__asinh_data.poly[i])
-
-/* Constants & data for log.  */
-#define OFF 0x3fe6000000000000
-#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
-#define A(i) v_f64 (__sv_log_data.poly[i])
-#define T(i) __log_data.tab[i]
-#define N (1 << LOG_TABLE_BITS)
-
-static NOINLINE v_f64_t
-special_case (v_f64_t x, v_f64_t y, v_u64_t special)
-{
-  return v_call_f64 (asinh, x, y, special);
-}
-
-struct entry
-{
-  v_f64_t invc;
-  v_f64_t logc;
-};
-
-static inline struct entry
-lookup (v_u64_t i)
-{
-  struct entry e;
-#ifdef SCALAR
-  e.invc = T (i).invc;
-  e.logc = T (i).logc;
-#else
-  e.invc[0] = T (i[0]).invc;
-  e.logc[0] = T (i[0]).logc;
-  e.invc[1] = T (i[1]).invc;
-  e.logc[1] = T (i[1]).logc;
-#endif
-  return e;
-}
-
-static inline v_f64_t
-log_inline (v_f64_t x)
-{
-  /* Double-precision vector log, copied from math/v_log.c with some cosmetic
-     modification and special-cases removed. See that file for details of the
-     algorithm used.  */
-  v_u64_t ix = v_as_u64_f64 (x);
-  v_u64_t tmp = ix - OFF;
-  v_u64_t i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
-  v_s64_t k = v_as_s64_u64 (tmp) >> 52;
-  v_u64_t iz = ix - (tmp & 0xfffULL << 52);
-  v_f64_t z = v_as_f64_u64 (iz);
-  struct entry e = lookup (i);
-  v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
-  v_f64_t kd = v_to_f64_s64 (k);
-  v_f64_t hi = v_fma_f64 (kd, Ln2, e.logc + r);
-  v_f64_t r2 = r * r;
-  v_f64_t y = v_fma_f64 (A (3), r, A (2));
-  v_f64_t p = v_fma_f64 (A (1), r, A (0));
-  y = v_fma_f64 (A (4), r2, y);
-  y = v_fma_f64 (y, r2, p);
-  y = v_fma_f64 (y, r2, hi);
-  return y;
-}
-
-/* Double-precision implementation of vector asinh(x).
-   asinh is very sensitive around 1, so it is impractical to devise a single
-   low-cost algorithm which is sufficiently accurate on a wide range of input.
-   Instead we use two different algorithms:
-   asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
-	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
-   where log(x) is an optimized log approximation, and P(x) is a polynomial
-   shared with the scalar routine. The greatest observed error 2.03 ULP, in
-   |x| >= 1:
-   __v_asinh(-0x1.00094e0f39574p+0) got -0x1.c3508eb6a681ep-1
-				   want -0x1.c3508eb6a682p-1.  */
-VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
-{
-  v_u64_t ix = v_as_u64_f64 (x);
-  v_u64_t iax = ix & AbsMask;
-  v_f64_t ax = v_as_f64_u64 (iax);
-  v_u64_t top12 = iax >> 52;
-
-  v_u64_t gt1 = v_cond_u64 (top12 >= OneTop);
-  v_u64_t special = v_cond_u64 (top12 >= HugeBound);
-
-#if WANT_SIMD_EXCEPT
-  v_u64_t tiny = v_cond_u64 (top12 < TinyBound);
-  special |= tiny;
-#endif
-
-  /* Option 1: |x| >= 1.
-     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
-     If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
-     overflow, by setting special lanes to 1. These will be fixed later.  */
-  v_f64_t option_1 = v_f64 (0);
-  if (likely (v_any_u64 (gt1)))
-    {
-#if WANT_SIMD_EXCEPT
-      v_f64_t xm = v_sel_f64 (special, v_f64 (1), ax);
-#else
-      v_f64_t xm = ax;
-#endif
-      option_1 = log_inline (xm + v_sqrt_f64 (xm * xm + 1));
-    }
-
-  /* Option 2: |x| < 1.
-     Compute asinh(x) using a polynomial.
-     If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
-     overflow, and tiny lanes, which will underflow, by setting them to 0. They
-     will be fixed later, either by selecting x or falling back to the scalar
-     special-case. The largest observed error in this region is 1.47 ULPs:
-     __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
-				    want 0x1.c1d6bf874019cp-1.  */
-  v_f64_t option_2 = v_f64 (0);
-  if (likely (v_any_u64 (~gt1)))
-    {
-#if WANT_SIMD_EXCEPT
-      ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
-#endif
-      v_f64_t x2 = ax * ax;
-      v_f64_t z2 = x2 * x2;
-      v_f64_t z4 = z2 * z2;
-      v_f64_t z8 = z4 * z4;
-      v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
-      option_2 = v_fma_f64 (p, x2 * ax, ax);
-#if WANT_SIMD_EXCEPT
-      option_2 = v_sel_f64 (tiny, x, option_2);
-#endif
-    }
-
-  /* Choose the right option for each lane.  */
-  v_f64_t y = v_sel_f64 (gt1, option_1, option_2);
-  /* Copy sign.  */
-  y = v_as_f64_u64 (v_bsl_u64 (AbsMask, v_as_u64_f64 (y), ix));
-
-  if (unlikely (v_any_u64 (special)))
-    return special_case (x, y, special);
-  return y;
-}
-VPCS_ALIAS
-
-PL_SIG (V, D, 1, asinh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (asinh), 1.54)
-PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_SIMD_EXCEPT)
-/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
-   Ensures the v_sel is choosing the right option in all cases.  */
-#define V_ASINH_INTERVAL(lo, hi, n)                                            \
-  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0.5)                          \
-  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 2)                            \
-  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0x1p600)
-V_ASINH_INTERVAL (0, 0x1p-26, 50000)
-V_ASINH_INTERVAL (0x1p-26, 1, 50000)
-V_ASINH_INTERVAL (1, 0x1p511, 50000)
-V_ASINH_INTERVAL (0x1p511, inf, 40000)
-V_ASINH_INTERVAL (-0, -0x1p-26, 50000)
-V_ASINH_INTERVAL (-0x1p-26, -1, 50000)
-V_ASINH_INTERVAL (-1, -0x1p511, 50000)
-V_ASINH_INTERVAL (-0x1p511, -inf, 40000)
-#endif
diff --git a/pl/math/v_asinh_3u5.c b/pl/math/v_asinh_3u5.c
new file mode 100644
index 0000000..5294a3c
--- /dev/null
+++ b/pl/math/v_asinh_3u5.c
@@ -0,0 +1,174 @@
+/*
+ * Double-precision vector asinh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define OneTop 0x3ff	/* top12(asuint64(1.0f)).  */
+#define HugeBound 0x5fe /* top12(asuint64(0x1p511)).  */
+#define TinyBound 0x3e5 /* top12(asuint64(0x1p-26)).  */
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define C(i) v_f64 (__asinh_data.poly[i])
+
+/* Constants & data for log.  */
+#define OFF 0x3fe6000000000000
+#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
+#define A(i) v_f64 (__sv_log_data.poly[i])
+#define T(i) __log_data.tab[i]
+#define N (1 << LOG_TABLE_BITS)
+
+static NOINLINE v_f64_t
+special_case (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (asinh, x, y, special);
+}
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t logc;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = T (i).invc;
+  e.logc = T (i).logc;
+#else
+  e.invc[0] = T (i[0]).invc;
+  e.logc[0] = T (i[0]).logc;
+  e.invc[1] = T (i[1]).invc;
+  e.logc[1] = T (i[1]).logc;
+#endif
+  return e;
+}
+
+static inline v_f64_t
+log_inline (v_f64_t x)
+{
+  /* Double-precision vector log, copied from math/v_log.c with some cosmetic
+     modification and special-cases removed. See that file for details of the
+     algorithm used.  */
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t tmp = ix - OFF;
+  v_u64_t i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
+  v_s64_t k = v_as_s64_u64 (tmp) >> 52;
+  v_u64_t iz = ix - (tmp & 0xfffULL << 52);
+  v_f64_t z = v_as_f64_u64 (iz);
+  struct entry e = lookup (i);
+  v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  v_f64_t kd = v_to_f64_s64 (k);
+  v_f64_t hi = v_fma_f64 (kd, Ln2, e.logc + r);
+  v_f64_t r2 = r * r;
+  v_f64_t y = v_fma_f64 (A (3), r, A (2));
+  v_f64_t p = v_fma_f64 (A (1), r, A (0));
+  y = v_fma_f64 (A (4), r2, y);
+  y = v_fma_f64 (y, r2, p);
+  y = v_fma_f64 (y, r2, hi);
+  return y;
+}
+
+/* Double-precision implementation of vector asinh(x).
+   asinh is very sensitive around 1, so it is impractical to devise a single
+   low-cost algorithm which is sufficiently accurate on a wide range of input.
+   Instead we use two different algorithms:
+   asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
+	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
+   where log(x) is an optimized log approximation, and P(x) is a polynomial
+   shared with the scalar routine. The greatest observed error 3.29 ULP, in
+   |x| >= 1:
+   __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
+				  want 0x1.ffffcfd0e2352p-1.  */
+VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_f64_t ax = v_as_f64_u64 (iax);
+  v_u64_t top12 = iax >> 52;
+
+  v_u64_t gt1 = v_cond_u64 (top12 >= OneTop);
+  v_u64_t special = v_cond_u64 (top12 >= HugeBound);
+
+#if WANT_SIMD_EXCEPT
+  v_u64_t tiny = v_cond_u64 (top12 < TinyBound);
+  special |= tiny;
+#endif
+
+  /* Option 1: |x| >= 1.
+     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
+     If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
+     overflow, by setting special lanes to 1. These will be fixed later.  */
+  v_f64_t option_1 = v_f64 (0);
+  if (likely (v_any_u64 (gt1)))
+    {
+#if WANT_SIMD_EXCEPT
+      v_f64_t xm = v_sel_f64 (special, v_f64 (1), ax);
+#else
+      v_f64_t xm = ax;
+#endif
+      option_1 = log_inline (xm + v_sqrt_f64 (xm * xm + 1));
+    }
+
+  /* Option 2: |x| < 1.
+     Compute asinh(x) using a polynomial.
+     If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
+     overflow, and tiny lanes, which will underflow, by setting them to 0. They
+     will be fixed later, either by selecting x or falling back to the scalar
+     special-case. The largest observed error in this region is 1.47 ULPs:
+     __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+				    want 0x1.c1d6bf874019cp-1.  */
+  v_f64_t option_2 = v_f64 (0);
+  if (likely (v_any_u64 (~gt1)))
+    {
+#if WANT_SIMD_EXCEPT
+      ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
+#endif
+      v_f64_t x2 = ax * ax;
+      v_f64_t z2 = x2 * x2;
+      v_f64_t z4 = z2 * z2;
+      v_f64_t z8 = z4 * z4;
+      v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
+      option_2 = v_fma_f64 (p, x2 * ax, ax);
+#if WANT_SIMD_EXCEPT
+      option_2 = v_sel_f64 (tiny, x, option_2);
+#endif
+    }
+
+  /* Choose the right option for each lane.  */
+  v_f64_t y = v_sel_f64 (gt1, option_1, option_2);
+  /* Copy sign.  */
+  y = v_as_f64_u64 (v_bsl_u64 (AbsMask, v_as_u64_f64 (y), ix));
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (asinh), 2.80)
+PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_SIMD_EXCEPT)
+/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
+   Ensures the v_sel is choosing the right option in all cases.  */
+#define V_ASINH_INTERVAL(lo, hi, n)                                            \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0.5)                          \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 2)                            \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0x1p600)
+V_ASINH_INTERVAL (0, 0x1p-26, 50000)
+V_ASINH_INTERVAL (0x1p-26, 1, 50000)
+V_ASINH_INTERVAL (1, 0x1p511, 50000)
+V_ASINH_INTERVAL (0x1p511, inf, 40000)
+V_ASINH_INTERVAL (-0, -0x1p-26, 50000)
+V_ASINH_INTERVAL (-0x1p-26, -1, 50000)
+V_ASINH_INTERVAL (-1, -0x1p511, 50000)
+V_ASINH_INTERVAL (-0x1p511, -inf, 40000)
+#endif
diff --git a/pl/math/vn_asinh_2u5.c b/pl/math/vn_asinh_2u5.c
deleted file mode 100644
index e349530..0000000
--- a/pl/math/vn_asinh_2u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_asinh.
- *
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_asinh, _ZGVnN2v_asinh)
-#include "v_asinh_2u5.c"
-#endif
diff --git a/pl/math/vn_asinh_3u5.c b/pl/math/vn_asinh_3u5.c
new file mode 100644
index 0000000..e2f3aeb
--- /dev/null
+++ b/pl/math/vn_asinh_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_asinh.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_asinh, _ZGVnN2v_asinh)
+#include "v_asinh_3u5.c"
+#endif
-- 
cgit v1.2.3


From 04e91eca36b0a7dbbab78bf9401c978ab1b08b67 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 20 Dec 2022 09:26:47 +0000
Subject: pl/math: Add scalar & vector/Neon cbrt

New routines use the same algorithm, with simplified argument
reduction and recombination in the vector variant. Both are accurate
to 2 ULP.
---
 pl/math/cbrt_2u.c         | 70 ++++++++++++++++++++++++++++++++++
 pl/math/cbrt_data.c       | 15 ++++++++
 pl/math/include/mathlib.h |  5 +++
 pl/math/math_config.h     |  6 +++
 pl/math/s_cbrt_2u.c       |  6 +++
 pl/math/tools/cbrt.sollya | 20 ++++++++++
 pl/math/v_cbrt_2u.c       | 97 +++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_cbrt_2u.c      | 12 ++++++
 8 files changed, 231 insertions(+)
 create mode 100644 pl/math/cbrt_2u.c
 create mode 100644 pl/math/cbrt_data.c
 create mode 100644 pl/math/s_cbrt_2u.c
 create mode 100644 pl/math/tools/cbrt.sollya
 create mode 100644 pl/math/v_cbrt_2u.c
 create mode 100644 pl/math/vn_cbrt_2u.c

diff --git a/pl/math/cbrt_2u.c b/pl/math/cbrt_2u.c
new file mode 100644
index 0000000..f89dd87
--- /dev/null
+++ b/pl/math/cbrt_2u.c
@@ -0,0 +1,70 @@
+/*
+ * Double-precision cbrt(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+PL_SIG (S, D, 1, cbrt, -10.0, 10.0)
+
+#define AbsMask 0x7fffffffffffffff
+#define TwoThirds 0x1.5555555555555p-1
+
+#define C(i) __cbrt_data.poly[i]
+#define T(i) __cbrt_data.table[i]
+
+/* Approximation for double-precision cbrt(x), using low-order polynomial and
+   two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+   according to the exponent, for instance an error observed for double value
+   m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+   integer.
+   cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+			     want 0x1.965fe72821e99p+0.  */
+double
+cbrt (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iax = ix & AbsMask;
+  uint64_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax == 0 || iax == 0x7f80000000000000))
+    return x;
+
+  /* |x| = m * 2^e, where m is in [0.5, 1.0].
+     We can easily decompose x into m and e using frexp.  */
+  int e;
+  double m = frexp (asdouble (iax), &e);
+
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+     Newton iterations.  */
+  double p_01 = fma (C (1), m, C (0));
+  double p_23 = fma (C (3), m, C (2));
+  double p = fma (p_23, m * m, p_01);
+
+  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
+  double m_by_3 = m / 3;
+  double a = fma (TwoThirds, p, m_by_3 / (p * p));
+  a = fma (TwoThirds, a, m_by_3 / (a * a));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)).
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3.
+     i is an integer in [-2, 2], so t can be looked up in the table T.
+     Hence the result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.
+     Which can be done easily using ldexp.  */
+  return asdouble (asuint64 (ldexp (a * T (2 + e % 3), e / 3)) | sign);
+}
+
+PL_TEST_ULP (cbrt, 1.30)
+PL_TEST_INTERVAL (cbrt, 0, inf, 1000000)
+PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000)
diff --git a/pl/math/cbrt_data.c b/pl/math/cbrt_data.c
new file mode 100644
index 0000000..1c6ca73
--- /dev/null
+++ b/pl/math/cbrt_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and table entries for double-precision cbrt(x).
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct cbrt_data __cbrt_data
+  = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1].
+                  See cbrt.sollya for details of generation.  */
+	      0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, 0x1.2c74eaa3ba428p-3},
+     .table = { /* table[i] = 2^((i - 2) / 3).  */
+	         0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0}};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 1266eb7..12d72fe 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -27,6 +27,7 @@ float tanhf (float);
 double acosh (double);
 double asinh (double);
 double atan2 (double, double);
+double cbrt (double);
 double cosh (double);
 double erfc (double);
 double expm1 (double);
@@ -53,6 +54,7 @@ float __s_tanhf (float);
 double __s_asinh (double);
 double __s_atan (double);
 double __s_atan2 (double, double);
+double __s_cbrt (double);
 double __s_cosh (double);
 double __s_erf (double);
 double __s_erfc (double);
@@ -82,6 +84,7 @@ __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
 __f32x4_t __v_atanhf (__f32x4_t);
 __f32x4_t __v_cbrtf (__f32x4_t);
+__f64x2_t __v_cbrt (__f64x2_t);
 __f32x4_t __v_coshf (__f32x4_t);
 __f64x2_t __v_cosh (__f64x2_t);
 __f32x4_t __v_erff (__f32x4_t);
@@ -113,6 +116,7 @@ __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t __vn_atanhf (__f32x4_t);
 __vpcs __f32x4_t __vn_cbrtf (__f32x4_t);
+__vpcs __f64x2_t __vn_cbrt (__f64x2_t);
 __vpcs __f32x4_t __vn_coshf (__f32x4_t);
 __vpcs __f64x2_t __vn_cosh (__f64x2_t);
 __vpcs __f32x4_t __vn_erff (__f32x4_t);
@@ -141,6 +145,7 @@ __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 90d571c..92ccebf 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -558,4 +558,10 @@ extern const struct cbrtf_data
   float table[5];
 } __cbrtf_data HIDDEN;
 
+extern const struct cbrt_data
+{
+  double poly[4];
+  double table[5];
+} __cbrt_data HIDDEN;
+
 #endif
diff --git a/pl/math/s_cbrt_2u.c b/pl/math/s_cbrt_2u.c
new file mode 100644
index 0000000..22f726b
--- /dev/null
+++ b/pl/math/s_cbrt_2u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cbrt_2u.c"
diff --git a/pl/math/tools/cbrt.sollya b/pl/math/tools/cbrt.sollya
new file mode 100644
index 0000000..7f179eb
--- /dev/null
+++ b/pl/math/tools/cbrt.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating cbrt(x) in double precision
+//
+// Copyright (c) 2022, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 3;
+
+a = 0.5;
+b = 1;
+
+
+f = x^(1/3);
+
+poly = fpminimax(f, deg, [|double ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), D, RN);
diff --git a/pl/math/v_cbrt_2u.c b/pl/math/v_cbrt_2u.c
new file mode 100644
index 0000000..b6e501c
--- /dev/null
+++ b/pl/math/v_cbrt_2u.c
@@ -0,0 +1,97 @@
+/*
+ * Double-precision vector cbrt(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffffffffffff
+#define TwoThirds v_f64 (0x1.5555555555555p-1)
+#define TinyBound 0x001 /* top12 (smallest_normal).  */
+#define BigBound 0x7ff	/* top12 (infinity).  */
+#define MantissaMask v_u64 (0x000fffffffffffff)
+#define HalfExp v_u64 (0x3fe0000000000000)
+
+#define C(i) v_f64 (__cbrt_data.poly[i])
+#define T(i) v_lookup_f64 (__cbrt_data.table, i)
+
+static NOINLINE v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (cbrt, x, y, special);
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order polynomial
+   and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+   according to the exponent, for instance an error observed for double value
+   m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+   integer.
+   __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+				 want 0x1.965fe72821e99p+0.  */
+VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_u64_t ia12 = iax >> 52;
+
+  /* Subnormal, +/-0 and special values.  */
+  v_u64_t special = v_cond_u64 ((ia12 < TinyBound) | (ia12 >= BigBound));
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexp, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  v_f64_t m = v_as_f64_u64 (v_bsl_u64 (MantissaMask, iax, HalfExp));
+  v_s64_t e = v_as_s64_u64 (iax >> 52) - 1022;
+
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+     Newton iterations.  */
+  v_f64_t p_01 = v_fma_f64 (C (1), m, C (0));
+  v_f64_t p_23 = v_fma_f64 (C (3), m, C (2));
+  v_f64_t p = v_fma_f64 (m * m, p_23, p_01);
+
+  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
+  v_f64_t m_by_3 = m / 3;
+  v_f64_t a = v_fma_f64 (TwoThirds, p, m_by_3 / (p * p));
+  a = v_fma_f64 (TwoThirds, a, m_by_3 / (a * a));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+     an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+
+  v_s64_t ey = e / 3;
+  v_f64_t my = a * T (v_as_u64_s64 (e % 3 + 2));
+
+  /* Vector version of ldexp.  */
+  v_f64_t y = v_as_f64_u64 ((v_as_u64_s64 (ey + 1023) << 52)) * my;
+  /* Copy sign.  */
+  y = v_as_f64_u64 (v_bsl_u64 (v_u64 (AbsMask), v_as_u64_f64 (y), ix));
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_TEST_ULP (V_NAME (cbrt), 1.30)
+PL_SIG (V, D, 1, cbrt, -10.0, 10.0)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrt))
+PL_TEST_INTERVAL (V_NAME (cbrt), 0, inf, 1000000)
+PL_TEST_INTERVAL (V_NAME (cbrt), -0, -inf, 1000000)
+#endif
diff --git a/pl/math/vn_cbrt_2u.c b/pl/math/vn_cbrt_2u.c
new file mode 100644
index 0000000..ccaa085
--- /dev/null
+++ b/pl/math/vn_cbrt_2u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cbrt.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_cbrt, _ZGVnN2v_cbrt)
+#include "v_cbrt_2u.c"
+#endif
-- 
cgit v1.2.3


From f312cb80e4a306d8a127a1dd78b5ee0a1ee89732 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 20 Dec 2022 09:26:57 +0000
Subject: pl/math: Add scalar atanf and set fenv in Neon atanf

The simplest way to set fenv in Neon atanf is by using a scalar
fallback to under/overflow cases, however this routine did not have a
scalar counterpart so we add a new one, based on the same algorithm
and polynomial as the vector variants, and accurate to 2.9 ULP. This
is now used as the fallback for all lanes, when any lane of the Neon
input is special.
---
 pl/math/atanf_2u9.c                       | 76 +++++++++++++++++++++++++++++++
 pl/math/include/mathlib.h                 |  1 +
 pl/math/test/testcases/directed/atanf.tst | 22 +++++++++
 pl/math/v_atanf_3u.c                      | 37 ++++++++++++---
 4 files changed, 129 insertions(+), 7 deletions(-)
 create mode 100644 pl/math/atanf_2u9.c
 create mode 100644 pl/math/test/testcases/directed/atanf.tst

diff --git a/pl/math/atanf_2u9.c b/pl/math/atanf_2u9.c
new file mode 100644
index 0000000..d7071be
--- /dev/null
+++ b/pl/math/atanf_2u9.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision atan(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "atanf_common.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define PiOver2 0x1.921fb6p+0f
+#define AbsMask 0x7fffffff
+#define TinyBound 0x30800000 /* asuint(0x1p-30).  */
+#define BigBound 0x4e800000  /* asuint(0x1p30).  */
+#define One 0x3f800000
+
+/* Approximation of single-precision atan(x) based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+   using z=-1/x and shift = pi/2.
+   Maximum error is 2.88 ulps:
+   atanf(0x1.0565ccp+0) got 0x1.97771p-1
+		       want 0x1.97770ap-1.  */
+float
+atanf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t sign = ix & ~AbsMask;
+  uint32_t ia = ix & AbsMask;
+
+  if (unlikely (ia < TinyBound))
+    /* Avoid underflow by returning x.  */
+    return x;
+
+  if (unlikely (ia > BigBound))
+    {
+      if (ia > 0x7f800000)
+	/* Propagate NaN.  */
+	return __math_invalidf (x);
+      /* atan(x) rounds to PiOver2 for large x.  */
+      return asfloat (asuint (PiOver2) ^ sign);
+    }
+
+  float z, az, shift;
+  if (ia > One)
+    {
+      /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x).  */
+      z = -1.0f / x;
+      shift = PiOver2;
+      /* Use absolute value only when needed (odd powers of z).  */
+      az = -fabsf (z);
+    }
+  else
+    {
+      /* For x < 1, approximate atan(x) directly.  */
+      z = x;
+      az = asfloat (ia);
+      shift = 0;
+    }
+
+  /* Calculate polynomial, shift + z + z^3 * P(z^2).  */
+  float y = eval_poly (z, az, shift);
+  /* Copy sign.  */
+  return asfloat (asuint (y) ^ sign);
+}
+
+PL_SIG (S, F, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (atanf, 2.38)
+PL_TEST_INTERVAL (atanf, 0, 0x1p-30, 5000)
+PL_TEST_INTERVAL (atanf, -0, -0x1p-30, 5000)
+PL_TEST_INTERVAL (atanf, 0x1p-30, 1, 40000)
+PL_TEST_INTERVAL (atanf, -0x1p-30, -1, 40000)
+PL_TEST_INTERVAL (atanf, 1, 0x1p30, 40000)
+PL_TEST_INTERVAL (atanf, -1, -0x1p30, 40000)
+PL_TEST_INTERVAL (atanf, 0x1p30, inf, 1000)
+PL_TEST_INTERVAL (atanf, -0x1p30, -inf, 1000)
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 12d72fe..44cbc73 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -12,6 +12,7 @@
 float acoshf (float);
 float asinhf (float);
 float atan2f (float, float);
+float atanf (float);
 float atanhf (float);
 float cbrtf (float);
 float coshf (float);
diff --git a/pl/math/test/testcases/directed/atanf.tst b/pl/math/test/testcases/directed/atanf.tst
new file mode 100644
index 0000000..8661527
--- /dev/null
+++ b/pl/math/test/testcases/directed/atanf.tst
@@ -0,0 +1,22 @@
+; atanf.tst
+;
+; Copyright 2007-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanf op1=7fc00001 result=7fc00001 errno=0
+func=atanf op1=ffc00001 result=7fc00001 errno=0
+func=atanf op1=7f800001 result=7fc00001 errno=0 status=i
+func=atanf op1=ff800001 result=7fc00001 errno=0 status=i
+func=atanf op1=7f800000 result=3fc90fda.a22 errno=0
+func=atanf op1=ff800000 result=bfc90fda.a22 errno=0
+func=atanf op1=00000000 result=00000000 errno=0
+func=atanf op1=80000000 result=80000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=atanf op1=80000001 result=80000001 errno=0 maybestatus=ux
+
+func=atanf op1=3f800000 result=3f490fda.a22 errno=0
+func=atanf op1=bf800000 result=bf490fda.a22 errno=0
diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c
index 3cb51b1..c61f8f8 100644
--- a/pl/math/v_atanf_3u.c
+++ b/pl/math/v_atanf_3u.c
@@ -15,6 +15,16 @@
 
 #define PiOver2 v_f32 (0x1.921fb6p+0f)
 #define AbsMask v_u32 (0x7fffffff)
+#define TinyBound 0x308 /* top12(asuint(0x1p-30)).  */
+#define BigBound 0x4e8	/* top12(asuint(0x1p30)).  */
+
+#if WANT_SIMD_EXCEPT
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (atanf, x, y, special);
+}
+#endif
 
 /* Fast implementation of vector atanf based on
    atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
@@ -23,11 +33,20 @@
 VPCS_ATTR
 v_f32_t V_NAME (atanf) (v_f32_t x)
 {
-  /* No need to trigger special case. Small cases, infs and nans
-     are supported by our approximation technique.  */
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
   v_u32_t ix = v_as_u32_f32 (x);
   v_u32_t sign = ix & ~AbsMask;
 
+#if WANT_SIMD_EXCEPT
+  v_u32_t ia12 = (ix >> 20) & 0x7ff;
+  v_u32_t special = v_cond_u32 (ia12 - TinyBound > BigBound - TinyBound);
+  /* If any lane is special, fall back to the scalar routine for all lanes.  */
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (x, x, v_u32 (-1));
+#endif
+
   /* Argument reduction:
      y := arctan(x) for x < 1
      y := pi/2 + arctan(-1/x) for x > 1
@@ -52,9 +71,13 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, atan, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (atanf), 2.5)
-PL_TEST_INTERVAL (V_NAME (atanf), -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (V_NAME (atanf), -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atanf), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atanf), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atanf), 1e6, 1e32, 40000)
+PL_TEST_EXPECT_FENV (V_NAME (atanf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (atanf), 0, 0x1p-30, 5000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0, -0x1p-30, 5000)
+PL_TEST_INTERVAL (V_NAME (atanf), 0x1p-30, 1, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0x1p-30, -1, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), 1, 0x1p30, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), -1, -0x1p30, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), 0x1p30, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0x1p30, -inf, 1000)
 #endif
-- 
cgit v1.2.3


From 7ab15c5f583cd2dd097aa31e9bdace5af0e674f4 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Tue, 20 Dec 2022 10:13:08 +0000
Subject: pl/math: Update ULP threshold for SVE erf

Updated comment and test threshold.
---
 pl/math/sv_erf_2u5.c | 103 ---------------------------------------------------
 pl/math/sv_erf_3u.c  | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 103 deletions(-)
 delete mode 100644 pl/math/sv_erf_2u5.c
 create mode 100644 pl/math/sv_erf_3u.c

diff --git a/pl/math/sv_erf_2u5.c b/pl/math/sv_erf_2u5.c
deleted file mode 100644
index b4c9186..0000000
--- a/pl/math/sv_erf_2u5.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Double-precision SVE erf(x) function.
- *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if SV_SUPPORTED
-
-#define Scale (8.0)
-#define AbsMask (0x7fffffffffffffff)
-
-static NOINLINE sv_f64_t
-__sv_erf_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
-{
-  return sv_call_f64 (erf, x, y, cmp);
-}
-
-/* Optimized double precision SVE error function erf. Maximum
-   observed error is 2.46 ULP:
-   __sv_erf(0x1.5644782ddd668p+2) got 0x1.ffffffffffeap-1
-				 want 0x1.ffffffffffe9ep-1.  */
-sv_f64_t
-__sv_erf_x (sv_f64_t x, const svbool_t pg)
-{
-  /* Use top 16 bits to test for special cases and small values.  */
-  sv_u64_t ix = sv_as_u64_f64 (x);
-  sv_u64_t atop = svand_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 48), 0x7fff);
-
-  /* Handle both inf/nan as well as small values (|x|<2^-28).  */
-  svbool_t cmp
-    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30);
-
-  /* Get sign and absolute value.  */
-  sv_f64_t a = sv_as_f64_u64 (svand_n_u64_x (pg, ix, AbsMask));
-  sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
-
-  /* i = trunc(Scale*x).  */
-  sv_f64_t a_scale = svmul_n_f64_x (pg, a, Scale);
-  /* Saturate index of intervals.  */
-  svbool_t a_lt_6 = svcmplt_n_u64 (pg, atop, 0x4018);
-  sv_u64_t i = svcvt_u64_f64_m (sv_u64 (V_ERF_NINTS - 1), a_lt_6, a_scale);
-
-  /* Load polynomial coefficients.  */
-  sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i);
-  sv_f64_t P_1 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[1], i);
-  sv_f64_t P_2 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[2], i);
-  sv_f64_t P_3 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[3], i);
-  sv_f64_t P_4 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[4], i);
-  sv_f64_t P_5 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[5], i);
-  sv_f64_t P_6 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[6], i);
-  sv_f64_t P_7 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[7], i);
-  sv_f64_t P_8 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[8], i);
-  sv_f64_t P_9 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[9], i);
-
-  /* Get shift and scale.  */
-  sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i);
-
-  /* Transform polynomial variable.
-     Set z = 0 in the boring domain to avoid overflow.  */
-  sv_f64_t z = svmla_f64_m (a_lt_6, shift, sv_f64 (Scale), a);
-
-  /* Evaluate polynomial P(z) using level-2 Estrin.  */
-  sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0);
-  sv_f64_t r2 = sv_fma_f64_x (pg, z, P_3, P_2);
-  sv_f64_t r3 = sv_fma_f64_x (pg, z, P_5, P_4);
-  sv_f64_t r4 = sv_fma_f64_x (pg, z, P_7, P_6);
-  sv_f64_t r5 = sv_fma_f64_x (pg, z, P_9, P_8);
-
-  sv_f64_t z2 = svmul_f64_x (pg, z, z);
-  sv_f64_t z4 = svmul_f64_x (pg, z2, z2);
-
-  sv_f64_t q2 = sv_fma_f64_x (pg, r4, z2, r3);
-  sv_f64_t q1 = sv_fma_f64_x (pg, r2, z2, r1);
-
-  sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2);
-  y = sv_fma_f64_x (pg, z4, y, q1);
-
-  /* y = erf(x) if x > 0, -erf(-x) otherwise.  */
-  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
-
-  if (unlikely (svptest_any (pg, cmp)))
-    return __sv_erf_specialcase (x, y, cmp);
-  return y;
-}
-
-PL_ALIAS (__sv_erf_x, _ZGVsMxv_erf)
-
-PL_SIG (SV, D, 1, erf, -4.0, 4.0)
-PL_TEST_ULP (__sv_erf, 1.97)
-PL_TEST_INTERVAL (__sv_erf, 0, 0x1p-28, 20000)
-PL_TEST_INTERVAL (__sv_erf, 0x1p-28, 1, 60000)
-PL_TEST_INTERVAL (__sv_erf, 1, 0x1p28, 60000)
-PL_TEST_INTERVAL (__sv_erf, 0x1p28, inf, 20000)
-PL_TEST_INTERVAL (__sv_erf, -0, -0x1p-28, 20000)
-PL_TEST_INTERVAL (__sv_erf, -0x1p-28, -1, 60000)
-PL_TEST_INTERVAL (__sv_erf, -1, -0x1p28, 60000)
-PL_TEST_INTERVAL (__sv_erf, -0x1p28, -inf, 20000)
-#endif
diff --git a/pl/math/sv_erf_3u.c b/pl/math/sv_erf_3u.c
new file mode 100644
index 0000000..c860e1a
--- /dev/null
+++ b/pl/math/sv_erf_3u.c
@@ -0,0 +1,103 @@
+/*
+ * Double-precision SVE erf(x) function.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define Scale (8.0)
+#define AbsMask (0x7fffffffffffffff)
+
+static NOINLINE sv_f64_t
+__sv_erf_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (erf, x, y, cmp);
+}
+
+/* Optimized double precision SVE error function erf.
+   Maximum observed error is 2.62 ULP:
+   __sv_erf(0x1.79cab7e3078fap+2) got 0x1.0000000000001p+0
+				 want 0x1.fffffffffffffp-1.  */
+sv_f64_t
+__sv_erf_x (sv_f64_t x, const svbool_t pg)
+{
+  /* Use top 16 bits to test for special cases and small values.  */
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t atop = svand_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 48), 0x7fff);
+
+  /* Handle both inf/nan as well as small values (|x|<2^-28).  */
+  svbool_t cmp
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30);
+
+  /* Get sign and absolute value.  */
+  sv_f64_t a = sv_as_f64_u64 (svand_n_u64_x (pg, ix, AbsMask));
+  sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
+
+  /* i = trunc(Scale*x).  */
+  sv_f64_t a_scale = svmul_n_f64_x (pg, a, Scale);
+  /* Saturate index of intervals.  */
+  svbool_t a_lt_6 = svcmplt_n_u64 (pg, atop, 0x4018);
+  sv_u64_t i = svcvt_u64_f64_m (sv_u64 (V_ERF_NINTS - 1), a_lt_6, a_scale);
+
+  /* Load polynomial coefficients.  */
+  sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i);
+  sv_f64_t P_1 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[1], i);
+  sv_f64_t P_2 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[2], i);
+  sv_f64_t P_3 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[3], i);
+  sv_f64_t P_4 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[4], i);
+  sv_f64_t P_5 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[5], i);
+  sv_f64_t P_6 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[6], i);
+  sv_f64_t P_7 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[7], i);
+  sv_f64_t P_8 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[8], i);
+  sv_f64_t P_9 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[9], i);
+
+  /* Get shift and scale.  */
+  sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i);
+
+  /* Transform polynomial variable.
+     Set z = 0 in the boring domain to avoid overflow.  */
+  sv_f64_t z = svmla_f64_m (a_lt_6, shift, sv_f64 (Scale), a);
+
+  /* Evaluate polynomial P(z) using level-2 Estrin.  */
+  sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0);
+  sv_f64_t r2 = sv_fma_f64_x (pg, z, P_3, P_2);
+  sv_f64_t r3 = sv_fma_f64_x (pg, z, P_5, P_4);
+  sv_f64_t r4 = sv_fma_f64_x (pg, z, P_7, P_6);
+  sv_f64_t r5 = sv_fma_f64_x (pg, z, P_9, P_8);
+
+  sv_f64_t z2 = svmul_f64_x (pg, z, z);
+  sv_f64_t z4 = svmul_f64_x (pg, z2, z2);
+
+  sv_f64_t q2 = sv_fma_f64_x (pg, r4, z2, r3);
+  sv_f64_t q1 = sv_fma_f64_x (pg, r2, z2, r1);
+
+  sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2);
+  y = sv_fma_f64_x (pg, z4, y, q1);
+
+  /* y = erf(x) if x > 0, -erf(-x) otherwise.  */
+  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_erf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_erf_x, _ZGVsMxv_erf)
+
+PL_SIG (SV, D, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (__sv_erf, 2.13)
+PL_TEST_INTERVAL (__sv_erf, 0, 0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erf, 0x1p-28, 1, 60000)
+PL_TEST_INTERVAL (__sv_erf, 1, 0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erf, 0x1p28, inf, 20000)
+PL_TEST_INTERVAL (__sv_erf, -0, -0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erf, -0x1p-28, -1, 60000)
+PL_TEST_INTERVAL (__sv_erf, -1, -0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erf, -0x1p28, -inf, 20000)
+#endif
-- 
cgit v1.2.3


From 3bfa7bd49c5576d5b1f9e6a79e3d3a15fe3823bc Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue, 20 Dec 2022 10:38:32 +0000
Subject: Correct exit code from runulp.sh

The pipe prevented FAILs and PASSs being counted properly - the while
read loop has been rewritten without a pipe, as it was prior to the
changes here.

fenv checking is temporarily disabled in Neon sinh and sinhf, as they
do not get it right. This will be re-enabled once they have been
fixed.
---
 pl/math/test/runulp.sh | 6 ++++--
 pl/math/v_sinh_3u.c    | 3 ++-
 pl/math/v_sinhf_2u3.c  | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 7fa4058..c5902e6 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -65,10 +65,12 @@ check -q -f -e 0 __sv_powi   0  inf x -0 -1000 100000 && runsv=1
 check -q -f -e 0 __sv_powi  -0 -inf x -0 -1000 100000 && runsv=1
 fi
 
-cat $INTERVALS | while read F LO HI N C
+while read F LO HI N C
 do
 	t $F $LO $HI $N $C
-done
+done << EOF
+$(cat $INTERVALS)
+EOF
 
 [ 0 -eq $FAIL ] || {
 	echo "FAILED $FAIL PASSED $PASS"
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
index 37d7e45..9fe496e 100644
--- a/pl/math/v_sinh_3u.c
+++ b/pl/math/v_sinh_3u.c
@@ -46,7 +46,8 @@ VPCS_ALIAS
 
 PL_SIG (V, D, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinh), 2.08)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (sinh))
+/* TODO: reinstate PL_TEST_EXPECT_FENV here once fp exceptions are triggered
+   correctly.  */
 PL_TEST_INTERVAL (V_NAME (sinh), 0, 0x1p-51, 100)
 PL_TEST_INTERVAL (V_NAME (sinh), -0, -0x1p-51, 100)
 PL_TEST_INTERVAL (V_NAME (sinh), 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
index 50fc786..ce2fe0e 100644
--- a/pl/math/v_sinhf_2u3.c
+++ b/pl/math/v_sinhf_2u3.c
@@ -45,7 +45,8 @@ VPCS_ALIAS
 
 PL_SIG (V, F, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinhf), 1.76)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (sinhf))
+/* TODO: reinstate PL_TEST_EXPECT_FENV here once fp exceptions are triggered
+   correctly.  */
 PL_TEST_INTERVAL (V_NAME (sinhf), 0, 0x1.62e43p+6, 100000)
 PL_TEST_INTERVAL (V_NAME (sinhf), -0, -0x1.62e43p+6, 100000)
 PL_TEST_INTERVAL (V_NAME (sinhf), 0x1.62e43p+6, 0x1.65a9fap+6, 100)
-- 
cgit v1.2.3


From 0a9270a27f48bea87c5bd3f0f9c759da66fb45a3 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 22 Dec 2022 16:20:22 +0000
Subject: pl/math: Fix fp exceptions in Neon sinhf and sinh

Both routines previously relied on the vector expm1(f) routine exposed
by the library, which depended on WANT_SIMD_EXCEPT for its fenv
behaviour, however both routines were expected to always trigger fp
exceptions correctly. To remedy this, both routines now use an inlined
helper for expm1 (reused from vector tanhf in the case of sinhf), and
special-case small input as well as large when WANT_SIMD_EXCEPT is
enabled.
---
 pl/math/v_expm1f_inline.h | 49 ++++++++++++++++++++++++++++++++
 pl/math/v_sinh_3u.c       | 72 +++++++++++++++++++++++++++++++++++------------
 pl/math/v_sinhf_2u3.c     | 42 +++++++++++++++++----------
 pl/math/v_tanhf_2u6.c     | 38 ++-----------------------
 4 files changed, 132 insertions(+), 69 deletions(-)
 create mode 100644 pl/math/v_expm1f_inline.h

diff --git a/pl/math/v_expm1f_inline.h b/pl/math/v_expm1f_inline.h
new file mode 100644
index 0000000..ef9e934
--- /dev/null
+++ b/pl/math/v_expm1f_inline.h
@@ -0,0 +1,49 @@
+/*
+ * Helper for single-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_V_EXPM1F_INLINE_H
+#define PL_MATH_V_EXPM1F_INLINE_H
+
+#include "v_math.h"
+#include "math_config.h"
+#include "estrinf.h"
+
+#define One 0x3f800000
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define MLn2hi v_f32 (-0x1.62e4p-1f)
+#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
+
+#define C(i) v_f32 (__expm1f_poly[i])
+
+static inline v_f32_t
+expm1f_inline (v_f32_t x)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from v_expm1f_1u6.c, with all special-case handling removed - the
+     calling routine should handle special values if required.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
+  v_s32_t i = v_to_s32_f32 (j);
+  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
+  f = v_fma_f32 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
+  v_f32_t f2 = f * f;
+  v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
+  p = v_fma_f32 (f2, p, f);
+
+  /* t = 2^i.  */
+  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return v_fma_f32 (p, t, t - 1);
+}
+
+#endif // PL_MATH_V_EXPM1F_INLINE_H
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
index 9fe496e..8ddd29d 100644
--- a/pl/math/v_sinh_3u.c
+++ b/pl/math/v_sinh_3u.c
@@ -5,18 +5,51 @@
  */
 
 #include "v_math.h"
-#include "mathlib.h"
+#include "estrin.h"
 #include "pl_sig.h"
 #include "pl_test.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define Half 0x3fe0000000000000
-#define OFlowBound                                                             \
-  0x40862e42fefa39f0 /* 0x1.62e42fefa39fp+9, above which using expm1 results   \
-			in NaN.  */
+#define BigBound                                                               \
+  0x4080000000000000 /* 2^9. expm1 helper overflows for large input.  */
+#define TinyBound                                                              \
+  0x3e50000000000000 /* 2^-26, below which sinh(x) rounds to x.  */
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define One 0x3ff0000000000000
+#define C(i) v_f64 (__expm1_poly[i])
 
 #if V_SUPPORTED
 
+static inline v_f64_t
+expm1_inline (v_f64_t x)
+{
+  /* Reduce argument:
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where i = round(x / ln2)
+     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
+  v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+  v_s64_t i = v_to_s64_f64 (j);
+  v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+  f = v_fma_f64 (j, MLn2lo, f);
+  /* Approximate expm1(f) using polynomial.  */
+  v_f64_t f2 = f * f, f4 = f2 * f2, f8 = f4 * f4;
+  v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f8, C), f);
+  /* t = 2^i.  */
+  v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return v_fma_f64 (p, t, t - 1);
+}
+
+static NOINLINE VPCS_ATTR v_f64_t
+special_case (v_f64_t x)
+{
+  return v_call_f64 (sinh, x, x, v_u64 (-1));
+}
+
 /* Approximation for vector double-precision sinh(x) using expm1.
    sinh(x) = (exp(x) - exp(-x)) / 2.
    The greatest observed error is 2.57 ULP:
@@ -30,28 +63,31 @@ VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x)
   v_u64_t sign = ix & ~AbsMask;
   v_f64_t halfsign = v_as_f64_u64 (sign | Half);
 
-  v_u64_t special = v_cond_u64 (iax >= OFlowBound);
-  /* Fall back to the scalar variant for all lanes if any of them should trigger
-     an exception.  */
+#if WANT_SIMD_EXCEPT
+  v_u64_t special = v_cond_u64 ((iax - TinyBound) >= (BigBound - TinyBound));
+#else
+  v_u64_t special = v_cond_u64 (iax >= BigBound);
+#endif
+
+  /* Fall back to scalar variant for all lanes if any of them are special.  */
   if (unlikely (v_any_u64 (special)))
-    return v_call_f64 (sinh, x, x, v_u64 (-1));
+    return special_case (x);
 
   /* Up to the point that expm1 overflows, we can use it to calculate sinh
-     using a slight rearrangement of the definition of asinh. This allows us to
+     using a slight rearrangement of the definition of sinh. This allows us to
      retain acceptable accuracy for very small inputs.  */
-  v_f64_t t = V_NAME (expm1) (ax);
+  v_f64_t t = expm1_inline (ax);
   return (t + t / (t + 1)) * halfsign;
 }
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinh), 2.08)
-/* TODO: reinstate PL_TEST_EXPECT_FENV here once fp exceptions are triggered
-   correctly.  */
-PL_TEST_INTERVAL (V_NAME (sinh), 0, 0x1p-51, 100)
-PL_TEST_INTERVAL (V_NAME (sinh), -0, -0x1p-51, 100)
-PL_TEST_INTERVAL (V_NAME (sinh), 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
-PL_TEST_INTERVAL (V_NAME (sinh), -0x1p-51, -0x1.62e42fefa39fp+9, 100000)
-PL_TEST_INTERVAL (V_NAME (sinh), 0x1.62e42fefa39fp+9, inf, 1000)
-PL_TEST_INTERVAL (V_NAME (sinh), -0x1.62e42fefa39fp+9, -inf, 1000)
+PL_TEST_EXPECT_FENV (V_NAME (sinh), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (sinh), 0, TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), TinyBound, BigBound, 500000)
+PL_TEST_INTERVAL (V_NAME (sinh), -TinyBound, -BigBound, 500000)
+PL_TEST_INTERVAL (V_NAME (sinh), BigBound, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), -BigBound, -inf, 1000)
 #endif
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
index ce2fe0e..a54c178 100644
--- a/pl/math/v_sinhf_2u3.c
+++ b/pl/math/v_sinhf_2u3.c
@@ -5,17 +5,25 @@
  */
 
 #include "v_math.h"
-#include "mathlib.h"
 #include "pl_sig.h"
 #include "pl_test.h"
 
 #if V_SUPPORTED
 
+#include "v_expm1f_inline.h"
+
 #define AbsMask 0x7fffffff
 #define Half 0x3f000000
-#define Expm1OFlowLimit                                                        \
-  0x42b17218 /* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f          \
-		overflows.  */
+#define BigBound                                                               \
+  0x42b0c0a7 /* 0x1.61814ep+6, above which expm1f helper overflows.  */
+#define TinyBound                                                              \
+  0x2fb504f4 /* 0x1.6a09e8p-32, below which expm1f underflows.  */
+
+static NOINLINE VPCS_ATTR v_f32_t
+special_case (v_f32_t x)
+{
+  return v_call_f32 (sinhf, x, x, v_u32 (-1));
+}
 
 /* Approximation for vector single-precision sinh(x) using expm1.
    sinh(x) = (exp(x) - exp(-x)) / 2.
@@ -29,28 +37,32 @@ VPCS_ATTR v_f32_t V_NAME (sinhf) (v_f32_t x)
   v_u32_t sign = ix & ~AbsMask;
   v_f32_t halfsign = v_as_f32_u32 (sign | Half);
 
-  v_u32_t special = v_cond_u32 (iax >= Expm1OFlowLimit);
+#if WANT_SIMD_EXCEPT
+  v_u32_t special = v_cond_u32 ((iax - TinyBound) >= (BigBound - TinyBound));
+#else
+  v_u32_t special = v_cond_u32 (iax >= BigBound);
+#endif
+
   /* Fall back to the scalar variant for all lanes if any of them should trigger
      an exception.  */
   if (unlikely (v_any_u32 (special)))
-    return v_call_f32 (sinhf, x, x, v_u32 (-1));
+    return special_case (x);
 
   /* Up to the point that expm1f overflows, we can use it to calculate sinhf
      using a slight rearrangement of the definition of asinh. This allows us to
      retain acceptable accuracy for very small inputs.  */
-  v_f32_t t = V_NAME (expm1f) (ax);
+  v_f32_t t = expm1f_inline (ax);
   return (t + t / (t + 1)) * halfsign;
 }
 VPCS_ALIAS
 
 PL_SIG (V, F, 1, sinh, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (sinhf), 1.76)
-/* TODO: reinstate PL_TEST_EXPECT_FENV here once fp exceptions are triggered
-   correctly.  */
-PL_TEST_INTERVAL (V_NAME (sinhf), 0, 0x1.62e43p+6, 100000)
-PL_TEST_INTERVAL (V_NAME (sinhf), -0, -0x1.62e43p+6, 100000)
-PL_TEST_INTERVAL (V_NAME (sinhf), 0x1.62e43p+6, 0x1.65a9fap+6, 100)
-PL_TEST_INTERVAL (V_NAME (sinhf), -0x1.62e43p+6, -0x1.65a9fap+6, 100)
-PL_TEST_INTERVAL (V_NAME (sinhf), 0x1.65a9fap+6, inf, 100)
-PL_TEST_INTERVAL (V_NAME (sinhf), -0x1.65a9fap+6, -inf, 100)
+PL_TEST_EXPECT_FENV (V_NAME (sinhf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (sinhf), 0, TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), TinyBound, BigBound, 100000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -TinyBound, -BigBound, 100000)
+PL_TEST_INTERVAL (V_NAME (sinhf), BigBound, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -BigBound, -inf, 1000)
 #endif
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index dedc085..0e7ff69 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -5,51 +5,17 @@
  */
 
 #include "v_math.h"
-#include "estrinf.h"
-#include "mathlib.h"
 #include "pl_sig.h"
 #include "pl_test.h"
 
 #if V_SUPPORTED
 
+#include "v_expm1f_inline.h"
+
 #define BoringBound                                                            \
   0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
 		negative).  */
 #define AbsMask 0x7fffffff
-#define One 0x3f800000
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define MLn2hi v_f32 (-0x1.62e4p-1f)
-#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
-
-#define C(i) v_f32 (__expm1f_poly[i])
-
-static inline v_f32_t
-expm1f_inline (v_f32_t x)
-{
-  /* Helper routine for calculating exp(x) - 1.
-     Copied from v_expm1f_1u6.c, with all special-case handling removed, as
-     special, tiny and large values are all dealt with in the main tanhf
-     routine.  */
-
-  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
-  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
-  v_s32_t i = v_to_s32_f32 (j);
-  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
-  f = v_fma_f32 (j, MLn2lo, f);
-
-  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
-     Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
-  v_f32_t f2 = f * f;
-  v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
-  p = v_fma_f32 (f2, p, f);
-
-  /* t = 2^i.  */
-  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
-  /* expm1(x) ~= p * t + (t - 1).  */
-  return v_fma_f32 (p, t, t - 1);
-}
 
 static NOINLINE v_f32_t
 special_case (v_f32_t x, v_f32_t y, v_u32_t special)
-- 
cgit v1.2.3


From 2015eee4012b8fa766855328438f48aaf424a835 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 22 Dec 2022 16:20:40 +0000
Subject: pl/math: Add scalar atan and set fenv in Neon atan

The simplest way to set fenv in Neon atan is by using a scalar
fallback for under/overflow cases, however this routine did not have a
scalar counterpart so we add a new one, based on the same algorithm
and polynomial as the vector variants, and accurate to 2.5 ULP. This
is now used as the fallback for all lanes, when any lane of the Neon
input is special.
---
 pl/math/atan_2u5.c                       | 73 ++++++++++++++++++++++++++++++++
 pl/math/include/mathlib.h                |  1 +
 pl/math/test/testcases/directed/atan.tst | 22 ++++++++++
 pl/math/v_atan_2u5.c                     | 29 +++++++++----
 4 files changed, 117 insertions(+), 8 deletions(-)
 create mode 100644 pl/math/atan_2u5.c
 create mode 100644 pl/math/test/testcases/directed/atan.tst

diff --git a/pl/math/atan_2u5.c b/pl/math/atan_2u5.c
new file mode 100644
index 0000000..99fea0f
--- /dev/null
+++ b/pl/math/atan_2u5.c
@@ -0,0 +1,73 @@
+/*
+ * Double-precision atan(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "atan_common.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define PiOver2 0x1.921fb54442d18p+0
+#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)).  */
+#define BigBound 0x434	/* top12(asuint64(0x1p53)).  */
+#define OneTop 0x3ff
+
+/* Fast implementation of double-precision atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+   atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+			     want 0x1.9225645bdd7c3p-1.  */
+double
+atan (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t sign = ix & ~AbsMask;
+  uint64_t ia = ix & AbsMask;
+  uint32_t ia12 = ia >> 52;
+
+  if (unlikely (ia12 >= BigBound || ia12 < TinyBound))
+    {
+      if (ia12 < TinyBound)
+	/* Avoid underflow by returning x.  */
+	return x;
+      if (ia > 0x7ff0000000000000)
+	/* Propagate NaN.  */
+	return __math_invalid (x);
+      /* atan(x) rounds to PiOver2 for large x.  */
+      return asdouble (asuint64 (PiOver2) ^ sign);
+    }
+
+  double z, az, shift;
+  if (ia12 >= OneTop)
+    {
+      /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x).  */
+      z = -1.0 / x;
+      shift = PiOver2;
+      /* Use absolute value only when needed (odd powers of z).  */
+      az = -fabs (z);
+    }
+  else
+    {
+      /* For x < 1, approximate atan(x) directly.  */
+      z = x;
+      shift = 0;
+      az = asdouble (ia);
+    }
+
+  /* Calculate polynomial, shift + z + z^3 * P(z^2).  */
+  double y = eval_poly (z, az, shift);
+  /* Copy sign.  */
+  return asdouble (asuint64 (y) ^ sign);
+}
+
+PL_SIG (S, D, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (atan, 1.78)
+PL_TEST_INTERVAL (atan, 0, 0x1p-30, 10000)
+PL_TEST_INTERVAL (atan, -0, -0x1p-30, 1000)
+PL_TEST_INTERVAL (atan, 0x1p-30, 0x1p53, 900000)
+PL_TEST_INTERVAL (atan, -0x1p-30, -0x1p53, 90000)
+PL_TEST_INTERVAL (atan, 0x1p53, inf, 10000)
+PL_TEST_INTERVAL (atan, -0x1p53, -inf, 1000)
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 44cbc73..05fa306 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -27,6 +27,7 @@ float tanhf (float);
 
 double acosh (double);
 double asinh (double);
+double atan (double);
 double atan2 (double, double);
 double cbrt (double);
 double cosh (double);
diff --git a/pl/math/test/testcases/directed/atan.tst b/pl/math/test/testcases/directed/atan.tst
new file mode 100644
index 0000000..5716276
--- /dev/null
+++ b/pl/math/test/testcases/directed/atan.tst
@@ -0,0 +1,22 @@
+; atan.tst
+;
+; Copyright 1999-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atan op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan op1=7ff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan op1=fff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan op1=00000000.00000000 result=00000000.00000000 errno=0
+func=atan op1=80000000.00000000 result=80000000.00000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atan op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=atan op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
+
+func=atan op1=3ff00000.00000000 result=3fe921fb.54442d18.469 errno=0
+func=atan op1=bff00000.00000000 result=bfe921fb.54442d18.469 errno=0
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
index 43b4abd..92479ab 100644
--- a/pl/math/v_atan_2u5.c
+++ b/pl/math/v_atan_2u5.c
@@ -15,6 +15,8 @@
 
 #define PiOver2 v_f64 (0x1.921fb54442d18p+0)
 #define AbsMask v_u64 (0x7fffffffffffffff)
+#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)).  */
+#define BigBound 0x434	/* top12(asuint64(0x1p53)).  */
 
 /* Fast implementation of vector atan.
    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
@@ -24,11 +26,20 @@
 VPCS_ATTR
 v_f64_t V_NAME (atan) (v_f64_t x)
 {
-  /* No need to trigger special case. Small cases, infs and nans
-     are supported by our approximation technique.  */
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
   v_u64_t ix = v_as_u64_f64 (x);
   v_u64_t sign = ix & ~AbsMask;
 
+#if WANT_SIMD_EXCEPT
+  v_u64_t ia12 = (ix >> 52) & 0x7ff;
+  v_u64_t special = v_cond_u64 (ia12 - TinyBound > BigBound - TinyBound);
+  /* If any lane is special, fall back to the scalar routine for all lanes.  */
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
+#endif
+
   /* Argument reduction:
      y := arctan(x) for x < 1
      y := pi/2 + arctan(-1/x) for x > 1
@@ -46,16 +57,18 @@ v_f64_t V_NAME (atan) (v_f64_t x)
 
   /* y = atan(x) if x>0, -atan(-x) otherwise.  */
   y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
-
   return y;
 }
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, atan, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (atan), 1.78)
-PL_TEST_INTERVAL (V_NAME (atan), -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (V_NAME (atan), -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan), 1e6, 1e32, 40000)
+PL_TEST_EXPECT_FENV (V_NAME (atan), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (atan), 0, 0x1p-30, 10000)
+PL_TEST_INTERVAL (V_NAME (atan), -0, -0x1p-30, 1000)
+PL_TEST_INTERVAL (V_NAME (atan), 0x1p-30, 0x1p53, 900000)
+PL_TEST_INTERVAL (V_NAME (atan), -0x1p-30, -0x1p53, 90000)
+PL_TEST_INTERVAL (V_NAME (atan), 0x1p53, inf, 10000)
+PL_TEST_INTERVAL (V_NAME (atan), -0x1p53, -inf, 1000)
+
 #endif
-- 
cgit v1.2.3


From 0d000be25df5ecebba2cf95f219a53c218fcb761 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 22 Dec 2022 17:50:40 +0000
Subject: pl/math: Add scalar & vector/Neon atanh

New routines are both based on existing log1p routines. Scalar is
accurate to 3 ULP, Neon to 3.5 ULP. Both set fp exceptions correctly
regardless of build config.
---
 pl/math/atanh_3u.c                        |  85 ++++++++++++++++++++++++
 pl/math/include/mathlib.h                 |   5 ++
 pl/math/pairwise_horner_wrap.h            |  22 +++++--
 pl/math/s_atanh_3u5.c                     |   6 ++
 pl/math/test/testcases/directed/atanh.tst |  22 +++++++
 pl/math/v_atanh_3u5.c                     | 104 ++++++++++++++++++++++++++++++
 pl/math/vn_atanh_3u5.c                    |  12 ++++
 7 files changed, 251 insertions(+), 5 deletions(-)
 create mode 100644 pl/math/atanh_3u.c
 create mode 100644 pl/math/s_atanh_3u5.c
 create mode 100644 pl/math/test/testcases/directed/atanh.tst
 create mode 100644 pl/math/v_atanh_3u5.c
 create mode 100644 pl/math/vn_atanh_3u5.c

diff --git a/pl/math/atanh_3u.c b/pl/math/atanh_3u.c
new file mode 100644
index 0000000..b72326c
--- /dev/null
+++ b/pl/math/atanh_3u.c
@@ -0,0 +1,85 @@
+/*
+ * Double-precision atanh(x) function.
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define One 0x3ff0000000000000
+#define Ln2Hi 0x1.62e42fefa3800p-1
+#define Ln2Lo 0x1.ef35793c76730p-45
+#define OneMHfRt2Top                                                           \
+  0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)).  */
+#define OneTop12 0x3ff
+#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)).  */
+#define BottomMask 0xffffffff
+#define C(i) __log1p_data.coeffs[i]
+
+static inline double
+log1p_inline (double x)
+{
+  /* Helper for calculating log(1 + x) using order-18 polynomial on a reduced
+     interval. Copied from log1p_2u.c, with no special-case handling. See that
+     file for details of the algorithm.  */
+  double m = x + 1;
+  uint64_t mi = asuint64 (m);
+
+  /* Decompose x + 1 into (f + 1) * 2^k, with k chosen such that f is in
+     [sqrt(2)/2, sqrt(2)].  */
+  uint32_t u = (mi >> 32) + OneMHfRt2Top;
+  int32_t k = (int32_t) (u >> 20) - OneTop12;
+  uint32_t utop = (u & 0x000fffff) + HfRt2Top;
+  uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask);
+  double f = asdouble (u_red) - 1;
+
+  /* Correction term for round-off in f.  */
+  double cm = (x - (m - 1)) / m;
+
+  /* Approximate log1p(f) with polynomial.  */
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double f8 = f4 * f4;
+  double p = fma (f, ESTRIN_18 (f, f2, f4, f8, f8 * f8, C) * f, f);
+
+  /* Recombine log1p(x) = k*log2 + log1p(f) + c/m.  */
+  double kd = k;
+  double y = fma (Ln2Lo, kd, cm);
+  return y + fma (Ln2Hi, kd, p);
+}
+
+/* Approximation for double-precision inverse tanh(x), using a simplified
+   version of log1p. Greatest observed error is 3.00 ULP:
+   atanh(0x1.e58f3c108d714p-4) got 0x1.e7da77672a647p-4
+			      want 0x1.e7da77672a64ap-4.  */
+double
+atanh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t sign = ix & ~AbsMask;
+  uint64_t ia = ix & AbsMask;
+
+  if (unlikely (ia == One))
+    return __math_divzero (sign >> 32);
+
+  if (unlikely (ia > One))
+    return __math_invalid (x);
+
+  double halfsign = asdouble (Half | sign);
+  double ax = asdouble (ia);
+  return halfsign * log1p_inline ((2 * ax) / (1 - ax));
+}
+
+PL_SIG (S, D, 1, atanh, -1.0, 1.0)
+PL_TEST_ULP (atanh, 3.00)
+PL_TEST_INTERVAL (atanh, 0, 0x1p-23, 10000)
+PL_TEST_INTERVAL (atanh, -0, -0x1p-23, 10000)
+PL_TEST_INTERVAL (atanh, 0x1p-23, 1, 90000)
+PL_TEST_INTERVAL (atanh, -0x1p-23, -1, 90000)
+PL_TEST_INTERVAL (atanh, 1, inf, 100)
+PL_TEST_INTERVAL (atanh, -1, -inf, 100)
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 05fa306..041b407 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -29,6 +29,7 @@ double acosh (double);
 double asinh (double);
 double atan (double);
 double atan2 (double, double);
+double atanh (double);
 double cbrt (double);
 double cosh (double);
 double erfc (double);
@@ -56,6 +57,7 @@ float __s_tanhf (float);
 double __s_asinh (double);
 double __s_atan (double);
 double __s_atan2 (double, double);
+double __s_atanh (double);
 double __s_cbrt (double);
 double __s_cosh (double);
 double __s_erf (double);
@@ -85,6 +87,7 @@ __f64x2_t __v_atan (__f64x2_t);
 __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
 __f32x4_t __v_atanhf (__f32x4_t);
+__f64x2_t __v_atanh (__f64x2_t);
 __f32x4_t __v_cbrtf (__f32x4_t);
 __f64x2_t __v_cbrt (__f64x2_t);
 __f32x4_t __v_coshf (__f32x4_t);
@@ -117,6 +120,7 @@ __vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t __vn_atanhf (__f32x4_t);
+__vpcs __f64x2_t __vn_atanh (__f64x2_t);
 __vpcs __f32x4_t __vn_cbrtf (__f32x4_t);
 __vpcs __f64x2_t __vn_cbrt (__f64x2_t);
 __vpcs __f32x4_t __vn_coshf (__f32x4_t);
@@ -146,6 +150,7 @@ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
diff --git a/pl/math/pairwise_horner_wrap.h b/pl/math/pairwise_horner_wrap.h
index 5bc287b..e75a491 100644
--- a/pl/math/pairwise_horner_wrap.h
+++ b/pl/math/pairwise_horner_wrap.h
@@ -7,11 +7,14 @@
 
 // clang-format off
 #define  PW_HORNER_1_(x, c,     i) FMA(x,  C(i + 1),                      C(i))
-#define  PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_(x,     c, i + 2), PW_HORNER_1_(x, c, i))
-#define  PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define  PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define  PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_ (x,     c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_13_(x, x2, c, i) FMA(x2, PW_HORNER_11_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_15_(x, x2, c, i) FMA(x2, PW_HORNER_13_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_17_(x, x2, c, i) FMA(x2, PW_HORNER_15_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
 
 #define  PAIRWISE_HORNER_1(x,     c) PW_HORNER_1_ (x, c, 0)
 #define  PAIRWISE_HORNER_3(x, x2, c) PW_HORNER_3_ (x, x2, c, 0)
@@ -19,6 +22,9 @@
 #define  PAIRWISE_HORNER_7(x, x2, c) PW_HORNER_7_ (x, x2, c, 0)
 #define  PAIRWISE_HORNER_9(x, x2, c) PW_HORNER_9_ (x, x2, c, 0)
 #define PAIRWISE_HORNER_11(x, x2, c) PW_HORNER_11_(x, x2, c, 0)
+#define PAIRWISE_HORNER_13(x, x2, c) PW_HORNER_13_(x, x2, c, 0)
+#define PAIRWISE_HORNER_15(x, x2, c) PW_HORNER_15_(x, x2, c, 0)
+#define PAIRWISE_HORNER_17(x, x2, c) PW_HORNER_17_(x, x2, c, 0)
 
 #define  PW_HORNER_2_(x, x2, c, i) FMA(x2, c(i + 2),                       PW_HORNER_1_(x, c, i))
 #define  PW_HORNER_4_(x, x2, c, i) FMA(x2, PW_HORNER_2_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
@@ -26,6 +32,9 @@
 #define  PW_HORNER_8_(x, x2, c, i) FMA(x2, PW_HORNER_6_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
 #define PW_HORNER_10_(x, x2, c, i) FMA(x2, PW_HORNER_8_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
 #define PW_HORNER_12_(x, x2, c, i) FMA(x2, PW_HORNER_10_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_14_(x, x2, c, i) FMA(x2, PW_HORNER_12_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_16_(x, x2, c, i) FMA(x2, PW_HORNER_14_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_18_(x, x2, c, i) FMA(x2, PW_HORNER_16_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
 
 #define  PAIRWISE_HORNER_2(x, x2, c) PW_HORNER_2_ (x, x2, c, 0)
 #define  PAIRWISE_HORNER_4(x, x2, c) PW_HORNER_4_ (x, x2, c, 0)
@@ -33,4 +42,7 @@
 #define  PAIRWISE_HORNER_8(x, x2, c) PW_HORNER_8_(x, x2, c, 0)
 #define PAIRWISE_HORNER_10(x, x2, c) PW_HORNER_10_(x, x2, c, 0)
 #define PAIRWISE_HORNER_12(x, x2, c) PW_HORNER_12_(x, x2, c, 0)
+#define PAIRWISE_HORNER_14(x, x2, c) PW_HORNER_14_(x, x2, c, 0)
+#define PAIRWISE_HORNER_16(x, x2, c) PW_HORNER_16_(x, x2, c, 0)
+#define PAIRWISE_HORNER_18(x, x2, c) PW_HORNER_18_(x, x2, c, 0)
 // clang-format on
diff --git a/pl/math/s_atanh_3u5.c b/pl/math/s_atanh_3u5.c
new file mode 100644
index 0000000..11877c6
--- /dev/null
+++ b/pl/math/s_atanh_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanh_3u5.c"
diff --git a/pl/math/test/testcases/directed/atanh.tst b/pl/math/test/testcases/directed/atanh.tst
new file mode 100644
index 0000000..530df8b
--- /dev/null
+++ b/pl/math/test/testcases/directed/atanh.tst
@@ -0,0 +1,22 @@
+; atanh.tst
+;
+; Copyright 2009-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atanh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atanh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atanh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atanh op1=7ff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=3ff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=3ff00000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
+func=atanh op1=bff00000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=atanh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=atanh op1=80000000.00000000 result=80000000.00000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=atanh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/v_atanh_3u5.c b/pl/math/v_atanh_3u5.c
new file mode 100644
index 0000000..ca68020
--- /dev/null
+++ b/pl/math/v_atanh_3u5.c
@@ -0,0 +1,104 @@
+/*
+ * Double-precision vector atanh(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pairwise_horner.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
+#define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
+#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32.  */
+#define OneMHfRt2Top                                                           \
+  0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)))      \
+			<< 32.  */
+#define OneTop12 0x3ff
+#define BottomMask 0xffffffff
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define One 0x3ff0000000000000
+
+#define C(i) v_f64 (__log1p_data.coeffs[i])
+
+static inline v_f64_t
+log1p_inline (v_f64_t x)
+{
+  /* Helper for calculating log(1 + x) using order-18 polynomial on a reduced
+     interval. Copied from v_log1p_2u5.c, with the following modifications:
+     - No special-case handling.
+     - Pairwise Horner instead of Estrin for improved accuracy.
+     - Slightly different recombination to reuse f2.
+     See original source for details of the algorithm.  */
+  v_f64_t m = x + 1;
+  v_u64_t mi = v_as_u64_f64 (m);
+
+  /* Decompose x + 1 into (f + 1) * 2^k, with k chosen such that f is in
+     [sqrt(2)/2, sqrt(2)].  */
+  v_u64_t u = mi + OneMHfRt2Top;
+  v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop12;
+  v_f64_t k = v_to_f64_s64 (ki);
+  v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top;
+  v_u64_t u_red = utop | (mi & BottomMask);
+  v_f64_t f = v_as_f64_u64 (u_red) - 1;
+
+  /* Correction term for round-off in f.  */
+  v_f64_t cm = (x - (m - 1)) / m;
+
+  /* Approximate log1p(f) with polynomial.  */
+  v_f64_t f2 = f * f;
+  v_f64_t p = PAIRWISE_HORNER_18 (f, f2, C);
+
+  /* Recombine log1p(x) = k*log2 + log1p(f) + c/m.  */
+  v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm);
+  v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f);
+  v_f64_t y = v_fma_f64 (f2, p, ylo + yhi);
+  return y;
+}
+
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (atanh, x, y, special);
+}
+
+/* Approximation for vector double-precision atanh(x) using modified log1p.
+   The greatest observed error is 3.31 ULP:
+   __v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+				 want 0x1.ffd8ff31b501cp-6.  */
+VPCS_ATTR
+v_f64_t V_NAME (atanh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t sign = ix & ~AbsMask;
+  v_u64_t ia = ix & AbsMask;
+  v_u64_t special = v_cond_u64 (ia >= One);
+  v_f64_t halfsign = v_as_f64_u64 (sign | Half);
+
+  /* Mask special lanes with 0 to prevent spurious underflow.  */
+  v_f64_t ax = v_sel_f64 (special, v_f64 (0), v_as_f64_u64 (ia));
+  v_f64_t y = halfsign * log1p_inline ((2 * ax) / (1 - ax));
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, atanh, -1.0, 1.0)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (atanh))
+PL_TEST_ULP (V_NAME (atanh), 3.32)
+PL_TEST_INTERVAL_C (V_NAME (atanh), 0, 0x1p-23, 10000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), -0, -0x1p-23, 10000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), 0x1p-23, 1, 90000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), -0x1p-23, -1, 90000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), 1, inf, 100, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), -1, -inf, 100, 0)
+#endif
diff --git a/pl/math/vn_atanh_3u5.c b/pl/math/vn_atanh_3u5.c
new file mode 100644
index 0000000..27a5af5
--- /dev/null
+++ b/pl/math/vn_atanh_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanh.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atanh, _ZGVnN2v_atanh)
+#include "v_atanh_3u5.c"
+#endif
-- 
cgit v1.2.3


From 08482af8d045312f51ce407adc689459fb45dbf2 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Fri, 30 Dec 2022 11:57:50 +0000
Subject: pl/math: Add vector/SVE log2f

New SVE routine is an SVE port of the Neon algorithm
and is accurate to 2.48 ULPs.
---
 pl/math/include/mathlib.h |  2 ++
 pl/math/sv_log2f_2u5.c    | 79 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 pl/math/sv_log2f_2u5.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 041b407..f05ad03 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -190,6 +190,7 @@ svfloat32_t __sv_logf_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_log2f_x (svfloat32_t, svbool_t);
 svfloat32_t __sv_powif_x (svfloat32_t, svint32_t, svbool_t);
 svfloat64_t __sv_powi_x (svfloat64_t, svint64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
@@ -210,6 +211,7 @@ svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t);
 svfloat32_t _ZGVsMxvv_powi(svfloat32_t, svint32_t, svbool_t);
 svfloat64_t _ZGVsMxvv_powk(svfloat64_t, svint64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
diff --git a/pl/math/sv_log2f_2u5.c b/pl/math/sv_log2f_2u5.c
new file mode 100644
index 0000000..6488658
--- /dev/null
+++ b/pl/math/sv_log2f_2u5.c
@@ -0,0 +1,79 @@
+/*
+ * Single-precision vector/SVE log2 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define P(i) __v_log2f_data.poly[i]
+
+#define Ln2 (0x1.62e43p-1f) /* 0x3f317218.  */
+#define Min (0x00800000)
+#define Max (0x7f800000)
+#define Mask (0x007fffff)
+#define Off (0x3f2aaaab) /* 0.666667.  */
+
+static NOINLINE sv_f32_t
+specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (log2f, x, y, cmp);
+}
+
+/* Optimised implementation of SVE log2f, using the same algorithm
+   and polynomial as Neon log2f. Maximum error is 2.48 ULPs:
+   __sv_log2f(0x1.558174p+0) got 0x1.a9be84p-2
+			    want 0x1.a9be8p-2.  */
+sv_f32_t
+__sv_log2f_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t u = sv_as_u32_f32 (x);
+  svbool_t special
+    = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min));
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u = svsub_n_u32_x (pg, u, Off);
+  sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u),
+						   23)); /* Sign-extend.  */
+  u = svand_n_u32_x (pg, u, Mask);
+  u = svadd_n_u32_x (pg, u, Off);
+  sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f);
+
+  /* y = log2(1+r) + n.  */
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+
+  /* Evaluate polynomial using pairwise Horner scheme.  */
+  sv_f32_t p67 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6)));
+  sv_f32_t p45 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4)));
+  sv_f32_t p23 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2)));
+  sv_f32_t p01 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0)));
+  sv_f32_t y;
+  y = sv_fma_n_f32_x (pg, P (8), r2, p67);
+  y = sv_fma_f32_x (pg, y, r2, p45);
+  y = sv_fma_f32_x (pg, y, r2, p23);
+  y = sv_fma_f32_x (pg, y, r2, p01);
+  y = sv_fma_f32_x (pg, y, r, n);
+
+  if (unlikely (svptest_any (pg, special)))
+    return specialcase (x, y, special);
+  return y;
+}
+
+PL_ALIAS (__sv_log2f_x, _ZGVsMxv_log2f)
+
+PL_SIG (SV, F, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (__sv_log2f, 1.99)
+PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2f)
+PL_TEST_INTERVAL (__sv_log2f, -0.0, -0x1p126, 4000)
+PL_TEST_INTERVAL (__sv_log2f, 0.0, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log2f, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log2f, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log2f, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log2f, 100, inf, 50000)
+
+#endif // SV_SUPPORTED
-- 
cgit v1.2.3


From 2364ce531894c760f0742e17c490069ffa0032bd Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Fri, 30 Dec 2022 12:06:10 +0000
Subject: pl/math: Add vector/SVE log2

The new SVE implementation is a direct port of Neon log2, and is
accurate to 2.58 ULPs.
Update error threshold and comments for Neon log2 too, new
approximate argmax but same threshold.
---
 pl/math/include/mathlib.h |  2 ++
 pl/math/sv_log2_3u.c      | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/v_log2_3u.c       |  8 ++---
 3 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 pl/math/sv_log2_3u.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index f05ad03..67c3c9d 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -191,6 +191,7 @@ svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
 svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_log2f_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_log2_x (svfloat64_t, svbool_t);
 svfloat32_t __sv_powif_x (svfloat32_t, svint32_t, svbool_t);
 svfloat64_t __sv_powi_x (svfloat64_t, svint64_t, svbool_t);
 svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
@@ -212,6 +213,7 @@ svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
 svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_log2 (svfloat64_t, svbool_t);
 svfloat32_t _ZGVsMxvv_powi(svfloat32_t, svint32_t, svbool_t);
 svfloat64_t _ZGVsMxvv_powk(svfloat64_t, svint64_t, svbool_t);
 svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
diff --git a/pl/math/sv_log2_3u.c b/pl/math/sv_log2_3u.c
new file mode 100644
index 0000000..d66a474
--- /dev/null
+++ b/pl/math/sv_log2_3u.c
@@ -0,0 +1,85 @@
+/*
+ * Double-precision SVE log2 function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define InvLn2 sv_f64 (0x1.71547652b82fep0)
+#define N (1 << V_LOG2_TABLE_BITS)
+#define OFF 0x3fe6900900000000
+#define P(i) sv_f64 (__v_log2_data.poly[i])
+
+NOINLINE static sv_f64_t
+specialcase (sv_f64_t x, sv_f64_t y, const svbool_t cmp)
+{
+  return sv_call_f64 (log2, x, y, cmp);
+}
+
+/* Double-precision SVE log2 routine. Implements the same algorithm as vector
+   log10, with coefficients and table entries scaled in extended precision.
+   The maximum observed error is 2.58 ULP:
+   __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+				 want 0x1.fffb34198d9ddp-5.  */
+sv_f64_t
+__sv_log2_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
+
+  svbool_t special
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x7ff0 - 0x0010);
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
+  sv_u64_t i
+    = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG2_TABLE_BITS), N);
+  sv_f64_t k
+    = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52));
+  sv_f64_t z = sv_as_f64_u64 (
+    svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52)));
+
+  sv_u64_t idx = svmul_n_u64_x (pg, i, 2);
+  sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].invc, idx);
+  sv_f64_t log2c = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].log2c, idx);
+
+  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
+
+  sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
+  sv_f64_t w = sv_fma_f64_x (pg, r, InvLn2, log2c);
+
+  sv_f64_t r2 = svmul_f64_x (pg, r, r);
+  sv_f64_t p_23 = sv_fma_f64_x (pg, P (3), r, P (2));
+  sv_f64_t p_01 = sv_fma_f64_x (pg, P (1), r, P (0));
+  sv_f64_t y = sv_fma_f64_x (pg, P (4), r2, p_23);
+  y = sv_fma_f64_x (pg, y, r2, p_01);
+  y = sv_fma_f64_x (pg, y, r2, svadd_f64_x (pg, k, w));
+
+  if (unlikely (svptest_any (pg, special)))
+    {
+      return specialcase (x, y, special);
+    }
+  return y;
+}
+
+PL_ALIAS (__sv_log2_x, _ZGVsMxv_log2)
+
+PL_SIG (SV, D, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (__sv_log2, 2.09)
+PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2)
+PL_TEST_INTERVAL (__sv_log2, -0.0, -0x1p126, 1000)
+PL_TEST_INTERVAL (__sv_log2, 0.0, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log2, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log2, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log2, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log2, 100, inf, 50000)
+
+#endif
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
index e0a854f..5b9bdd8 100644
--- a/pl/math/v_log2_3u.c
+++ b/pl/math/v_log2_3u.c
@@ -48,9 +48,9 @@ specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
 
 /* Double-precision vector log2 routine. Implements the same algorithm as vector
    log10, with coefficients and table entries scaled in extended precision.
-   The maximum observed error is 2.59 ULP:
-   __v_log2(0x1.0b555054a9bd1p+0) got 0x1.fff6977bdced3p-5
-				 want 0x1.fff6977bdced6p-5.  */
+   The maximum observed error is 2.58 ULP:
+   __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+				 want 0x1.fffb34198d9ddp-5.  */
 VPCS_ATTR
 v_f64_t V_NAME (log2) (v_f64_t x)
 {
@@ -89,7 +89,7 @@ v_f64_t V_NAME (log2) (v_f64_t x)
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, log2, 0.01, 11.1)
-PL_TEST_ULP (V_NAME (log2), 2.10)
+PL_TEST_ULP (V_NAME (log2), 2.09)
 PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2))
 PL_TEST_INTERVAL (V_NAME (log2), -0.0, -0x1p126, 100)
 PL_TEST_INTERVAL (V_NAME (log2), 0x1p-149, 0x1p-126, 4000)
-- 
cgit v1.2.3


From 47c03a91d990e9dad25b7e9925c4b72533a242f7 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 5 Jan 2023 10:29:55 +0000
Subject: pl/math: Add scalar & vector/Neon tanh

New routines use the same algorithm, reliant on a modified version of
expm1, and are accurate to 3 ULP.
---
 pl/math/include/mathlib.h                |  5 ++
 pl/math/s_tanh_3u.c                      |  6 ++
 pl/math/tanh_3u.c                        | 82 ++++++++++++++++++++++++++++
 pl/math/test/testcases/directed/tanh.tst | 18 ++++++
 pl/math/v_tanh_3u.c                      | 94 ++++++++++++++++++++++++++++++++
 pl/math/vn_tanh_3u.c                     | 12 ++++
 6 files changed, 217 insertions(+)
 create mode 100644 pl/math/s_tanh_3u.c
 create mode 100644 pl/math/tanh_3u.c
 create mode 100644 pl/math/test/testcases/directed/tanh.tst
 create mode 100644 pl/math/v_tanh_3u.c
 create mode 100644 pl/math/vn_tanh_3u.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 67c3c9d..43c5ebc 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -37,6 +37,7 @@ double expm1 (double);
 double log10 (double);
 double log1p (double);
 double sinh (double);
+double tanh (double);
 
 float __s_asinhf (float);
 float __s_atanf (float);
@@ -67,6 +68,7 @@ double __s_log10 (double);
 double __s_log1p (double);
 double __s_log2 (double);
 double __s_sinh (double);
+double __s_tanh (double);
 
 #if __aarch64__
 #if __GNUC__ >= 5
@@ -108,6 +110,7 @@ __f32x4_t __v_sinhf (__f32x4_t);
 __f64x2_t __v_sinh (__f64x2_t);
 __f32x4_t __v_tanf (__f32x4_t);
 __f32x4_t __v_tanhf (__f32x4_t);
+__f64x2_t __v_tanh (__f64x2_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
@@ -141,6 +144,7 @@ __vpcs __f32x4_t __vn_sinhf (__f32x4_t);
 __vpcs __f64x2_t __vn_sinh (__f64x2_t);
 __vpcs __f32x4_t __vn_tanf (__f32x4_t);
 __vpcs __f32x4_t __vn_tanhf (__f32x4_t);
+__vpcs __f64x2_t __vn_tanh (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
@@ -171,6 +175,7 @@ __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t);
 
 #endif
 
diff --git a/pl/math/s_tanh_3u.c b/pl/math/s_tanh_3u.c
new file mode 100644
index 0000000..a4d7bce
--- /dev/null
+++ b/pl/math/s_tanh_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanh_3u.c"
diff --git a/pl/math/tanh_3u.c b/pl/math/tanh_3u.c
new file mode 100644
index 0000000..46d9fb3
--- /dev/null
+++ b/pl/math/tanh_3u.c
@@ -0,0 +1,82 @@
+/*
+ * Double-precision tanh(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define InvLn2 0x1.71547652b82fep0
+#define Ln2hi 0x1.62e42fefa39efp-1
+#define Ln2lo 0x1.abc9e3b39803fp-56
+#define Shift 0x1.8p52
+#define C(i) __expm1_poly[i]
+
+#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4).  */
+#define TinyBound 0x3e40000000000000   /* asuint64 (0x1p-27).  */
+#define One 0x3ff0000000000000
+
+static inline double
+expm1_inline (double x)
+{
+  /* Helper routine for calculating exp(x) - 1. Copied from expm1_2u5.c, with
+     several simplifications:
+     - No special-case handling for tiny or special values.
+     - Simpler combination of p and t in final stage of the algorithm.
+     - Use shift-and-add instead of ldexp to calculate t.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  double j = fma (InvLn2, x, Shift) - Shift;
+  int64_t i = j;
+  double f = fma (j, -Ln2hi, x);
+  f = fma (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) using polynomial.  */
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
+
+  /* t = 2 ^ i.  */
+  double t = asdouble ((uint64_t) (i + 1023) << 52);
+  /* expm1(x) = p * t + (t - 1).  */
+  return fma (p, t, t - 1);
+}
+
+/* Approximation for double-precision tanh(x), using a simplified version of
+   expm1. The greatest observed error is 2.75 ULP:
+   tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3
+			      want -0x1.ba31ba4691ab4p-3.  */
+double
+tanh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ia = ix & AbsMask;
+  uint64_t sign = ix & ~AbsMask;
+
+  if (unlikely (ia > BoringBound))
+    {
+      if (ia > 0x7ff0000000000000)
+	return __math_invalid (x);
+      return asdouble (One | sign);
+    }
+
+  if (unlikely (ia < TinyBound))
+    return x;
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  double q = expm1_inline (2 * x);
+  return q / (q + 2);
+}
+
+PL_SIG (S, D, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (tanh, 2.26)
+PL_TEST_INTERVAL (tanh, 0, TinyBound, 1000)
+PL_TEST_INTERVAL (tanh, -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (tanh, TinyBound, BoringBound, 100000)
+PL_TEST_INTERVAL (tanh, -TinyBound, -BoringBound, 100000)
+PL_TEST_INTERVAL (tanh, BoringBound, inf, 1000)
+PL_TEST_INTERVAL (tanh, -BoringBound, -inf, 1000)
diff --git a/pl/math/test/testcases/directed/tanh.tst b/pl/math/test/testcases/directed/tanh.tst
new file mode 100644
index 0000000..4a02c55
--- /dev/null
+++ b/pl/math/test/testcases/directed/tanh.tst
@@ -0,0 +1,18 @@
+; tanh.tst
+;
+; Copyright 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=tanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=tanh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=tanh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=tanh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=tanh op1=7ff00000.00000000 result=3ff00000.00000000 errno=0
+func=tanh op1=fff00000.00000000 result=bff00000.00000000 errno=0
+func=tanh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=tanh op1=80000000.00000000 result=80000000.00000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=tanh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=tanh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/v_tanh_3u.c b/pl/math/v_tanh_3u.c
new file mode 100644
index 0000000..c8b6c25
--- /dev/null
+++ b/pl/math/v_tanh_3u.c
@@ -0,0 +1,94 @@
+/*
+ * Double-precision vector tanh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define C(i) v_f64 (__expm1_poly[i])
+
+#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4).  */
+#define TinyBound 0x3e40000000000000   /* asuint64 (0x1p-27).  */
+#define One v_u64 (0x3ff0000000000000)
+
+static inline v_f64_t
+expm1_inline (v_f64_t x)
+{
+  /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
+     the scalar variant of tanh.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+  v_s64_t i = v_to_s64_f64 (j);
+  v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+  f = v_fma_f64 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) using polynomial.  */
+  v_f64_t f2 = f * f;
+  v_f64_t f4 = f2 * f2;
+  v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
+
+  /* t = 2 ^ i.  */
+  v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+  /* expm1(x) = p * t + (t - 1).  */
+  return v_fma_f64 (p, t, t - 1);
+}
+
+static NOINLINE v_f64_t
+special_case (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (tanh, x, y, special);
+}
+
+/* Vector approximation for double-precision tanh(x), using a simplified
+   version of expm1. The greatest observed error is 2.75 ULP:
+   __v_tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3
+				  want -0x1.ba31ba4691ab4p-3.  */
+VPCS_ATTR v_f64_t V_NAME (tanh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t ia = ix & AbsMask;
+
+  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
+  v_u64_t special = v_cond_u64 ((ia - TinyBound) > (BoringBound - TinyBound));
+  v_f64_t u;
+
+  /* To trigger fp exceptions correctly, set special lanes to a neutral value.
+     They will be fixed up later by the special-case handler.  */
+  if (unlikely (v_any_u64 (special)))
+    u = v_sel_f64 (special, v_f64 (1), x) * 2;
+  else
+    u = x * 2;
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  v_f64_t q = expm1_inline (u);
+  v_f64_t y = q / (q + 2);
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (tanh), 2.26)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (tanh))
+PL_TEST_INTERVAL (V_NAME (tanh), 0, TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (tanh), -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (tanh), TinyBound, BoringBound, 100000)
+PL_TEST_INTERVAL (V_NAME (tanh), -TinyBound, -BoringBound, 100000)
+PL_TEST_INTERVAL (V_NAME (tanh), BoringBound, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (tanh), -BoringBound, -inf, 1000)
+#endif
diff --git a/pl/math/vn_tanh_3u.c b/pl/math/vn_tanh_3u.c
new file mode 100644
index 0000000..cb2746c
--- /dev/null
+++ b/pl/math/vn_tanh_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanh.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_tanh, _ZGVnN2v_tanh)
+#include "v_tanh_3u.c"
+#endif
-- 
cgit v1.2.3


From 0866a19c42fd560ea8985f829b17c4b93b6a6f1b Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 5 Jan 2023 10:30:01 +0000
Subject: pl/math: Add vector/Neon acoshf

New routine uses inlined log1pf helper, and is accurate to 3.1 ULP
(2.8 ULP if fp exceptions are enabled).
---
 pl/math/include/mathlib.h |  4 +++
 pl/math/s_acoshf_3u1.c    |  6 +++++
 pl/math/v_acoshf_3u1.c    | 68 +++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_acoshf_3u1.c   | 12 +++++++++
 4 files changed, 90 insertions(+)
 create mode 100644 pl/math/s_acoshf_3u1.c
 create mode 100644 pl/math/v_acoshf_3u1.c
 create mode 100644 pl/math/vn_acoshf_3u1.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 43c5ebc..dc2cef8 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -39,6 +39,7 @@ double log1p (double);
 double sinh (double);
 double tanh (double);
 
+float __s_acoshf (float);
 float __s_asinhf (float);
 float __s_atanf (float);
 float __s_atan2f (float, float);
@@ -82,6 +83,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #endif
 
 /* Vector functions following the base PCS.  */
+__f32x4_t __v_acoshf (__f32x4_t);
 __f32x4_t __v_asinhf (__f32x4_t);
 __f64x2_t __v_asinh (__f64x2_t);
 __f32x4_t __v_atanf (__f32x4_t);
@@ -116,6 +118,7 @@ __f64x2_t __v_tanh (__f64x2_t);
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_acoshf (__f32x4_t);
 __vpcs __f32x4_t __vn_asinhf (__f32x4_t);
 __vpcs __f64x2_t __vn_asinh (__f64x2_t);
 __vpcs __f32x4_t __vn_atanf (__f32x4_t);
@@ -147,6 +150,7 @@ __vpcs __f32x4_t __vn_tanhf (__f32x4_t);
 __vpcs __f64x2_t __vn_tanh (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
diff --git a/pl/math/s_acoshf_3u1.c b/pl/math/s_acoshf_3u1.c
new file mode 100644
index 0000000..3740666
--- /dev/null
+++ b/pl/math/s_acoshf_3u1.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_acoshf_3u1.c"
diff --git a/pl/math/v_acoshf_3u1.c b/pl/math/v_acoshf_3u1.c
new file mode 100644
index 0000000..2b5aff5
--- /dev/null
+++ b/pl/math/v_acoshf_3u1.c
@@ -0,0 +1,68 @@
+/*
+ * Single-precision vector acosh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define SignMask 0x80000000
+#define One 0x3f800000
+#define SquareLim 0x5f800000 /* asuint(0x1p64).  */
+
+#if V_SUPPORTED
+
+#include "v_log1pf_inline.h"
+
+static NOINLINE VPCS_ATTR v_f32_t
+special_case (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (acoshf, x, y, special);
+}
+
+/* Vector approximation for single-precision acosh, based on log1p. Maximum
+   error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
+   is 2.78 ULP:
+   __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
+			   want 0x1.ef9ea2p-3.
+   With exceptions disabled, we can compute u with a shorter dependency chain,
+   which gives maximum error of 3.07 ULP:
+  __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
+			   want 0x1.fbc7f4p-4.  */
+
+VPCS_ATTR v_f32_t V_NAME (acoshf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t special = v_cond_u32 ((ix - One) >= (SquareLim - One));
+
+#if WANT_SIMD_EXCEPT
+  /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
+     only xm1 to calculate u, as operating on x will trigger invalid for NaN. */
+  v_f32_t xm1 = v_sel_f32 (special, v_f32 (1), x - 1);
+  v_f32_t u = v_fma_f32 (xm1, xm1, 2 * xm1);
+#else
+  v_f32_t xm1 = x - 1;
+  v_f32_t u = xm1 * (x + 1.0f);
+#endif
+  v_f32_t y = log1pf_inline (xm1 + v_sqrt_f32 (u));
+
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, acosh, 1.0, 10.0)
+#if WANT_SIMD_EXCEPT
+PL_TEST_ULP (V_NAME (acoshf), 2.29)
+#else
+PL_TEST_ULP (V_NAME (acoshf), 2.58)
+#endif
+PL_TEST_EXPECT_FENV (V_NAME (acoshf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (acoshf), 0, 1, 500)
+PL_TEST_INTERVAL (V_NAME (acoshf), 1, SquareLim, 100000)
+PL_TEST_INTERVAL (V_NAME (acoshf), SquareLim, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (acoshf), -0, -inf, 1000)
+#endif
diff --git a/pl/math/vn_acoshf_3u1.c b/pl/math/vn_acoshf_3u1.c
new file mode 100644
index 0000000..8c5f106
--- /dev/null
+++ b/pl/math/vn_acoshf_3u1.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_acoshf.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_acoshf, _ZGVnN4v_acoshf)
+#include "v_acoshf_3u1.c"
+#endif
-- 
cgit v1.2.3


From 49d283240e503a412ae85cd5b60f323797f7b897 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 5 Jan 2023 10:30:07 +0000
Subject: pl/math: Add vector/Neon acosh

New routine is based on a vector implementation from log1p, which has
been reused (with some modification for improved accuracy close to 0)
from Neon atanh. Accurate to 3.5 ULP.
---
 pl/math/include/mathlib.h |  4 +++
 pl/math/s_acosh_3u5.c     |  6 ++++
 pl/math/v_acosh_3u5.c     | 51 +++++++++++++++++++++++++++++++
 pl/math/v_atanh_3u5.c     | 47 ++---------------------------
 pl/math/v_log1p_inline.h  | 77 +++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_acosh_3u5.c    | 12 ++++++++
 6 files changed, 152 insertions(+), 45 deletions(-)
 create mode 100644 pl/math/s_acosh_3u5.c
 create mode 100644 pl/math/v_acosh_3u5.c
 create mode 100644 pl/math/v_log1p_inline.h
 create mode 100644 pl/math/vn_acosh_3u5.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index dc2cef8..64e34d5 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -56,6 +56,7 @@ float __s_sinhf (float);
 float __s_tanf (float);
 float __s_tanhf (float);
 
+double __s_acosh (double);
 double __s_asinh (double);
 double __s_atan (double);
 double __s_atan2 (double, double);
@@ -84,6 +85,7 @@ typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 
 /* Vector functions following the base PCS.  */
 __f32x4_t __v_acoshf (__f32x4_t);
+__f64x2_t __v_acosh (__f64x2_t);
 __f32x4_t __v_asinhf (__f32x4_t);
 __f64x2_t __v_asinh (__f64x2_t);
 __f32x4_t __v_atanf (__f32x4_t);
@@ -119,6 +121,7 @@ __f64x2_t __v_tanh (__f64x2_t);
 
 /* Vector functions following the vector PCS.  */
 __vpcs __f32x4_t __vn_acoshf (__f32x4_t);
+__vpcs __f64x2_t __vn_acosh (__f64x2_t);
 __vpcs __f32x4_t __vn_asinhf (__f32x4_t);
 __vpcs __f64x2_t __vn_asinh (__f64x2_t);
 __vpcs __f32x4_t __vn_atanf (__f32x4_t);
@@ -151,6 +154,7 @@ __vpcs __f64x2_t __vn_tanh (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
diff --git a/pl/math/s_acosh_3u5.c b/pl/math/s_acosh_3u5.c
new file mode 100644
index 0000000..f62cbd6
--- /dev/null
+++ b/pl/math/s_acosh_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_acosh_3u5.c"
diff --git a/pl/math/v_acosh_3u5.c b/pl/math/v_acosh_3u5.c
new file mode 100644
index 0000000..22f69d7
--- /dev/null
+++ b/pl/math/v_acosh_3u5.c
@@ -0,0 +1,51 @@
+/*
+ * Single-precision vector acosh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define WANT_V_LOG1P_K0_SHORTCUT 1
+#include "v_log1p_inline.h"
+
+#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)).  */
+
+#if V_SUPPORTED
+
+static NOINLINE VPCS_ATTR v_f64_t
+special_case (v_f64_t x)
+{
+  return v_call_f64 (acosh, x, x, v_u64 (-1));
+}
+
+/* Vector approximation for double-precision acosh, based on log1p.
+   The largest observed error is 3.02 ULP in the region where the
+   argument to log1p falls in the k=0 interval, i.e. x close to 1:
+   __v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5
+				  want 0x1.f2d6d823bc9e2p-5.  */
+VPCS_ATTR v_f64_t V_NAME (acosh) (v_f64_t x)
+{
+  v_u64_t itop = v_as_u64_f64 (x) >> 52;
+  v_u64_t special = v_cond_u64 ((itop - OneTop) >= (BigBoundTop - OneTop));
+
+  /* Fall back to scalar routine for all lanes if any of them are special.  */
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x);
+
+  v_f64_t xm1 = x - 1;
+  v_f64_t u = xm1 * (x + 1);
+  return log1p_inline (xm1 + v_sqrt_f64 (u));
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, acosh, 1.0, 10.0)
+PL_TEST_ULP (V_NAME (acosh), 2.53)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (acosh))
+PL_TEST_INTERVAL (V_NAME (acosh), 1, 0x1p511, 90000)
+PL_TEST_INTERVAL (V_NAME (acosh), 0x1p511, inf, 10000)
+PL_TEST_INTERVAL (V_NAME (acosh), 0, 1, 1000)
+PL_TEST_INTERVAL (V_NAME (acosh), -0, -inf, 10000)
+#endif
diff --git a/pl/math/v_atanh_3u5.c b/pl/math/v_atanh_3u5.c
index ca68020..ffd6f59 100644
--- a/pl/math/v_atanh_3u5.c
+++ b/pl/math/v_atanh_3u5.c
@@ -12,56 +12,13 @@
 
 #if V_SUPPORTED
 
-#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
-#define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
-#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32.  */
-#define OneMHfRt2Top                                                           \
-  0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)))      \
-			<< 32.  */
-#define OneTop12 0x3ff
-#define BottomMask 0xffffffff
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define Half 0x3fe0000000000000
 #define One 0x3ff0000000000000
 
-#define C(i) v_f64 (__log1p_data.coeffs[i])
-
-static inline v_f64_t
-log1p_inline (v_f64_t x)
-{
-  /* Helper for calculating log(1 + x) using order-18 polynomial on a reduced
-     interval. Copied from v_log1p_2u5.c, with the following modifications:
-     - No special-case handling.
-     - Pairwise Horner instead of Estrin for improved accuracy.
-     - Slightly different recombination to reuse f2.
-     See original source for details of the algorithm.  */
-  v_f64_t m = x + 1;
-  v_u64_t mi = v_as_u64_f64 (m);
-
-  /* Decompose x + 1 into (f + 1) * 2^k, with k chosen such that f is in
-     [sqrt(2)/2, sqrt(2)].  */
-  v_u64_t u = mi + OneMHfRt2Top;
-  v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop12;
-  v_f64_t k = v_to_f64_s64 (ki);
-  v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top;
-  v_u64_t u_red = utop | (mi & BottomMask);
-  v_f64_t f = v_as_f64_u64 (u_red) - 1;
-
-  /* Correction term for round-off in f.  */
-  v_f64_t cm = (x - (m - 1)) / m;
-
-  /* Approximate log1p(f) with polynomial.  */
-  v_f64_t f2 = f * f;
-  v_f64_t p = PAIRWISE_HORNER_18 (f, f2, C);
-
-  /* Recombine log1p(x) = k*log2 + log1p(f) + c/m.  */
-  v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm);
-  v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f);
-  v_f64_t y = v_fma_f64 (f2, p, ylo + yhi);
-  return y;
-}
-
 VPCS_ATTR
 NOINLINE static v_f64_t
 specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
diff --git a/pl/math/v_log1p_inline.h b/pl/math/v_log1p_inline.h
new file mode 100644
index 0000000..e5c7339
--- /dev/null
+++ b/pl/math/v_log1p_inline.h
@@ -0,0 +1,77 @@
+/*
+ * Helper for vector double-precision routines which calculate log(1 + x) and do
+ * not need special-case handling
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#ifndef PL_MATH_V_LOG1P_INLINE_H
+#define PL_MATH_V_LOG1P_INLINE_H
+
+#include "v_math.h"
+#include "pairwise_horner.h"
+
+#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
+#define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
+#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32.  */
+#define OneMHfRt2Top                                                           \
+  0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)))      \
+			<< 32.  */
+#define OneTop 0x3ff
+#define BottomMask 0xffffffff
+#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)).  */
+
+#define C(i) v_f64 (__log1p_data.coeffs[i])
+
+static inline v_f64_t
+log1p_inline (v_f64_t x)
+{
+  /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
+     modifications:
+     - No special-case handling - this should be dealt with by the caller.
+     - Pairwise Horner polynomial evaluation for improved accuracy.
+     - Optionally simulate the shortcut for k=0, used in the scalar routine,
+       using v_sel, for improved accuracy when the argument to log1p is close to
+       0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
+       the source of the caller before including this file.
+     See v_log1pf_2u1.c for details of the algorithm.  */
+  v_f64_t m = x + 1;
+  v_u64_t mi = v_as_u64_f64 (m);
+  v_u64_t u = mi + OneMHfRt2Top;
+
+  v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop;
+  v_f64_t k = v_to_f64_s64 (ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top;
+  v_u64_t u_red = utop | (mi & BottomMask);
+  v_f64_t f = v_as_f64_u64 (u_red) - 1;
+
+  /* Correction term c/m.  */
+  v_f64_t cm = (x - (m - 1)) / m;
+
+#ifndef WANT_V_LOG1P_K0_SHORTCUT
+#error                                                                         \
+  "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_V_LOG1P_K0_SHORTCUT
+  /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+     that the approximation is solely the polynomial. */
+  v_u64_t k0 = k == 0;
+  if (unlikely (v_any_u64 (k0)))
+    {
+      cm = v_sel_f64 (k0, v_f64 (0), cm);
+      f = v_sel_f64 (k0, x, f);
+    }
+#endif
+
+  /* Approximate log1p(f) on the reduced input using a polynomial.  */
+  v_f64_t f2 = f * f;
+  v_f64_t p = PAIRWISE_HORNER_18 (f, f2, C);
+
+  /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
+  v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm);
+  v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f);
+  return v_fma_f64 (f2, p, ylo + yhi);
+}
+
+#endif // PL_MATH_V_LOG1P_INLINE_H
diff --git a/pl/math/vn_acosh_3u5.c b/pl/math/vn_acosh_3u5.c
new file mode 100644
index 0000000..649735b
--- /dev/null
+++ b/pl/math/vn_acosh_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_acosh.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_acosh, _ZGVnN2v_acosh)
+#include "v_acosh_3u5.c"
+#endif
-- 
cgit v1.2.3


From 0f87f607b976820ef41fe64d004fe67dc7af8236 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 5 Jan 2023 11:56:20 +0000
Subject: Rewrite two abs masks as literals

These were technically undefined behaviour - they have been rewritten
without the shift so that their type is unsigned int by default.
---
 math/logf.c      | 2 +-
 pl/math/log10f.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/math/logf.c b/math/logf.c
index ea378d6..a1cd2d7 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -57,7 +57,7 @@ logf (float x)
   tmp = ix - OFF;
   i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
   k = (int32_t) tmp >> 23; /* arithmetic shift */
-  iz = ix - (tmp & 0x1ff << 23);
+  iz = ix - (tmp & 0xff800000);
   invc = T[i].invc;
   logc = T[i].logc;
   z = (double_t) asfloat (iz);
diff --git a/pl/math/log10f.c b/pl/math/log10f.c
index 32de42f..5813982 100644
--- a/pl/math/log10f.c
+++ b/pl/math/log10f.c
@@ -67,7 +67,7 @@ log10f (float x)
   tmp = ix - OFF;
   i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
   k = (int32_t) tmp >> 23; /* arithmetic shift.  */
-  iz = ix - (tmp & 0x1ff << 23);
+  iz = ix - (tmp & 0xff800000);
   invc = T[i].invc;
   logc = T[i].logc;
   z = (double_t) asfloat (iz);
-- 
cgit v1.2.3


From f0f80b8a19b2593491847ed87456694d789f6f80 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Fri, 6 Jan 2023 09:10:57 +0000
Subject: pl/math: Update copyright years

All files in pl/math updated to 2023.
---
 pl/math/Dir.mk                             | 2 +-
 pl/math/acosh_3u.c                         | 3 ++-
 pl/math/acoshf_2u8.c                       | 3 ++-
 pl/math/asinh_2u5.c                        | 2 +-
 pl/math/asinh_data.c                       | 2 +-
 pl/math/asinhf_3u5.c                       | 3 ++-
 pl/math/asinhf_data.c                      | 3 ++-
 pl/math/atan2_2u5.c                        | 2 +-
 pl/math/atan2f_3u.c                        | 2 +-
 pl/math/atan_2u5.c                         | 2 +-
 pl/math/atan_common.h                      | 2 +-
 pl/math/atan_data.c                        | 2 +-
 pl/math/atanf_2u9.c                        | 2 +-
 pl/math/atanf_common.h                     | 2 +-
 pl/math/atanf_data.c                       | 2 +-
 pl/math/atanh_3u.c                         | 3 ++-
 pl/math/atanhf_3u1.c                       | 3 ++-
 pl/math/cbrt_2u.c                          | 2 +-
 pl/math/cbrt_data.c                        | 2 +-
 pl/math/cbrtf_1u5.c                        | 2 +-
 pl/math/cbrtf_data.c                       | 2 +-
 pl/math/cosh_2u.c                          | 2 +-
 pl/math/coshf_1u9.c                        | 2 +-
 pl/math/erfc_4u5.c                         | 2 +-
 pl/math/erfc_data.c                        | 2 +-
 pl/math/erfcf.h                            | 2 +-
 pl/math/erfcf_2u.c                         | 2 +-
 pl/math/erfcf_data.c                       | 2 +-
 pl/math/erff_1u5.c                         | 2 +-
 pl/math/erff_data.c                        | 2 +-
 pl/math/estrin.h                           | 2 +-
 pl/math/estrin_wrap.h                      | 2 +-
 pl/math/estrinf.h                          | 2 +-
 pl/math/exp.c                              | 2 +-
 pl/math/exp_data.c                         | 2 +-
 pl/math/expf.c                             | 2 +-
 pl/math/expf_data.c                        | 2 +-
 pl/math/expm1_2u5.c                        | 2 +-
 pl/math/expm1_data.c                       | 2 +-
 pl/math/expm1f_1u6.c                       | 2 +-
 pl/math/expm1f_data.c                      | 2 +-
 pl/math/horner.h                           | 2 +-
 pl/math/horner_wrap.h                      | 2 +-
 pl/math/hornerf.h                          | 2 +-
 pl/math/include/mathlib.h                  | 2 +-
 pl/math/include/pl_test.h                  | 2 +-
 pl/math/log.c                              | 2 +-
 pl/math/log10_2u.c                         | 2 +-
 pl/math/log10_data.c                       | 2 +-
 pl/math/log10f.c                           | 2 +-
 pl/math/log1p_2u.c                         | 3 ++-
 pl/math/log1p_data.c                       | 3 ++-
 pl/math/log1pf_2u1.c                       | 3 ++-
 pl/math/log1pf_data.c                      | 2 +-
 pl/math/log_data.c                         | 2 +-
 pl/math/logf.c                             | 2 +-
 pl/math/logf_data.c                        | 2 +-
 pl/math/math_config.h                      | 2 +-
 pl/math/math_err.c                         | 2 +-
 pl/math/math_errf.c                        | 2 +-
 pl/math/pairwise_horner.h                  | 2 +-
 pl/math/pairwise_horner_wrap.h             | 2 +-
 pl/math/pairwise_hornerf.h                 | 2 +-
 pl/math/pl_sig.h                           | 2 +-
 pl/math/s_asinh_3u5.c                      | 2 +-
 pl/math/s_asinhf_2u7.c                     | 2 +-
 pl/math/s_atan2_3u.c                       | 2 +-
 pl/math/s_atan2f_3u.c                      | 2 +-
 pl/math/s_atan_2u5.c                       | 2 +-
 pl/math/s_atanf_3u.c                       | 2 +-
 pl/math/s_atanh_3u5.c                      | 2 +-
 pl/math/s_atanhf_3u1.c                     | 2 +-
 pl/math/s_cbrt_2u.c                        | 2 +-
 pl/math/s_cbrtf_1u5.c                      | 2 +-
 pl/math/s_cosh_2u.c                        | 2 +-
 pl/math/s_coshf_2u4.c                      | 2 +-
 pl/math/s_erf_2u.c                         | 2 +-
 pl/math/s_erfc_4u.c                        | 2 +-
 pl/math/s_erfcf_1u.c                       | 2 +-
 pl/math/s_erff_1u5.c                       | 2 +-
 pl/math/s_exp_tail.c                       | 2 +-
 pl/math/s_expf.c                           | 2 +-
 pl/math/s_expm1_2u5.c                      | 2 +-
 pl/math/s_expm1f_1u6.c                     | 2 +-
 pl/math/s_log10_2u5.c                      | 2 +-
 pl/math/s_log10f_3u5.c                     | 2 +-
 pl/math/s_log1p_2u5.c                      | 2 +-
 pl/math/s_log1pf_2u1.c                     | 2 +-
 pl/math/s_log2_3u.c                        | 2 +-
 pl/math/s_log2f_2u5.c                      | 2 +-
 pl/math/s_sinh_3u.c                        | 2 +-
 pl/math/s_sinhf_2u3.c                      | 2 +-
 pl/math/s_tanf_3u2.c                       | 2 +-
 pl/math/s_tanhf_2u6.c                      | 2 +-
 pl/math/sinh_3u.c                          | 2 +-
 pl/math/sinhf_2u3.c                        | 2 +-
 pl/math/sv_atan2_2u5.c                     | 2 +-
 pl/math/sv_atan2f_3u.c                     | 2 +-
 pl/math/sv_atan_2u5.c                      | 2 +-
 pl/math/sv_atan_common.h                   | 2 +-
 pl/math/sv_atanf_2u9.c                     | 2 +-
 pl/math/sv_atanf_common.h                  | 2 +-
 pl/math/sv_cos_2u5.c                       | 2 +-
 pl/math/sv_cosf_2u1.c                      | 2 +-
 pl/math/sv_erf_3u.c                        | 2 +-
 pl/math/sv_erfc_4u.c                       | 2 +-
 pl/math/sv_erff_1u3.c                      | 2 +-
 pl/math/sv_exp_tail.h                      | 2 +-
 pl/math/sv_expf_2u.c                       | 2 +-
 pl/math/sv_expf_data.c                     | 2 +-
 pl/math/sv_log10_2u5.c                     | 2 +-
 pl/math/sv_log10f_3u5.c                    | 2 +-
 pl/math/sv_log2_3u.c                       | 2 +-
 pl/math/sv_log2f_2u5.c                     | 2 +-
 pl/math/sv_log_2u5.c                       | 2 +-
 pl/math/sv_log_data.c                      | 2 +-
 pl/math/sv_logf_3u4.c                      | 2 +-
 pl/math/sv_logf_data.c                     | 2 +-
 pl/math/sv_math.h                          | 2 +-
 pl/math/sv_powi.c                          | 2 +-
 pl/math/sv_powif.c                         | 2 +-
 pl/math/sv_sin_3u.c                        | 2 +-
 pl/math/sv_sinf_1u9.c                      | 2 +-
 pl/math/sv_sinf_poly_data.c                | 2 +-
 pl/math/sv_tanf_3u2.c                      | 2 +-
 pl/math/tanf_3u3.c                         | 2 +-
 pl/math/tanf_data.c                        | 2 +-
 pl/math/tanhf_2u6.c                        | 2 +-
 pl/math/test/mathbench_funcs.h             | 2 +-
 pl/math/test/mathbench_wrappers.h          | 2 +-
 pl/math/test/pl_test.h                     | 2 +-
 pl/math/test/runulp.sh                     | 2 +-
 pl/math/test/testcases/directed/acosh.tst  | 2 +-
 pl/math/test/testcases/directed/acoshf.tst | 2 +-
 pl/math/test/testcases/directed/asinh.tst  | 2 +-
 pl/math/test/testcases/directed/asinhf.tst | 2 +-
 pl/math/test/testcases/directed/atan.tst   | 2 +-
 pl/math/test/testcases/directed/atan2.tst  | 2 +-
 pl/math/test/testcases/directed/atan2f.tst | 2 +-
 pl/math/test/testcases/directed/atanf.tst  | 2 +-
 pl/math/test/testcases/directed/atanh.tst  | 2 +-
 pl/math/test/testcases/directed/atanhf.tst | 2 +-
 pl/math/test/testcases/directed/cbrtf.tst  | 2 +-
 pl/math/test/testcases/directed/cosh.tst   | 2 +-
 pl/math/test/testcases/directed/coshf.tst  | 2 +-
 pl/math/test/testcases/directed/erfc.tst   | 2 +-
 pl/math/test/testcases/directed/erfcf.tst  | 2 +-
 pl/math/test/testcases/directed/erff.tst   | 2 +-
 pl/math/test/testcases/directed/expm1.tst  | 2 +-
 pl/math/test/testcases/directed/expm1f.tst | 2 +-
 pl/math/test/testcases/directed/log10.tst  | 2 +-
 pl/math/test/testcases/directed/log10f.tst | 2 +-
 pl/math/test/testcases/directed/log1p.tst  | 2 +-
 pl/math/test/testcases/directed/log1pf.tst | 2 +-
 pl/math/test/testcases/directed/log2.tst   | 2 +-
 pl/math/test/testcases/directed/log2f.tst  | 2 +-
 pl/math/test/testcases/directed/sinh.tst   | 2 +-
 pl/math/test/testcases/directed/sinhf.tst  | 2 +-
 pl/math/test/testcases/directed/tanf.tst   | 2 +-
 pl/math/test/testcases/directed/tanhf.tst  | 2 +-
 pl/math/test/testcases/random/double.tst   | 2 +-
 pl/math/test/testcases/random/float.tst    | 2 +-
 pl/math/test/ulp_funcs.h                   | 2 +-
 pl/math/test/ulp_wrappers.h                | 2 +-
 pl/math/tools/asinh.sollya                 | 2 +-
 pl/math/tools/asinhf.sollya                | 2 +-
 pl/math/tools/atan.sollya                  | 2 +-
 pl/math/tools/atanf.sollya                 | 2 +-
 pl/math/tools/cbrt.sollya                  | 2 +-
 pl/math/tools/cbrtf.sollya                 | 2 +-
 pl/math/tools/erfc.sollya                  | 2 +-
 pl/math/tools/erfcf.sollya                 | 2 +-
 pl/math/tools/expm1.sollya                 | 2 +-
 pl/math/tools/expm1f.sollya                | 2 +-
 pl/math/tools/log10.sollya                 | 2 +-
 pl/math/tools/log10f.sollya                | 2 +-
 pl/math/tools/log1p.sollya                 | 2 +-
 pl/math/tools/log1pf.sollya                | 2 +-
 pl/math/tools/tanf.sollya                  | 2 +-
 pl/math/tools/v_erf.sollya                 | 2 +-
 pl/math/tools/v_erfc.sollya                | 2 +-
 pl/math/tools/v_log10.sollya               | 2 +-
 pl/math/tools/v_log10f.sollya              | 2 +-
 pl/math/tools/v_log2f.sollya               | 2 +-
 pl/math/v_asinh_3u5.c                      | 3 ++-
 pl/math/v_asinhf_2u7.c                     | 3 ++-
 pl/math/v_atan2_3u.c                       | 2 +-
 pl/math/v_atan2f_3u.c                      | 2 +-
 pl/math/v_atan_2u5.c                       | 2 +-
 pl/math/v_atanf_3u.c                       | 2 +-
 pl/math/v_atanh_3u5.c                      | 2 +-
 pl/math/v_atanhf_3u1.c                     | 3 ++-
 pl/math/v_cbrt_2u.c                        | 3 ++-
 pl/math/v_cbrtf_1u5.c                      | 3 ++-
 pl/math/v_cosh_2u.c                        | 3 ++-
 pl/math/v_coshf_2u4.c                      | 3 ++-
 pl/math/v_erf_2u.c                         | 2 +-
 pl/math/v_erf_data.c                       | 2 +-
 pl/math/v_erfc_4u.c                        | 2 +-
 pl/math/v_erfc_data.c                      | 2 +-
 pl/math/v_erfcf_1u.c                       | 2 +-
 pl/math/v_erff_1u5.c                       | 2 +-
 pl/math/v_erff_data.c                      | 2 +-
 pl/math/v_exp_tail.c                       | 2 +-
 pl/math/v_exp_tail.h                       | 2 +-
 pl/math/v_exp_tail_data.c                  | 2 +-
 pl/math/v_expf.c                           | 2 +-
 pl/math/v_expm1_2u5.c                      | 2 +-
 pl/math/v_expm1f_1u6.c                     | 2 +-
 pl/math/v_expm1f_inline.h                  | 2 +-
 pl/math/v_log10_2u5.c                      | 2 +-
 pl/math/v_log10_data.c                     | 2 +-
 pl/math/v_log10f_3u5.c                     | 2 +-
 pl/math/v_log10f_data.c                    | 2 +-
 pl/math/v_log1p_2u5.c                      | 3 ++-
 pl/math/v_log1pf_2u1.c                     | 3 ++-
 pl/math/v_log1pf_inline.h                  | 2 +-
 pl/math/v_log2_3u.c                        | 2 +-
 pl/math/v_log2_data.c                      | 2 +-
 pl/math/v_log2f_2u5.c                      | 2 +-
 pl/math/v_log2f_data.c                     | 2 +-
 pl/math/v_math.h                           | 2 +-
 pl/math/v_sinh_3u.c                        | 3 ++-
 pl/math/v_sinhf_2u3.c                      | 3 ++-
 pl/math/v_tanf_3u2.c                       | 2 +-
 pl/math/v_tanhf_2u6.c                      | 3 ++-
 pl/math/vn_asinh_3u5.c                     | 2 +-
 pl/math/vn_asinhf_2u7.c                    | 2 +-
 pl/math/vn_atan2_3u.c                      | 2 +-
 pl/math/vn_atan2f_3u.c                     | 2 +-
 pl/math/vn_atan_2u5.c                      | 2 +-
 pl/math/vn_atanf_3u.c                      | 2 +-
 pl/math/vn_atanh_3u5.c                     | 2 +-
 pl/math/vn_atanhf_3u1.c                    | 2 +-
 pl/math/vn_cbrt_2u.c                       | 2 +-
 pl/math/vn_cbrtf_1u5.c                     | 2 +-
 pl/math/vn_cosh_2u.c                       | 2 +-
 pl/math/vn_coshf_2u4.c                     | 2 +-
 pl/math/vn_erf_2u.c                        | 2 +-
 pl/math/vn_erfc_4u.c                       | 2 +-
 pl/math/vn_erfcf_1u.c                      | 2 +-
 pl/math/vn_erff_1u5.c                      | 2 +-
 pl/math/vn_exp_tail.c                      | 2 +-
 pl/math/vn_expf.c                          | 2 +-
 pl/math/vn_expm1_2u5.c                     | 2 +-
 pl/math/vn_expm1f_1u6.c                    | 2 +-
 pl/math/vn_log10_2u5.c                     | 2 +-
 pl/math/vn_log10f_3u5.c                    | 2 +-
 pl/math/vn_log1p_2u5.c                     | 2 +-
 pl/math/vn_log1pf_2u1.c                    | 2 +-
 pl/math/vn_log2_3u.c                       | 2 +-
 pl/math/vn_log2f_2u5.c                     | 2 +-
 pl/math/vn_sinh_3u.c                       | 2 +-
 pl/math/vn_sinhf_2u3.c                     | 2 +-
 pl/math/vn_tanf_3u2.c                      | 2 +-
 pl/math/vn_tanhf_2u6.c                     | 2 +-
 256 files changed, 277 insertions(+), 256 deletions(-)

diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index 60814f8..be65344 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -1,6 +1,6 @@
 # Makefile fragment - requires GNU make
 #
-# Copyright (c) 2019-2022, Arm Limited.
+# Copyright (c) 2019-2023, Arm Limited.
 # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 PLM := $(srcdir)/pl/math
diff --git a/pl/math/acosh_3u.c b/pl/math/acosh_3u.c
index d2c195f..4e2cb67 100644
--- a/pl/math/acosh_3u.c
+++ b/pl/math/acosh_3u.c
@@ -1,6 +1,7 @@
 /*
  * Double-precision acosh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/acoshf_2u8.c b/pl/math/acoshf_2u8.c
index bd9c561..c9cded7 100644
--- a/pl/math/acoshf_2u8.c
+++ b/pl/math/acoshf_2u8.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision acosh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c
index 064d81e..f167955 100644
--- a/pl/math/asinh_2u5.c
+++ b/pl/math/asinh_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision asinh(x) function
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "estrin.h"
diff --git a/pl/math/asinh_data.c b/pl/math/asinh_data.c
index 319c572..073b197 100644
--- a/pl/math/asinh_data.c
+++ b/pl/math/asinh_data.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision polynomial coefficients for scalar asinh(x)
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
index 2429e82..2b2c55d 100644
--- a/pl/math/asinhf_3u5.c
+++ b/pl/math/asinhf_3u5.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision asinh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/asinhf_data.c b/pl/math/asinhf_data.c
index ce9b632..cd1ef16 100644
--- a/pl/math/asinhf_data.c
+++ b/pl/math/asinhf_data.c
@@ -1,6 +1,7 @@
 /*
  * Coefficients for single-precision asinh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/atan2_2u5.c b/pl/math/atan2_2u5.c
index ba39d9a..c909ac9 100644
--- a/pl/math/atan2_2u5.c
+++ b/pl/math/atan2_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision scalar atan2(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/atan2f_3u.c b/pl/math/atan2f_3u.c
index e84ea0b..38e1df5 100644
--- a/pl/math/atan2f_3u.c
+++ b/pl/math/atan2f_3u.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision scalar atan2(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/atan_2u5.c b/pl/math/atan_2u5.c
index 99fea0f..ee47701 100644
--- a/pl/math/atan_2u5.c
+++ b/pl/math/atan_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision atan(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/atan_common.h b/pl/math/atan_common.h
index 331c1bb..da0da64 100644
--- a/pl/math/atan_common.h
+++ b/pl/math/atan_common.h
@@ -2,7 +2,7 @@
  * Double-precision polynomial evaluation function for scalar and vector atan(x)
  * and atan2(y,x).
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/atan_data.c b/pl/math/atan_data.c
index fa34d11..91d0f61 100644
--- a/pl/math/atan_data.c
+++ b/pl/math/atan_data.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision polynomial coefficients for vector atan(x) and atan2(y,x).
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/atanf_2u9.c b/pl/math/atanf_2u9.c
index d7071be..9d17f25 100644
--- a/pl/math/atanf_2u9.c
+++ b/pl/math/atanf_2u9.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision atan(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/atanf_common.h b/pl/math/atanf_common.h
index 3038e54..37ca76d 100644
--- a/pl/math/atanf_common.h
+++ b/pl/math/atanf_common.h
@@ -2,7 +2,7 @@
  * Single-precision polynomial evaluation function for scalar and vector
  * atan(x) and atan2(y,x).
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/atanf_data.c b/pl/math/atanf_data.c
index 8ea952a..c4cba23 100644
--- a/pl/math/atanf_data.c
+++ b/pl/math/atanf_data.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision polynomial coefficients for vector atan(x) and atan2(y,x).
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/atanh_3u.c b/pl/math/atanh_3u.c
index b72326c..a168cd5 100644
--- a/pl/math/atanh_3u.c
+++ b/pl/math/atanh_3u.c
@@ -1,6 +1,7 @@
 /*
  * Double-precision atanh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/atanhf_3u1.c b/pl/math/atanhf_3u1.c
index c7f80b0..fb90aa2 100644
--- a/pl/math/atanhf_3u1.c
+++ b/pl/math/atanhf_3u1.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision atanh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/cbrt_2u.c b/pl/math/cbrt_2u.c
index f89dd87..83715dd 100644
--- a/pl/math/cbrt_2u.c
+++ b/pl/math/cbrt_2u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision cbrt(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/cbrt_data.c b/pl/math/cbrt_data.c
index 1c6ca73..3d484c2 100644
--- a/pl/math/cbrt_data.c
+++ b/pl/math/cbrt_data.c
@@ -1,7 +1,7 @@
 /*
  * Coefficients and table entries for double-precision cbrt(x).
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
index 86a6088..adc5917 100644
--- a/pl/math/cbrtf_1u5.c
+++ b/pl/math/cbrtf_1u5.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision cbrt(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/cbrtf_data.c b/pl/math/cbrtf_data.c
index 386a2b4..c6cdb4d 100644
--- a/pl/math/cbrtf_data.c
+++ b/pl/math/cbrtf_data.c
@@ -1,7 +1,7 @@
 /*
  * Coefficients and table entries for single-precision cbrt(x).
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/cosh_2u.c b/pl/math/cosh_2u.c
index 5ec3b77..5d1df07 100644
--- a/pl/math/cosh_2u.c
+++ b/pl/math/cosh_2u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision cosh(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/coshf_1u9.c b/pl/math/coshf_1u9.c
index 2f93f1c..c125c92 100644
--- a/pl/math/coshf_1u9.c
+++ b/pl/math/coshf_1u9.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision cosh(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
index 6d4a29a..e9af9d3 100644
--- a/pl/math/erfc_4u5.c
+++ b/pl/math/erfc_4u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision erfc(x) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/erfc_data.c b/pl/math/erfc_data.c
index 02b7db1..fa7184f 100644
--- a/pl/math/erfc_data.c
+++ b/pl/math/erfc_data.c
@@ -1,7 +1,7 @@
 /*
  * Data used in double-precision erfc(x) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/erfcf.h b/pl/math/erfcf.h
index 98ead38..8f1e5f4 100644
--- a/pl/math/erfcf.h
+++ b/pl/math/erfcf.h
@@ -1,7 +1,7 @@
 /*
  * Shared functions for scalar and vector single-precision erfc(x) functions.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c
index 7a55000..5a3f9b0 100644
--- a/pl/math/erfcf_2u.c
+++ b/pl/math/erfcf_2u.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision erfc(x) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/erfcf_data.c b/pl/math/erfcf_data.c
index 34fe033..2e018c8 100644
--- a/pl/math/erfcf_data.c
+++ b/pl/math/erfcf_data.c
@@ -1,7 +1,7 @@
 /*
  * Data used in single-precision erfc(x) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c
index 3d8cfee..1a69872 100644
--- a/pl/math/erff_1u5.c
+++ b/pl/math/erff_1u5.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision erf(x) function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "estrinf.h"
diff --git a/pl/math/erff_data.c b/pl/math/erff_data.c
index eeb0b20..2352bae 100644
--- a/pl/math/erff_data.c
+++ b/pl/math/erff_data.c
@@ -1,7 +1,7 @@
 /*
  * Data for approximation of erff.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/estrin.h b/pl/math/estrin.h
index 89df329..f967fb0 100644
--- a/pl/math/estrin.h
+++ b/pl/math/estrin.h
@@ -1,7 +1,7 @@
 /*
  * Helper macros for double-precision Estrin polynomial evaluation.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/estrin_wrap.h b/pl/math/estrin_wrap.h
index 93af2ab..2ae0700 100644
--- a/pl/math/estrin_wrap.h
+++ b/pl/math/estrin_wrap.h
@@ -1,7 +1,7 @@
 /*
  * Helper macros for double-precision Estrin polynomial evaluation.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/estrinf.h b/pl/math/estrinf.h
index be52ab5..175233c 100644
--- a/pl/math/estrinf.h
+++ b/pl/math/estrinf.h
@@ -1,7 +1,7 @@
 /*
  * Helper macros for single-precision Estrin polynomial evaluation.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/exp.c b/pl/math/exp.c
index f95c46f..90253b6 100644
--- a/pl/math/exp.c
+++ b/pl/math/exp.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision e^x function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/exp_data.c b/pl/math/exp_data.c
index 714c845..2354be7 100644
--- a/pl/math/exp_data.c
+++ b/pl/math/exp_data.c
@@ -1,7 +1,7 @@
 /*
  * Shared data between exp, exp2 and pow.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/expf.c b/pl/math/expf.c
index fa03b05..c325e45 100644
--- a/pl/math/expf.c
+++ b/pl/math/expf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision e^x function.
  *
- * Copyright (c) 2017-2022, Arm Limited.
+ * Copyright (c) 2017-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/expf_data.c b/pl/math/expf_data.c
index 1525fcc..474ad57 100644
--- a/pl/math/expf_data.c
+++ b/pl/math/expf_data.c
@@ -2,7 +2,7 @@
  * Coeffs and table entries for single-precision exp. Copied from
  * math/exp2f_data.c, with EXP2F_TABLE_BITS == 32.
  *
- * Copyright (c) 2017-2022, Arm Limited.
+ * Copyright (c) 2017-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/expm1_2u5.c b/pl/math/expm1_2u5.c
index 60a556e..a3faff7 100644
--- a/pl/math/expm1_2u5.c
+++ b/pl/math/expm1_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision e^x - 1 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/expm1_data.c b/pl/math/expm1_data.c
index 93aaa47..ff7426b 100644
--- a/pl/math/expm1_data.c
+++ b/pl/math/expm1_data.c
@@ -1,7 +1,7 @@
 /*
  * Coefficients for double-precision e^x - 1 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/expm1f_1u6.c b/pl/math/expm1f_1u6.c
index 5138865..70b14e4 100644
--- a/pl/math/expm1f_1u6.c
+++ b/pl/math/expm1f_1u6.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision e^x - 1 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/expm1f_data.c b/pl/math/expm1f_data.c
index fc0bd41..9d02dc4 100644
--- a/pl/math/expm1f_data.c
+++ b/pl/math/expm1f_data.c
@@ -1,7 +1,7 @@
 /*
  * Coefficients for single-precision e^x - 1 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/horner.h b/pl/math/horner.h
index 4dbc122..f92ab67 100644
--- a/pl/math/horner.h
+++ b/pl/math/horner.h
@@ -1,7 +1,7 @@
 /*
  * Helper macros for single-precision Horner polynomial evaluation.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/horner_wrap.h b/pl/math/horner_wrap.h
index 892d63b..a254b2d 100644
--- a/pl/math/horner_wrap.h
+++ b/pl/math/horner_wrap.h
@@ -1,7 +1,7 @@
 /*
  * Helper macros for Horner polynomial evaluation.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/hornerf.h b/pl/math/hornerf.h
index bec1593..0703817 100644
--- a/pl/math/hornerf.h
+++ b/pl/math/hornerf.h
@@ -1,7 +1,7 @@
 /*
  * Helper macros for double-precision Horner polynomial evaluation.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 64e34d5..74c3b34 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -2,7 +2,7 @@
 /*
  * Public API.
  *
- * Copyright (c) 2015-2022, Arm Limited.
+ * Copyright (c) 2015-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/include/pl_test.h b/pl/math/include/pl_test.h
index 30d39c1..6a81360 100644
--- a/pl/math/include/pl_test.h
+++ b/pl/math/include/pl_test.h
@@ -3,7 +3,7 @@
  * routine, not the tests. Separate definitions are found in test/pl_test.h
  * which emit test parameters.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
  */
 
diff --git a/pl/math/log.c b/pl/math/log.c
index 418c715..40b0441 100644
--- a/pl/math/log.c
+++ b/pl/math/log.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision log(x) function.
  *
- * Copyright (c) 2018-2022, Arm Limited.
+ * Copyright (c) 2018-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/log10_2u.c b/pl/math/log10_2u.c
index 81f73a8..74828ea 100644
--- a/pl/math/log10_2u.c
+++ b/pl/math/log10_2u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision log10(x) function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/log10_data.c b/pl/math/log10_data.c
index e02e9b1..9976f19 100644
--- a/pl/math/log10_data.c
+++ b/pl/math/log10_data.c
@@ -1,7 +1,7 @@
 /*
  * Data for log10.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/log10f.c b/pl/math/log10f.c
index 5813982..5c80008 100644
--- a/pl/math/log10f.c
+++ b/pl/math/log10f.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision log10 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
index 519df42..23c8ed4 100644
--- a/pl/math/log1p_2u.c
+++ b/pl/math/log1p_2u.c
@@ -1,6 +1,7 @@
 /*
  * Double-precision log(1+x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/log1p_data.c b/pl/math/log1p_data.c
index 9380d13..6168a0c 100644
--- a/pl/math/log1p_data.c
+++ b/pl/math/log1p_data.c
@@ -1,6 +1,7 @@
 /*
  * Data used in double-precision log(1+x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/log1pf_2u1.c b/pl/math/log1pf_2u1.c
index cb1d4bc..fcfd05a 100644
--- a/pl/math/log1pf_2u1.c
+++ b/pl/math/log1pf_2u1.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision log(1+x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/log1pf_data.c b/pl/math/log1pf_data.c
index d7bc95c..8c92d57 100644
--- a/pl/math/log1pf_data.c
+++ b/pl/math/log1pf_data.c
@@ -1,7 +1,7 @@
 /*
  * Data used in single-precision log1p(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "math_config.h"
diff --git a/pl/math/log_data.c b/pl/math/log_data.c
index ef10d33..34715e5 100644
--- a/pl/math/log_data.c
+++ b/pl/math/log_data.c
@@ -1,7 +1,7 @@
 /*
  * Data for log.
  *
- * Copyright (c) 2018-2022, Arm Limited.
+ * Copyright (c) 2018-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/logf.c b/pl/math/logf.c
index 2962ee7..17a74ed 100644
--- a/pl/math/logf.c
+++ b/pl/math/logf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision log function.
  *
- * Copyright (c) 2017-2019, Arm Limited.
+ * Copyright (c) 2017-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/logf_data.c b/pl/math/logf_data.c
index 279a265..97d9eb8 100644
--- a/pl/math/logf_data.c
+++ b/pl/math/logf_data.c
@@ -1,7 +1,7 @@
 /*
  * Data definition for logf and log10f.
  *
- * Copyright (c) 2017-2022, Arm Limited.
+ * Copyright (c) 2017-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 92ccebf..03b4ad4 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -1,7 +1,7 @@
 /*
  * Configuration for math routines.
  *
- * Copyright (c) 2017-2022, Arm Limited.
+ * Copyright (c) 2017-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/math_err.c b/pl/math/math_err.c
index fb98361..d246a89 100644
--- a/pl/math/math_err.c
+++ b/pl/math/math_err.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision math error handling.
  *
- * Copyright (c) 2018-2022, Arm Limited.
+ * Copyright (c) 2018-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/math_errf.c b/pl/math/math_errf.c
index 5b4945f..96271ff 100644
--- a/pl/math/math_errf.c
+++ b/pl/math/math_errf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision math error handling.
  *
- * Copyright (c) 2017-2022, Arm Limited.
+ * Copyright (c) 2017-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/pairwise_horner.h b/pl/math/pairwise_horner.h
index bee7592..6ad98dc 100644
--- a/pl/math/pairwise_horner.h
+++ b/pl/math/pairwise_horner.h
@@ -1,7 +1,7 @@
 /*
  * Helper macros for double-precision pairwise Horner polynomial evaluation.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/pairwise_horner_wrap.h b/pl/math/pairwise_horner_wrap.h
index e75a491..b6efb6f 100644
--- a/pl/math/pairwise_horner_wrap.h
+++ b/pl/math/pairwise_horner_wrap.h
@@ -1,7 +1,7 @@
 /*
  * Helper macros for pairwise Horner polynomial evaluation.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/pairwise_hornerf.h b/pl/math/pairwise_hornerf.h
index a8aa4d1..784750c 100644
--- a/pl/math/pairwise_hornerf.h
+++ b/pl/math/pairwise_hornerf.h
@@ -1,7 +1,7 @@
 /*
  * Helper macros for single-precision pairwise Horner polynomial evaluation.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/pl_sig.h b/pl/math/pl_sig.h
index e9f54c0..686d24f 100644
--- a/pl/math/pl_sig.h
+++ b/pl/math/pl_sig.h
@@ -1,7 +1,7 @@
 /*
  * PL macros for emitting various ulp/bench entries based on function signature
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
  */
 #define PL_DECL_SF1(fun) float fun##f (float);
diff --git a/pl/math/s_asinh_3u5.c b/pl/math/s_asinh_3u5.c
index d767100..ab8fbd9 100644
--- a/pl/math/s_asinh_3u5.c
+++ b/pl/math/s_asinh_3u5.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_asinhf_2u7.c b/pl/math/s_asinhf_2u7.c
index bce86a7..13e1a5f 100644
--- a/pl/math/s_asinhf_2u7.c
+++ b/pl/math/s_asinhf_2u7.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_atan2_3u.c b/pl/math/s_atan2_3u.c
index 5955e3c..4603e5f 100644
--- a/pl/math/s_atan2_3u.c
+++ b/pl/math/s_atan2_3u.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_atan2f_3u.c b/pl/math/s_atan2f_3u.c
index 5002d32..894d843 100644
--- a/pl/math/s_atan2f_3u.c
+++ b/pl/math/s_atan2f_3u.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_atan_2u5.c b/pl/math/s_atan_2u5.c
index b6b746a..4b61bc4 100644
--- a/pl/math/s_atan_2u5.c
+++ b/pl/math/s_atan_2u5.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_atanf_3u.c b/pl/math/s_atanf_3u.c
index 4e8a2f7..6b65719 100644
--- a/pl/math/s_atanf_3u.c
+++ b/pl/math/s_atanf_3u.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_atanh_3u5.c b/pl/math/s_atanh_3u5.c
index 11877c6..f6a5f75 100644
--- a/pl/math/s_atanh_3u5.c
+++ b/pl/math/s_atanh_3u5.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_atanhf_3u1.c b/pl/math/s_atanhf_3u1.c
index 9f75962..e7e5c61 100644
--- a/pl/math/s_atanhf_3u1.c
+++ b/pl/math/s_atanhf_3u1.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_cbrt_2u.c b/pl/math/s_cbrt_2u.c
index 22f726b..435e74a 100644
--- a/pl/math/s_cbrt_2u.c
+++ b/pl/math/s_cbrt_2u.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_cbrtf_1u5.c b/pl/math/s_cbrtf_1u5.c
index d60508e..5c79370 100644
--- a/pl/math/s_cbrtf_1u5.c
+++ b/pl/math/s_cbrtf_1u5.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_cosh_2u.c b/pl/math/s_cosh_2u.c
index f9c681c..cdf352c 100644
--- a/pl/math/s_cosh_2u.c
+++ b/pl/math/s_cosh_2u.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_coshf_2u4.c b/pl/math/s_coshf_2u4.c
index 1b7091b..8f7d5da 100644
--- a/pl/math/s_coshf_2u4.c
+++ b/pl/math/s_coshf_2u4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_erf_2u.c b/pl/math/s_erf_2u.c
index e5c25e0..839535c 100644
--- a/pl/math/s_erf_2u.c
+++ b/pl/math/s_erf_2u.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_erfc_4u.c b/pl/math/s_erfc_4u.c
index 6d80574..bf9e3e6 100644
--- a/pl/math/s_erfc_4u.c
+++ b/pl/math/s_erfc_4u.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_erfcf_1u.c b/pl/math/s_erfcf_1u.c
index 615db16..024d224 100644
--- a/pl/math/s_erfcf_1u.c
+++ b/pl/math/s_erfcf_1u.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_erff_1u5.c b/pl/math/s_erff_1u5.c
index f6817eb..a5b9bf9 100644
--- a/pl/math/s_erff_1u5.c
+++ b/pl/math/s_erff_1u5.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_exp_tail.c b/pl/math/s_exp_tail.c
index 4db47bb..20b1b41 100644
--- a/pl/math/s_exp_tail.c
+++ b/pl/math/s_exp_tail.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_expf.c b/pl/math/s_expf.c
index dacda7f..557a2e3 100644
--- a/pl/math/s_expf.c
+++ b/pl/math/s_expf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_expm1_2u5.c b/pl/math/s_expm1_2u5.c
index 00827da..da2d6e7 100644
--- a/pl/math/s_expm1_2u5.c
+++ b/pl/math/s_expm1_2u5.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_expm1f_1u6.c b/pl/math/s_expm1f_1u6.c
index 83385df..eea8089 100644
--- a/pl/math/s_expm1f_1u6.c
+++ b/pl/math/s_expm1f_1u6.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_log10_2u5.c b/pl/math/s_log10_2u5.c
index ad7f50b..2480e5a 100644
--- a/pl/math/s_log10_2u5.c
+++ b/pl/math/s_log10_2u5.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_log10f_3u5.c b/pl/math/s_log10f_3u5.c
index dc804b6..173e0fd 100644
--- a/pl/math/s_log10f_3u5.c
+++ b/pl/math/s_log10f_3u5.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_log1p_2u5.c b/pl/math/s_log1p_2u5.c
index 1d96025..20b395a 100644
--- a/pl/math/s_log1p_2u5.c
+++ b/pl/math/s_log1p_2u5.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_log1pf_2u1.c b/pl/math/s_log1pf_2u1.c
index fe01b05..013ec4c 100644
--- a/pl/math/s_log1pf_2u1.c
+++ b/pl/math/s_log1pf_2u1.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_log2_3u.c b/pl/math/s_log2_3u.c
index 913c825..d46f3f9 100644
--- a/pl/math/s_log2_3u.c
+++ b/pl/math/s_log2_3u.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_log2f_2u5.c b/pl/math/s_log2f_2u5.c
index 7077814..e76c67d 100644
--- a/pl/math/s_log2f_2u5.c
+++ b/pl/math/s_log2f_2u5.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_sinh_3u.c b/pl/math/s_sinh_3u.c
index 2c08fa1..27e5e65 100644
--- a/pl/math/s_sinh_3u.c
+++ b/pl/math/s_sinh_3u.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_sinhf_2u3.c b/pl/math/s_sinhf_2u3.c
index ac6a269..607f942 100644
--- a/pl/math/s_sinhf_2u3.c
+++ b/pl/math/s_sinhf_2u3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_tanf_3u2.c b/pl/math/s_tanf_3u2.c
index a47a7c0..b5ddf94 100644
--- a/pl/math/s_tanf_3u2.c
+++ b/pl/math/s_tanf_3u2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/s_tanhf_2u6.c b/pl/math/s_tanhf_2u6.c
index bbb4569..896fc62 100644
--- a/pl/math/s_tanhf_2u6.c
+++ b/pl/math/s_tanhf_2u6.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
diff --git a/pl/math/sinh_3u.c b/pl/math/sinh_3u.c
index 52ca156..f534815 100644
--- a/pl/math/sinh_3u.c
+++ b/pl/math/sinh_3u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision sinh(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sinhf_2u3.c b/pl/math/sinhf_2u3.c
index 38f59b0..de94428 100644
--- a/pl/math/sinhf_2u3.c
+++ b/pl/math/sinhf_2u3.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision sinh(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_atan2_2u5.c b/pl/math/sv_atan2_2u5.c
index b230b36..a4bea1d 100644
--- a/pl/math/sv_atan2_2u5.c
+++ b/pl/math/sv_atan2_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector atan2(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_atan2f_3u.c b/pl/math/sv_atan2f_3u.c
index 5e9d59b..f7674c4 100644
--- a/pl/math/sv_atan2f_3u.c
+++ b/pl/math/sv_atan2f_3u.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector atan2f(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_atan_2u5.c b/pl/math/sv_atan_2u5.c
index 16430a2..02ac331 100644
--- a/pl/math/sv_atan_2u5.c
+++ b/pl/math/sv_atan_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector atan(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_atan_common.h b/pl/math/sv_atan_common.h
index 53cdbc7..bfe6998 100644
--- a/pl/math/sv_atan_common.h
+++ b/pl/math/sv_atan_common.h
@@ -2,7 +2,7 @@
  * Double-precision polynomial evaluation function for SVE atan(x) and
  * atan2(y,x).
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_atanf_2u9.c b/pl/math/sv_atanf_2u9.c
index 41f99e5..8d38e42 100644
--- a/pl/math/sv_atanf_2u9.c
+++ b/pl/math/sv_atanf_2u9.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector atan(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_atanf_common.h b/pl/math/sv_atanf_common.h
index 869a257..dc45eff 100644
--- a/pl/math/sv_atanf_common.h
+++ b/pl/math/sv_atanf_common.h
@@ -2,7 +2,7 @@
  * Single-precision polynomial evaluation function for SVE atan(x) and
  * atan2(y,x).
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_cos_2u5.c b/pl/math/sv_cos_2u5.c
index a06ab9a..1940348 100644
--- a/pl/math/sv_cos_2u5.c
+++ b/pl/math/sv_cos_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision SVE cos(x) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_cosf_2u1.c b/pl/math/sv_cosf_2u1.c
index b8ec846..8f138bc 100644
--- a/pl/math/sv_cosf_2u1.c
+++ b/pl/math/sv_cosf_2u1.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision SVE cos(x) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_erf_3u.c b/pl/math/sv_erf_3u.c
index c860e1a..bec7f8a 100644
--- a/pl/math/sv_erf_3u.c
+++ b/pl/math/sv_erf_3u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision SVE erf(x) function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_erfc_4u.c b/pl/math/sv_erfc_4u.c
index 5b2fc18..076b471 100644
--- a/pl/math/sv_erfc_4u.c
+++ b/pl/math/sv_erfc_4u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision SVE erfc(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_erff_1u3.c b/pl/math/sv_erff_1u3.c
index fb1bef8..c7a738c 100644
--- a/pl/math/sv_erff_1u3.c
+++ b/pl/math/sv_erff_1u3.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector erf(x) function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_exp_tail.h b/pl/math/sv_exp_tail.h
index 846fe97..9b739da 100644
--- a/pl/math/sv_exp_tail.h
+++ b/pl/math/sv_exp_tail.h
@@ -1,7 +1,7 @@
 /*
  * Double-precision SVE e^(x+tail) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c
index 30a6c62..87fbe45 100644
--- a/pl/math/sv_expf_2u.c
+++ b/pl/math/sv_expf_2u.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector e^x function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_expf_data.c b/pl/math/sv_expf_data.c
index 22c8b2f..6875adf 100644
--- a/pl/math/sv_expf_data.c
+++ b/pl/math/sv_expf_data.c
@@ -1,7 +1,7 @@
 /*
  * Coefficients for single-precision vector e^x function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_log10_2u5.c b/pl/math/sv_log10_2u5.c
index 770b964..884e201 100644
--- a/pl/math/sv_log10_2u5.c
+++ b/pl/math/sv_log10_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision SVE log10(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_log10f_3u5.c b/pl/math/sv_log10f_3u5.c
index 06c0908..e7b1e98 100644
--- a/pl/math/sv_log10f_3u5.c
+++ b/pl/math/sv_log10f_3u5.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision SVE log10 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_log2_3u.c b/pl/math/sv_log2_3u.c
index d66a474..a0815bb 100644
--- a/pl/math/sv_log2_3u.c
+++ b/pl/math/sv_log2_3u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision SVE log2 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_log2f_2u5.c b/pl/math/sv_log2f_2u5.c
index 6488658..fe2ab16 100644
--- a/pl/math/sv_log2f_2u5.c
+++ b/pl/math/sv_log2f_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector/SVE log2 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_log_2u5.c b/pl/math/sv_log_2u5.c
index 7eeb206..7f06fd3 100644
--- a/pl/math/sv_log_2u5.c
+++ b/pl/math/sv_log_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision SVE log(x) function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_log_data.c b/pl/math/sv_log_data.c
index a544a69..77f9989 100644
--- a/pl/math/sv_log_data.c
+++ b/pl/math/sv_log_data.c
@@ -1,7 +1,7 @@
 /*
  * Coefficients for double-precision SVE log(x) function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_logf_3u4.c b/pl/math/sv_logf_3u4.c
index 4ca1ead..11f0b8a 100644
--- a/pl/math/sv_logf_3u4.c
+++ b/pl/math/sv_logf_3u4.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector log function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_logf_data.c b/pl/math/sv_logf_data.c
index 0082ee3..51dd7a7 100644
--- a/pl/math/sv_logf_data.c
+++ b/pl/math/sv_logf_data.c
@@ -1,7 +1,7 @@
 /*
  * Coefficients for single-precision SVE log function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
index 7f06a11..5ef0ad3 100644
--- a/pl/math/sv_math.h
+++ b/pl/math/sv_math.h
@@ -1,7 +1,7 @@
 /*
  * Wrapper functions for SVE ACLE.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_powi.c b/pl/math/sv_powi.c
index 4e653dc..1bb0eb3 100644
--- a/pl/math/sv_powi.c
+++ b/pl/math/sv_powi.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision SVE powi(x, n) function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_powif.c b/pl/math/sv_powif.c
index 819c318..d0567e3 100644
--- a/pl/math/sv_powif.c
+++ b/pl/math/sv_powif.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision SVE powi(x, n) function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_sin_3u.c b/pl/math/sv_sin_3u.c
index 9072ef4..3fee080 100644
--- a/pl/math/sv_sin_3u.c
+++ b/pl/math/sv_sin_3u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision SVE sin(x) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_sinf_1u9.c b/pl/math/sv_sinf_1u9.c
index 576baea..9184ccd 100644
--- a/pl/math/sv_sinf_1u9.c
+++ b/pl/math/sv_sinf_1u9.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision SVE sin(x) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_sinf_poly_data.c b/pl/math/sv_sinf_poly_data.c
index 109ed58..1e1ab5e 100644
--- a/pl/math/sv_sinf_poly_data.c
+++ b/pl/math/sv_sinf_poly_data.c
@@ -1,7 +1,7 @@
 /*
  * Data used in single-precision sin(x) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/sv_tanf_3u2.c b/pl/math/sv_tanf_3u2.c
index ca5c5de..78ff480 100644
--- a/pl/math/sv_tanf_3u2.c
+++ b/pl/math/sv_tanf_3u2.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector tan(x) function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/tanf_3u3.c b/pl/math/tanf_3u3.c
index f6673f5..0b1617c 100644
--- a/pl/math/tanf_3u3.c
+++ b/pl/math/tanf_3u3.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision scalar tan(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "math_config.h"
diff --git a/pl/math/tanf_data.c b/pl/math/tanf_data.c
index 386b911..242cfaa 100644
--- a/pl/math/tanf_data.c
+++ b/pl/math/tanf_data.c
@@ -1,7 +1,7 @@
 /*
  * Data used in single-precision tan(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/tanhf_2u6.c b/pl/math/tanhf_2u6.c
index 745e5e3..76e54a4 100644
--- a/pl/math/tanhf_2u6.c
+++ b/pl/math/tanhf_2u6.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision tanh(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "math_config.h"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index e3eda6f..bf820bd 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -2,7 +2,7 @@
 /*
  * Function entries for mathbench.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index 7c990ba..eba960e 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -1,7 +1,7 @@
 /*
  * Function wrappers for mathbench.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/test/pl_test.h b/pl/math/test/pl_test.h
index 158db5f..467d1ca 100644
--- a/pl/math/test/pl_test.h
+++ b/pl/math/test/pl_test.h
@@ -2,7 +2,7 @@
  * PL macros for emitting various details about routines for consumption by
  * runulp.sh.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
  */
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index c5902e6..4d02530 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -2,7 +2,7 @@
 
 # ULP error check script.
 #
-# Copyright (c) 2019-2022, Arm Limited.
+# Copyright (c) 2019-2023, Arm Limited.
 # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 #set -x
diff --git a/pl/math/test/testcases/directed/acosh.tst b/pl/math/test/testcases/directed/acosh.tst
index bbc1551..dd962bd 100644
--- a/pl/math/test/testcases/directed/acosh.tst
+++ b/pl/math/test/testcases/directed/acosh.tst
@@ -1,6 +1,6 @@
 ; acosh.tst
 ;
-; Copyright 2009-2022, Arm Limited.
+; Copyright (c) 2009-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=acosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/acoshf.tst b/pl/math/test/testcases/directed/acoshf.tst
index ffa6208..606c615 100644
--- a/pl/math/test/testcases/directed/acoshf.tst
+++ b/pl/math/test/testcases/directed/acoshf.tst
@@ -1,6 +1,6 @@
 ; acoshf.tst
 ;
-; Copyright 2009-2022, Arm Limited.
+; Copyright (c) 2009-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=acoshf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/asinh.tst b/pl/math/test/testcases/directed/asinh.tst
index f0d50ac..1485dfe 100644
--- a/pl/math/test/testcases/directed/asinh.tst
+++ b/pl/math/test/testcases/directed/asinh.tst
@@ -1,6 +1,6 @@
 ; asinh.tst
 ;
-; Copyright (c) 2022, Arm Limited.
+; Copyright (c) 2022-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=asinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/asinhf.tst b/pl/math/test/testcases/directed/asinhf.tst
index d832056..eb76a58 100644
--- a/pl/math/test/testcases/directed/asinhf.tst
+++ b/pl/math/test/testcases/directed/asinhf.tst
@@ -1,6 +1,6 @@
 ; asinhf.tst
 ;
-; Copyright (c) 2007-2022, Arm Limited.
+; Copyright (c) 2007-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=asinhf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/atan.tst b/pl/math/test/testcases/directed/atan.tst
index 5716276..4c67055 100644
--- a/pl/math/test/testcases/directed/atan.tst
+++ b/pl/math/test/testcases/directed/atan.tst
@@ -1,6 +1,6 @@
 ; atan.tst
 ;
-; Copyright 1999-2022, Arm Limited.
+; Copyright (c) 1999-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=atan op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/atan2.tst b/pl/math/test/testcases/directed/atan2.tst
index df16d41..647b376 100644
--- a/pl/math/test/testcases/directed/atan2.tst
+++ b/pl/math/test/testcases/directed/atan2.tst
@@ -1,6 +1,6 @@
 ; atan2.tst
 ;
-; Copyright (c) 1999-2022, Arm Limited.
+; Copyright (c) 1999-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=atan2 op1=7ff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
diff --git a/pl/math/test/testcases/directed/atan2f.tst b/pl/math/test/testcases/directed/atan2f.tst
index 708e867..85c5c5d 100644
--- a/pl/math/test/testcases/directed/atan2f.tst
+++ b/pl/math/test/testcases/directed/atan2f.tst
@@ -1,6 +1,6 @@
 ; atan2f.tst
 ;
-; Copyright (c) 1999-2022, Arm Limited.
+; Copyright (c) 1999-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=atan2f op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
diff --git a/pl/math/test/testcases/directed/atanf.tst b/pl/math/test/testcases/directed/atanf.tst
index 8661527..0a0bfc2 100644
--- a/pl/math/test/testcases/directed/atanf.tst
+++ b/pl/math/test/testcases/directed/atanf.tst
@@ -1,6 +1,6 @@
 ; atanf.tst
 ;
-; Copyright 2007-2022, Arm Limited.
+; Copyright (c) 2007-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=atanf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/atanh.tst b/pl/math/test/testcases/directed/atanh.tst
index 530df8b..d96ff32 100644
--- a/pl/math/test/testcases/directed/atanh.tst
+++ b/pl/math/test/testcases/directed/atanh.tst
@@ -1,6 +1,6 @@
 ; atanh.tst
 ;
-; Copyright 2009-2022, Arm Limited.
+; Copyright (c) 2009-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=atanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/atanhf.tst b/pl/math/test/testcases/directed/atanhf.tst
index 616b59d..21a68a6 100644
--- a/pl/math/test/testcases/directed/atanhf.tst
+++ b/pl/math/test/testcases/directed/atanhf.tst
@@ -1,6 +1,6 @@
 ; atanhf.tst
 ;
-; Copyright 2009-2022, Arm Limited.
+; Copyright (c) 2009-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=atanhf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/cbrtf.tst b/pl/math/test/testcases/directed/cbrtf.tst
index 5f8b97f..0dd8d09 100644
--- a/pl/math/test/testcases/directed/cbrtf.tst
+++ b/pl/math/test/testcases/directed/cbrtf.tst
@@ -1,6 +1,6 @@
 ; cbrtf.tst
 ;
-; Copyright 2009-2022, Arm Limited.
+; Copyright (c) 2009-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=cbrtf op1=7f800000 result=7f800000 errno=0
diff --git a/pl/math/test/testcases/directed/cosh.tst b/pl/math/test/testcases/directed/cosh.tst
index 5fdc94b..c4efacb 100644
--- a/pl/math/test/testcases/directed/cosh.tst
+++ b/pl/math/test/testcases/directed/cosh.tst
@@ -1,6 +1,6 @@
 ; cosh.tst
 ;
-; Copyright 1999-2022, Arm Limited.
+; Copyright (c) 1999-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=cosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/coshf.tst b/pl/math/test/testcases/directed/coshf.tst
index cdc1d8d..2b967e7 100644
--- a/pl/math/test/testcases/directed/coshf.tst
+++ b/pl/math/test/testcases/directed/coshf.tst
@@ -1,6 +1,6 @@
 ; coshf.tst
 ;
-; Copyright (c) 2007-2022, Arm Limited.
+; Copyright (c) 2007-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=coshf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/erfc.tst b/pl/math/test/testcases/directed/erfc.tst
index 9ccf196..c03fc59 100644
--- a/pl/math/test/testcases/directed/erfc.tst
+++ b/pl/math/test/testcases/directed/erfc.tst
@@ -1,6 +1,6 @@
 ; erfc.tst - Directed test cases for erfc
 ;
-; Copyright (c) 2022, Arm Limited.
+; Copyright (c) 2022-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erfc op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/erfcf.tst b/pl/math/test/testcases/directed/erfcf.tst
index 4cea316..719bacc 100644
--- a/pl/math/test/testcases/directed/erfcf.tst
+++ b/pl/math/test/testcases/directed/erfcf.tst
@@ -1,6 +1,6 @@
 ; erfcf.tst - Directed test cases for erfcf
 ;
-; Copyright (c) 2007-2022, Arm Limited.
+; Copyright (c) 2007-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erfcf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/erff.tst b/pl/math/test/testcases/directed/erff.tst
index 48a3d6e..9b1d3d5 100644
--- a/pl/math/test/testcases/directed/erff.tst
+++ b/pl/math/test/testcases/directed/erff.tst
@@ -1,6 +1,6 @@
 ; erff.tst
 ;
-; Copyright (c) 2007-2022, Arm Limited.
+; Copyright (c) 2007-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erff op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/expm1.tst b/pl/math/test/testcases/directed/expm1.tst
index d382c18..609d6f4 100644
--- a/pl/math/test/testcases/directed/expm1.tst
+++ b/pl/math/test/testcases/directed/expm1.tst
@@ -1,6 +1,6 @@
 ; expm1.tst
 ;
-; Copyright 2009-2022, Arm Limited.
+; Copyright (c) 2009-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=expm1 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/expm1f.tst b/pl/math/test/testcases/directed/expm1f.tst
index dcf3d06..44c3842 100644
--- a/pl/math/test/testcases/directed/expm1f.tst
+++ b/pl/math/test/testcases/directed/expm1f.tst
@@ -1,6 +1,6 @@
 ; expm1f.tst
 ;
-; Copyright 2009-2022, Arm Limited.
+; Copyright (c) 2009-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=expm1f op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/log10.tst b/pl/math/test/testcases/directed/log10.tst
index a8da6a7..3483143 100644
--- a/pl/math/test/testcases/directed/log10.tst
+++ b/pl/math/test/testcases/directed/log10.tst
@@ -1,6 +1,6 @@
 ; log10.tst
 ;
-; Copyright (c) 2007-2022, Arm Limited.
+; Copyright (c) 2007-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/log10f.tst b/pl/math/test/testcases/directed/log10f.tst
index 5fdd635..d5744a6 100644
--- a/pl/math/test/testcases/directed/log10f.tst
+++ b/pl/math/test/testcases/directed/log10f.tst
@@ -1,6 +1,6 @@
 ; log10f.tst
 ;
-; Copyright (c) 2007-2022, Arm Limited.
+; Copyright (c) 2007-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log10f op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/log1p.tst b/pl/math/test/testcases/directed/log1p.tst
index 41a1896..9ee8c62 100644
--- a/pl/math/test/testcases/directed/log1p.tst
+++ b/pl/math/test/testcases/directed/log1p.tst
@@ -1,6 +1,6 @@
 ; log1p.tst
 ;
-; Copyright (c) 2009-2022, Arm Limited.
+; Copyright (c) 2009-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log1p op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/log1pf.tst b/pl/math/test/testcases/directed/log1pf.tst
index a543887..aaa01d6 100644
--- a/pl/math/test/testcases/directed/log1pf.tst
+++ b/pl/math/test/testcases/directed/log1pf.tst
@@ -1,6 +1,6 @@
 ; log1pf.tst
 ;
-; Copyright (c) 2009-2022, Arm Limited.
+; Copyright (c) 2009-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log1pf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/log2.tst b/pl/math/test/testcases/directed/log2.tst
index c84ff65..5d1eb9b 100644
--- a/pl/math/test/testcases/directed/log2.tst
+++ b/pl/math/test/testcases/directed/log2.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for log2
 ;
-; Copyright (c) 2018-2022, Arm Limited.
+; Copyright (c) 2018-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/log2f.tst b/pl/math/test/testcases/directed/log2f.tst
index 9e99c53..4e08110 100644
--- a/pl/math/test/testcases/directed/log2f.tst
+++ b/pl/math/test/testcases/directed/log2f.tst
@@ -1,6 +1,6 @@
 ; log2f.tst - Directed test cases for log2f
 ;
-; Copyright (c) 2017-2022, Arm Limited.
+; Copyright (c) 2017-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2f op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/sinh.tst b/pl/math/test/testcases/directed/sinh.tst
index d8c7d91..d6a3da8 100644
--- a/pl/math/test/testcases/directed/sinh.tst
+++ b/pl/math/test/testcases/directed/sinh.tst
@@ -1,6 +1,6 @@
 ; sinh.tst
 ;
-; Copyright 1999-2022, Arm Limited.
+; Copyright (c) 1999-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=sinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/sinhf.tst b/pl/math/test/testcases/directed/sinhf.tst
index 9a5ee56..5f7bd1b 100644
--- a/pl/math/test/testcases/directed/sinhf.tst
+++ b/pl/math/test/testcases/directed/sinhf.tst
@@ -1,6 +1,6 @@
 ; sinhf.tst
 ;
-; Copyright 2009-2022, Arm Limited.
+; Copyright (c) 2009-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=sinhf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/tanf.tst b/pl/math/test/testcases/directed/tanf.tst
index 99aacc4..3161f70 100644
--- a/pl/math/test/testcases/directed/tanf.tst
+++ b/pl/math/test/testcases/directed/tanf.tst
@@ -1,6 +1,6 @@
 ; tanf.tst
 ;
-; Copyright (c) 2022, Arm Limited.
+; Copyright (c) 2022-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=tanf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/tanhf.tst b/pl/math/test/testcases/directed/tanhf.tst
index c3edb50..603e310 100644
--- a/pl/math/test/testcases/directed/tanhf.tst
+++ b/pl/math/test/testcases/directed/tanhf.tst
@@ -1,6 +1,6 @@
 ; tanhf.tst
 ;
-; Copyright 2007-2022, Arm Limited.
+; Copyright (c) 2007-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=tanhf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/random/double.tst b/pl/math/test/testcases/random/double.tst
index 03d14d4..d83283e 100644
--- a/pl/math/test/testcases/random/double.tst
+++ b/pl/math/test/testcases/random/double.tst
@@ -1,6 +1,6 @@
 !! double.tst - Random test case specification for DP functions
 !!
-!! Copyright (c) 1999-2022, Arm Limited.
+!! Copyright (c) 1999-2023, Arm Limited.
 !! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 test log10 10000
diff --git a/pl/math/test/testcases/random/float.tst b/pl/math/test/testcases/random/float.tst
index 68afbfb..fa77efe 100644
--- a/pl/math/test/testcases/random/float.tst
+++ b/pl/math/test/testcases/random/float.tst
@@ -1,6 +1,6 @@
 !! float.tst - Random test case specification for SP functions
 !!
-!! Copyright (c) 2022, Arm Limited.
+!! Copyright (c) 2022-2023, Arm Limited.
 !! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 test erff 10000
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index af1c464..5e3133e 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -1,7 +1,7 @@
 /*
  * Function entries for ulp.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index e91cbe5..b682e93 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -2,7 +2,7 @@
 /*
  * Function wrappers for ulp.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/tools/asinh.sollya b/pl/math/tools/asinh.sollya
index 6ff217f..663ee92 100644
--- a/pl/math/tools/asinh.sollya
+++ b/pl/math/tools/asinh.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating asinh(x)
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 // Polynomial is used in [2^-26, 1]. However it is least accurate close to 1, so
diff --git a/pl/math/tools/asinhf.sollya b/pl/math/tools/asinhf.sollya
index cbe7d62..ab115b5 100644
--- a/pl/math/tools/asinhf.sollya
+++ b/pl/math/tools/asinhf.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating asinh(x)
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 9;
diff --git a/pl/math/tools/atan.sollya b/pl/math/tools/atan.sollya
index f1f33c5..ad4f33b 100644
--- a/pl/math/tools/atan.sollya
+++ b/pl/math/tools/atan.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating atan(x) and atan2(y, x)
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 // atan is odd, so approximate with an odd polynomial:
diff --git a/pl/math/tools/atanf.sollya b/pl/math/tools/atanf.sollya
index 42b8c36..ed88d0b 100644
--- a/pl/math/tools/atanf.sollya
+++ b/pl/math/tools/atanf.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating atanf(x)
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 // Generate list of monomials:
diff --git a/pl/math/tools/cbrt.sollya b/pl/math/tools/cbrt.sollya
index 7f179eb..1d43dc7 100644
--- a/pl/math/tools/cbrt.sollya
+++ b/pl/math/tools/cbrt.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating cbrt(x) in double precision
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 3;
diff --git a/pl/math/tools/cbrtf.sollya b/pl/math/tools/cbrtf.sollya
index 9cd1259..4e0cc69 100644
--- a/pl/math/tools/cbrtf.sollya
+++ b/pl/math/tools/cbrtf.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating cbrt(x) in single precision
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 3;
diff --git a/pl/math/tools/erfc.sollya b/pl/math/tools/erfc.sollya
index 55c1495..8c40b4b 100644
--- a/pl/math/tools/erfc.sollya
+++ b/pl/math/tools/erfc.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating erfc(x)*exp(x*x)
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 12; // poly degree
diff --git a/pl/math/tools/erfcf.sollya b/pl/math/tools/erfcf.sollya
index bfb8451..69c6836 100644
--- a/pl/math/tools/erfcf.sollya
+++ b/pl/math/tools/erfcf.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating erfc(x)*exp(x*x)
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 15; // poly degree
diff --git a/pl/math/tools/expm1.sollya b/pl/math/tools/expm1.sollya
index 587db46..7b6f324 100644
--- a/pl/math/tools/expm1.sollya
+++ b/pl/math/tools/expm1.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating exp(x)-1 in double precision
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 12;
diff --git a/pl/math/tools/expm1f.sollya b/pl/math/tools/expm1f.sollya
index f5d769c..efdf1bd 100644
--- a/pl/math/tools/expm1f.sollya
+++ b/pl/math/tools/expm1f.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating exp(x)-1 in single precision
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 5;
diff --git a/pl/math/tools/log10.sollya b/pl/math/tools/log10.sollya
index a353a20..85d1d15 100644
--- a/pl/math/tools/log10.sollya
+++ b/pl/math/tools/log10.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating log10(1+x)
 //
-// Copyright (c) 2019-2022, Arm Limited.
+// Copyright (c) 2019-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
diff --git a/pl/math/tools/log10f.sollya b/pl/math/tools/log10f.sollya
index 26a4a76..94bf32f 100644
--- a/pl/math/tools/log10f.sollya
+++ b/pl/math/tools/log10f.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating log10f(1+x)
 //
-// Copyright (c) 2019-2022, Arm Limited.
+// Copyright (c) 2019-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 // Computation of log10f(1+x) will be carried out in double precision
diff --git a/pl/math/tools/log1p.sollya b/pl/math/tools/log1p.sollya
index fb159b3..598a36a 100644
--- a/pl/math/tools/log1p.sollya
+++ b/pl/math/tools/log1p.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating log(1+x) in double precision
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 20;
diff --git a/pl/math/tools/log1pf.sollya b/pl/math/tools/log1pf.sollya
index 32b307b..cc1db10 100644
--- a/pl/math/tools/log1pf.sollya
+++ b/pl/math/tools/log1pf.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating log(1+x) in single precision
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 10;
diff --git a/pl/math/tools/tanf.sollya b/pl/math/tools/tanf.sollya
index 73ca0f9..8b2306b 100644
--- a/pl/math/tools/tanf.sollya
+++ b/pl/math/tools/tanf.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating single precision tan(x)
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 dtype = single;
diff --git a/pl/math/tools/v_erf.sollya b/pl/math/tools/v_erf.sollya
index c9deae9..394ba37 100644
--- a/pl/math/tools/v_erf.sollya
+++ b/pl/math/tools/v_erf.sollya
@@ -2,7 +2,7 @@
 // To generate coefficients for interval i (0 to 47) do:
 // $ sollya v_erf.sollya $i
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 scale = 1/8;
diff --git a/pl/math/tools/v_erfc.sollya b/pl/math/tools/v_erfc.sollya
index e4e5fb1..3b03ba0 100644
--- a/pl/math/tools/v_erfc.sollya
+++ b/pl/math/tools/v_erfc.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating erfc(x)*exp(x*x)
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 12; // poly degree
diff --git a/pl/math/tools/v_log10.sollya b/pl/math/tools/v_log10.sollya
index 76c1648..e2df436 100644
--- a/pl/math/tools/v_log10.sollya
+++ b/pl/math/tools/v_log10.sollya
@@ -1,6 +1,6 @@
 // polynomial used for __v_log10(x)
 //
-// Copyright (c) 2019-2022, Arm Limited.
+// Copyright (c) 2019-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
diff --git a/pl/math/tools/v_log10f.sollya b/pl/math/tools/v_log10f.sollya
index c24c2c9..396d5a9 100644
--- a/pl/math/tools/v_log10f.sollya
+++ b/pl/math/tools/v_log10f.sollya
@@ -1,6 +1,6 @@
 // polynomial for approximating v_log10f(1+x)
 //
-// Copyright (c) 2019-2022, Arm Limited.
+// Copyright (c) 2019-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 9; // poly degree
diff --git a/pl/math/tools/v_log2f.sollya b/pl/math/tools/v_log2f.sollya
index 18869a5..99e050c 100644
--- a/pl/math/tools/v_log2f.sollya
+++ b/pl/math/tools/v_log2f.sollya
@@ -1,6 +1,6 @@
 // polynomial used for __v_log2f(x)
 //
-// Copyright (c) 2022, Arm Limited.
+// Copyright (c) 2022-2023, Arm Limited.
 // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 9; // poly degree
diff --git a/pl/math/v_asinh_3u5.c b/pl/math/v_asinh_3u5.c
index 5294a3c..fd329b6 100644
--- a/pl/math/v_asinh_3u5.c
+++ b/pl/math/v_asinh_3u5.c
@@ -1,6 +1,7 @@
 /*
  * Double-precision vector asinh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
index 4710a22..9d8c8a9 100644
--- a/pl/math/v_asinhf_2u7.c
+++ b/pl/math/v_asinhf_2u7.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision vector asinh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_atan2_3u.c b/pl/math/v_atan2_3u.c
index b123cfa..6327fea 100644
--- a/pl/math/v_atan2_3u.c
+++ b/pl/math/v_atan2_3u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector atan2(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
index abf8f5e..5d1e6ca 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/pl/math/v_atan2f_3u.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector atan2(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
index 92479ab..0f3c2cc 100644
--- a/pl/math/v_atan_2u5.c
+++ b/pl/math/v_atan_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector atan(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c
index c61f8f8..67d90b9 100644
--- a/pl/math/v_atanf_3u.c
+++ b/pl/math/v_atanf_3u.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector atan(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_atanh_3u5.c b/pl/math/v_atanh_3u5.c
index ffd6f59..bfaf5c2 100644
--- a/pl/math/v_atanh_3u5.c
+++ b/pl/math/v_atanh_3u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector atanh(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
index 7a027fc..cd30696 100644
--- a/pl/math/v_atanhf_3u1.c
+++ b/pl/math/v_atanhf_3u1.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision vector atanh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_cbrt_2u.c b/pl/math/v_cbrt_2u.c
index b6e501c..d5abe41 100644
--- a/pl/math/v_cbrt_2u.c
+++ b/pl/math/v_cbrt_2u.c
@@ -1,6 +1,7 @@
 /*
  * Double-precision vector cbrt(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c
index 38c20e3..62fa375 100644
--- a/pl/math/v_cbrtf_1u5.c
+++ b/pl/math/v_cbrtf_1u5.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision vector cbrt(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_cosh_2u.c b/pl/math/v_cosh_2u.c
index 67390d4..0a9fbf8 100644
--- a/pl/math/v_cosh_2u.c
+++ b/pl/math/v_cosh_2u.c
@@ -1,6 +1,7 @@
 /*
  * Double-precision vector cosh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_coshf_2u4.c b/pl/math/v_coshf_2u4.c
index bee46ed..1422d4d 100644
--- a/pl/math/v_coshf_2u4.c
+++ b/pl/math/v_coshf_2u4.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision vector cosh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_erf_2u.c b/pl/math/v_erf_2u.c
index caec4d8..1d7ddbb 100644
--- a/pl/math/v_erf_2u.c
+++ b/pl/math/v_erf_2u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector erf(x) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_erf_data.c b/pl/math/v_erf_data.c
index 1694f28..7bbb281 100644
--- a/pl/math/v_erf_data.c
+++ b/pl/math/v_erf_data.c
@@ -2,7 +2,7 @@
  * Polynomial coefficients and shifts for double-precision erf(x) vector
  * function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_erfc_4u.c b/pl/math/v_erfc_4u.c
index 9247f87..c306351 100644
--- a/pl/math/v_erfc_4u.c
+++ b/pl/math/v_erfc_4u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector erfc(x) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_erfc_data.c b/pl/math/v_erfc_data.c
index c53a669..3c47033 100644
--- a/pl/math/v_erfc_data.c
+++ b/pl/math/v_erfc_data.c
@@ -1,7 +1,7 @@
 /*
  * Polynomial coefficients for double-precision erfc(x) vector function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c
index 4b495d0..963490d 100644
--- a/pl/math/v_erfcf_1u.c
+++ b/pl/math/v_erfcf_1u.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector erfc(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_erff_1u5.c b/pl/math/v_erff_1u5.c
index bb9b786..3a25cc8 100644
--- a/pl/math/v_erff_1u5.c
+++ b/pl/math/v_erff_1u5.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector erf(x) function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_erff_data.c b/pl/math/v_erff_data.c
index 0661d20..73ccb5c 100644
--- a/pl/math/v_erff_data.c
+++ b/pl/math/v_erff_data.c
@@ -1,7 +1,7 @@
 /*
  * Data for approximation of vector erff.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_exp_tail.c b/pl/math/v_exp_tail.c
index fabc110..fd38aa8 100644
--- a/pl/math/v_exp_tail.c
+++ b/pl/math/v_exp_tail.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector e^(x+tail) function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_exp_tail.h b/pl/math/v_exp_tail.h
index e1417d3..903f1fd 100644
--- a/pl/math/v_exp_tail.h
+++ b/pl/math/v_exp_tail.h
@@ -1,7 +1,7 @@
 /*
  * Constants for double-precision e^(x+tail) vector function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_exp_tail_data.c b/pl/math/v_exp_tail_data.c
index 97e1bc1..675eb76 100644
--- a/pl/math/v_exp_tail_data.c
+++ b/pl/math/v_exp_tail_data.c
@@ -1,7 +1,7 @@
 /*
  * Lookup table for double-precision e^(x+tail) vector function.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_expf.c b/pl/math/v_expf.c
index d6e5720..a422e69 100644
--- a/pl/math/v_expf.c
+++ b/pl/math/v_expf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector e^x function.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
index 879fcb6..4b491d1 100644
--- a/pl/math/v_expm1_2u5.c
+++ b/pl/math/v_expm1_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector exp(x) - 1 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_expm1f_1u6.c b/pl/math/v_expm1f_1u6.c
index 7a59ddc..ab13242 100644
--- a/pl/math/v_expm1f_1u6.c
+++ b/pl/math/v_expm1f_1u6.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector exp(x) - 1 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_expm1f_inline.h b/pl/math/v_expm1f_inline.h
index ef9e934..c261941 100644
--- a/pl/math/v_expm1f_inline.h
+++ b/pl/math/v_expm1f_inline.h
@@ -2,7 +2,7 @@
  * Helper for single-precision routines which calculate exp(x) - 1 and do not
  * need special-case handling
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c
index e8d8021..86d398c 100644
--- a/pl/math/v_log10_2u5.c
+++ b/pl/math/v_log10_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector log10(x) function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_log10_data.c b/pl/math/v_log10_data.c
index d1db9a5..fda85c8 100644
--- a/pl/math/v_log10_data.c
+++ b/pl/math/v_log10_data.c
@@ -1,7 +1,7 @@
 /*
  * Lookup table for double-precision log10(x) vector function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
index a032fa9..e9f7f03 100644
--- a/pl/math/v_log10f_3u5.c
+++ b/pl/math/v_log10f_3u5.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector log10 function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_log10f_data.c b/pl/math/v_log10f_data.c
index c95f38b..537482a 100644
--- a/pl/math/v_log10f_data.c
+++ b/pl/math/v_log10f_data.c
@@ -1,7 +1,7 @@
 /*
  * Coefficients for single-precision vector log10 function.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "math_config.h"
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
index 7ff948f..e482910 100644
--- a/pl/math/v_log1p_2u5.c
+++ b/pl/math/v_log1p_2u5.c
@@ -1,6 +1,7 @@
 /*
  * Double-precision vector log(1+x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c
index ab5e7b7..4a7732b 100644
--- a/pl/math/v_log1pf_2u1.c
+++ b/pl/math/v_log1pf_2u1.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision vector log(1+x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_log1pf_inline.h b/pl/math/v_log1pf_inline.h
index cf32b2a..e3048e6 100644
--- a/pl/math/v_log1pf_inline.h
+++ b/pl/math/v_log1pf_inline.h
@@ -2,7 +2,7 @@
  * Helper for single-precision routines which calculate log(1 + x) and do not
  * need special-case handling
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
index 5b9bdd8..fac73f6 100644
--- a/pl/math/v_log2_3u.c
+++ b/pl/math/v_log2_3u.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector log2 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_log2_data.c b/pl/math/v_log2_data.c
index e3c56c1..2a1da68 100644
--- a/pl/math/v_log2_data.c
+++ b/pl/math/v_log2_data.c
@@ -1,7 +1,7 @@
 /*
  * Coefficients and table entries for vector log2
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_log2f_2u5.c b/pl/math/v_log2f_2u5.c
index f4fa0ab..8f9241b 100644
--- a/pl/math/v_log2f_2u5.c
+++ b/pl/math/v_log2f_2u5.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector log2 function.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_log2f_data.c b/pl/math/v_log2f_data.c
index 7e5cb1e..b144e8f 100644
--- a/pl/math/v_log2f_data.c
+++ b/pl/math/v_log2f_data.c
@@ -1,7 +1,7 @@
 /*
  * Coefficients for vector log2f
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index 0ff3db3..a8fa091 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -1,7 +1,7 @@
 /*
  * Vector math abstractions.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
index 8ddd29d..57ec66e 100644
--- a/pl/math/v_sinh_3u.c
+++ b/pl/math/v_sinh_3u.c
@@ -1,6 +1,7 @@
 /*
  * Double-precision vector sinh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
index a54c178..49cf078 100644
--- a/pl/math/v_sinhf_2u3.c
+++ b/pl/math/v_sinhf_2u3.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision vector sinh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
index 6125319..a2b1fab 100644
--- a/pl/math/v_tanf_3u2.c
+++ b/pl/math/v_tanf_3u2.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector tan(x) function.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index 0e7ff69..3616611 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -1,6 +1,7 @@
 /*
  * Single-precision vector tanh(x) function.
- * Copyright (c) 2022, Arm Limited.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/pl/math/vn_asinh_3u5.c b/pl/math/vn_asinh_3u5.c
index e2f3aeb..0d2373b 100644
--- a/pl/math/vn_asinh_3u5.c
+++ b/pl/math/vn_asinh_3u5.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_asinh.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_asinhf_2u7.c b/pl/math/vn_asinhf_2u7.c
index 8efe099..6c8927f 100644
--- a/pl/math/vn_asinhf_2u7.c
+++ b/pl/math/vn_asinhf_2u7.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_asinhf.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_atan2_3u.c b/pl/math/vn_atan2_3u.c
index 7575bff..925b5b4 100644
--- a/pl/math/vn_atan2_3u.c
+++ b/pl/math/vn_atan2_3u.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_atan2.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_atan2f_3u.c b/pl/math/vn_atan2f_3u.c
index b378806..51d33d5 100644
--- a/pl/math/vn_atan2f_3u.c
+++ b/pl/math/vn_atan2f_3u.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_atan2f.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_atan_2u5.c b/pl/math/vn_atan_2u5.c
index 539e61b..ccebce2 100644
--- a/pl/math/vn_atan_2u5.c
+++ b/pl/math/vn_atan_2u5.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_atan.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_atanf_3u.c b/pl/math/vn_atanf_3u.c
index aaeef5b..b879727 100644
--- a/pl/math/vn_atanf_3u.c
+++ b/pl/math/vn_atanf_3u.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_atanf.
  *
- * Copyright (c) 2021-2022, Arm Limited.
+ * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_atanh_3u5.c b/pl/math/vn_atanh_3u5.c
index 27a5af5..19429b2 100644
--- a/pl/math/vn_atanh_3u5.c
+++ b/pl/math/vn_atanh_3u5.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_atanh.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_atanhf_3u1.c b/pl/math/vn_atanhf_3u1.c
index 32e2c45..7de226d 100644
--- a/pl/math/vn_atanhf_3u1.c
+++ b/pl/math/vn_atanhf_3u1.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_atanhf.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_cbrt_2u.c b/pl/math/vn_cbrt_2u.c
index ccaa085..4cb0dc8 100644
--- a/pl/math/vn_cbrt_2u.c
+++ b/pl/math/vn_cbrt_2u.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_cbrt.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_cbrtf_1u5.c b/pl/math/vn_cbrtf_1u5.c
index 53774cf..40a72d8 100644
--- a/pl/math/vn_cbrtf_1u5.c
+++ b/pl/math/vn_cbrtf_1u5.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_cbrtf.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_cosh_2u.c b/pl/math/vn_cosh_2u.c
index 5950e2d..9bf7f02 100644
--- a/pl/math/vn_cosh_2u.c
+++ b/pl/math/vn_cosh_2u.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_cosh.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_coshf_2u4.c b/pl/math/vn_coshf_2u4.c
index e2fdc13..b149cb3 100644
--- a/pl/math/vn_coshf_2u4.c
+++ b/pl/math/vn_coshf_2u4.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_coshf.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_erf_2u.c b/pl/math/vn_erf_2u.c
index 0ffad52..95bd141 100644
--- a/pl/math/vn_erf_2u.c
+++ b/pl/math/vn_erf_2u.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_erf.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_erfc_4u.c b/pl/math/vn_erfc_4u.c
index 940188a..1cf6546 100644
--- a/pl/math/vn_erfc_4u.c
+++ b/pl/math/vn_erfc_4u.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_erfc.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_erfcf_1u.c b/pl/math/vn_erfcf_1u.c
index 58829b5..ef5a21d 100644
--- a/pl/math/vn_erfcf_1u.c
+++ b/pl/math/vn_erfcf_1u.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_erfcf.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_erff_1u5.c b/pl/math/vn_erff_1u5.c
index f39560e..ee8848e 100644
--- a/pl/math/vn_erff_1u5.c
+++ b/pl/math/vn_erff_1u5.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_erff.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_exp_tail.c b/pl/math/vn_exp_tail.c
index 04b5aaa..52a57fe 100644
--- a/pl/math/vn_exp_tail.c
+++ b/pl/math/vn_exp_tail.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_erfc.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_expf.c b/pl/math/vn_expf.c
index 6e91a94..83e7f0a 100644
--- a/pl/math/vn_expf.c
+++ b/pl/math/vn_expf.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_expf.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
diff --git a/pl/math/vn_expm1_2u5.c b/pl/math/vn_expm1_2u5.c
index d946808..35111e2 100644
--- a/pl/math/vn_expm1_2u5.c
+++ b/pl/math/vn_expm1_2u5.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_expm1.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_expm1f_1u6.c b/pl/math/vn_expm1f_1u6.c
index 304e0a5..bea491f 100644
--- a/pl/math/vn_expm1f_1u6.c
+++ b/pl/math/vn_expm1f_1u6.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_expm1f.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_log10_2u5.c b/pl/math/vn_log10_2u5.c
index e52285c..5f32c33 100644
--- a/pl/math/vn_log10_2u5.c
+++ b/pl/math/vn_log10_2u5.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_log10.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_log10f_3u5.c b/pl/math/vn_log10f_3u5.c
index 7d6fe25..2673ef5 100644
--- a/pl/math/vn_log10f_3u5.c
+++ b/pl/math/vn_log10f_3u5.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_log10f.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_log1p_2u5.c b/pl/math/vn_log1p_2u5.c
index 7beab12..3f4f8d1 100644
--- a/pl/math/vn_log1p_2u5.c
+++ b/pl/math/vn_log1p_2u5.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_log1p.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_log1pf_2u1.c b/pl/math/vn_log1pf_2u1.c
index f5ebcd8..a319bc9 100644
--- a/pl/math/vn_log1pf_2u1.c
+++ b/pl/math/vn_log1pf_2u1.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_log1pf.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_log2_3u.c b/pl/math/vn_log2_3u.c
index 3a67e03..a870392 100644
--- a/pl/math/vn_log2_3u.c
+++ b/pl/math/vn_log2_3u.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_log2.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_log2f_2u5.c b/pl/math/vn_log2f_2u5.c
index b1e491a..b4a9cb7 100644
--- a/pl/math/vn_log2f_2u5.c
+++ b/pl/math/vn_log2f_2u5.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_log2f.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_sinh_3u.c b/pl/math/vn_sinh_3u.c
index fb42f20..7c881de 100644
--- a/pl/math/vn_sinh_3u.c
+++ b/pl/math/vn_sinh_3u.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_sinh.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_sinhf_2u3.c b/pl/math/vn_sinhf_2u3.c
index 230ee6e..251e732 100644
--- a/pl/math/vn_sinhf_2u3.c
+++ b/pl/math/vn_sinhf_2u3.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_sinhf.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_tanf_3u2.c b/pl/math/vn_tanf_3u2.c
index e37976d..ccdcab6 100644
--- a/pl/math/vn_tanf_3u2.c
+++ b/pl/math/vn_tanf_3u2.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_tanf.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
diff --git a/pl/math/vn_tanhf_2u6.c b/pl/math/vn_tanhf_2u6.c
index 86e460c..47f0a7f 100644
--- a/pl/math/vn_tanhf_2u6.c
+++ b/pl/math/vn_tanhf_2u6.c
@@ -1,7 +1,7 @@
 /*
  * AdvSIMD vector PCS variant of __v_tanhf.
  *
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "include/mathlib.h"
-- 
cgit v1.2.3


From 268217a15421eb9dfa42d3f8b8c078dd6de06b61 Mon Sep 17 00:00:00 2001
From: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Mon, 9 Jan 2023 08:46:21 +0000
Subject: pl/math: Fix benchmark entries for SVE bivariate functions

Variant was wrongly set in structures used to benchmark SVE functions.
Before this change only half of the lanes were set as expected.
Also reformat for ease of reading.
---
 pl/math/test/mathbench_funcs.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index bf820bd..e0f6ac7 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -61,26 +61,26 @@
    exotic signatures that need wrapping, below.  */
 
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
-{"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
-{"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}},
-
-{"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
-{"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
-{"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
-{"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
-{"__vn_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}},
+{"atan2",  'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
+{"powi",   'd', 0,  0.01, 11.1, {.d = powi_wrap}},
+
+{"__s_atan2f",       'f', 0,   -10.0, 10.0, {.f = __s_atan2f_wrap}},
+{"__s_atan2",        'd', 0,   -10.0, 10.0, {.d = __s_atan2_wrap}},
+{"__v_atan2f",       'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
+{"__v_atan2",        'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
+{"__vn_atan2f",      'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}},
 {"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}},
-{"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
-{"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
+{"__vn_atan2",       'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
+{"_ZGVnN2vv_atan2",  'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
 
 #if WANT_SVE_MATH
-{"__sv_atan2f_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_atan2f_wrap}},
-{"_ZGVsMxvv_atan2f", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}},
-{"__sv_atan2_x", 'd', 'n', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
-{"_ZGVsM2vv_atan2", 'd', 'n', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
-{"__sv_powif_x", 'f', 'n', -10.0, 10.0, {.svf = __sv_powif_wrap}},
-{"_ZGVsMxvv_powi", 'f', 'n', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
-{"__sv_powi_x", 'd', 'n', -10.0, 10.0, {.svd = __sv_powi_wrap}},
-{"_ZGVsMxvv_powk", 'd', 'n', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}},
+{"__sv_atan2f_x",    'f', 's', -10.0, 10.0, {.svf = __sv_atan2f_wrap}},
+{"_ZGVsMxvv_atan2f", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}},
+{"__sv_atan2_x",     'd', 's', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
+{"_ZGVsM2vv_atan2",  'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
+{"__sv_powif_x",     'f', 's', -10.0, 10.0, {.svf = __sv_powif_wrap}},
+{"_ZGVsMxvv_powi",   'f', 's', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
+{"__sv_powi_x",      'd', 's', -10.0, 10.0, {.svd = __sv_powi_wrap}},
+{"_ZGVsMxvv_powk",   'd', 's', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}},
 #endif
   // clang-format on
-- 
cgit v1.2.3


From 10589b2c95e4d482f09ac6a705918fdc32c8421a Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Tue, 10 Jan 2023 14:24:53 +0000
Subject: string: Improve strrchr-mte

Use shrn for narrowing the mask which simplifies code. Unroll the
strchr search loop which improves performance on large strings.
---
 string/aarch64/strrchr-mte.S | 52 ++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index e05579f..c451d72 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -19,7 +19,6 @@
 
 #define src		x2
 #define tmp		x3
-#define wtmp		w3
 #define synd		x3
 #define shift		x4
 #define src_match	x4
@@ -31,7 +30,6 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
 #define vend		v5
 #define dend		d5
 
@@ -47,55 +45,67 @@ ENTRY (__strrchr_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0x3003
-	dup	vrepmask.8h, wtmp
-	tst	srcin, 15
-	beq	L(loop1)
-
-	ld1	{vdata.16b}, [src], 16
+	movi	vrepmask.16b, 0x33
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp, 0xf00f
-	dup	vrepmask2.8h, wtmp
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	lsl	shift, srcin, 2
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	lsl	synd, synd, shift
 	ands	nul_match, synd, 0xcccccccccccccccc
 	bne	L(tail)
-	cbnz	synd, L(loop2)
+	cbnz	synd, L(loop2_start)
 
-	.p2align 5
+	.p2align 4
 L(loop1):
-	ld1	{vdata.16b}, [src], 16
+	ldr	q1, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop1_end)
+	ldr	q1, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop1)
-
+	sub	src, src, 16
+L(loop1_end):
+	add	src, src, 16
 	cmeq	vhas_nul.16b, vdata.16b, 0
+#ifdef __AARCH64EB__
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	rbit	synd, synd
+#else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	bic	vhas_nul.8h, 0x0f, lsl 8
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
+#endif
 	ands	nul_match, synd, 0xcccccccccccccccc
-	beq	L(loop2)
-
+	beq	L(loop2_start)
 L(tail):
 	sub	nul_match, nul_match, 1
 	and	chr_match, synd, 0x3333333333333333
 	ands	chr_match, chr_match, nul_match
-	sub	result, src, 1
+	add	result, src, 15
 	clz	tmp, chr_match
 	sub	result, result, tmp, lsr 2
 	csel	result, result, xzr, ne
 	ret
 
 	.p2align 4
+	nop
+	nop
+L(loop2_start):
+	add	src, src, 16
+	bic	vrepmask.8h, 0xf0
+
 L(loop2):
 	cmp	synd, 0
 	csel	src_match, src, src_match, ne
-- 
cgit v1.2.3


From 7c1d7a24c9e2ea5ba846a6311d24df2eb0154ea0 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Tue, 10 Jan 2023 14:27:39 +0000
Subject: string: Optimize strcpy

Optimize strcpy main loop - large strings are ~22% faster.
---
 string/aarch64/strcpy.S | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 9aca330..470a865 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -84,13 +84,10 @@ ENTRY (STRCPY)
 	IFSTPCPY (add result, dstin, len)
 	ret
 
-	.p2align 4,,8
 L(tail):
 	rbit	synd, synd
 	clz	len, synd
 	lsr	len, len, 2
-
-	.p2align 4
 L(less16):
 	tbz	len, 3, L(less8)
 	sub	tmp, len, 7
@@ -123,31 +120,37 @@ L(zerobyte):
 
 	.p2align 4
 L(start_loop):
-	sub	len, src, srcin
+	sub	tmp, srcin, dstin
 	ldr	dataq2, [srcin]
-	add	dst, dstin, len
+	sub	dst, src, tmp
 	str	dataq2, [dstin]
-
-	.p2align 5
 L(loop):
-	str	dataq, [dst], 16
-	ldr	dataq, [src, 16]!
+	str	dataq, [dst], 32
+	ldr	dataq, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loopend)
+	str	dataq, [dst, -16]
+	ldr	dataq, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop)
-
+	add	dst, dst, 16
+L(loopend):
 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
+	sub	dst, dst, 31
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
 	clz	len, synd
 	lsr	len, len, 2
-	sub	tmp, len, 15
-	ldr	dataq, [src, tmp]
-	str	dataq, [dst, tmp]
-	IFSTPCPY (add result, dst, len)
+	add	dst, dst, len
+	ldr	dataq, [dst, tmp]
+	str	dataq, [dst]
+	IFSTPCPY (add result, dst, 15)
 	ret
 
 END (STRCPY)
-- 
cgit v1.2.3


From cd28f3c4253257027f9834eb7b17db047346a267 Mon Sep 17 00:00:00 2001
From: Jake Weinstein <jake@aospa.co>
Date: Wed, 21 Dec 2022 09:02:54 +0900
Subject: string: Compile memcpy-sve.S for aarch64 if compiler supports it

This is a partial revert of b7e368fb. If SVE assembly is guarded by
__ARM_FEATURE_SVE, it cannot build when SVE is not enabled by the build
system. This is ok on AOR, but because Android (bionic) uses ifuncs to
select the appropriate assembly at runtime, these need to compile
regardless of if the target actually supports the instructions.

Check for AArch64 and GCC >= 8 or Clang >= 5 so that SVE is not used on
compilers that do not support it. This condition will always be true on
future builds of Android for AArch64.
---
 string/aarch64/asmdefs.h    | 9 +++++++++
 string/aarch64/memcpy-sve.S | 7 +++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h
index b5ad6fb..18c331b 100644
--- a/string/aarch64/asmdefs.h
+++ b/string/aarch64/asmdefs.h
@@ -80,4 +80,13 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
 #define SIZE_ARG(n)
 #endif
 
+/* Compiler supports SVE instructions  */
+#ifndef HAVE_SVE
+# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
+#   define HAVE_SVE 1
+# else
+#   define HAVE_SVE 0
+# endif
+#endif
+
 #endif
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
index b82510a..f74d4a9 100644
--- a/string/aarch64/memcpy-sve.S
+++ b/string/aarch64/memcpy-sve.S
@@ -11,10 +11,12 @@
  *
  */
 
-#if __ARM_FEATURE_SVE
-
 #include "asmdefs.h"
 
+#ifdef HAVE_SVE
+
+.arch armv8-a+sve
+
 #define dstin	x0
 #define src	x1
 #define count	x2
@@ -177,4 +179,5 @@ L(return):
 	ret
 
 END (__memcpy_aarch64_sve)
+
 #endif
-- 
cgit v1.2.3


From 76c2badba375dd8f785e92079dbd4290038a2750 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 19 Jan 2023 13:19:38 +0000
Subject: pl/math: Add vector/Neon tan

New routine uses a similar technique to the single-precision Neon
routine, but with an extra reduction to pi/8 using the double-angle
formula. It is accurate to 3.5 ULP.
---
 pl/math/include/mathlib.h |   4 ++
 pl/math/math_config.h     |   5 +++
 pl/math/s_tan_3u5.c       |   6 +++
 pl/math/tools/tan.sollya  |  20 +++++++++
 pl/math/v_tan_3u5.c       | 102 ++++++++++++++++++++++++++++++++++++++++++++++
 pl/math/v_tan_data.c      |  15 +++++++
 pl/math/vn_tan_3u5.c      |  12 ++++++
 7 files changed, 164 insertions(+)
 create mode 100644 pl/math/s_tan_3u5.c
 create mode 100644 pl/math/tools/tan.sollya
 create mode 100644 pl/math/v_tan_3u5.c
 create mode 100644 pl/math/v_tan_data.c
 create mode 100644 pl/math/vn_tan_3u5.c

diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 74c3b34..af5f9f9 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -70,6 +70,7 @@ double __s_log10 (double);
 double __s_log1p (double);
 double __s_log2 (double);
 double __s_sinh (double);
+double __s_tan (double);
 double __s_tanh (double);
 
 #if __aarch64__
@@ -113,6 +114,7 @@ __f64x2_t __v_log2 (__f64x2_t);
 __f32x4_t __v_sinhf (__f32x4_t);
 __f64x2_t __v_sinh (__f64x2_t);
 __f32x4_t __v_tanf (__f32x4_t);
+__f64x2_t __v_tan (__f64x2_t);
 __f32x4_t __v_tanhf (__f32x4_t);
 __f64x2_t __v_tanh (__f64x2_t);
 
@@ -149,6 +151,7 @@ __vpcs __f64x2_t __vn_log2 (__f64x2_t);
 __vpcs __f32x4_t __vn_sinhf (__f32x4_t);
 __vpcs __f64x2_t __vn_sinh (__f64x2_t);
 __vpcs __f32x4_t __vn_tanf (__f32x4_t);
+__vpcs __f64x2_t __vn_tan (__f64x2_t);
 __vpcs __f32x4_t __vn_tanhf (__f32x4_t);
 __vpcs __f64x2_t __vn_tanh (__f64x2_t);
 
@@ -182,6 +185,7 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t);
 
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 03b4ad4..9a7ce96 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -564,4 +564,9 @@ extern const struct cbrt_data
   double table[5];
 } __cbrt_data HIDDEN;
 
+extern const struct v_tan_data
+{
+  double neg_half_pi_hi, neg_half_pi_lo;
+  double poly[9];
+} __v_tan_data HIDDEN;
 #endif
diff --git a/pl/math/s_tan_3u5.c b/pl/math/s_tan_3u5.c
new file mode 100644
index 0000000..adb807c
--- /dev/null
+++ b/pl/math/s_tan_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tan_3u5.c"
diff --git a/pl/math/tools/tan.sollya b/pl/math/tools/tan.sollya
new file mode 100644
index 0000000..bb0bb28
--- /dev/null
+++ b/pl/math/tools/tan.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating double precision tan(x)
+//
+// Copyright (c) 2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 8;
+
+// interval bounds
+a = 0x1.0p-126;
+b = pi / 8;
+
+display = hexadecimal;
+
+f = (tan(sqrt(x))-sqrt(x))/x^(3/2);
+poly = fpminimax(f, deg, [|double ...|], [a*a;b*b]);
+
+//print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/pl/math/v_tan_3u5.c b/pl/math/v_tan_3u5.c
new file mode 100644
index 0000000..f87bacc
--- /dev/null
+++ b/pl/math/v_tan_3u5.c
@@ -0,0 +1,102 @@
+/*
+ * Double-precision vector tan(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define MHalfPiHi v_f64 (__v_tan_data.neg_half_pi_hi)
+#define MHalfPiLo v_f64 (__v_tan_data.neg_half_pi_lo)
+#define TwoOverPi v_f64 (0x1.45f306dc9c883p-1)
+#define Shift v_f64 (0x1.8p52)
+#define AbsMask 0x7fffffffffffffff
+#define RangeVal 0x4160000000000000  /* asuint64(2^23).  */
+#define TinyBound 0x3e50000000000000 /* asuint64(2^-26).  */
+#define C(i) v_f64 (__v_tan_data.poly[i])
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x)
+{
+  return v_call_f64 (tan, x, x, v_u64 (-1));
+}
+
+/* Vector approximation for double-precision tan.
+   Maximum measured error is 3.48 ULP:
+   __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
+				 want -0x1.f6ccd8ecf7deap+37.   */
+VPCS_ATTR
+v_f64_t V_NAME (tan) (v_f64_t x)
+{
+  v_u64_t iax = v_as_u64_f64 (x) & AbsMask;
+
+  /* Our argument reduction cannot calculate q with sufficient accuracy for very
+     large inputs. Fall back to scalar routine for all lanes if any are too
+     large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny
+     input to avoid underflow. Note pl does not supply a scalar double-precision
+     tan, so the fallback will be statically linked from the system libm.  */
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (iax - TinyBound > RangeVal - TinyBound)))
+#else
+  if (unlikely (v_any_u64 (iax > RangeVal)))
+#endif
+    return specialcase (x);
+
+  /* q = nearest integer to 2 * x / pi.  */
+  v_f64_t q = v_fma_f64 (x, TwoOverPi, Shift) - Shift;
+  v_s64_t qi = v_to_s64_f64 (q);
+
+  /* Use q to reduce x to r in [-pi/4, pi/4], by:
+     r = x - q * pi/2, in extended precision.  */
+  v_f64_t r = x;
+  r = v_fma_f64 (q, MHalfPiHi, r);
+  r = v_fma_f64 (q, MHalfPiLo, r);
+  /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
+     formula.  */
+  r = r * 0.5;
+
+  /* Approximate tan(r) using order 8 polynomial.
+     tan(x) is odd, so polynomial has the form:
+     tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ...
+     Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
+     Then compute the approximation by:
+     tan(r) ~= r + r^3 * (C0 + r^2 * P(r)).  */
+  v_f64_t r2 = r * r, r4 = r2 * r2, r8 = r4 * r4;
+  /* Use offset version of Estrin wrapper to evaluate from C1 onwards.  */
+  v_f64_t p = ESTRIN_7_ (r2, r4, r8, C, 1);
+  p = v_fma_f64 (p, r2, C (0));
+  p = v_fma_f64 (r2, p * r, r);
+
+  /* Recombination uses double-angle formula:
+     tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
+     and reciprocity around pi/2:
+     tan(x) = 1 / (tan(pi/2 - x))
+     to assemble result using change-of-sign and conditional selection of
+     numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */
+  v_f64_t n = v_fma_f64 (p, p, v_f64 (-1));
+  v_f64_t d = p * 2;
+
+  v_u64_t use_recip = v_cond_u64 ((v_as_u64_s64 (qi) & 1) == 0);
+
+  return v_sel_f64 (use_recip, -d, n) / v_sel_f64 (use_recip, n, d);
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (V_NAME (tan), 2.99)
+PL_TEST_EXPECT_FENV (V_NAME (tan), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (tan), 0, TinyBound, 5000)
+PL_TEST_INTERVAL (V_NAME (tan), TinyBound, RangeVal, 100000)
+PL_TEST_INTERVAL (V_NAME (tan), RangeVal, inf, 5000)
+PL_TEST_INTERVAL (V_NAME (tan), -0, -TinyBound, 5000)
+PL_TEST_INTERVAL (V_NAME (tan), -TinyBound, -RangeVal, 100000)
+PL_TEST_INTERVAL (V_NAME (tan), -RangeVal, -inf, 5000)
+#endif
diff --git a/pl/math/v_tan_data.c b/pl/math/v_tan_data.c
new file mode 100644
index 0000000..04e2516
--- /dev/null
+++ b/pl/math/v_tan_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and helpers for double-precision vector tan(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+const struct v_tan_data __v_tan_data
+  = {.neg_half_pi_hi = -0x1.921fb54442d18p0,
+     .neg_half_pi_lo = -0x1.1a62633145c07p-54,
+     .poly
+     = {0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5,
+	0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9,
+	0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, 0x1.4e4fd14147622p-12}};
diff --git a/pl/math/vn_tan_3u5.c b/pl/math/vn_tan_3u5.c
new file mode 100644
index 0000000..a4efb06
--- /dev/null
+++ b/pl/math/vn_tan_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tan.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_tan, _ZGVnN2v_tan)
+#include "v_tan_3u5.c"
+#endif
-- 
cgit v1.2.3


From a7b6022090c9b43af519dc328c1aeece7258e558 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon, 23 Jan 2023 10:01:03 +0000
Subject: pl/math: Reduce order of single-precision tan polynomial

For both vector and scalar routines we reduce the order from 6 to
5. For vector routines, this requires reducing RangeVal as for large
values the tan polynomial is not quite accurate enough. However the
cotan polynomial is used in the inaccurate region in the scalar
routine, so this does not need to change.

Accuracy of scalar routine is unchanged. Accuracy in both vector
routines is now 3.45 ULP, with the same worst-case.
---
 pl/math/horner_wrap.h          |   2 +-
 pl/math/math_config.h          |   2 +-
 pl/math/pairwise_horner_wrap.h |   2 +-
 pl/math/s_tanf_3u2.c           |   6 --
 pl/math/s_tanf_3u5.c           |   6 ++
 pl/math/sv_tanf_3u2.c          | 114 -----------------------------------
 pl/math/sv_tanf_3u5.c          | 112 +++++++++++++++++++++++++++++++++++
 pl/math/tanf_3u3.c             |  18 ++----
 pl/math/tanf_data.c            |  27 ++++-----
 pl/math/tools/tanf.sollya      |  14 +----
 pl/math/v_tanf_3u2.c           | 131 -----------------------------------------
 pl/math/v_tanf_3u5.c           | 131 +++++++++++++++++++++++++++++++++++++++++
 pl/math/vn_tanf_3u2.c          |  12 ----
 pl/math/vn_tanf_3u5.c          |  12 ++++
 14 files changed, 283 insertions(+), 306 deletions(-)
 delete mode 100644 pl/math/s_tanf_3u2.c
 create mode 100644 pl/math/s_tanf_3u5.c
 delete mode 100644 pl/math/sv_tanf_3u2.c
 create mode 100644 pl/math/sv_tanf_3u5.c
 delete mode 100644 pl/math/v_tanf_3u2.c
 create mode 100644 pl/math/v_tanf_3u5.c
 delete mode 100644 pl/math/vn_tanf_3u2.c
 create mode 100644 pl/math/vn_tanf_3u5.c

diff --git a/pl/math/horner_wrap.h b/pl/math/horner_wrap.h
index a254b2d..6478968 100644
--- a/pl/math/horner_wrap.h
+++ b/pl/math/horner_wrap.h
@@ -6,7 +6,7 @@
  */
 
 // clang-format off
-#define  HORNER_1_(x, c, i) FMA(C(i + 1), x, c(i))
+#define  HORNER_1_(x, c, i) FMA(c(i + 1), x, c(i))
 #define  HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i))
 #define  HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i))
 #define  HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i))
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 9a7ce96..dccb3ce 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -472,7 +472,7 @@ extern const struct log1pf_data
   float coeffs[LOG1PF_NCOEFFS];
 } __log1pf_data HIDDEN;
 
-#define TANF_P_POLY_NCOEFFS 7
+#define TANF_P_POLY_NCOEFFS 6
 /* cotan approach needs order 3 on [0, pi/4] to reach <3.5ulps.  */
 #define TANF_Q_POLY_NCOEFFS 4
 extern const struct tanf_poly_data
diff --git a/pl/math/pairwise_horner_wrap.h b/pl/math/pairwise_horner_wrap.h
index b6efb6f..e56f059 100644
--- a/pl/math/pairwise_horner_wrap.h
+++ b/pl/math/pairwise_horner_wrap.h
@@ -6,7 +6,7 @@
  */
 
 // clang-format off
-#define  PW_HORNER_1_(x, c,     i) FMA(x,  C(i + 1),                      C(i))
+#define  PW_HORNER_1_(x, c,     i) FMA(x,  c(i + 1),                       c(i))
 #define  PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_ (x,     c, i + 2), PW_HORNER_1_(x, c, i))
 #define  PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
 #define  PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
diff --git a/pl/math/s_tanf_3u2.c b/pl/math/s_tanf_3u2.c
deleted file mode 100644
index b5ddf94..0000000
--- a/pl/math/s_tanf_3u2.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_tanf_3u2.c"
diff --git a/pl/math/s_tanf_3u5.c b/pl/math/s_tanf_3u5.c
new file mode 100644
index 0000000..fa64c8a
--- /dev/null
+++ b/pl/math/s_tanf_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanf_3u5.c"
diff --git a/pl/math/sv_tanf_3u2.c b/pl/math/sv_tanf_3u2.c
deleted file mode 100644
index 78ff480..0000000
--- a/pl/math/sv_tanf_3u2.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Single-precision vector tan(x) function.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if SV_SUPPORTED
-
-/* Constants.  */
-#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
-#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
-#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
-#define InvPio2 (sv_f32 (0x1.45f306p-1f))
-#define RangeVal (sv_f32 (0x1p17f))
-#define Shift (sv_f32 (0x1.8p+23f))
-
-#define poly(i) sv_f32 (__tanf_poly_data.poly_tan[i])
-
-/* Use full Estrin's scheme to evaluate polynomial.  */
-static inline sv_f32_t
-eval_poly (svbool_t pg, sv_f32_t z)
-{
-  sv_f32_t z2 = svmul_f32_x (pg, z, z);
-  sv_f32_t z4 = svmul_f32_x (pg, z2, z2);
-  sv_f32_t y_10 = sv_fma_f32_x (pg, z, poly (1), poly (0));
-  sv_f32_t y_32 = sv_fma_f32_x (pg, z, poly (3), poly (2));
-  sv_f32_t y_54 = sv_fma_f32_x (pg, z, poly (5), poly (4));
-  sv_f32_t y_6_54 = sv_fma_f32_x (pg, z2, poly (6), y_54);
-  sv_f32_t y_32_10 = sv_fma_f32_x (pg, z2, y_32, y_10);
-  sv_f32_t y = sv_fma_f32_x (pg, z4, y_6_54, y_32_10);
-  return y;
-}
-
-static NOINLINE sv_f32_t
-__sv_tanf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
-{
-  return sv_call_f32 (tanf, x, y, cmp);
-}
-
-/* Fast implementation of SVE tanf.
-   The maximum measured errors were located near RangeVal.
-   Maximum error: 3.121ulps.
-   svtan_f32(0x1.ff3df8p+16) got -0x1.fbb7b8p-1
-			    want -0x1.fbb7b2p-1.  */
-sv_f32_t
-__sv_tanf_x (sv_f32_t x, const svbool_t pg)
-{
-  /* Determine whether input is too large to perform fast regression.  */
-  svbool_t cmp = svacge_f32 (pg, x, RangeVal);
-  svbool_t pred_minuszero = svcmpeq_f32 (pg, x, sv_f32 (-0.0));
-
-  /* n = rint(x/(pi/2)).  */
-  sv_f32_t q = sv_fma_f32_x (pg, InvPio2, x, Shift);
-  sv_f32_t n = svsub_f32_x (pg, q, Shift);
-  /* n is already a signed integer, simply convert it.  */
-  sv_s32_t in = sv_to_s32_f32_x (pg, n);
-  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
-  sv_s32_t alt = svand_s32_x (pg, in, sv_s32 (1));
-  svbool_t pred_alt = svcmpne_s32 (pg, alt, sv_s32 (0));
-
-  /* r = x - n * (pi/2)  (range reduction into 0 .. pi/4).  */
-  sv_f32_t r;
-  r = sv_fma_f32_x (pg, NegPio2_1, n, x);
-  r = sv_fma_f32_x (pg, NegPio2_2, n, r);
-  r = sv_fma_f32_x (pg, NegPio2_3, n, r);
-
-  /* If x lives in an interval, where |tan(x)|
-     - is finite, then use a polynomial approximation of the form
-       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
-     - grows to infinity then use symmetries of tangent and the identity
-       tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
-       the same polynomial approximation of tan as above.  */
-
-  /* Perform additional reduction if required.  */
-  sv_f32_t z = svneg_f32_m (r, pred_alt, r);
-
-  /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4].  */
-  sv_f32_t z2 = svmul_f32_x (pg, z, z);
-  sv_f32_t p = eval_poly (pg, z2);
-  sv_f32_t y = sv_fma_f32_x (pg, svmul_f32_x (pg, z, z2), p, z);
-
-  /* Transform result back, if necessary.  */
-  sv_f32_t inv_y = svdiv_f32_x (pg, sv_f32 (1.0f), y);
-  y = svsel_f32 (pred_alt, inv_y, y);
-
-  /* Fast reduction does not handle the x = -0.0 case well,
-     therefore it is fixed here.  */
-  y = svsel_f32 (pred_minuszero, x, y);
-
-  /* No need to pass pg to specialcase here since cmp is a strict subset,
-     guaranteed by the cmpge above.  */
-  if (unlikely (svptest_any (pg, cmp)))
-    return __sv_tanf_specialcase (x, y, cmp);
-  return y;
-}
-
-PL_ALIAS (__sv_tanf_x, _ZGVsMxv_tanf)
-
-PL_SIG (SV, F, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (__sv_tanf, 2.7)
-PL_TEST_INTERVAL (__sv_tanf, -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (__sv_tanf, 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (__sv_tanf, 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (__sv_tanf, 0x1p-23, 0.7, 50000)
-PL_TEST_INTERVAL (__sv_tanf, 0.7, 1.5, 50000)
-PL_TEST_INTERVAL (__sv_tanf, 1.5, 100, 50000)
-PL_TEST_INTERVAL (__sv_tanf, 100, 0x1p17, 50000)
-PL_TEST_INTERVAL (__sv_tanf, 0x1p17, inf, 50000)
-#endif
diff --git a/pl/math/sv_tanf_3u5.c b/pl/math/sv_tanf_3u5.c
new file mode 100644
index 0000000..cca43bd
--- /dev/null
+++ b/pl/math/sv_tanf_3u5.c
@@ -0,0 +1,112 @@
+/*
+ * Single-precision vector tan(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+/* Constants.  */
+#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
+#define InvPio2 (sv_f32 (0x1.45f306p-1f))
+#define RangeVal (sv_f32 (0x1p15f))
+#define Shift (sv_f32 (0x1.8p+23f))
+
+#define poly(i) sv_f32 (__tanf_poly_data.poly_tan[i])
+
+/* Use full Estrin's scheme to evaluate polynomial.  */
+static inline sv_f32_t
+eval_poly (svbool_t pg, sv_f32_t z)
+{
+  sv_f32_t z2 = svmul_f32_x (pg, z, z);
+  sv_f32_t z4 = svmul_f32_x (pg, z2, z2);
+  sv_f32_t y_10 = sv_fma_f32_x (pg, z, poly (1), poly (0));
+  sv_f32_t y_32 = sv_fma_f32_x (pg, z, poly (3), poly (2));
+  sv_f32_t y_54 = sv_fma_f32_x (pg, z, poly (5), poly (4));
+  sv_f32_t y_32_10 = sv_fma_f32_x (pg, z2, y_32, y_10);
+  sv_f32_t y = sv_fma_f32_x (pg, z4, y_54, y_32_10);
+  return y;
+}
+
+static NOINLINE sv_f32_t
+__sv_tanf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (tanf, x, y, cmp);
+}
+
+/* Fast implementation of SVE tanf.
+   Maximum error is 3.45 ULP:
+   __sv_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
+			     want 0x1.ff9850p-1.  */
+sv_f32_t
+__sv_tanf_x (sv_f32_t x, const svbool_t pg)
+{
+  /* Determine whether input is too large to perform fast regression.  */
+  svbool_t cmp = svacge_f32 (pg, x, RangeVal);
+  svbool_t pred_minuszero = svcmpeq_f32 (pg, x, sv_f32 (-0.0));
+
+  /* n = rint(x/(pi/2)).  */
+  sv_f32_t q = sv_fma_f32_x (pg, InvPio2, x, Shift);
+  sv_f32_t n = svsub_f32_x (pg, q, Shift);
+  /* n is already a signed integer, simply convert it.  */
+  sv_s32_t in = sv_to_s32_f32_x (pg, n);
+  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
+  sv_s32_t alt = svand_s32_x (pg, in, sv_s32 (1));
+  svbool_t pred_alt = svcmpne_s32 (pg, alt, sv_s32 (0));
+
+  /* r = x - n * (pi/2)  (range reduction into 0 .. pi/4).  */
+  sv_f32_t r;
+  r = sv_fma_f32_x (pg, NegPio2_1, n, x);
+  r = sv_fma_f32_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_3, n, r);
+
+  /* If x lives in an interval, where |tan(x)|
+     - is finite, then use a polynomial approximation of the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use symmetries of tangent and the identity
+       tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
+       the same polynomial approximation of tan as above.  */
+
+  /* Perform additional reduction if required.  */
+  sv_f32_t z = svneg_f32_m (r, pred_alt, r);
+
+  /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4].  */
+  sv_f32_t z2 = svmul_f32_x (pg, z, z);
+  sv_f32_t p = eval_poly (pg, z2);
+  sv_f32_t y = sv_fma_f32_x (pg, svmul_f32_x (pg, z, z2), p, z);
+
+  /* Transform result back, if necessary.  */
+  sv_f32_t inv_y = svdiv_f32_x (pg, sv_f32 (1.0f), y);
+  y = svsel_f32 (pred_alt, inv_y, y);
+
+  /* Fast reduction does not handle the x = -0.0 case well,
+     therefore it is fixed here.  */
+  y = svsel_f32 (pred_minuszero, x, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_tanf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_tanf_x, _ZGVsMxv_tanf)
+
+PL_SIG (SV, F, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (__sv_tanf, 2.96)
+PL_TEST_INTERVAL (__sv_tanf, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-23, 0.7, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 1.5, 100, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 100, 0x1p17, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p17, inf, 50000)
+#endif
diff --git a/pl/math/tanf_3u3.c b/pl/math/tanf_3u3.c
index 0b1617c..ec006dc 100644
--- a/pl/math/tanf_3u3.c
+++ b/pl/math/tanf_3u3.c
@@ -7,6 +7,7 @@
 #include "math_config.h"
 #include "pl_sig.h"
 #include "pl_test.h"
+#include "pairwise_hornerf.h"
 
 /* Useful constants.  */
 #define NegPio2_1 (-0x1.921fb6p+0f)
@@ -21,28 +22,19 @@
 /* 2PI * 2^-64.  */
 #define Pio2p63 (0x1.921FB54442D18p-62)
 
-#define P __tanf_poly_data.poly_tan
-#define Q __tanf_poly_data.poly_cotan
+#define P(i) __tanf_poly_data.poly_tan[i]
+#define Q(i) __tanf_poly_data.poly_cotan[i]
 
 static inline float
 eval_P (float z)
 {
-  float z2 = z * z;
-  float y_10 = fmaf (z, P[1], P[0]);
-  float y_32 = fmaf (z, P[3], P[2]);
-  float y_54 = fmaf (z, P[5], P[4]);
-  float y_6_54 = fmaf (z2, P[6], y_54);
-  float y_32_10 = fmaf (z2, y_32, y_10);
-  float y = fmaf (z2, z2 * y_6_54, y_32_10);
-  return y;
+  return PAIRWISE_HORNER_5 (z, z * z, P);
 }
 
 static inline float
 eval_Q (float z)
 {
-  float z2 = z * z;
-  float y = fmaf (z2, fmaf (z, Q[3], Q[2]), fmaf (z, Q[1], Q[0]));
-  return y;
+  return PAIRWISE_HORNER_3 (z, z * z, Q);
 }
 
 /* Reduction of the input argument x using Cody-Waite approach, such that x = r
diff --git a/pl/math/tanf_data.c b/pl/math/tanf_data.c
index 242cfaa..a6b9d51 100644
--- a/pl/math/tanf_data.c
+++ b/pl/math/tanf_data.c
@@ -10,23 +10,20 @@
 const struct tanf_poly_data __tanf_poly_data = {
 .poly_tan = {
 /* Coefficients generated using:
-   remez(f(x) = (tan(sqrt(x)) - sqrt(x)) / (x * sqrt(x)), deg, [a;b], 1, 1e-16, [|dtype ...|])
-   optimize each coefficient
+   poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2), deg, [|single ...|], [a*a;b*b]);
    optimize relative error
    final prec : 23 bits
-   working prec : 128 bits
-   deg : 6
-   a : 0x1p-126
-   b : (pi) / 0x1p2
-   dirty rel error : 0x1.df324p-26
-   dirty abs error : 0x1.df3244p-26.  */
-0x1.555558p-2, /* 0.3333334.  */
-0x1.110e1cp-3, /* 0.1333277.  */
-0x1.bb0e7p-5, /* 5.408403e-2.  */
-0x1.5826c8p-6, /* 2.100534e-2.  */
-0x1.8426a6p-7, /* 1.1845428e-2.  */
--0x1.7a5adcp-10, /* -1.4433095e-3.  */
-0x1.5574dap-8, /* 5.210212e-3.  */
+   deg : 5
+   a : 0x1p-126 ^ 2
+   b : ((pi) / 0x1p2) ^ 2
+   dirty rel error: 0x1.f7c2e4p-25
+   dirty abs error: 0x1.f7c2ecp-25.  */
+0x1.55555p-2,
+0x1.11166p-3,
+0x1.b88a78p-5,
+0x1.7b5756p-6,
+0x1.4ef4cep-8,
+0x1.0e1e74p-7
 },
 .poly_cotan = {
 /* Coefficients generated using:
diff --git a/pl/math/tools/tanf.sollya b/pl/math/tools/tanf.sollya
index 8b2306b..f4b49b4 100644
--- a/pl/math/tools/tanf.sollya
+++ b/pl/math/tools/tanf.sollya
@@ -6,7 +6,7 @@
 dtype = single;
 
 mthd = 0; // approximate tan
-deg = 6; // poly degree
+deg = 5; // poly degree
 
 // // Uncomment for cotan
 // mthd = 1; // approximate cotan
@@ -38,19 +38,9 @@ if(mthd==0) then {
   F = proc(P) { return x + x^3 * P(x^2); };
   f = (g(sqrt(x))-sqrt(x))/(x*sqrt(x));
   init_poly = 0;
-  deg_init_poly = -1; // a value such that we actually start by building constant coefficient
   // Display info
   print("Approximate g(x) =", g, "as F(x)=", s, ".");
-  // Remez applied to minimise relative error
-  approx_remez = proc(func, poly, d) {
-    return remez(1 - poly / func, deg - d, [a;b], x^d/func(x), 1e-10);
-  };
-  // Iteratively find optimal coeffs
-  poly = init_poly;
-  for i from deg_init_poly+1 to deg do {
-    p = roundcoefficients(approx_remez(f, poly, i), [|dtype ...|]);
-    poly = poly + x^i * coeff(p,0);
-  };
+  poly = fpminimax(f, deg, [|dtype ...|], [a*a;b*b]);
 }
 else if (mthd==1) then {
   s = "1/x + x * P(x^2)";
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
deleted file mode 100644
index a2b1fab..0000000
--- a/pl/math/v_tanf_3u2.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Single-precision vector tan(x) function.
- *
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "estrinf.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if V_SUPPORTED
-
-/* Constants.  */
-#define NegPio2_1 (v_f32 (-0x1.921fb6p+0f))
-#define NegPio2_2 (v_f32 (0x1.777a5cp-25f))
-#define NegPio2_3 (v_f32 (0x1.ee59dap-50f))
-#define InvPio2 (v_f32 (0x1.45f306p-1f))
-#define RangeVal (0x48000000)  /* asuint32(0x1p17f).  */
-#define TinyBound (0x30000000) /* asuint32 (0x1p-31).  */
-#define Shift (v_f32 (0x1.8p+23f))
-#define AbsMask (v_u32 (0x7fffffff))
-
-#define poly(i) v_f32 (__tanf_poly_data.poly_tan[i])
-
-/* Special cases (fall back to scalar calls).  */
-VPCS_ATTR
-NOINLINE static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
-  return v_call_f32 (tanf, x, y, cmp);
-}
-
-/* Use a full Estrin scheme to evaluate polynomial.  */
-static inline v_f32_t
-eval_poly (v_f32_t z)
-{
-  v_f32_t z2 = z * z;
-#if WANT_SIMD_EXCEPT
-  /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions
-     are to be triggered correctly, sidestep this by fixing such lanes to 0.  */
-  v_u32_t will_uflow = v_cond_u32 ((v_as_u32_f32 (z) & AbsMask) <= TinyBound);
-  if (unlikely (v_any_u32 (will_uflow)))
-    z2 = v_sel_f32 (will_uflow, v_f32 (0), z2);
-#endif
-  v_f32_t z4 = z2 * z2;
-  return ESTRIN_6 (z, z2, z4, poly);
-}
-
-/* Fast implementation of Neon tanf.
-   Maximum measured error: 3.121ulps.
-   vtanq_f32(0x1.ff3df8p+16) got -0x1.fbb7b8p-1
-			    want -0x1.fbb7b2p-1.  */
-VPCS_ATTR
-v_f32_t V_NAME (tanf) (v_f32_t x)
-{
-  v_f32_t special_arg = x;
-  v_u32_t ix = v_as_u32_f32 (x);
-  v_u32_t iax = ix & AbsMask;
-
-  /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast
-     regression.  */
-#if WANT_SIMD_EXCEPT
-  /* If fp exceptions are to be triggered correctly, also special-case tiny
-     input, as this will load to overflow later. Fix any special lanes to 1 to
-     prevent any exceptions being triggered.  */
-  v_u32_t special = v_cond_u32 (iax - TinyBound >= RangeVal - TinyBound);
-  if (unlikely (v_any_u32 (special)))
-    x = v_sel_f32 (special, v_f32 (1.0f), x);
-#else
-  /* Otherwise, special-case large and special values.  */
-  v_u32_t special = v_cond_u32 (iax >= RangeVal);
-#endif
-
-  /* n = rint(x/(pi/2)).  */
-  v_f32_t q = v_fma_f32 (InvPio2, x, Shift);
-  v_f32_t n = q - Shift;
-  /* n is representable as a signed integer, simply convert it.  */
-  v_s32_t in = v_round_s32 (n);
-  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
-  v_s32_t alt = in & 1;
-  v_u32_t pred_alt = (alt != 0);
-
-  /* r = x - n * (pi/2)  (range reduction into -pi./4 .. pi/4).  */
-  v_f32_t r;
-  r = v_fma_f32 (NegPio2_1, n, x);
-  r = v_fma_f32 (NegPio2_2, n, r);
-  r = v_fma_f32 (NegPio2_3, n, r);
-
-  /* If x lives in an interval, where |tan(x)|
-     - is finite, then use a polynomial approximation of the form
-       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
-     - grows to infinity then use symmetries of tangent and the identity
-       tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
-       the same polynomial approximation of tan as above.  */
-
-  /* Perform additional reduction if required.  */
-  v_f32_t z = v_sel_f32 (pred_alt, -r, r);
-
-  /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4].  */
-  v_f32_t z2 = r * r;
-  v_f32_t p = eval_poly (z2);
-  v_f32_t y = v_fma_f32 (z * z2, p, z);
-
-  /* Compute reciprocal and apply if required.  */
-  v_f32_t inv_y = v_div_f32 (v_f32 (1.0f), y);
-  y = v_sel_f32 (pred_alt, inv_y, y);
-
-  /* Fast reduction does not handle the x = -0.0 case well,
-     therefore it is fixed here.  */
-  y = v_sel_f32 (x == v_f32 (-0.0), x, y);
-
-  if (unlikely (v_any_u32 (special)))
-    return specialcase (special_arg, y, special);
-  return y;
-}
-VPCS_ALIAS
-
-PL_SIG (V, F, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (V_NAME (tanf), 2.7)
-PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (tanf), -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-23, 0.7, 50000)
-PL_TEST_INTERVAL (V_NAME (tanf), 0.7, 1.5, 50000)
-PL_TEST_INTERVAL (V_NAME (tanf), 1.5, 100, 50000)
-PL_TEST_INTERVAL (V_NAME (tanf), 100, 0x1p17, 50000)
-PL_TEST_INTERVAL (V_NAME (tanf), 0x1p17, inf, 50000)
-#endif
diff --git a/pl/math/v_tanf_3u5.c b/pl/math/v_tanf_3u5.c
new file mode 100644
index 0000000..828466b
--- /dev/null
+++ b/pl/math/v_tanf_3u5.c
@@ -0,0 +1,131 @@
+/*
+ * Single-precision vector tan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrinf.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+/* Constants.  */
+#define NegPio2_1 (v_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (v_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (v_f32 (0x1.ee59dap-50f))
+#define InvPio2 (v_f32 (0x1.45f306p-1f))
+#define RangeVal (0x47000000)  /* asuint32(0x1p15f).  */
+#define TinyBound (0x30000000) /* asuint32 (0x1p-31).  */
+#define Shift (v_f32 (0x1.8p+23f))
+#define AbsMask (v_u32 (0x7fffffff))
+
+#define poly(i) v_f32 (__tanf_poly_data.poly_tan[i])
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  return v_call_f32 (tanf, x, y, cmp);
+}
+
+/* Use a full Estrin scheme to evaluate polynomial.  */
+static inline v_f32_t
+eval_poly (v_f32_t z)
+{
+  v_f32_t z2 = z * z;
+#if WANT_SIMD_EXCEPT
+  /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions
+     are to be triggered correctly, sidestep this by fixing such lanes to 0.  */
+  v_u32_t will_uflow = v_cond_u32 ((v_as_u32_f32 (z) & AbsMask) <= TinyBound);
+  if (unlikely (v_any_u32 (will_uflow)))
+    z2 = v_sel_f32 (will_uflow, v_f32 (0), z2);
+#endif
+  v_f32_t z4 = z2 * z2;
+  return ESTRIN_5 (z, z2, z4, poly);
+}
+
+/* Fast implementation of Neon tanf.
+   Maximum error is 3.45 ULP:
+   __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
+			    want 0x1.ff9850p-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (tanf) (v_f32_t x)
+{
+  v_f32_t special_arg = x;
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+
+  /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast
+     regression.  */
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, also special-case tiny
+     input, as this will load to overflow later. Fix any special lanes to 1 to
+     prevent any exceptions being triggered.  */
+  v_u32_t special = v_cond_u32 (iax - TinyBound >= RangeVal - TinyBound);
+  if (unlikely (v_any_u32 (special)))
+    x = v_sel_f32 (special, v_f32 (1.0f), x);
+#else
+  /* Otherwise, special-case large and special values.  */
+  v_u32_t special = v_cond_u32 (iax >= RangeVal);
+#endif
+
+  /* n = rint(x/(pi/2)).  */
+  v_f32_t q = v_fma_f32 (InvPio2, x, Shift);
+  v_f32_t n = q - Shift;
+  /* n is representable as a signed integer, simply convert it.  */
+  v_s32_t in = v_round_s32 (n);
+  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
+  v_s32_t alt = in & 1;
+  v_u32_t pred_alt = (alt != 0);
+
+  /* r = x - n * (pi/2)  (range reduction into -pi./4 .. pi/4).  */
+  v_f32_t r;
+  r = v_fma_f32 (NegPio2_1, n, x);
+  r = v_fma_f32 (NegPio2_2, n, r);
+  r = v_fma_f32 (NegPio2_3, n, r);
+
+  /* If x lives in an interval, where |tan(x)|
+     - is finite, then use a polynomial approximation of the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use symmetries of tangent and the identity
+       tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
+       the same polynomial approximation of tan as above.  */
+
+  /* Perform additional reduction if required.  */
+  v_f32_t z = v_sel_f32 (pred_alt, -r, r);
+
+  /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4].  */
+  v_f32_t z2 = r * r;
+  v_f32_t p = eval_poly (z2);
+  v_f32_t y = v_fma_f32 (z * z2, p, z);
+
+  /* Compute reciprocal and apply if required.  */
+  v_f32_t inv_y = v_div_f32 (v_f32 (1.0f), y);
+  y = v_sel_f32 (pred_alt, inv_y, y);
+
+  /* Fast reduction does not handle the x = -0.0 case well,
+     therefore it is fixed here.  */
+  y = v_sel_f32 (x == v_f32 (-0.0), x, y);
+
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (special_arg, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (V_NAME (tanf), 2.96)
+PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (tanf), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-23, 0.7, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 1.5, 100, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 100, 0x1p17, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p17, inf, 50000)
+#endif
diff --git a/pl/math/vn_tanf_3u2.c b/pl/math/vn_tanf_3u2.c
deleted file mode 100644
index ccdcab6..0000000
--- a/pl/math/vn_tanf_3u2.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_tanf.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_tanf, _ZGVnN4v_tanf)
-#include "v_tanf_3u2.c"
-#endif
diff --git a/pl/math/vn_tanf_3u5.c b/pl/math/vn_tanf_3u5.c
new file mode 100644
index 0000000..a88cb40
--- /dev/null
+++ b/pl/math/vn_tanf_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanf.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_tanf, _ZGVnN4v_tanf)
+#include "v_tanf_3u5.c"
+#endif
-- 
cgit v1.2.3


From 92864946def299cab01582672cb7510b2e996101 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Tue, 24 Jan 2023 11:56:38 +0000
Subject: string: Improve SVE memcpy

Improve SVE memcpy by copying 2 vectors. This avoids a check on vector length
and improves performance of random memcpy.
---
 string/aarch64/memcpy-sve.S | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
index f74d4a9..61d36f2 100644
--- a/string/aarch64/memcpy-sve.S
+++ b/string/aarch64/memcpy-sve.S
@@ -57,14 +57,16 @@ ENTRY (__memcpy_aarch64_sve)
 
 	cmp	count, 128
 	b.hi	L(copy_long)
-	cmp	count, 32
+	cntb	vlen
+	cmp	count, vlen, lsl 1
 	b.hi	L(copy32_128)
 
 	whilelo p0.b, xzr, count
-	cntb	vlen
-	tbnz	vlen, 4, L(vlen128)
-	ld1b	z0.b, p0/z, [src]
-	st1b	z0.b, p0, [dstin]
+	whilelo p1.b, vlen, count
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p1, [dstin, 1, mul vl]
 	ret
 
 	/* Medium copies: 33..128 bytes.  */
@@ -133,14 +135,6 @@ L(copy64_from_end):
 	stp	A_q, B_q, [dstend, -32]
 	ret
 
-L(vlen128):
-	whilelo p1.b, vlen, count
-	ld1b	z0.b, p0/z, [src, 0, mul vl]
-	ld1b	z1.b, p1/z, [src, 1, mul vl]
-	st1b	z0.b, p0, [dstin, 0, mul vl]
-	st1b	z1.b, p1, [dstin, 1, mul vl]
-	ret
-
 	/* Large backwards copy for overlapping copies.
 	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
 L(copy_long_backwards):
-- 
cgit v1.2.3


From 1eb5d7c2ded0691a48e3982211be200f67217019 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Tue, 24 Jan 2023 13:24:09 +0000
Subject: Update copyright years

Scripted copyright year updates based on git committer date.
---
 Makefile                                | 2 +-
 config.mk.dist                          | 2 +-
 math/Dir.mk                             | 2 +-
 math/logf.c                             | 2 +-
 math/test/mathbench.c                   | 2 +-
 math/test/mathtest.c                    | 2 +-
 math/test/runulp.sh                     | 2 +-
 math/test/ulp.c                         | 2 +-
 math/v_cos.c                            | 2 +-
 math/v_cosf.c                           | 2 +-
 math/v_exp.c                            | 2 +-
 math/v_exp2f.c                          | 2 +-
 math/v_expf.c                           | 2 +-
 math/v_math.h                           | 2 +-
 math/v_sin.c                            | 2 +-
 math/v_sinf.c                           | 2 +-
 string/aarch64/__mtag_tag_region.S      | 2 +-
 string/aarch64/__mtag_tag_zero_region.S | 2 +-
 string/aarch64/asmdefs.h                | 2 +-
 string/aarch64/check-arch.S             | 2 +-
 string/aarch64/memchr-mte.S             | 2 +-
 string/aarch64/memchr-sve.S             | 2 +-
 string/aarch64/memchr.S                 | 2 +-
 string/aarch64/memcmp-sve.S             | 2 +-
 string/aarch64/memcmp.S                 | 2 +-
 string/aarch64/memcpy-advsimd.S         | 2 +-
 string/aarch64/memcpy-sve.S             | 2 +-
 string/aarch64/memcpy.S                 | 2 +-
 string/aarch64/memrchr.S                | 2 +-
 string/aarch64/memset.S                 | 2 +-
 string/aarch64/strchr-mte.S             | 2 +-
 string/aarch64/strchr-sve.S             | 2 +-
 string/aarch64/strchr.S                 | 2 +-
 string/aarch64/strchrnul-mte.S          | 2 +-
 string/aarch64/strchrnul.S              | 2 +-
 string/aarch64/strcmp-sve.S             | 2 +-
 string/aarch64/strcpy-sve.S             | 2 +-
 string/aarch64/strcpy.S                 | 2 +-
 string/aarch64/strlen-mte.S             | 2 +-
 string/aarch64/strlen-sve.S             | 2 +-
 string/aarch64/strlen.S                 | 2 +-
 string/aarch64/strncmp-sve.S            | 2 +-
 string/aarch64/strnlen-sve.S            | 2 +-
 string/aarch64/strnlen.S                | 2 +-
 string/aarch64/strrchr-mte.S            | 2 +-
 string/aarch64/strrchr-sve.S            | 2 +-
 string/aarch64/strrchr.S                | 2 +-
 string/arm/check-arch.S                 | 2 +-
 string/arm/memchr.S                     | 2 +-
 string/arm/memcpy.S                     | 2 +-
 string/arm/strcmp-armv6m.S              | 2 +-
 string/arm/strcmp.S                     | 2 +-
 string/arm/strlen-armv6t2.S             | 2 +-
 string/test/strlen.c                    | 2 +-
 54 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/Makefile b/Makefile
index 22323af..c487896 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile - requires GNU make
 #
-# Copyright (c) 2018-2020, Arm Limited.
+# Copyright (c) 2018-2022, Arm Limited.
 # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 srcdir = .
diff --git a/config.mk.dist b/config.mk.dist
index 352136d..7a84975 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,6 +1,6 @@
 # Example config.mk
 #
-# Copyright (c) 2018-2020, Arm Limited.
+# Copyright (c) 2018-2022, Arm Limited.
 # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 # Subprojects to build
diff --git a/math/Dir.mk b/math/Dir.mk
index a84528d..2a9cad1 100644
--- a/math/Dir.mk
+++ b/math/Dir.mk
@@ -1,6 +1,6 @@
 # Makefile fragment - requires GNU make
 #
-# Copyright (c) 2019, Arm Limited.
+# Copyright (c) 2019-2022, Arm Limited.
 # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/math
diff --git a/math/logf.c b/math/logf.c
index a1cd2d7..820f74c 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision log function.
  *
- * Copyright (c) 2017-2019, Arm Limited.
+ * Copyright (c) 2017-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index a3093f3..6e18e36 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -1,7 +1,7 @@
 /*
  * Microbenchmark for math functions.
  *
- * Copyright (c) 2018-2020, Arm Limited.
+ * Copyright (c) 2018-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index 21509b2..3168da4 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -1,7 +1,7 @@
 /*
  * mathtest.c - test rig for mathlib
  *
- * Copyright (c) 1998-2019, Arm Limited.
+ * Copyright (c) 1998-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 4793b84..b4000f6 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -2,7 +2,7 @@
 
 # ULP error check script.
 #
-# Copyright (c) 2019-2020, Arm Limited.
+# Copyright (c) 2019-2022, Arm Limited.
 # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 #set -x
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 8589ee9..bb8c3ad 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -1,7 +1,7 @@
 /*
  * ULP error checking tool for math functions.
  *
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/math/v_cos.c b/math/v_cos.c
index 0a51481..4c8787e 100644
--- a/math/v_cos.c
+++ b/math/v_cos.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector cos function.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/math/v_cosf.c b/math/v_cosf.c
index 55ecbbb..bd677c3 100644
--- a/math/v_cosf.c
+++ b/math/v_cosf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector cos function.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/math/v_exp.c b/math/v_exp.c
index c25825f..da23fd1 100644
--- a/math/v_exp.c
+++ b/math/v_exp.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector e^x function.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/math/v_exp2f.c b/math/v_exp2f.c
index 22039ca..7f40dba 100644
--- a/math/v_exp2f.c
+++ b/math/v_exp2f.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector 2^x function.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/math/v_expf.c b/math/v_expf.c
index cb4348e..ade23b2 100644
--- a/math/v_expf.c
+++ b/math/v_expf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector e^x function.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/math/v_math.h b/math/v_math.h
index 5848349..3289916 100644
--- a/math/v_math.h
+++ b/math/v_math.h
@@ -1,7 +1,7 @@
 /*
  * Vector math abstractions.
  *
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/math/v_sin.c b/math/v_sin.c
index af7ccf7..9dbb9de 100644
--- a/math/v_sin.c
+++ b/math/v_sin.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision vector sin function.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/math/v_sinf.c b/math/v_sinf.c
index ee6ed9a..ce35dac 100644
--- a/math/v_sinf.c
+++ b/math/v_sinf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision vector sin function.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
index d9b7b64..207e229 100644
--- a/string/aarch64/__mtag_tag_region.S
+++ b/string/aarch64/__mtag_tag_region.S
@@ -1,7 +1,7 @@
 /*
  * __mtag_tag_region - tag memory
  *
- * Copyright (c) 2021, Arm Limited.
+ * Copyright (c) 2021-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
index 874acf5..44b8e01 100644
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -1,7 +1,7 @@
 /*
  * __mtag_tag_zero_region - tag memory and fill it with zero bytes
  *
- * Copyright (c) 2021, Arm Limited.
+ * Copyright (c) 2021-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h
index 18c331b..069b146 100644
--- a/string/aarch64/asmdefs.h
+++ b/string/aarch64/asmdefs.h
@@ -1,7 +1,7 @@
 /*
  * Macros for asm code.  AArch64 version.
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
index 58f92d9..131b7fa 100644
--- a/string/aarch64/check-arch.S
+++ b/string/aarch64/check-arch.S
@@ -1,7 +1,7 @@
 /*
  * check ARCH setting.
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index a2870d3..948c3cb 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -1,7 +1,7 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index 3b358b1..b851cf3 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -1,7 +1,7 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 53eadf7..fe6cfe2 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -1,7 +1,7 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2014-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index 22e6d2c..d52ce45 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -1,7 +1,7 @@
 /*
  * memcmp - compare memory
  *
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index aa180e8..35135e7 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,6 +1,6 @@
 /* memcmp - compare memory
  *
- * Copyright (c) 2013-2021, Arm Limited.
+ * Copyright (c) 2013-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index e86d7a3..e6527d0 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -1,7 +1,7 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
index 61d36f2..e8a946d 100644
--- a/string/aarch64/memcpy-sve.S
+++ b/string/aarch64/memcpy-sve.S
@@ -1,7 +1,7 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 2415bd6..7c0606e 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -1,7 +1,7 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2012-2020, Arm Limited.
+ * Copyright (c) 2012-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index bee71ef..6418bdf 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -1,7 +1,7 @@
 /*
  * memrchr - find last character in a memory zone.
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 6bbcedf..553b0fc 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,7 +1,7 @@
 /*
  * memset - fill memory with a constant byte
  *
- * Copyright (c) 2012-2021, Arm Limited.
+ * Copyright (c) 2012-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 04f269f..6ec08f7 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -1,7 +1,7 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index e18640c..ff07516 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -1,7 +1,7 @@
 /*
  * strchr/strchrnul - find a character in a string
  *
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index a041e57..37193bd 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -1,7 +1,7 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2014-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index cd67858..543ee88 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -1,7 +1,7 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index c6b295d..666e8d0 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -1,7 +1,7 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2014-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index 4c00463..eaf909a 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -1,7 +1,7 @@
 /*
  * __strcmp_aarch64_sve - compare two strings
  *
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index 803e603..00e72dc 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -1,7 +1,7 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 470a865..97ae37e 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -1,7 +1,7 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index a83b9b6..7723579 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -1,7 +1,7 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 1171558..12ebbdb 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -1,7 +1,7 @@
 /*
  * __strlen_aarch64_sve - compute the length of a string
  *
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index f164322..6f6f08f 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,7 +1,7 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index 4a7be2d..6a9e9f7 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -1,7 +1,7 @@
 /*
  * strncmp - compare two strings with limit
  *
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index 498a335..6c43dc4 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -1,7 +1,7 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2019-2021, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index 03a4706..f2090a7 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -1,7 +1,7 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index c451d72..bb61ab9 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -1,7 +1,7 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index fbcd5ba..825a738 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -1,7 +1,7 @@
 /*
  * strrchr - find the last of a character in a string
  *
- * Copyright (c) 2019-2021, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index 8f10c96..bf9cb29 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -1,7 +1,7 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2014-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/arm/check-arch.S b/string/arm/check-arch.S
index b893f32..9551671 100644
--- a/string/arm/check-arch.S
+++ b/string/arm/check-arch.S
@@ -1,7 +1,7 @@
 /*
  * check ARCH setting.
  *
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 9b77b75..823d601 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -1,7 +1,7 @@
 /*
  * memchr - scan memory for a character
  *
- * Copyright (c) 2010-2021, Arm Limited.
+ * Copyright (c) 2010-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index c4dfa8a..2423cfd 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -1,7 +1,7 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2013-2020, Arm Limited.
+ * Copyright (c) 2013-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
index 699fa1b..4d55306 100644
--- a/string/arm/strcmp-armv6m.S
+++ b/string/arm/strcmp-armv6m.S
@@ -1,7 +1,7 @@
 /*
  * strcmp for ARMv6-M (optimized for performance, not size)
  *
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2014-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index bc6f75f..74b3d23 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -1,7 +1,7 @@
 /*
  * strcmp for ARMv7
  *
- * Copyright (c) 2012-2021, Arm Limited.
+ * Copyright (c) 2012-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index f9f50c0..5eb8671 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -1,7 +1,7 @@
 /*
  * strlen - calculate the length of a string
  *
- * Copyright (c) 2010-2020, Arm Limited.
+ * Copyright (c) 2010-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
diff --git a/string/test/strlen.c b/string/test/strlen.c
index 0c20018..47ef3dc 100644
--- a/string/test/strlen.c
+++ b/string/test/strlen.c
@@ -1,7 +1,7 @@
 /*
  * strlen test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019-2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-- 
cgit v1.2.3


From a1b6ffb361553f7d40cf9491ce017dcbf51a6505 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Tue, 24 Jan 2023 13:33:40 +0000
Subject: pl/math: Fix a copyright notice for consistency

The (c) is not strictly required, but it was only missing from one file.
---
 pl/math/test/testcases/directed/tanh.tst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pl/math/test/testcases/directed/tanh.tst b/pl/math/test/testcases/directed/tanh.tst
index 4a02c55..78776e6 100644
--- a/pl/math/test/testcases/directed/tanh.tst
+++ b/pl/math/test/testcases/directed/tanh.tst
@@ -1,6 +1,6 @@
 ; tanh.tst
 ;
-; Copyright 1999-2023, Arm Limited.
+; Copyright (c) 1999-2023, Arm Limited.
 ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=tanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
-- 
cgit v1.2.3


From 56e3bf05c19c4e28e1f5edd9093c712f16c5c32a Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Tue, 24 Jan 2023 14:07:20 +0000
Subject: v23.01 release

* Project changes
  * All files are under a new dual license now (MIT OR Apache-2.0 WITH
    LLVM-exception at the election of the user).
  * Added MAINTAINERS file describing who maintains the subdirectories.
  * Added README.contributors files documenting contribution
    requirements.
  * Added new pl/ subdirectory for Arm's Performance Library related
    routines.
* String routine changes
  * Added memset benchmark.
  * Improved strlen and memcpy benchmarks.
  * Added SVE memcpy.
  * Updated arm string functions to support M-profile PACBTI.
  * Merged the MTE and generic versions of strcmp, strncmp, strcpy and
    stpcpy into one implementation.
  * Optimized memcmp, memchr-mte, memrchr, strchr-mte, strchrnul-mte,
    strrchr-mte, strlen, strlen-mte, strnlen, strcpy.
* Math routine changes
  * Fixed constants in sinf, cosf and sincosf to be compile time
    computed even with gcc-12 -frounding-math.
  * Fixed an invalid shift in logf.
  * Support floating-point exceptions in vector math routines when
    WANT_SIMD_EXCEPT is set.
---
 README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README b/README
index e24746d..a2143a2 100644
--- a/README
+++ b/README
@@ -12,7 +12,7 @@ contribution requirements are documented in README.contributors of
 the appropriate subdirectory.
 
 Regular quarterly releases are tagged as vYY.MM, the latest
-release is v21.02.
+release is v23.01.
 
 Source code layout:
 
-- 
cgit v1.2.3


From ffea11cb143d09f08e4d41286484a67061195c11 Mon Sep 17 00:00:00 2001
From: Jake Weinstein <jake@aospa.co>
Date: Mon, 14 Feb 2022 14:57:49 +0900
Subject: Build SVE routines.

Test: treehugger
Change-Id: I64eb06a9d17c229abb026439d0cdd36ba646eaf4
---
 Android.bp | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/Android.bp b/Android.bp
index 62b947b..62e26e7 100644
--- a/Android.bp
+++ b/Android.bp
@@ -109,31 +109,42 @@ cc_library_static {
         arm64: {
             srcs: [
                 "string/aarch64/memchr-mte.S",
+                "string/aarch64/memchr-sve.S",
                 "string/aarch64/memchr.S",
+                "string/aarch64/memcmp-sve.S",
                 "string/aarch64/memcmp.S",
                 "string/aarch64/memcpy-advsimd.S",
+                "string/aarch64/memcpy-sve.S",
                 "string/aarch64/memcpy.S",
                 "string/aarch64/memrchr.S",
                 "string/aarch64/memset.S",
+                "string/aarch64/stpcpy-sve.S",
                 "string/aarch64/stpcpy.S",
                 "string/aarch64/strchrnul-mte.S",
+                "string/aarch64/strchrnul-sve.S",
                 "string/aarch64/strchrnul.S",
                 "string/aarch64/strchr-mte.S",
+                "string/aarch64/strchr-sve.S",
                 "string/aarch64/strchr.S",
+                "string/aarch64/strcmp-sve.S",
                 "string/aarch64/strcmp.S",
+                "string/aarch64/strcpy-sve.S",
                 "string/aarch64/strcpy.S",
                 "string/aarch64/strlen-mte.S",
+                "string/aarch64/strlen-sve.S",
                 "string/aarch64/strlen.S",
+                "string/aarch64/strncmp-sve.S",
                 "string/aarch64/strncmp.S",
+                "string/aarch64/strnlen-sve.S",
                 "string/aarch64/strnlen.S",
                 "string/aarch64/strrchr-mte.S",
+                "string/aarch64/strrchr-sve.S",
                 "string/aarch64/strrchr.S",
             ],
             asflags: [
-                "-D__memcmp_aarch64=memcmp",
+                "-march=armv8-a+sve",
                 "-D__memset_aarch64=memset",
                 "-D__memrchr_aarch64=memrchr",
-                "-D__strnlen_aarch64=strnlen",
             ]
         },
     },
-- 
cgit v1.2.3