5 files changed, 1158 insertions, 3 deletions
diff --git a/Makefile b/Makefile
index f5b5d3b..9916716 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,7 @@ ALL_OBJS = $(MATH_OBJS) \
 	$(RTEST_OBJS) \
 	build/test/mathtest.o \
 	build/test/mathbench.o \
+	build/test/ulp.o \
 
 INCLUDES = $(wildcard $(srcdir)/math/include/*.h)
 ALL_INCLUDES = $(INCLUDES:$(srcdir)/math/%=build/%)
@@ -31,6 +32,8 @@ ALL_TOOLS = \
 	build/bin/mathtest \
 	build/bin/mathbench \
 	build/bin/mathbench_libc \
+	build/bin/runulp.sh \
+	build/bin/ulp \
 
 HOST_TOOLS = \
 	build/bin/rtest \
@@ -74,6 +77,8 @@ $(RTEST_OBJS): CFLAGS_ALL = $(HOST_CFLAGS)
 
 build/test/mathtest.o: CFLAGS_ALL += -fmath-errno
 
+build/test/ulp.o: $(srcdir)/test/ulp.h
+
 build/%.o: $(srcdir)/%.S
 	$(CC) $(CFLAGS_ALL) -c -o $@ $<
 
@@ -106,9 +111,15 @@ build/bin/mathbench: build/test/mathbench.o build/lib/libmathlib.a
 build/bin/mathbench_libc: build/test/mathbench.o
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS_ALL) -static -o $@ $^ $(LDLIBS)
 
+build/bin/ulp: build/test/ulp.o build/lib/libmathlib.a
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS_ALL) -static -o $@ $^ $(LDLIBS)
+
 build/include/%.h: $(srcdir)/math/include/%.h
 	cp $< $@
 
+build/bin/%.sh: $(srcdir)/test/%.sh
+	cp $< $@
+
 clean:
 	rm -rf build
 
@@ -141,6 +152,9 @@ check: $(ALL_TOOLS)
 rcheck: $(HOST_TOOLS) $(ALL_TOOLS)
 	cat $(RTESTS) | build/bin/rtest | $(EMULATOR) build/bin/mathtest
 
-check-all: check rcheck
+ucheck: $(ALL_TOOLS)
+	build/bin/runulp.sh $(EMULATOR)
+
+check-all: check rcheck ucheck
 
-.PHONY: all clean distclean install install-tools install-libs install-headers check rcheck check-all
+.PHONY: all clean distclean install install-tools install-libs install-headers check rcheck ucheck check-all
diff --git a/config.mk.dist b/config.mk.dist
index 7cf7393..c549fa4 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -16,7 +16,11 @@ HOST_CFLAGS += -g
 CFLAGS += -g
 
 # Use if the target FPU only supports single precision.
-#CFLAGS += WANT_SINGLEPREC
+#CFLAGS += -DWANT_SINGLEPREC
+
+# Use if mpfr is available on the target for ulp error checking.
+#LDLIBS += -lmpfr -lgmp
+#CFLAGS += -DUSE_MPFR
 
 # Use with gcc.
 CFLAGS += -frounding-math -fexcess-precision=standard -fno-stack-protector
diff --git a/test/runulp.sh b/test/runulp.sh
new file mode 100755
index 0000000..d7d79a4
--- /dev/null
+++ b/test/runulp.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# ULP error check script.
+#
+# Copyright (c) 2019, Arm Limited.
+# SPDX-License-Identifier: MIT
+
+#set -x
+set -eu
+
+# cd to bin directory.
+cd "${0%/*}"
+
+rmodes='n u d z'
+#rmodes=n
+flags='-q'
+emu="$@"
+
+t() {
+	[ $r = "n" ] && Lt=$L || Lt=$Ldir
+	$emu ./ulp -r $r -e $Lt $flags "$@"
+}
+
+Ldir=0.5
+for r in $rmodes
+do
+L=0.01
+t exp  0 0xffff000000000000 10000
+t exp  0x1p-6     0x1p6     40000
+t exp -0x1p-6    -0x1p6     40000
+t exp  633.3      733.3     10000
+t exp -633.3     -777.3     10000
+
+L=0.01
+t exp2  0 0xffff000000000000 10000
+t exp2  0x1p-6     0x1p6     40000
+t exp2 -0x1p-6    -0x1p6     40000
+t exp2  633.3      733.3     10000
+t exp2 -633.3     -777.3     10000
+
+L=0.05
+t pow  0.5  2.0  x  0  inf 20000
+t pow -0.5 -2.0  x  0  inf 20000
+t pow  0.5  2.0  x -0 -inf 20000
+t pow -0.5 -2.0  x -0 -inf 20000
+t pow  0.5  2.0  x  0x1p-10  0x1p10  40000
+t pow  0.5  2.0  x -0x1p-10 -0x1p10  40000
+t pow  0    inf  x    0.5      2.0   80000
+t pow  0    inf  x   -0.5     -2.0   80000
+t pow  0x1.fp-1   0x1.08p0  x  0x1p8 0x1p17  80000
+t pow  0x1.fp-1   0x1.08p0  x -0x1p8 -0x1p17 80000
+t pow  0         0x1p-1000  x  0 1.0 50000
+t pow  0x1p1000        inf  x  0 1.0 50000
+t pow  0x1.ffffffffffff0p-1  0x1.0000000000008p0 x 0x1p60 0x1p68 50000
+t pow  0x1.ffffffffff000p-1  0x1p0 x 0x1p50 0x1p52 50000
+t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
+
+L=0.01
+t expf  0    0xffff0000    10000
+t expf  0x1p-14   0x1p8    50000
+t expf -0x1p-14  -0x1p8    50000
+
+L=0.01
+t exp2f  0    0xffff0000   10000
+t exp2f  0x1p-14   0x1p8   50000
+t exp2f -0x1p-14  -0x1p8   50000
+
+L=0.06
+t sinf  0    0xffff0000    10000
+t sinf  0x1p-14  0x1p54    50000
+t sinf -0x1p-14 -0x1p54    50000
+
+L=0.06
+t cosf  0    0xffff0000    10000
+t cosf  0x1p-14  0x1p54    50000
+t cosf -0x1p-14 -0x1p54    50000
+
+L=0.4
+t powf  0x1p-1   0x1p1  x  0x1p-7 0x1p7   50000
+t powf  0x1p-1   0x1p1  x -0x1p-7 -0x1p7  50000
+t powf  0x1p-70 0x1p70  x  0x1p-1 0x1p1   50000
+t powf  0x1p-70 0x1p70  x  -0x1p-1 -0x1p1 50000
+t powf  0x1.ep-1 0x1.1p0 x  0x1p8 0x1p14  50000
+t powf  0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
+done
diff --git a/test/ulp.c b/test/ulp.c
new file mode 100644
index 0000000..8de6e5b
--- /dev/null
+++ b/test/ulp.c
@@ -0,0 +1,714 @@
+/*
+ * ULP error checking tool for math functions.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <ctype.h>
+#include <fenv.h>
+#include <float.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Don't depend on mpfr by default.  */
+#ifndef USE_MPFR
+# define USE_MPFR 0
+#endif
+#if USE_MPFR
+# include <mpfr.h>
+#endif
+
+static inline uint64_t
+asuint64 (double f)
+{
+  union
+  {
+    double f;
+    uint64_t i;
+  } u = {f};
+  return u.i;
+}
+
+static inline double
+asdouble (uint64_t i)
+{
+  union
+  {
+    uint64_t i;
+    double f;
+  } u = {i};
+  return u.f;
+}
+
+static inline uint32_t
+asuint (float f)
+{
+  union
+  {
+    float f;
+    uint32_t i;
+  } u = {f};
+  return u.i;
+}
+
+static inline float
+asfloat (uint32_t i)
+{
+  union
+  {
+    uint32_t i;
+    float f;
+  } u = {i};
+  return u.f;
+}
+
+static uint64_t seed = 0x0123456789abcdef;
+static uint64_t
+rand64 (void)
+{
+  seed = 6364136223846793005ull * seed + 1;
+  return seed ^ (seed >> 32);
+}
+
+/* Uniform random in [0,n].  */
+static uint64_t
+randn (uint64_t n)
+{
+  uint64_t r, m;
+
+  if (n == 0)
+    return 0;
+  n++;
+  if (n == 0)
+    return rand64 ();
+  for (;;)
+    {
+      r = rand64 ();
+      m = r % n;
+      if (r - m <= -n)
+	return m;
+    }
+}
+
+struct gen
+{
+  uint64_t start;
+  uint64_t len;
+  uint64_t start2;
+  uint64_t len2;
+  uint64_t off;
+  uint64_t step;
+  uint64_t cnt;
+};
+
+struct args_f1
+{
+  float x;
+};
+
+struct args_f2
+{
+  float x;
+  float x2;
+};
+
+struct args_d1
+{
+  double x;
+};
+
+struct args_d2
+{
+  double x;
+  double x2;
+};
+
+/* result = y + tail*2^ulpexp.  */
+struct ret_f
+{
+  float y;
+  double tail;
+  int ulpexp;
+  int ex;
+  int ex_may;
+};
+
+struct ret_d
+{
+  double y;
+  double tail;
+  int ulpexp;
+  int ex;
+  int ex_may;
+};
+
+static inline uint64_t
+next1 (struct gen *g)
+{
+  /* For single argument use randomized incremental steps,
+     that produce dense sampling without collisions and allow
+     testing all inputs in a range.  */
+  uint64_t r = g->start + g->off;
+  g->off += g->step + randn (g->step / 2);
+  if (g->off > g->len)
+    g->off -= g->len; /* hack.  */
+  return r;
+}
+
+static inline uint64_t
+next2 (uint64_t *x2, struct gen *g)
+{
+  /* For two arguments use uniform random sampling.  */
+  uint64_t r = g->start + randn (g->len);
+  *x2 = g->start2 + randn (g->len2);
+  return r;
+}
+
+static struct args_f1
+next_f1 (void *g)
+{
+  return (struct args_f1){asfloat (next1 (g))};
+}
+
+static struct args_f2
+next_f2 (void *g)
+{
+  uint64_t x2;
+  uint64_t x = next2 (&x2, g);
+  return (struct args_f2){asfloat (x), asfloat (x2)};
+}
+
+static struct args_d1
+next_d1 (void *g)
+{
+  return (struct args_d1){asdouble (next1 (g))};
+}
+
+static struct args_d2
+next_d2 (void *g)
+{
+  uint64_t x2;
+  uint64_t x = next2 (&x2, g);
+  return (struct args_d2){asdouble (x), asdouble (x2)};
+}
+
+struct conf
+{
+  int r;
+  int rc;
+  int quiet;
+  int mpfr;
+  int fenv;
+  unsigned long long n;
+  double softlim;
+  double errlim;
+};
+
+struct fun
+{
+  const char *name;
+  int arity;
+  int singleprec;
+  union
+  {
+    float (*f1) (float);
+    float (*f2) (float, float);
+    double (*d1) (double);
+    double (*d2) (double, double);
+  } fun;
+  union
+  {
+    double (*f1) (double);
+    double (*f2) (double, double);
+    long double (*d1) (long double);
+    long double (*d2) (long double, long double);
+  } fun_long;
+#if USE_MPFR
+  union
+  {
+    int (*f1) (mpfr_t, const mpfr_t, mpfr_rnd_t);
+    int (*f2) (mpfr_t, const mpfr_t, const mpfr_t, mpfr_rnd_t);
+    int (*d1) (mpfr_t, const mpfr_t, mpfr_rnd_t);
+    int (*d2) (mpfr_t, const mpfr_t, const mpfr_t, mpfr_rnd_t);
+  } fun_mpfr;
+#endif
+};
+
+static const struct fun fun[] = {
+#if USE_MPFR
+# define F(x, x_long, x_mpfr, a, s, t) \
+  {#x, a, s, {.t = x}, {.t = x_long}, {.t = x_mpfr}},
+#else
+# define F(x, x_long, x_mpfr, a, s, t) \
+  {#x, a, s, {.t = x}, {.t = x_long}},
+#endif
+#define F1(x) F (x##f, x, mpfr_##x, 1, 1, f1)
+#define F2(x) F (x##f, x, mpfr_##x, 2, 1, f2)
+#define D1(x) F (x, x##l, mpfr_##x, 1, 0, d1)
+#define D2(x) F (x, x##l, mpfr_##x, 2, 0, d2)
+ F1 (sin)
+ F1 (cos)
+ F1 (exp)
+ F1 (exp2)
+ F1 (log)
+ F1 (log2)
+ F2 (pow)
+ D1 (exp)
+ D1 (exp2)
+ D1 (log)
+ D1 (log2)
+ D2 (pow)
+ {0}};
+
+/* Boilerplate for generic calls.  */
+
+static inline int
+ulpscale_f (float x)
+{
+  int e = asuint (x) >> 23 & 0xff;
+  if (!e)
+    e++;
+  return e - 0x7f - 23;
+}
+static inline int
+ulpscale_d (double x)
+{
+  int e = asuint64 (x) >> 52 & 0x7ff;
+  if (!e)
+    e++;
+  return e - 0x3ff - 52;
+}
+static inline float
+call_f1 (const struct fun *f, struct args_f1 a)
+{
+  return f->fun.f1 (a.x);
+}
+static inline float
+call_f2 (const struct fun *f, struct args_f2 a)
+{
+  return f->fun.f2 (a.x, a.x2);
+}
+
+static inline double
+call_d1 (const struct fun *f, struct args_d1 a)
+{
+  return f->fun.d1 (a.x);
+}
+static inline double
+call_d2 (const struct fun *f, struct args_d2 a)
+{
+  return f->fun.d2 (a.x, a.x2);
+}
+static inline double
+call_long_f1 (const struct fun *f, struct args_f1 a)
+{
+  return f->fun_long.f1 (a.x);
+}
+static inline double
+call_long_f2 (const struct fun *f, struct args_f2 a)
+{
+  return f->fun_long.f2 (a.x, a.x2);
+}
+static inline long double
+call_long_d1 (const struct fun *f, struct args_d1 a)
+{
+  return f->fun_long.d1 (a.x);
+}
+static inline long double
+call_long_d2 (const struct fun *f, struct args_d2 a)
+{
+  return f->fun_long.d2 (a.x, a.x2);
+}
+static inline void
+printcall_f1 (const struct fun *f, struct args_f1 a)
+{
+  printf ("%s(%a)", f->name, a.x);
+}
+static inline void
+printcall_f2 (const struct fun *f, struct args_f2 a)
+{
+  printf ("%s(%a, %a)", f->name, a.x, a.x2);
+}
+static inline void
+printcall_d1 (const struct fun *f, struct args_d1 a)
+{
+  printf ("%s(%a)", f->name, a.x);
+}
+static inline void
+printcall_d2 (const struct fun *f, struct args_d2 a)
+{
+  printf ("%s(%a, %a)", f->name, a.x, a.x2);
+}
+static inline void
+printgen_f1 (const struct fun *f, struct gen *gen)
+{
+  printf ("%s in [%a;%a]", f->name, asfloat (gen->start),
+	  asfloat (gen->start + gen->len));
+}
+static inline void
+printgen_f2 (const struct fun *f, struct gen *gen)
+{
+  printf ("%s in [%a;%a] x [%a;%a]", f->name, asfloat (gen->start),
+	  asfloat (gen->start + gen->len), asfloat (gen->start2),
+	  asfloat (gen->start2 + gen->len2));
+}
+static inline void
+printgen_d1 (const struct fun *f, struct gen *gen)
+{
+  printf ("%s in [%a;%a]", f->name, asdouble (gen->start),
+	  asdouble (gen->start + gen->len));
+}
+static inline void
+printgen_d2 (const struct fun *f, struct gen *gen)
+{
+  printf ("%s in [%a;%a] x [%a;%a]", f->name, asdouble (gen->start),
+	  asdouble (gen->start + gen->len), asdouble (gen->start2),
+	  asdouble (gen->start2 + gen->len2));
+}
+
+#define reduce_f1(a, f, op) (f (a.x))
+#define reduce_f2(a, f, op) (f (a.x) op f (a.x2))
+#define reduce_d1(a, f, op) (f (a.x))
+#define reduce_d2(a, f, op) (f (a.x) op f (a.x2))
+
+#ifndef IEEE_754_2008_SNAN
+# define IEEE_754_2008_SNAN 1
+#endif
+static inline int
+issignaling_f (float x)
+{
+  uint32_t ix = asuint (x);
+  if (!IEEE_754_2008_SNAN)
+    return (ix & 0x7fc00000) == 0x7fc00000;
+  return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000;
+}
+static inline int
+issignaling_d (double x)
+{
+  uint64_t ix = asuint64 (x);
+  if (!IEEE_754_2008_SNAN)
+    return (ix & 0x7ff8000000000000) == 0x7ff8000000000000;
+  return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL;
+}
+
+#if USE_MPFR
+static mpfr_rnd_t
+rmap (int r)
+{
+  switch (r)
+    {
+    case FE_TONEAREST:
+      return MPFR_RNDN;
+    case FE_TOWARDZERO:
+      return MPFR_RNDZ;
+    case FE_UPWARD:
+      return MPFR_RNDU;
+    case FE_DOWNWARD:
+      return MPFR_RNDD;
+    }
+  return -1;
+}
+
+#define prec_mpfr_f 50
+#define prec_mpfr_d 80
+#define prec_f 24
+#define prec_d 53
+#define emin_f -148
+#define emin_d -1073
+#define emax_f 128
+#define emax_d 1024
+static inline int
+call_mpfr_f1 (mpfr_t y, const struct fun *f, struct args_f1 a, mpfr_rnd_t r)
+{
+  MPFR_DECL_INIT (x, prec_f);
+  mpfr_set_flt (x, a.x, MPFR_RNDN);
+  return f->fun_mpfr.f1 (y, x, r);
+}
+static inline int
+call_mpfr_f2 (mpfr_t y, const struct fun *f, struct args_f2 a, mpfr_rnd_t r)
+{
+  MPFR_DECL_INIT (x, prec_f);
+  MPFR_DECL_INIT (x2, prec_f);
+  mpfr_set_flt (x, a.x, MPFR_RNDN);
+  mpfr_set_flt (x2, a.x2, MPFR_RNDN);
+  return f->fun_mpfr.f2 (y, x, x2, r);
+}
+static inline int
+call_mpfr_d1 (mpfr_t y, const struct fun *f, struct args_d1 a, mpfr_rnd_t r)
+{
+  MPFR_DECL_INIT (x, prec_d);
+  mpfr_set_d (x, a.x, MPFR_RNDN);
+  return f->fun_mpfr.d1 (y, x, r);
+}
+static inline int
+call_mpfr_d2 (mpfr_t y, const struct fun *f, struct args_d2 a, mpfr_rnd_t r)
+{
+  MPFR_DECL_INIT (x, prec_d);
+  MPFR_DECL_INIT (x2, prec_d);
+  mpfr_set_d (x, a.x, MPFR_RNDN);
+  mpfr_set_d (x2, a.x2, MPFR_RNDN);
+  return f->fun_mpfr.d2 (y, x, x2, r);
+}
+#endif
+
+#define float_f float
+#define double_f double
+#define copysign_f copysignf
+#define nextafter_f nextafterf
+#define fabs_f fabsf
+#define asuint_f asuint
+#define asfloat_f asfloat
+#define scalbn_f scalbnf
+#define lscalbn_f scalbn
+#define halfinf_f 0x1p127f
+#define min_normal_f 0x1p-126f
+
+#define float_d double
+#define double_d long double
+#define copysign_d copysign
+#define nextafter_d nextafter
+#define fabs_d fabs
+#define asuint_d asuint64
+#define asfloat_d asdouble
+#define scalbn_d scalbn
+#define lscalbn_d scalbnl
+#define halfinf_d 0x1p1023
+#define min_normal_d 0x1p-1022
+
+#define NEW_RT
+#define RT(x) x##_f
+#define T(x) x##_f1
+#include "ulp.h"
+#undef T
+#define T(x) x##_f2
+#include "ulp.h"
+#undef T
+#undef RT
+
+#define NEW_RT
+#define RT(x) x##_d
+#define T(x) x##_d1
+#include "ulp.h"
+#undef T
+#define T(x) x##_d2
+#include "ulp.h"
+#undef T
+#undef RT
+
+static void
+usage (void)
+{
+  puts ("./ulp [-q] [-m] [-f] [-r nudz] [-l soft-ulplimit] [-e ulplimit] func "
+	"lo [hi [x lo2 hi2] [count]]");
+  puts ("Compares func against a higher precision implementation in [lo; hi].");
+  puts ("-q: quiet.");
+  puts ("-m: use mpfr even if faster method is available.");
+  puts ("-f: disable fenv testing (rounding modes and exceptions).");
+  puts ("Supported func:");
+  for (const struct fun *f = fun; f->name; f++)
+    printf ("\t%s\n", f->name);
+  exit (1);
+}
+
+static void
+cmp (const struct fun *f, struct gen *gen, const struct conf *conf)
+{
+  if (f->arity == 1 && f->singleprec)
+    cmp_f1 (f, gen, conf);
+  else if (f->arity == 2 && f->singleprec)
+    cmp_f2 (f, gen, conf);
+  else if (f->arity == 1 && !f->singleprec)
+    cmp_d1 (f, gen, conf);
+  else if (f->arity == 2 && !f->singleprec)
+    cmp_d2 (f, gen, conf);
+  else
+    usage ();
+}
+
+static uint64_t
+getnum (const char *s, int singleprec)
+{
+  //	int i;
+  uint64_t sign = 0;
+  //	char buf[12];
+
+  if (s[0] == '+')
+    s++;
+  else if (s[0] == '-')
+    {
+      sign = singleprec ? 1ULL << 31 : 1ULL << 63;
+      s++;
+    }
+  /* 0xXXXX is treated as bit representation, '-' flips the sign bit.  */
+  if (s[0] == '0' && tolower (s[1]) == 'x' && strchr (s, 'p') == 0)
+    return sign ^ strtoull (s, 0, 0);
+  //	/* SNaN, QNaN, NaN, Inf.  */
+  //	for (i=0; s[i] && i < sizeof buf; i++)
+  //		buf[i] = tolower(s[i]);
+  //	buf[i] = 0;
+  //	if (strcmp(buf, "snan") == 0)
+  //		return sign | (singleprec ? 0x7fa00000 : 0x7ff4000000000000);
+  //	if (strcmp(buf, "qnan") == 0 || strcmp(buf, "nan") == 0)
+  //		return sign | (singleprec ? 0x7fc00000 : 0x7ff8000000000000);
+  //	if (strcmp(buf, "inf") == 0 || strcmp(buf, "infinity") == 0)
+  //		return sign | (singleprec ? 0x7f800000 : 0x7ff0000000000000);
+  /* Otherwise assume it's a floating-point literal.  */
+  return sign
+	 | (singleprec ? asuint (strtof (s, 0)) : asuint64 (strtod (s, 0)));
+}
+
+static void
+parsegen (struct gen *g, int argc, char *argv[], const struct fun *f)
+{
+  int singleprec = f->singleprec;
+  int arity = f->arity;
+  uint64_t a, b, a2, b2, n;
+  if (argc < 1)
+    usage ();
+  b = a = getnum (argv[0], singleprec);
+  n = 0;
+  if (argc > 1 && strcmp (argv[1], "x") == 0)
+    {
+      argc -= 2;
+      argv += 2;
+    }
+  else if (argc > 1)
+    {
+      b = getnum (argv[1], singleprec);
+      if (argc > 2 && strcmp (argv[2], "x") == 0)
+	{
+	  argc -= 3;
+	  argv += 3;
+	}
+    }
+  b2 = a2 = getnum (argv[0], singleprec);
+  if (argc > 1)
+    b2 = getnum (argv[1], singleprec);
+  if (argc > 2)
+    n = strtoull (argv[2], 0, 0);
+  if (argc > 3)
+    usage ();
+  //printf("ab %lx %lx ab2 %lx %lx n %lu\n", a, b, a2, b2, n);
+  if (arity == 1)
+    {
+      g->start = a;
+      g->len = b - a;
+      if (n - 1 > b - a)
+	n = b - a + 1;
+      g->off = 0;
+      g->step = n ? (g->len + 1) / n : 1;
+      g->start2 = g->len2 = 0;
+      g->cnt = n;
+    }
+  else if (arity == 2)
+    {
+      g->start = a;
+      g->len = b - a;
+      g->off = g->step = 0;
+      g->start2 = a2;
+      g->len2 = b2 - a2;
+      g->cnt = n;
+    }
+  else
+    usage ();
+}
+
+int
+main (int argc, char *argv[])
+{
+  const struct fun *f;
+  struct gen gen;
+  struct conf conf;
+  conf.rc = 'n';
+  conf.quiet = 0;
+  conf.mpfr = 0;
+  conf.fenv = 1;
+  conf.softlim = 0;
+  conf.errlim = INFINITY;
+  for (;;)
+    {
+      argc--;
+      argv++;
+      if (argc < 1)
+	usage ();
+      if (argv[0][0] != '-')
+	break;
+      switch (argv[0][1])
+	{
+	case 'e':
+	  argc--;
+	  argv++;
+	  if (argc < 1)
+	    usage ();
+	  conf.errlim = strtod (argv[0], 0);
+	  break;
+	case 'f':
+	  conf.fenv = 0;
+	  break;
+	case 'l':
+	  argc--;
+	  argv++;
+	  if (argc < 1)
+	    usage ();
+	  conf.softlim = strtod (argv[0], 0);
+	  break;
+	case 'm':
+	  conf.mpfr = 1;
+	  break;
+	case 'q':
+	  conf.quiet = 1;
+	  break;
+	case 'r':
+	  conf.rc = argv[0][2];
+	  if (!conf.rc)
+	    {
+	      argc--;
+	      argv++;
+	      if (argc < 1)
+		usage ();
+	      conf.rc = argv[0][0];
+	    }
+	  break;
+	default:
+	  usage ();
+	}
+    }
+  switch (conf.rc)
+    {
+    case 'n':
+      conf.r = FE_TONEAREST;
+      break;
+    case 'u':
+      conf.r = FE_UPWARD;
+      break;
+    case 'd':
+      conf.r = FE_DOWNWARD;
+      break;
+    case 'z':
+      conf.r = FE_TOWARDZERO;
+      break;
+    default:
+      usage ();
+    }
+  for (f = fun; f->name; f++)
+    if (strcmp (argv[0], f->name) == 0)
+      break;
+  if (!f->name)
+    usage ();
+  if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG)
+    conf.mpfr = 1; /* Use mpfr if long double has no extra precision.  */
+  if (!USE_MPFR && conf.mpfr)
+    {
+      puts ("mpfr is not available.");
+      return 1;
+    }
+  argc--;
+  argv++;
+  parsegen (&gen, argc, argv, f);
+  conf.n = gen.cnt;
+  cmp (f, &gen, &conf);
+}
diff --git a/test/ulp.h b/test/ulp.h
new file mode 100644
index 0000000..07b0cb6
--- /dev/null
+++ b/test/ulp.h
@@ -0,0 +1,338 @@
+/*
+ * Generic functions for ULP error estimation.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* For each different math function type,
+   T(x) should add a different suffix to x.
+   RT(x) should add a return type specific suffix to x. */
+
+#ifdef NEW_RT
+#undef NEW_RT
+
+# if USE_MPFR
+static int RT(ulpscale_mpfr) (mpfr_t x, int t)
+{
+  /* TODO: pow of 2 cases.  */
+  if (mpfr_regular_p (x))
+    {
+      mpfr_exp_t e = mpfr_get_exp (x) - RT(prec);
+      if (e < RT(emin))
+	e = RT(emin) - 1;
+      if (e > RT(emax) - RT(prec))
+	e = RT(emax) - RT(prec);
+      return e;
+    }
+  if (mpfr_zero_p (x))
+    return RT(emin) - 1;
+  if (mpfr_inf_p (x))
+    return RT(emax) - RT(prec);
+  /* NaN.  */
+  return 0;
+}
+# endif
+
+/* Difference between exact result and closest real number that
+   gets rounded to got, i.e. error before rounding, for a correctly
+   rounded result the difference is 0.  */
+static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
+{
+  RT(float) want = p->y;
+  RT(float) d;
+  double e;
+
+  if (RT(asuint) (got) == RT(asuint) (want))
+    return 0.0;
+  if (signbit (got) != signbit (want))
+    /* May have false positives with NaN.  */
+    //return isnan(got) && isnan(want) ? 0 : INFINITY;
+    return INFINITY;
+  if (!isfinite (want) || !isfinite (got))
+    {
+      if (isnan (got) != isnan (want))
+	return INFINITY;
+      if (isnan (want))
+	return 0;
+      if (isinf (got))
+	{
+	  got = RT(copysign) (RT(halfinf), got);
+	  want *= 0.5f;
+	}
+      if (isinf (want))
+	{
+	  want = RT(copysign) (RT(halfinf), want);
+	  got *= 0.5f;
+	}
+    }
+  if (r == FE_TONEAREST)
+    {
+      // TODO: incorrect when got vs want cross a powof2 boundary
+      /* error = got > want
+	      ? got - want - tail ulp - 0.5 ulp
+	      : got - want - tail ulp + 0.5 ulp;  */
+      d = got - want;
+      e = d > 0 ? -p->tail - 0.5 : -p->tail + 0.5;
+    }
+  else
+    {
+      if ((r == FE_DOWNWARD && got < want) || (r == FE_UPWARD && got > want)
+	  || (r == FE_TOWARDZERO && fabs (got) < fabs (want)))
+	got = RT(nextafter) (got, want);
+      d = got - want;
+      e = -p->tail;
+    }
+  return RT(scalbn) (d, -p->ulpexp) + e;
+}
+
+static int RT(isok) (RT(float) ygot, int exgot, RT(float) ywant, int exwant,
+		      int exmay)
+{
+  return RT(asuint) (ygot) == RT(asuint) (ywant)
+	 && ((exgot ^ exwant) & ~exmay) == 0;
+}
+
+static int RT(isok_nofenv) (RT(float) ygot, RT(float) ywant)
+{
+  return RT(asuint) (ygot) == RT(asuint) (ywant);
+}
+#endif
+
+static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r,
+				  RT(float) * y, int *ex)
+{
+  if (r != FE_TONEAREST)
+    fesetround (r);
+  feclearexcept (FE_ALL_EXCEPT);
+  *y = T(call) (f, a);
+  *ex = fetestexcept (FE_ALL_EXCEPT);
+  if (r != FE_TONEAREST)
+    fesetround (FE_TONEAREST);
+}
+
+static inline void T(call_nofenv) (const struct fun *f, struct T(args) a,
+				    int r, RT(float) * y, int *ex)
+{
+  *y = T(call) (f, a);
+  *ex = 0;
+}
+
+static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a,
+				      int r, struct RT(ret) * p,
+				      RT(float) ygot, int exgot)
+{
+  if (r != FE_TONEAREST)
+    fesetround (r);
+  feclearexcept (FE_ALL_EXCEPT);
+  RT(double) yl = T(call_long) (f, a);
+  p->y = (RT(float)) yl;
+  volatile RT(float) vy = p->y; // TODO: barrier
+  (void) vy;
+  p->ex = fetestexcept (FE_ALL_EXCEPT);
+  if (r != FE_TONEAREST)
+    fesetround (FE_TONEAREST);
+  p->ex_may = FE_INEXACT;
+  if (RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may))
+    return 1;
+  p->ulpexp = RT(ulpscale) (p->y);
+  if (isinf (p->y))
+    p->tail = RT(lscalbn) (yl - (RT(double)) 2 * RT(halfinf), -p->ulpexp);
+  else
+    p->tail = RT(lscalbn) (yl - p->y, -p->ulpexp);
+  if (RT(fabs) (p->y) < RT(min_normal))
+    {
+      /* TODO: subnormal result is treated as undeflow even if it's
+	 exact since call_long may not raise inexact correctly.  */
+      if (p->y != 0 || (p->ex & FE_INEXACT))
+	p->ex |= FE_UNDERFLOW | FE_INEXACT;
+    }
+  return 0;
+}
+static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a,
+					int r, struct RT(ret) * p,
+					RT(float) ygot, int exgot)
+{
+  RT(double) yl = T(call_long) (f, a);
+  p->y = (RT(float)) yl;
+  if (RT(isok_nofenv) (ygot, p->y))
+    return 1;
+  p->ulpexp = RT(ulpscale) (p->y);
+  if (isinf (p->y))
+    p->tail = RT(lscalbn) (yl - (RT(double)) 2 * RT(halfinf), -p->ulpexp);
+  else
+    p->tail = RT(lscalbn) (yl - p->y, -p->ulpexp);
+  return 0;
+}
+
+/* There are nan input args and all quiet.  */
+static inline int T(qnanpropagation) (struct T(args) a)
+{
+  return T(reduce) (a, isnan, ||) && !T(reduce) (a, RT(issignaling), ||);
+}
+static inline RT(float) T(sum) (struct T(args) a)
+{
+  return T(reduce) (a, , +);
+}
+
+/* returns 1 if the got result is ok.  */
+static inline int T(call_mpfr_fix) (const struct fun *f, struct T(args) a,
+				     int r_fenv, struct RT(ret) * p,
+				     RT(float) ygot, int exgot)
+{
+#if USE_MPFR
+  int t, t2;
+  mpfr_rnd_t r = rmap (r_fenv);
+  MPFR_DECL_INIT(my, RT(prec_mpfr));
+  MPFR_DECL_INIT(mr, RT(prec));
+  MPFR_DECL_INIT(me, RT(prec_mpfr));
+  mpfr_clear_flags ();
+  t = T(call_mpfr) (my, f, a, r);
+  /* Double rounding.  */
+  t2 = mpfr_set (mr, my, r);
+  if (t2)
+    t = t2;
+  mpfr_set_emin (RT(emin));
+  mpfr_set_emax (RT(emax));
+  t = mpfr_check_range (mr, t, r);
+  t = mpfr_subnormalize (mr, t, r);
+  mpfr_set_emax (MPFR_EMAX_DEFAULT);
+  mpfr_set_emin (MPFR_EMIN_DEFAULT);
+  p->y = mpfr_get_d (mr, r);
+  p->ex = t ? FE_INEXACT : 0;
+  p->ex_may = FE_INEXACT;
+  if (mpfr_underflow_p () && (p->ex & FE_INEXACT))
+    /* TODO: handle before and after rounding uflow cases.  */
+    p->ex |= FE_UNDERFLOW;
+  if (mpfr_overflow_p ())
+    p->ex |= FE_OVERFLOW | FE_INEXACT;
+  if (mpfr_divby0_p ())
+    p->ex |= FE_DIVBYZERO;
+  //if (mpfr_erangeflag_p ())
+  //  p->ex |= FE_INVALID;
+  if (!mpfr_nanflag_p () && RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may))
+    return 1;
+  if (mpfr_nanflag_p () && !T(qnanpropagation) (a))
+    p->ex |= FE_INVALID;
+  p->ulpexp = RT(ulpscale_mpfr) (my, t);
+  if (!isfinite (p->y))
+    {
+      p->tail = 0;
+      if (isnan (p->y))
+	{
+	  /* If an input was nan keep its sign.  */
+	  p->y = T(sum) (a);
+	  if (!isnan (p->y))
+	    p->y = (p->y - p->y) / (p->y - p->y);
+	  return RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may);
+	}
+      mpfr_set_si_2exp (mr, signbit (p->y) ? -1 : 1, 1024, MPFR_RNDN);
+      if (mpfr_cmpabs (my, mr) >= 0)
+	return RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may);
+    }
+  mpfr_sub (me, my, mr, MPFR_RNDN);
+  mpfr_mul_2si (me, me, -p->ulpexp, MPFR_RNDN);
+  p->tail = mpfr_get_d (me, MPFR_RNDN);
+  return 0;
+#else
+  abort ();
+#endif
+}
+
+static void T(cmp) (const struct fun *f, struct gen *gen,
+		     const struct conf *conf)
+{
+  double maxerr = 0;
+  uint64_t cnt = 0;
+  uint64_t cnt1 = 0;
+  uint64_t cnt2 = 0;
+  uint64_t cntfail = 0;
+  int r = conf->r;
+  int use_mpfr = conf->mpfr;
+  int fenv = conf->fenv;
+  for (;;)
+    {
+      struct RT(ret) want;
+      struct T(args) a = T(next) (gen);
+      int exgot;
+      RT(float) ygot;
+      if (fenv)
+	T(call_fenv) (f, a, r, &ygot, &exgot);
+      else
+	T(call_nofenv) (f, a, r, &ygot, &exgot);
+      cnt++;
+      int ok = use_mpfr
+		 ? T(call_mpfr_fix) (f, a, r, &want, ygot, exgot)
+		 : (fenv ? T(call_long_fenv) (f, a, r, &want, ygot, exgot)
+			 : T(call_long_nofenv) (f, a, r, &want, ygot, exgot));
+      if (!ok)
+	{
+	  int print = 0;
+	  int fail = 0;
+	  double err = RT(ulperr) (ygot, &want, r);
+	  double abserr = fabs (err);
+	  // TODO: count errors below accuracy limit.
+	  if (abserr > 0)
+	    cnt1++;
+	  if (abserr > 1)
+	    cnt2++;
+	  if (abserr > conf->errlim)
+	    {
+	      print = fail = 1;
+	      cntfail++;
+	    }
+	  if (abserr > maxerr)
+	    {
+	      maxerr = abserr;
+	      if (!conf->quiet && abserr > conf->softlim)
+		print = 1;
+	    }
+	  if (print)
+	    {
+	      T(printcall) (f, a);
+	      // TODO: inf ulp handling
+	      printf (" got %a want %a %+g ulp err %g\n", ygot, want.y,
+		      want.tail, err);
+	    }
+	  int diff = fenv ? exgot ^ want.ex : 0;
+	  if (fenv && (diff & ~want.ex_may))
+	    {
+	      if (!fail)
+		{
+		  fail = 1;
+		  cntfail++;
+		}
+	      T(printcall) (f, a);
+	      printf (" is %a %+g ulp, got except 0x%0x", want.y, want.tail,
+		      exgot);
+	      if (diff & exgot)
+		printf (" wrongly set: 0x%x", diff & exgot);
+	      if (diff & ~exgot)
+		printf (" wrongly clear: 0x%x", diff & ~exgot);
+	      putchar ('\n');
+	    }
+	}
+      if (cnt >= conf->n)
+	break;
+      if (!conf->quiet && cnt % 0x100000 == 0)
+	printf ("progress: %6.3f%% cnt %llu cnt1 %llu cnt2 %llu cntfail %llu "
+		"maxerr %g\n",
+		100.0 * cnt / conf->n, (unsigned long long) cnt,
+		(unsigned long long) cnt1, (unsigned long long) cnt2,
+		(unsigned long long) cntfail, maxerr);
+    }
+  double cc = cnt;
+  if (cntfail)
+    printf ("FAIL ");
+  else
+    printf ("PASS ");
+  T(printgen) (f, gen);
+  printf (" round %c errlim %g maxerr %g %s cnt %llu cnt1 %llu %g%% cnt2 %llu "
+	  "%g%% cntfail %llu %g%%\n",
+	  conf->rc, conf->errlim,
+	  maxerr, conf->r == FE_TONEAREST ? "+0.5" : "+1.0",
+	  (unsigned long long) cnt,
+	  (unsigned long long) cnt1, 100.0 * cnt1 / cc,
+	  (unsigned long long) cnt2, 100.0 * cnt2 / cc,
+	  (unsigned long long) cntfail, 100.0 * cntfail / cc);
+}