From 6a988f6813aa5d929ec7f8aa17946f2d9cf511ce Mon Sep 17 00:00:00 2001 From: Ola Liljedahl Date: Thu, 27 Feb 2020 15:10:03 +0100 Subject: networking: New subproject. Add scalar and NEON ones' complement checksumming implementations for AArch64 and Armv7-A. --- networking/Dir.mk | 76 ++++++++ networking/aarch64/chksum_simd.c | 146 +++++++++++++++ networking/arm/chksum_simd.c | 149 +++++++++++++++ networking/chksum.c | 81 +++++++++ networking/chksum_common.h | 132 ++++++++++++++ networking/include/networking.h | 14 ++ networking/test/chksum.c | 381 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 979 insertions(+) create mode 100644 networking/Dir.mk create mode 100644 networking/aarch64/chksum_simd.c create mode 100644 networking/arm/chksum_simd.c create mode 100644 networking/chksum.c create mode 100644 networking/chksum_common.h create mode 100644 networking/include/networking.h create mode 100644 networking/test/chksum.c (limited to 'networking') diff --git a/networking/Dir.mk b/networking/Dir.mk new file mode 100644 index 0000000..b496103 --- /dev/null +++ b/networking/Dir.mk @@ -0,0 +1,76 @@ +# Makefile fragment - requires GNU make +# +# Copyright (c) 2019-2020, Arm Limited. +# SPDX-License-Identifier: MIT + +S := $(srcdir)/networking +B := build/networking + +ifeq ($(ARCH),) +all-networking check-networking install-networking clean-networking: + @echo "*** Please set ARCH in config.mk. ***" + @exit 1 +else + +networking-lib-srcs := $(wildcard $(S)/*.[cS]) $(wildcard $(S)/$(ARCH)/*.[cS]) +networking-test-srcs := $(wildcard $(S)/test/*.c) + +networking-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h)) + +networking-libs := \ + build/lib/libnetworking.so \ + build/lib/libnetworking.a \ + +networking-tools := \ + build/bin/test/chksum + +networking-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-lib-srcs))) +networking-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-test-srcs))) + +networking-objs := \ + $(networking-lib-objs) \ + $(networking-lib-objs:%.o=%.os) \ + $(networking-test-objs) \ + +networking-files := \ + $(networking-objs) \ + $(networking-libs) \ + $(networking-tools) \ + $(networking-includes) \ + +all-networking: $(networking-libs) $(networking-tools) $(networking-includes) + +$(networking-objs): $(networking-includes) +$(networking-objs): CFLAGS_ALL += $(networking-cflags) + +build/lib/libnetworking.so: $(networking-lib-objs:%.o=%.os) + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^ + +build/lib/libnetworkinglib.a: $(networking-lib-objs) + rm -f $@ + $(AR) rc $@ $^ + $(RANLIB) $@ + +build/bin/test/%: $(B)/test/%.o build/lib/libnetworkinglib.a + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) + +build/include/%.h: $(S)/include/%.h + cp $< $@ + +build/bin/%.sh: $(S)/test/%.sh + cp $< $@ + +check-networking: $(networking-tools) + $(EMULATOR) build/bin/test/chksum -i simple + $(EMULATOR) build/bin/test/chksum -i scalar + $(EMULATOR) build/bin/test/chksum -i simd || true # simd is not always available + +install-networking: \ + $(networking-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \ + $(networking-includes:build/include/%=$(DESTDIR)$(includedir)/%) + +clean-networking: + rm -f $(networking-files) +endif + +.PHONY: all-networking check-networking install-networking clean-networking diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c new file mode 100644 index 0000000..6d5be58 --- /dev/null +++ b/networking/aarch64/chksum_simd.c @@ -0,0 +1,146 @@ +/* + * AArch64-specific checksum implementation using NEON + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "networking.h" +#include "../chksum_common.h" + +#ifndef __ARM_NEON +#pragma GCC target("+simd") +#endif + +#include + +always_inline +static inline uint64_t +slurp_head64(const void **pptr, uint32_t *nbytes) +{ + Assert(*nbytes >= 8); + uint64_t sum = 0; + uint32_t off = (uintptr_t) *pptr % 8; + if (likely(off != 0)) + { + /* Get rid of bytes 0..off-1 */ + const unsigned char *ptr64 = align_ptr(*pptr, 8); + uint64_t mask = ALL_ONES << (CHAR_BIT * off); + uint64_t val = load64(ptr64) & mask; + /* Fold 64-bit sum to 33 bits */ + sum = val >> 32; + sum += (uint32_t) val; + *pptr = ptr64 + 8; + *nbytes -= 8 - off; + } + return sum; +} + +always_inline +static inline uint64_t +slurp_tail64(uint64_t sum, const void *ptr, uint32_t nbytes) +{ + Assert(nbytes < 8); + if (likely(nbytes != 0)) + { + /* Get rid of bytes 7..nbytes */ + uint64_t mask = ALL_ONES >> (CHAR_BIT * (8 - nbytes)); + Assert(__builtin_popcountl(mask) / CHAR_BIT == nbytes); + uint64_t val = load64(ptr) & mask; + sum += val >> 32; + sum += (uint32_t) val; + nbytes = 0; + } + Assert(nbytes == 0); + return sum; +} + +unsigned short +__chksum_aarch64_simd(const void *ptr, unsigned int nbytes) +{ + bool swap = (uintptr_t) ptr & 1; + uint64_t sum; + + if (unlikely(nbytes < 50)) + { + sum = slurp_small(ptr, nbytes); + swap = false; + goto fold; + } + + /* 8-byte align pointer */ + Assert(nbytes >= 8); + sum = slurp_head64(&ptr, &nbytes); + Assert(((uintptr_t) ptr & 7) == 0); + + const uint32_t *may_alias ptr32 = ptr; + + uint64x2_t vsum0 = { 0, 0 }; + uint64x2_t vsum1 = { 0, 0 }; + uint64x2_t vsum2 = { 0, 0 }; + uint64x2_t vsum3 = { 0, 0 }; + + /* Sum groups of 64 bytes */ + for (uint32_t i = 0; i < nbytes / 64; i++) + { + uint32x4_t vtmp0 = vld1q_u32(ptr32); + uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4); + uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8); + uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12); + vsum0 = vpadalq_u32(vsum0, vtmp0); + vsum1 = vpadalq_u32(vsum1, vtmp1); + vsum2 = vpadalq_u32(vsum2, vtmp2); + vsum3 = vpadalq_u32(vsum3, vtmp3); + ptr32 += 16; + } + nbytes %= 64; + + /* Fold vsum2 and vsum3 into vsum0 and vsum1 */ + vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2)); + vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3)); + + /* Add any trailing group of 32 bytes */ + if (nbytes & 32) + { + uint32x4_t vtmp0 = vld1q_u32(ptr32); + uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4); + vsum0 = vpadalq_u32(vsum0, vtmp0); + vsum1 = vpadalq_u32(vsum1, vtmp1); + ptr32 += 8; + nbytes -= 32; + } + Assert(nbytes < 32); + + /* Fold vsum1 into vsum0 */ + vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1)); + + /* Add any trailing group of 16 bytes */ + if (nbytes & 16) + { + uint32x4_t vtmp = vld1q_u32(ptr32); + vsum0 = vpadalq_u32(vsum0, vtmp); + ptr32 += 4; + nbytes -= 16; + } + Assert(nbytes < 16); + + /* Add any trailing group of 8 bytes */ + if (nbytes & 8) + { + uint32x2_t vtmp = vld1_u32(ptr32); + vsum0 = vaddw_u32(vsum0, vtmp); + ptr32 += 2; + nbytes -= 8; + } + Assert(nbytes < 8); + + uint64_t val = vaddlvq_u32(vreinterpretq_u32_u64(vsum0)); + sum += val >> 32; + sum += (uint32_t) val; + + /* Handle any trailing 0..7 bytes */ + sum = slurp_tail64(sum, ptr32, nbytes); + +fold: + return fold_and_swap(sum, swap); +} diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c new file mode 100644 index 0000000..7f69adf --- /dev/null +++ b/networking/arm/chksum_simd.c @@ -0,0 +1,149 @@ +/* + * Armv7-A specific checksum implementation using NEON + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "networking.h" +#include "../chksum_common.h" + +#ifndef __ARM_NEON +#pragma GCC target("+simd") +#endif + +#include + +unsigned short +__chksum_arm_simd(const void *ptr, unsigned int nbytes) +{ + bool swap = (uintptr_t) ptr & 1; + uint64x1_t vsum = { 0 }; + + if (unlikely(nbytes < 40)) + { + uint64_t sum = slurp_small(ptr, nbytes); + return fold_and_swap(sum, false); + } + + /* 8-byte align pointer */ + /* Inline slurp_head-like code since we use NEON here */ + Assert(nbytes >= 8); + uint32_t off = (uintptr_t) ptr & 7; + if (likely(off != 0)) + { + const uint64_t *may_alias ptr64 = align_ptr(ptr, 8); + uint64x1_t vword64 = vld1_u64(ptr64); + /* Get rid of bytes 0..off-1 */ + uint64x1_t vmask = vdup_n_u64(ALL_ONES); + int64x1_t vshiftl = vdup_n_s64(CHAR_BIT * off); + vmask = vshl_u64(vmask, vshiftl); + vword64 = vand_u64(vword64, vmask); + uint32x2_t vtmp = vreinterpret_u32_u64(vword64); + /* Set accumulator */ + vsum = vpaddl_u32(vtmp); + /* Update pointer and remaining size */ + ptr = (char *) ptr64 + 8; + nbytes -= 8 - off; + } + Assert(((uintptr_t) ptr & 7) == 0); + + /* Sum groups of 64 bytes */ + uint64x2_t vsum0 = { 0, 0 }; + uint64x2_t vsum1 = { 0, 0 }; + uint64x2_t vsum2 = { 0, 0 }; + uint64x2_t vsum3 = { 0, 0 }; + const uint32_t *may_alias ptr32 = ptr; + for (uint32_t i = 0; i < nbytes / 64; i++) + { + uint32x4_t vtmp0 = vld1q_u32(ptr32); + uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4); + uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8); + uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12); + vsum0 = vpadalq_u32(vsum0, vtmp0); + vsum1 = vpadalq_u32(vsum1, vtmp1); + vsum2 = vpadalq_u32(vsum2, vtmp2); + vsum3 = vpadalq_u32(vsum3, vtmp3); + ptr32 += 16; + } + nbytes %= 64; + + /* Fold vsum1/vsum2/vsum3 into vsum0 */ + vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2)); + vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3)); + vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1)); + + /* Add any trailing 16-byte groups */ + while (likely(nbytes >= 16)) + { + uint32x4_t vtmp0 = vld1q_u32(ptr32); + vsum0 = vpadalq_u32(vsum0, vtmp0); + ptr32 += 4; + nbytes -= 16; + } + Assert(nbytes < 16); + + /* Fold vsum0 into vsum */ + { + /* 4xu32 (4x32b) -> 2xu64 (2x33b) */ + vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0)); + /* 4xu32 (2x(1b+32b)) -> 2xu64 (2x(0b+32b)) */ + vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0)); + /* 4xu32 (4x32b) -> 2xu64 (2x33b) */ + Assert((vgetq_lane_u64(vsum0, 0) >> 32) == 0); + Assert((vgetq_lane_u64(vsum0, 1) >> 32) == 0); + uint32x2_t vtmp = vmovn_u64(vsum0); + /* Add to accumulator */ + vsum = vpadal_u32(vsum, vtmp); + } + + /* Add any trailing group of 8 bytes */ + if (nbytes & 8) + { + uint32x2_t vtmp = vld1_u32(ptr32); + /* Add to accumulator */ + vsum = vpadal_u32(vsum, vtmp); + ptr32 += 2; + nbytes -= 8; + } + Assert(nbytes < 8); + + /* Handle any trailing 1..7 bytes */ + if (likely(nbytes != 0)) + { + Assert(((uintptr_t) ptr32 & 7) == 0); + Assert(nbytes < 8); + uint64x1_t vword64 = vld1_u64((const uint64_t *) ptr32); + /* Get rid of bytes 7..nbytes */ + uint64x1_t vmask = vdup_n_u64(ALL_ONES); + int64x1_t vshiftr = vdup_n_s64(-CHAR_BIT * (8 - nbytes)); + vmask = vshl_u64(vmask, vshiftr);/* Shift right */ + vword64 = vand_u64(vword64, vmask); + /* Fold 64-bit sum to 33 bits */ + vword64 = vpaddl_u32(vreinterpret_u32_u64(vword64)); + /* Add to accumulator */ + vsum = vpadal_u32(vsum, vreinterpret_u32_u64(vword64)); + } + + /* Fold 64-bit vsum to 32 bits */ + vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); + vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); + Assert(vget_lane_u32(vreinterpret_u32_u64(vsum), 1) == 0); + + /* Fold 32-bit vsum to 16 bits */ + uint32x2_t vsum32 = vreinterpret_u32_u64(vsum); + vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32)); + vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32)); + Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 1) == 0); + Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 2) == 0); + Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 3) == 0); + + /* Convert to 16-bit scalar */ + uint16_t sum = vget_lane_u16(vreinterpret_u16_u32(vsum32), 0); + + if (unlikely(swap))/* Odd base pointer is unexpected */ + { + sum = bswap16(sum); + } + return sum; +} diff --git a/networking/chksum.c b/networking/chksum.c new file mode 100644 index 0000000..95ce5ba --- /dev/null +++ b/networking/chksum.c @@ -0,0 +1,81 @@ +/* + * Compute 16-bit sum in ones' complement arithmetic (with end-around carry). + * This sum is often used as a simple checksum in networking. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "networking.h" +#include "chksum_common.h" + +always_inline +static inline uint32_t +slurp_head32(const void **pptr, uint32_t *nbytes) +{ + uint32_t sum = 0; + Assert(*nbytes >= 4); + uint32_t off = (uintptr_t) *pptr % 4; + if (likely(off != 0)) + { + /* Get rid of bytes 0..off-1 */ + const unsigned char *ptr32 = align_ptr(*pptr, 4); + uint32_t mask = ~0U << (CHAR_BIT * off); + sum = load32(ptr32) & mask; + *pptr = ptr32 + 4; + *nbytes -= 4 - off; + } + return sum; +} + +/* Additional loop unrolling would help when not auto-vectorizing */ +unsigned short +__chksum(const void *ptr, unsigned int nbytes) +{ + bool swap = false; + uint64_t sum = 0; + + if (nbytes > 300) + { + /* 4-byte align pointer */ + swap = (uintptr_t) ptr & 1; + sum = slurp_head32(&ptr, &nbytes); + } + /* Else benefit of aligning not worth the overhead */ + + /* Sum all 16-byte chunks */ + const char *cptr = ptr; + for (uint32_t nquads = nbytes / 16; nquads != 0; nquads--) + { + uint64_t h0 = load32(cptr + 0); + uint64_t h1 = load32(cptr + 4); + uint64_t h2 = load32(cptr + 8); + uint64_t h3 = load32(cptr + 12); + sum += h0 + h1 + h2 + h3; + cptr += 16; + } + nbytes %= 16; + Assert(nbytes < 16); + + /* Handle any trailing 4-byte chunks */ + while (nbytes >= 4) + { + sum += load32(cptr); + cptr += 4; + nbytes -= 4; + } + Assert(nbytes < 4); + + if (nbytes & 2) + { + sum += load16(cptr); + cptr += 2; + } + + if (nbytes & 1) + { + sum += *(uint8_t *)cptr; + } + + return fold_and_swap(sum, swap); +} diff --git a/networking/chksum_common.h b/networking/chksum_common.h new file mode 100644 index 0000000..958c8cc --- /dev/null +++ b/networking/chksum_common.h @@ -0,0 +1,132 @@ +/* + * Common code for checksum implementations + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef CHKSUM_COMMON_H +#define CHKSUM_COMMON_H + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ +#error Only little endian supported +#endif + +#include +#include +#include +#include + +/* Assertions must be explicitly enabled */ +#if WANT_ASSERT +#undef NDEBUG +#include +#define Assert(exp) assert(exp) +#else +#define Assert(exp) (void) (exp) +#endif + +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#define may_alias __attribute__((__may_alias__)) +#define always_inline __attribute__((always_inline)) +#ifdef __clang__ +#define no_unroll_loops +#else +#define no_unroll_loops __attribute__((optimize("no-unroll-loops"))) +#endif +#define bswap16(x) __builtin_bswap16((x)) +#else +#define likely(x) (x) +#define unlikely(x) (x) +#define may_alias +#define always_inline +#define no_unroll_loops +#define bswap16(x) ((uint8_t)((x) >> 8) | ((uint8_t)(x) << 8)) +#endif + +#define ALL_ONES ~UINT64_C(0) + +static inline +uint64_t load64(const void *ptr) +{ + /* GCC will optimise this to a normal load instruction */ + uint64_t v; + memcpy(&v, ptr, sizeof v); + return v; +} + +static inline +uint32_t load32(const void *ptr) +{ + /* GCC will optimise this to a normal load instruction */ + uint32_t v; + memcpy(&v, ptr, sizeof v); + return v; +} + +static inline +uint16_t load16(const void *ptr) +{ + /* GCC will optimise this to a normal load instruction */ + uint16_t v; + memcpy(&v, ptr, sizeof v); + return v; +} + +/* slurp_small() is for small buffers, don't waste cycles on alignment */ +no_unroll_loops +always_inline +static inline uint64_t +slurp_small(const void *ptr, uint32_t nbytes) +{ + const unsigned char *cptr = ptr; + uint64_t sum = 0; + while (nbytes >= 4) + { + sum += load32(cptr); + cptr += 4; + nbytes -= 4; + } + if (nbytes & 2) + { + sum += load16(cptr); + cptr += 2; + } + if (nbytes & 1) + { + sum += (uint8_t) *cptr; + } + return sum; +} + +static inline const void * +align_ptr(const void *ptr, size_t bytes) +{ + return (void *) ((uintptr_t) ptr & -(uintptr_t) bytes); +} + +always_inline +static inline uint16_t +fold_and_swap(uint64_t sum, bool swap) +{ + /* Fold 64-bit sum to 32 bits */ + sum = (sum & 0xffffffff) + (sum >> 32); + sum = (sum & 0xffffffff) + (sum >> 32); + Assert(sum == (uint32_t) sum); + + /* Fold 32-bit sum to 16 bits */ + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + Assert(sum == (uint16_t) sum); + + if (unlikely(swap)) /* Odd base pointer is unexpected */ + { + sum = bswap16(sum); + } + + return (uint16_t) sum; +} + +#endif diff --git a/networking/include/networking.h b/networking/include/networking.h new file mode 100644 index 0000000..a88feff --- /dev/null +++ b/networking/include/networking.h @@ -0,0 +1,14 @@ +/* + * Public API. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +unsigned short __chksum (const void *, unsigned int); +#if __aarch64__ && __ARM_NEON +unsigned short __chksum_aarch64_simd (const void *, unsigned int); +#endif +#if __arm__ && __ARM_NEON +unsigned short __chksum_arm_simd (const void *, unsigned int); +#endif diff --git a/networking/test/chksum.c b/networking/test/chksum.c new file mode 100644 index 0000000..50722a4 --- /dev/null +++ b/networking/test/chksum.c @@ -0,0 +1,381 @@ +/* + * Ones' complement checksum test & benchmark + * + * Copyright 2016-2020 ARM Limited + * SPDX-License-Identifier: MIT + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../include/networking.h" + +#if WANT_ASSERT +#undef NDEBUG +#include +#define Assert(exp) assert(exp) +#else +#define Assert(exp) (void) (exp) +#endif + +#ifdef __GNUC__ +#define may_alias __attribute__((__may_alias__)) +#else +#define may_alias +#endif + +#define CACHE_LINE 64 +#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1)) + +/* Reference implementation - do not modify! */ +static uint16_t +checksum_simple(const void *ptr, uint32_t nbytes) +{ + const uint16_t *may_alias hptr = ptr; + uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */ + + /* Sum all halfwords, assume misaligned accesses are handled in HW */ + for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--) + { + sum += *hptr++; + } + + /* Add any trailing odd byte */ + if ((nbytes & 0x01) != 0) + { + sum += *(uint8_t *) hptr; + } + + /* Fold 64-bit sum to 32 bits */ + sum = (sum & 0xffffffff) + (sum >> 32); + sum = (sum & 0xffffffff) + (sum >> 32); + Assert(sum == (uint32_t) sum); + + /* Fold 32-bit sum to 16 bits */ + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + Assert(sum == (uint16_t) sum); + + return (uint16_t) sum; +} + +static struct +{ + uint16_t (*cksum_fp)(const void *, uint32_t); + const char *name; +} implementations[] = +{ + { checksum_simple, "simple"}, + { __chksum, "scalar"}, +#if __arm__ + { __chksum_arm_simd, "simd" }, +#elif __aarch64__ + { __chksum_aarch64_simd, "simd" }, +#endif + { NULL, NULL} +}; + +static int +find_impl(const char *name) +{ + for (int i = 0; implementations[i].name != NULL; i++) + { + if (strcmp(implementations[i].name, name) == 0) + { + return i; + } + } + return -1; +} + +static uint16_t (*CKSUM_FP)(const void *, uint32_t); +static volatile uint16_t SINK; + +static bool +verify(const void *data, uint32_t offset, uint32_t size) +{ + + uint16_t csum_expected = checksum_simple(data, size); + uint16_t csum_actual = CKSUM_FP(data, size); + if (csum_actual != csum_expected) + { + fprintf(stderr, "\nInvalid checksum for offset %u size %u: " + "actual %04x expected %04x (valid)", + offset, size, csum_actual, csum_expected); + if (size < 65536) + { + /* Fatal error */ + exit(EXIT_FAILURE); + } + /* Else some implementations only support sizes up to 2^16 */ + return false; + } + return true; +} + +static uint64_t +clock_get_ns(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec; +} + +static void +benchmark(const uint8_t *base, + size_t poolsize, + uint32_t blksize, + uint32_t numops, + uint64_t cpufreq) +{ + printf("%11u ", (unsigned int) blksize); fflush(stdout); + + uint64_t start = clock_get_ns(); + for (uint32_t i = 0; i < numops; i ++) + { + /* Read a random value from the pool */ + uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)]; + /* Generate a random starting address */ + const void *data = &base[random % (poolsize - blksize)]; + SINK = CKSUM_FP(data, blksize); + } + uint64_t end = clock_get_ns(); + +#define MEGABYTE 1000000 /* Decimal megabyte (MB) */ + uint64_t elapsed_ns = end - start; + uint64_t elapsed_ms = elapsed_ns / 1000000; + uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000); + uint64_t accbytes = (uint64_t) numops * blksize; + printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE); + unsigned int cyc_per_blk = cpufreq / blks_per_s; + printf("%11u ", cyc_per_blk); + if (blksize != 0) + { + unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize; + printf("%7u.%03u ", + cyc_per_byte / 1000, cyc_per_byte % 1000); + } + printf("\n"); +} + +int main(int argc, char *argv[]) +{ + int c; + bool DUMP = false; + uint32_t IMPL = 0;/* Simple implementation */ + uint64_t CPUFREQ = 0; + uint32_t BLKSIZE = 0; + uint32_t NUMOPS = 1000000; + uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */ + + setvbuf(stdout, NULL, _IOLBF, 160); + while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1) + { + switch (c) + { + case 'b' : + { + int blksize = atoi(optarg); + if (blksize < 1 || blksize > POOLSIZE / 2) + { + fprintf(stderr, "Invalid block size %d\n", blksize); + exit(EXIT_FAILURE); + } + BLKSIZE = (unsigned) blksize; + break; + } + case 'd' : + DUMP = true; + break; + case 'f' : + { + int64_t cpufreq = atoll(optarg); + if (cpufreq < 1) + { + fprintf(stderr, "Invalid CPU frequency %"PRId64"\n", + cpufreq); + exit(EXIT_FAILURE); + } + CPUFREQ = cpufreq; + break; + } + case 'i' : + { + int impl = find_impl(optarg); + if (impl < 0) + { + fprintf(stderr, "Invalid implementation %s\n", optarg); + goto usage; + } + IMPL = (unsigned) impl; + break; + } + case 'n' : + { + int numops = atoi(optarg); + if (numops < 1) + { + fprintf(stderr, "Invalid number of operations %d\n", numops); + exit(EXIT_FAILURE); + } + NUMOPS = (unsigned) numops; + break; + } + case 'p' : + { + int poolsize = atoi(optarg); + if (poolsize < 4096) + { + fprintf(stderr, "Invalid pool size %d\n", poolsize); + exit(EXIT_FAILURE); + } + char c = optarg[strlen(optarg) - 1]; + if (c == 'M') + { + POOLSIZE = (unsigned) poolsize * 1024 * 1024; + } + else if (c == 'K') + { + POOLSIZE = (unsigned) poolsize * 1024; + } + else + { + POOLSIZE = (unsigned) poolsize; + } + break; + } + default : +usage : + fprintf(stderr, "Usage: checksum \n" + "-b Block size\n" + "-d Dump first 96 bytes of data\n" + "-f CPU frequency (Hz)\n" + "-i Implementation\n" + "-n Number of operations\n" + "-p Pool size (K or M suffix)\n" + ); + printf("Implementations:"); + for (int i = 0; implementations[i].name != NULL; i++) + { + printf(" %s", implementations[i].name); + } + printf("\n"); + exit(EXIT_FAILURE); + } + } + if (optind > argc) + { + goto usage; + } + + CKSUM_FP = implementations[IMPL].cksum_fp; + POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE); + uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (base == MAP_FAILED) + { + perror("aligned_alloc"), exit(EXIT_FAILURE); + } + for (size_t i = 0; i < POOLSIZE / 4; i++) + { + ((uint32_t *) base)[i] = rand(); + } + + printf("Implementation: %s\n", implementations[IMPL].name); + printf("numops %u, poolsize ", NUMOPS); + if (POOLSIZE % (1024 * 1024) == 0) + { + printf("%uMiB", POOLSIZE / (1024 * 1024)); + } + else if (POOLSIZE % 1024 == 0) + { + printf("%uKiB", POOLSIZE / 1024); + } + else + { + printf("%uB", POOLSIZE); + } + printf(", blocksize %u, CPU frequency %juMHz\n", + BLKSIZE, (uintmax_t) (CPUFREQ / 1000000)); +#if WANT_ASSERT + printf("Warning: assertions are enabled\n"); +#endif + + if (DUMP) + { + /* Print out first 96 bytes of data for human debugging */ + for (int i = 0; i < 96; i++) + { + if (i % 8 == 0) + printf("%2u:", i); + printf(" %02x", base[i]); + if (i % 8 == 7) + printf("\n"); + } + } + + /* Verify that chosen algorithm handles all combinations of offsets and sizes */ + printf("Verifying..."); fflush(stdout); + bool success = true; + /* Check all (relevant) combinations of size and offset */ + for (int size = 0; size <= 256; size++) + { + for (int offset = 0; offset < 255; offset++) + { + /* Check at start of mapped memory */ + success &= verify(&base[offset], offset, size); + /* Check at end of mapped memory */ + uint8_t *p = base + POOLSIZE - (size + offset); + success &= verify(p, (uintptr_t) p % 64, size); + } + } + /* Check increasingly larger sizes */ + for (size_t size = 1; size < POOLSIZE; size *= 2) + { + success &= verify(base, 0, size); + } + /* Check the full size, this can detect accumulator overflows */ + success &= verify(base, 0, POOLSIZE); + printf("%s\n", success ? "OK" : "failure"); + + /* Print throughput in decimal megabyte (1000000B) per second */ + if (CPUFREQ != 0) + { + printf("%11s %11s %11s %11s\n", + "block size", "MB/s", "cycles/blk", "cycles/byte"); + } + else + { + printf("%11s %11s %11s %11s\n", + "block size", "MB/s", "ns/blk", "ns/byte"); + CPUFREQ = 1000000000; + } + if (BLKSIZE != 0) + { + benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ); + } + else + { + static const uint16_t sizes[] = + { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 }; + for (int i = 0; sizes[i] != 0; i++) + { + uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]); + benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ); + } + } + + if (munmap(base, POOLSIZE) != 0) + { + perror("munmap"), exit(EXIT_FAILURE); + } + + return success ? EXIT_SUCCESS : EXIT_FAILURE; +} -- cgit v1.2.3