/* * Ones' complement checksum test & benchmark * * Copyright (c) 2016-2020, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include "../include/networking.h" #if WANT_ASSERT #undef NDEBUG #include #define Assert(exp) assert(exp) #else #define Assert(exp) (void) (exp) #endif #ifdef __GNUC__ #define may_alias __attribute__((__may_alias__)) #else #define may_alias #endif #define CACHE_LINE 64 #define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1)) /* Reference implementation - do not modify! */ static uint16_t checksum_simple(const void *ptr, uint32_t nbytes) { const uint16_t *may_alias hptr = ptr; uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */ /* Sum all halfwords, assume misaligned accesses are handled in HW */ for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--) { sum += *hptr++; } /* Add any trailing odd byte */ if ((nbytes & 0x01) != 0) { sum += *(uint8_t *) hptr; } /* Fold 64-bit sum to 32 bits */ sum = (sum & 0xffffffff) + (sum >> 32); sum = (sum & 0xffffffff) + (sum >> 32); Assert(sum == (uint32_t) sum); /* Fold 32-bit sum to 16 bits */ sum = (sum & 0xffff) + (sum >> 16); sum = (sum & 0xffff) + (sum >> 16); Assert(sum == (uint16_t) sum); return (uint16_t) sum; } static struct { uint16_t (*cksum_fp)(const void *, uint32_t); const char *name; } implementations[] = { { checksum_simple, "simple"}, { __chksum, "scalar"}, #if __arm__ { __chksum_arm_simd, "simd" }, #elif __aarch64__ { __chksum_aarch64_simd, "simd" }, #endif { NULL, NULL} }; static int find_impl(const char *name) { for (int i = 0; implementations[i].name != NULL; i++) { if (strcmp(implementations[i].name, name) == 0) { return i; } } return -1; } static uint16_t (*CKSUM_FP)(const void *, uint32_t); static volatile uint16_t SINK; static bool verify(const void *data, uint32_t offset, uint32_t size) { uint16_t csum_expected = checksum_simple(data, size); uint16_t csum_actual = CKSUM_FP(data, size); if (csum_actual != csum_expected) { fprintf(stderr, "\nInvalid checksum for offset %u size %u: " "actual %04x expected %04x (valid)", offset, size, csum_actual, csum_expected); if (size < 65536) { /* Fatal error */ exit(EXIT_FAILURE); } /* Else some implementations only support sizes up to 2^16 */ return false; } return true; } static uint64_t clock_get_ns(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec; } static void benchmark(const uint8_t *base, size_t poolsize, uint32_t blksize, uint32_t numops, uint64_t cpufreq) { printf("%11u ", (unsigned int) blksize); fflush(stdout); uint64_t start = clock_get_ns(); for (uint32_t i = 0; i < numops; i ++) { /* Read a random value from the pool */ uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)]; /* Generate a random starting address */ const void *data = &base[random % (poolsize - blksize)]; SINK = CKSUM_FP(data, blksize); } uint64_t end = clock_get_ns(); #define MEGABYTE 1000000 /* Decimal megabyte (MB) */ uint64_t elapsed_ns = end - start; uint64_t elapsed_ms = elapsed_ns / 1000000; uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000); uint64_t accbytes = (uint64_t) numops * blksize; printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE); unsigned int cyc_per_blk = cpufreq / blks_per_s; printf("%11u ", cyc_per_blk); if (blksize != 0) { unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize; printf("%7u.%03u ", cyc_per_byte / 1000, cyc_per_byte % 1000); } printf("\n"); } int main(int argc, char *argv[]) { int c; bool DUMP = false; uint32_t IMPL = 0;/* Simple implementation */ uint64_t CPUFREQ = 0; uint32_t BLKSIZE = 0; uint32_t NUMOPS = 1000000; uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */ setvbuf(stdout, NULL, _IOLBF, 160); while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1) { switch (c) { case 'b' : { int blksize = atoi(optarg); if (blksize < 1 || blksize > POOLSIZE / 2) { fprintf(stderr, "Invalid block size %d\n", blksize); exit(EXIT_FAILURE); } BLKSIZE = (unsigned) blksize; break; } case 'd' : DUMP = true; break; case 'f' : { int64_t cpufreq = atoll(optarg); if (cpufreq < 1) { fprintf(stderr, "Invalid CPU frequency %"PRId64"\n", cpufreq); exit(EXIT_FAILURE); } CPUFREQ = cpufreq; break; } case 'i' : { int impl = find_impl(optarg); if (impl < 0) { fprintf(stderr, "Invalid implementation %s\n", optarg); goto usage; } IMPL = (unsigned) impl; break; } case 'n' : { int numops = atoi(optarg); if (numops < 1) { fprintf(stderr, "Invalid number of operations %d\n", numops); exit(EXIT_FAILURE); } NUMOPS = (unsigned) numops; break; } case 'p' : { int poolsize = atoi(optarg); if (poolsize < 4096) { fprintf(stderr, "Invalid pool size %d\n", poolsize); exit(EXIT_FAILURE); } char c = optarg[strlen(optarg) - 1]; if (c == 'M') { POOLSIZE = (unsigned) poolsize * 1024 * 1024; } else if (c == 'K') { POOLSIZE = (unsigned) poolsize * 1024; } else { POOLSIZE = (unsigned) poolsize; } break; } default : usage : fprintf(stderr, "Usage: checksum \n" "-b Block size\n" "-d Dump first 96 bytes of data\n" "-f CPU frequency (Hz)\n" "-i Implementation\n" "-n Number of operations\n" "-p Pool size (K or M suffix)\n" ); printf("Implementations:"); for (int i = 0; implementations[i].name != NULL; i++) { printf(" %s", implementations[i].name); } printf("\n"); exit(EXIT_FAILURE); } } if (optind > argc) { goto usage; } CKSUM_FP = implementations[IMPL].cksum_fp; POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE); uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (base == MAP_FAILED) { perror("aligned_alloc"), exit(EXIT_FAILURE); } for (size_t i = 0; i < POOLSIZE / 4; i++) { ((uint32_t *) base)[i] = rand(); } printf("Implementation: %s\n", implementations[IMPL].name); printf("numops %u, poolsize ", NUMOPS); if (POOLSIZE % (1024 * 1024) == 0) { printf("%uMiB", POOLSIZE / (1024 * 1024)); } else if (POOLSIZE % 1024 == 0) { printf("%uKiB", POOLSIZE / 1024); } else { printf("%uB", POOLSIZE); } printf(", blocksize %u, CPU frequency %juMHz\n", BLKSIZE, (uintmax_t) (CPUFREQ / 1000000)); #if WANT_ASSERT printf("Warning: assertions are enabled\n"); #endif if (DUMP) { /* Print out first 96 bytes of data for human debugging */ for (int i = 0; i < 96; i++) { if (i % 8 == 0) printf("%2u:", i); printf(" %02x", base[i]); if (i % 8 == 7) printf("\n"); } } /* Verify that chosen algorithm handles all combinations of offsets and sizes */ printf("Verifying..."); fflush(stdout); bool success = true; /* Check all (relevant) combinations of size and offset */ for (int size = 0; size <= 256; size++) { for (int offset = 0; offset < 255; offset++) { /* Check at start of mapped memory */ success &= verify(&base[offset], offset, size); /* Check at end of mapped memory */ uint8_t *p = base + POOLSIZE - (size + offset); success &= verify(p, (uintptr_t) p % 64, size); } } /* Check increasingly larger sizes */ for (size_t size = 1; size < POOLSIZE; size *= 2) { success &= verify(base, 0, size); } /* Check the full size, this can detect accumulator overflows */ success &= verify(base, 0, POOLSIZE); printf("%s\n", success ? "OK" : "failure"); /* Print throughput in decimal megabyte (1000000B) per second */ if (CPUFREQ != 0) { printf("%11s %11s %11s %11s\n", "block size", "MB/s", "cycles/blk", "cycles/byte"); } else { printf("%11s %11s %11s %11s\n", "block size", "MB/s", "ns/blk", "ns/byte"); CPUFREQ = 1000000000; } if (BLKSIZE != 0) { benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ); } else { static const uint16_t sizes[] = { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 }; for (int i = 0; sizes[i] != 0; i++) { uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]); benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ); } } if (munmap(base, POOLSIZE) != 0) { perror("munmap"), exit(EXIT_FAILURE); } return success ? EXIT_SUCCESS : EXIT_FAILURE; }