diff options
Diffstat (limited to 'memcpy-perf')
-rw-r--r-- | memcpy-perf/memcpy-perf.cpp | 121 |
1 files changed, 103 insertions, 18 deletions
diff --git a/memcpy-perf/memcpy-perf.cpp b/memcpy-perf/memcpy-perf.cpp index 20d060b8..2dfd900d 100644 --- a/memcpy-perf/memcpy-perf.cpp +++ b/memcpy-perf/memcpy-perf.cpp @@ -7,14 +7,20 @@ #include <memory> #include <cmath> #include <string> +#include <thread> + +#define CACHE_HIT_SIZE 1 << 17 using namespace std; -const size_t size_start = 64; -const size_t size_end = 16 * (1ull << 20); -const size_t samples = 2048; +size_t size_start = 64; +size_t size_end = 16 * (1ull << 20); +size_t samples = 2048; size_t size_per_test = 64 * (1ull << 20); size_t tot_sum = 0; +size_t delay = 0; +float speed = 0; +bool dummy = false; void __attribute__((noinline)) memcpy_noinline(void *dst, void *src, size_t size); void __attribute__((noinline)) memset_noinline(void *dst, int value, size_t size); @@ -26,21 +32,64 @@ enum BenchType { SumBench, }; +static void usage(char* p) { + printf("Usage: %s <test> <options>\n" + "<test> is one of the following:\n" + " --memcpy\n" + " --memset\n" + " --sum\n" + "<options> are optional and apply to all tests:\n" + " --dummy\n" + " Simulates cpu-only load of a test. Guaranteed to use L2\n" + " instead. Not supported on --sum test.\n" + " --delay DELAY_DIVISOR\n" + " --start START_SIZE_MB\n" + " --end END_SIZE_MB (requires start, optional)\n" + " --samples NUM_SAMPLES\n" + , p); +} + int main(int argc, char *argv[]) { - BenchType type; + BenchType type = MemcpyBench; if (argc <= 1) { - cerr << "memcpy_perf [--memcpy|--memset|--sum]" << endl; + usage(argv[0]); return 0; } - if (string(argv[1]) == string("--memcpy")) { - type = MemcpyBench; - } else if (string(argv[1]) == string("--memset")) { - type = MemsetBench; - } else if (string(argv[1]) == string("--sum")) { - type = SumBench; - } else { - type = MemcpyBench; + for (int i = 1; i < argc; i++) { + if (string(argv[i]) == string("--memcpy")) { + type = MemcpyBench; + } else if (string(argv[i]) == string("--memset")) { + type = MemsetBench; + } else if (string(argv[i]) == string("--sum")) { + type = SumBench; + } else if (string(argv[i]) == string("--dummy")) { + dummy = true; + } else if (i + 1 < argc) { + if (string(argv[i]) == string("--delay")) { + delay = atoi(argv[++i]); + } else if (string(argv[i]) == string("--start")) { + size_start = atoi(argv[++i]) * (1ull << 20); + size_end = size_start; + } else if (string(argv[i]) == string("--end")) { + size_t end = atoi(argv[++i]) * (1ull << 20); + if (end > size_start && i > 3 + && string(argv[i-3]) == string("--start")) { + size_end = end; + } else { + printf("Cannot specify --end without --start.\n"); + return 0; + } + } else if (string(argv[i]) == string("--samples")) { + samples = atoi(argv[++i]); + } else { + printf("Unknown argument %s\n", argv[i]); + return 0; + } + } else { + printf("The %s option requires a single argument.\n", argv[i]); + return 0; + } } unique_ptr<uint8_t[]> src(new uint8_t[size_end]); @@ -54,8 +103,10 @@ int main(int argc, char *argv[]) //cout << "src: " << (uintptr_t)src.get() << endl; //cout << "dst: " << (uintptr_t)dst.get() << endl; - for (double cur_pow = start_pow; cur_pow <= end_pow; cur_pow += pow_inc) { - chrono::time_point<chrono::high_resolution_clock> copy_start, copy_end; + for (double cur_pow = start_pow; cur_pow <= end_pow && samples > 0; + cur_pow += pow_inc) { + chrono::time_point<chrono::high_resolution_clock> + copy_start, copy_end, pre_wait; size_t cur_size = (size_t)pow(10.0, cur_pow); size_t iter_per_size = size_per_test / cur_size; @@ -65,9 +116,21 @@ int main(int argc, char *argv[]) case MemsetBench: { memcpy_noinline(src.get(), dst.get(), cur_size); memset_noinline(dst.get(), 0xdeadbeef, cur_size); + size_t hit_size = CACHE_HIT_SIZE; copy_start = chrono::high_resolution_clock::now(); for (int i = 0; i < iter_per_size; i++) { - memset_noinline(dst.get(), 0xdeadbeef, cur_size); + if (!dummy) { + memset_noinline(dst.get(), 0xdeadbeef, cur_size); + } else { + while (hit_size < cur_size) { + memset_noinline + (dst.get(), 0xdeadbeef, CACHE_HIT_SIZE); + hit_size += 1 << 17; + } + } + if (delay != 0) + this_thread::sleep_for(chrono + ::nanoseconds(size_per_test / delay)); } copy_end = chrono::high_resolution_clock::now(); break; @@ -75,9 +138,21 @@ int main(int argc, char *argv[]) case MemcpyBench: { memcpy_noinline(dst.get(), src.get(), cur_size); memcpy_noinline(src.get(), dst.get(), cur_size); + size_t hit_size = CACHE_HIT_SIZE; copy_start = chrono::high_resolution_clock::now(); for (int i = 0; i < iter_per_size; i++) { - memcpy_noinline(dst.get(), src.get(), cur_size); + if (!dummy) { + memcpy_noinline(dst.get(), src.get(), cur_size); + } else { + while (hit_size < cur_size) { + memcpy_noinline + (dst.get(), src.get(), CACHE_HIT_SIZE); + hit_size += CACHE_HIT_SIZE; + } + } + if (delay != 0) + this_thread::sleep_for(chrono + ::nanoseconds(size_per_test / delay)); } copy_end = chrono::high_resolution_clock::now(); break; @@ -88,6 +163,9 @@ int main(int argc, char *argv[]) copy_start = chrono::high_resolution_clock::now(); for (int i = 0; i < iter_per_size; i++) { s += sum(src.get(), cur_size); + if (delay != 0) + this_thread::sleep_for(chrono + ::nanoseconds(size_per_test / delay)); } copy_end = chrono::high_resolution_clock::now(); tot_sum += s; @@ -95,11 +173,18 @@ int main(int argc, char *argv[]) } } + samples--; double ns_per_copy = chrono::duration_cast<chrono::nanoseconds>(copy_end - copy_start).count() / double(iter_per_size); double gb_per_sec = ((double)cur_size / (1ull<<30)) / (ns_per_copy / 1.0E9); if (type == MemcpyBench) gb_per_sec *= 2.0; - cout << "size: " << cur_size << ", perf: " << gb_per_sec << "GB/s, iter: " << iter_per_size << endl; + double percent_waiting = 0; + if (delay != 0) { + percent_waiting = (size_per_test / delay) / ns_per_copy * 100; + } + cout << "size: " << cur_size << ", perf: " << gb_per_sec + << "GB/s, iter: " << iter_per_size << ", \% time spent waiting: " + << percent_waiting << endl; } return 0; } |