summaryrefslogtreecommitdiff
path: root/memcpy-perf
diff options
context:
space:
mode:
Diffstat (limited to 'memcpy-perf')
-rw-r--r--memcpy-perf/memcpy-perf.cpp121
1 files changed, 103 insertions, 18 deletions
diff --git a/memcpy-perf/memcpy-perf.cpp b/memcpy-perf/memcpy-perf.cpp
index 20d060b8..2dfd900d 100644
--- a/memcpy-perf/memcpy-perf.cpp
+++ b/memcpy-perf/memcpy-perf.cpp
@@ -7,14 +7,20 @@
#include <memory>
#include <cmath>
#include <string>
+#include <thread>
+
+#define CACHE_HIT_SIZE 1 << 17
using namespace std;
-const size_t size_start = 64;
-const size_t size_end = 16 * (1ull << 20);
-const size_t samples = 2048;
+size_t size_start = 64;
+size_t size_end = 16 * (1ull << 20);
+size_t samples = 2048;
size_t size_per_test = 64 * (1ull << 20);
size_t tot_sum = 0;
+size_t delay = 0;
+float speed = 0;
+bool dummy = false;
void __attribute__((noinline)) memcpy_noinline(void *dst, void *src, size_t size);
void __attribute__((noinline)) memset_noinline(void *dst, int value, size_t size);
@@ -26,21 +32,64 @@ enum BenchType {
SumBench,
};
+static void usage(char* p) {
+ printf("Usage: %s <test> <options>\n"
+ "<test> is one of the following:\n"
+ " --memcpy\n"
+ " --memset\n"
+ " --sum\n"
+ "<options> are optional and apply to all tests:\n"
+ " --dummy\n"
+ " Simulates cpu-only load of a test. Guaranteed to use L2\n"
+ " instead. Not supported on --sum test.\n"
+ " --delay DELAY_DIVISOR\n"
+ " --start START_SIZE_MB\n"
+ " --end END_SIZE_MB (requires start, optional)\n"
+ " --samples NUM_SAMPLES\n"
+ , p);
+}
+
int main(int argc, char *argv[])
{
- BenchType type;
+ BenchType type = MemcpyBench;
if (argc <= 1) {
- cerr << "memcpy_perf [--memcpy|--memset|--sum]" << endl;
+ usage(argv[0]);
return 0;
}
- if (string(argv[1]) == string("--memcpy")) {
- type = MemcpyBench;
- } else if (string(argv[1]) == string("--memset")) {
- type = MemsetBench;
- } else if (string(argv[1]) == string("--sum")) {
- type = SumBench;
- } else {
- type = MemcpyBench;
+ for (int i = 1; i < argc; i++) {
+ if (string(argv[i]) == string("--memcpy")) {
+ type = MemcpyBench;
+ } else if (string(argv[i]) == string("--memset")) {
+ type = MemsetBench;
+ } else if (string(argv[i]) == string("--sum")) {
+ type = SumBench;
+ } else if (string(argv[i]) == string("--dummy")) {
+ dummy = true;
+ } else if (i + 1 < argc) {
+ if (string(argv[i]) == string("--delay")) {
+ delay = atoi(argv[++i]);
+ } else if (string(argv[i]) == string("--start")) {
+ size_start = atoi(argv[++i]) * (1ull << 20);
+ size_end = size_start;
+ } else if (string(argv[i]) == string("--end")) {
+ size_t end = atoi(argv[++i]) * (1ull << 20);
+ if (end > size_start && i > 3
+ && string(argv[i-3]) == string("--start")) {
+ size_end = end;
+ } else {
+ printf("Cannot specify --end without --start.\n");
+ return 0;
+ }
+ } else if (string(argv[i]) == string("--samples")) {
+ samples = atoi(argv[++i]);
+ } else {
+ printf("Unknown argument %s\n", argv[i]);
+ return 0;
+ }
+ } else {
+ printf("The %s option requires a single argument.\n", argv[i]);
+ return 0;
+ }
}
unique_ptr<uint8_t[]> src(new uint8_t[size_end]);
@@ -54,8 +103,10 @@ int main(int argc, char *argv[])
//cout << "src: " << (uintptr_t)src.get() << endl;
//cout << "dst: " << (uintptr_t)dst.get() << endl;
- for (double cur_pow = start_pow; cur_pow <= end_pow; cur_pow += pow_inc) {
- chrono::time_point<chrono::high_resolution_clock> copy_start, copy_end;
+ for (double cur_pow = start_pow; cur_pow <= end_pow && samples > 0;
+ cur_pow += pow_inc) {
+ chrono::time_point<chrono::high_resolution_clock>
+ copy_start, copy_end, pre_wait;
size_t cur_size = (size_t)pow(10.0, cur_pow);
size_t iter_per_size = size_per_test / cur_size;
@@ -65,9 +116,21 @@ int main(int argc, char *argv[])
case MemsetBench: {
memcpy_noinline(src.get(), dst.get(), cur_size);
memset_noinline(dst.get(), 0xdeadbeef, cur_size);
+ size_t hit_size = CACHE_HIT_SIZE;
copy_start = chrono::high_resolution_clock::now();
for (int i = 0; i < iter_per_size; i++) {
- memset_noinline(dst.get(), 0xdeadbeef, cur_size);
+ if (!dummy) {
+ memset_noinline(dst.get(), 0xdeadbeef, cur_size);
+ } else {
+ while (hit_size < cur_size) {
+ memset_noinline
+ (dst.get(), 0xdeadbeef, CACHE_HIT_SIZE);
+ hit_size += 1 << 17;
+ }
+ }
+ if (delay != 0)
+ this_thread::sleep_for(chrono
+ ::nanoseconds(size_per_test / delay));
}
copy_end = chrono::high_resolution_clock::now();
break;
@@ -75,9 +138,21 @@ int main(int argc, char *argv[])
case MemcpyBench: {
memcpy_noinline(dst.get(), src.get(), cur_size);
memcpy_noinline(src.get(), dst.get(), cur_size);
+ size_t hit_size = CACHE_HIT_SIZE;
copy_start = chrono::high_resolution_clock::now();
for (int i = 0; i < iter_per_size; i++) {
- memcpy_noinline(dst.get(), src.get(), cur_size);
+ if (!dummy) {
+ memcpy_noinline(dst.get(), src.get(), cur_size);
+ } else {
+ while (hit_size < cur_size) {
+ memcpy_noinline
+ (dst.get(), src.get(), CACHE_HIT_SIZE);
+ hit_size += CACHE_HIT_SIZE;
+ }
+ }
+ if (delay != 0)
+ this_thread::sleep_for(chrono
+ ::nanoseconds(size_per_test / delay));
}
copy_end = chrono::high_resolution_clock::now();
break;
@@ -88,6 +163,9 @@ int main(int argc, char *argv[])
copy_start = chrono::high_resolution_clock::now();
for (int i = 0; i < iter_per_size; i++) {
s += sum(src.get(), cur_size);
+ if (delay != 0)
+ this_thread::sleep_for(chrono
+ ::nanoseconds(size_per_test / delay));
}
copy_end = chrono::high_resolution_clock::now();
tot_sum += s;
@@ -95,11 +173,18 @@ int main(int argc, char *argv[])
}
}
+ samples--;
double ns_per_copy = chrono::duration_cast<chrono::nanoseconds>(copy_end - copy_start).count() / double(iter_per_size);
double gb_per_sec = ((double)cur_size / (1ull<<30)) / (ns_per_copy / 1.0E9);
if (type == MemcpyBench)
gb_per_sec *= 2.0;
- cout << "size: " << cur_size << ", perf: " << gb_per_sec << "GB/s, iter: " << iter_per_size << endl;
+ double percent_waiting = 0;
+ if (delay != 0) {
+ percent_waiting = (size_per_test / delay) / ns_per_copy * 100;
+ }
+ cout << "size: " << cur_size << ", perf: " << gb_per_sec
+ << "GB/s, iter: " << iter_per_size << ", \% time spent waiting: "
+ << percent_waiting << endl;
}
return 0;
}