diff options
Diffstat (limited to 'bench_mixers.c')
-rw-r--r-- | bench_mixers.c | 320 |
1 files changed, 264 insertions, 56 deletions
diff --git a/bench_mixers.c b/bench_mixers.c index df80818..5b22b3f 100644 --- a/bench_mixers.c +++ b/bench_mixers.c @@ -22,37 +22,102 @@ #endif #define BENCH_REF_TRIG_FUNC 1 -#define BENCH_OUT_OF_PLACE_ALGOS 1 +#define BENCH_OUT_OF_PLACE_ALGOS 0 #define BENCH_INPLACE_ALGOS 1 +#define SAVE_BY_DEFAULT 0 +#define SAVE_LIMIT_MSPS 16 + +#if 0 + #define BENCH_FILE_SHIFT_MATH_CC "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin" + #define BENCH_FILE_ADD_FAST_CC "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin" + #define BENCH_FILE_ADD_FAST_INP_C "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin" + #define BENCH_FILE_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin" + #define BENCH_FILE_REC_OSC_CC "" + #define BENCH_FILE_REC_OSC_INP_C "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin" + #define BENCH_FILE_REC_OSC_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin" +#else + #define BENCH_FILE_SHIFT_MATH_CC "" + #define BENCH_FILE_ADD_FAST_CC "" + #define BENCH_FILE_ADD_FAST_INP_C "" + #define BENCH_FILE_UNROLL_INP_C "" + #define BENCH_FILE_LTD_UNROLL_INP_C "" + #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C "" + #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C "" + #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C "" + #define BENCH_FILE_REC_OSC_CC "" + #define BENCH_FILE_REC_OSC_INP_C "" + #define BENCH_FILE_REC_OSC_SSE_INP_C "" +#endif + + #if defined(HAVE_SYS_TIMES) static double ttclk = 0.; - static double uclock_sec(void) + static double uclock_sec(int find_start) { - struct tms t; + struct tms t0, t; if (ttclk == 0.) + { ttclk = sysconf(_SC_CLK_TCK); + fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk); + } times(&t); + if (find_start) + { + t0 = t; + while (t0.tms_utime == t.tms_utime) + times(&t); + } /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */ return ((double)t.tms_utime) / ttclk; } -# else - double uclock_sec(void) + +#elif 0 + // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes + double uclock_sec(int find_start) + { + FILETIME a, b, c, d; + if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0) + { + // Returns total user time. + // Can be tweaked to include kernel times as well. + return + (double)(d.dwLowDateTime | + ((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001; + } + else { + // Handle error + return 0; + } + } + +#else + double uclock_sec(int find_start) { return (double)clock()/(double)CLOCKS_PER_SEC; } #endif void save(complexf * d, int B, int N, const char * fn) { - if (!fn) + if (!fn || !fn[0]) + { + if (! SAVE_BY_DEFAULT) + return; fn = "/dev/shm/bench.bin"; + } FILE * f = fopen(fn, "wb"); if (!f) { fprintf(stderr, "error writing result to %s\n", fn); return; } + if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 ) + N = SAVE_LIMIT_MSPS * 1024 * 1024; for (int off = 0; off + B <= N; off += B) { fwrite(d+off, sizeof(complexf), B, f); @@ -75,21 +140,22 @@ double bench_shift_math_cc(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase); off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(output, B, off, NULL); + save(output, B, off, BENCH_FILE_SHIFT_MATH_CC); + free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -112,7 +178,7 @@ double bench_shift_table_cc(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -120,14 +186,14 @@ double bench_shift_table_cc(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); save(output, B, off, NULL); free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -136,7 +202,6 @@ double bench_shift_table_cc(int B, int N) { double bench_shift_addfast(int B, int N) { double t0, t1, tstop, T, nI; int iter, off; - int table_size=65536; float phase = 0.0F; complexf *input = (complexf *)malloc(N * sizeof(complexf)); complexf *output = (complexf *)malloc(N * sizeof(complexf)); @@ -149,7 +214,7 @@ double bench_shift_addfast(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -157,14 +222,49 @@ double bench_shift_addfast(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(output, B, off, NULL); + save(output, B, off, BENCH_FILE_ADD_FAST_CC); + free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + +double bench_shift_addfast_inp(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_addfast_data_t state = shift_addfast_init(-0.0009F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(1); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + phase = shift_addfast_inp_c(input+off, B, &state, phase); + + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, BENCH_FILE_ADD_FAST_INP_C); + + free(input); + T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -185,7 +285,7 @@ double bench_shift_unroll_oop(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -193,14 +293,14 @@ double bench_shift_unroll_oop(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); save(output, B, off, NULL); free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -219,7 +319,7 @@ double bench_shift_unroll_inp(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -227,13 +327,14 @@ double bench_shift_unroll_inp(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(input, B, off, NULL); + save(input, B, off, BENCH_FILE_UNROLL_INP_C); + free(input); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -254,7 +355,7 @@ double bench_shift_limited_unroll_oop(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -262,14 +363,14 @@ double bench_shift_limited_unroll_oop(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); save(output, B, off, NULL); free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -288,7 +389,7 @@ double bench_shift_limited_unroll_inp(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -296,48 +397,121 @@ double bench_shift_limited_unroll_inp(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(input, B, off, NULL); + save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C); + free(input); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } -double bench_shift_limited_unroll_sse_inp(int B, int N) { +double bench_shift_limited_unroll_A_sse_inp(int B, int N) { double t0, t1, tstop, T, nI; int iter, off; complexf *input = (complexf *)malloc(N * sizeof(complexf)); shift_recursive_osc_t gen_state; shift_recursive_osc_conf_t gen_conf; - shift_limited_unroll_sse_data_t *state = malloc(sizeof(shift_limited_unroll_sse_data_t)); + shift_limited_unroll_A_sse_data_t *state = malloc(sizeof(shift_limited_unroll_A_sse_data_t)); - *state = shift_limited_unroll_sse_init(-0.0009F, 0.0F); + *state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F); shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); gen_recursive_osc_c(input, N, &gen_conf, &gen_state); iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work - shift_limited_unroll_sse_inp_c(input+off, B, state); + shift_limited_unroll_A_sse_inp_c(input+off, B, state); off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(input, B, off, NULL); + save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C); + free(input); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + +double bench_shift_limited_unroll_B_sse_inp(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_B_sse_data_t *state = malloc(sizeof(shift_limited_unroll_B_sse_data_t)); + + *state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + //shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(1); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_limited_unroll_B_sse_inp_c(input+off, B, state); + + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C); + + free(input); + T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + +double bench_shift_limited_unroll_C_sse_inp(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_C_sse_data_t *state = malloc(sizeof(shift_limited_unroll_C_sse_data_t)); + + *state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(1); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_limited_unroll_C_sse_inp_c(input+off, B, state); + + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C); + + free(input); + T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -358,7 +532,7 @@ double bench_shift_rec_osc_cc_oop(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -366,15 +540,16 @@ double bench_shift_rec_osc_cc_oop(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - //save(input, B, off, "/dev/shm/bench_inp.bin"); + save(input, B, off, BENCH_FILE_REC_OSC_CC); + save(output, B, off, NULL); free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -394,7 +569,7 @@ double bench_shift_rec_osc_cc_inp(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -402,13 +577,13 @@ double bench_shift_rec_osc_cc_inp(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(input, B, off, NULL); + save(input, B, off, BENCH_FILE_REC_OSC_INP_C); free(input); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -432,7 +607,7 @@ double bench_shift_rec_osc_sse_c_inp(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -440,13 +615,13 @@ double bench_shift_rec_osc_sse_c_inp(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(input, B, off, NULL); + save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C); free(input); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -460,6 +635,26 @@ int main(int argc, char **argv) // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte) int B = 8 * 1024; int N = 64 * 1024 * 1024; + int showUsage = 0; + + if (argc == 1) + showUsage = 1; + + if (1 < argc) + B = atoi(argv[1]); + if (2 < argc) + N = atoi(argv[2]) * 1024 * 1024; + + if ( !B || !N || showUsage ) + { + fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]); + if ( !B || !N ) + return 0; + } + + fprintf(stderr, "processing up to N = %d MSamples with blocke length of %d samples\n", + N / (1024 * 1024), B ); + #if BENCH_REF_TRIG_FUNC printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n"); @@ -490,18 +685,31 @@ int main(int argc, char **argv) #endif #if BENCH_INPLACE_ALGOS - printf("starting bench of shift_unroll_cc in-place ..\n"); + + printf("starting bench of shift_addfast_inp_c in-place ..\n"); + rt = bench_shift_addfast_inp(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_unroll_inp_c in-place ..\n"); rt = bench_shift_unroll_inp(B, N); printf(" %f MSamples/sec\n\n", rt * 1E-6); - printf("starting bench of shift_limited_unroll_cc in-place ..\n"); + printf("starting bench of shift_limited_unroll_inp_c in-place ..\n"); rt = bench_shift_limited_unroll_inp(B, N); printf(" %f MSamples/sec\n\n", rt * 1E-6); if ( have_sse_shift_mixer_impl() ) { - printf("starting bench of shift_limited_unroll_sse_inp_c in-place ..\n"); - rt = bench_shift_limited_unroll_sse_inp(B, N); + printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n"); + rt = bench_shift_limited_unroll_A_sse_inp(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n"); + rt = bench_shift_limited_unroll_B_sse_inp(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n"); + rt = bench_shift_limited_unroll_C_sse_inp(B, N); printf(" %f MSamples/sec\n\n", rt * 1E-6); } |