added mixer algorithm variants, cleaned up code, enabled unaligned arrays

author: hayati ayguen <h_ayguen@web.de> 2020-11-14 15:12:19 +0100
committer: hayati ayguen <h_ayguen@web.de> 2020-11-14 15:12:50 +0100
commit: d64eea5c6abe2c81f37b4e5932d15276fe85d09a (patch)
tree: 53fd06bbe82afe8da2107e35188cfd9db7359f81
parent: 2587d835de992cabfb653bfed9db80e499df4449 (diff)
download: pffft-d64eea5c6abe2c81f37b4e5932d15276fe85d09a.tar.gz
3 files changed, 930 insertions, 258 deletions
diff --git a/bench_mixers.c b/bench_mixers.c
index df80818..5b22b3f 100644
--- a/bench_mixers.c
+++ b/bench_mixers.c
@@ -22,37 +22,102 @@
 #endif
 
 #define BENCH_REF_TRIG_FUNC       1
-#define BENCH_OUT_OF_PLACE_ALGOS  1
+#define BENCH_OUT_OF_PLACE_ALGOS  0
 #define BENCH_INPLACE_ALGOS       1
 
+#define SAVE_BY_DEFAULT  0
+#define SAVE_LIMIT_MSPS           16
+
+#if 0
+  #define BENCH_FILE_SHIFT_MATH_CC           "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin"
+  #define BENCH_FILE_ADD_FAST_CC             "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin"
+  #define BENCH_FILE_ADD_FAST_INP_C          "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin"
+  #define BENCH_FILE_UNROLL_INP_C            "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_INP_C        "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin"
+  #define BENCH_FILE_REC_OSC_CC              ""
+  #define BENCH_FILE_REC_OSC_INP_C           "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin"
+  #define BENCH_FILE_REC_OSC_SSE_INP_C       "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin"
+#else
+  #define BENCH_FILE_SHIFT_MATH_CC           ""
+  #define BENCH_FILE_ADD_FAST_CC             ""
+  #define BENCH_FILE_ADD_FAST_INP_C          ""
+  #define BENCH_FILE_UNROLL_INP_C            ""
+  #define BENCH_FILE_LTD_UNROLL_INP_C        ""
+  #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  ""
+  #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  ""
+  #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  ""
+  #define BENCH_FILE_REC_OSC_CC              ""
+  #define BENCH_FILE_REC_OSC_INP_C           ""
+  #define BENCH_FILE_REC_OSC_SSE_INP_C       ""
+#endif
+
+
 
 #if defined(HAVE_SYS_TIMES)
     static double ttclk = 0.;
 
-    static double uclock_sec(void)
+    static double uclock_sec(int find_start)
     {
-        struct tms t;
+        struct tms t0, t;
         if (ttclk == 0.)
+        {
             ttclk = sysconf(_SC_CLK_TCK);
+            fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk);
+        }
         times(&t);
+        if (find_start)
+        {
+            t0 = t;
+            while (t0.tms_utime == t.tms_utime)
+                times(&t);
+        }
         /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
         return ((double)t.tms_utime) / ttclk;
     }
-# else
-    double uclock_sec(void)
+
+#elif 0
+    // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes
+    double uclock_sec(int find_start)
+    {
+        FILETIME a, b, c, d;
+        if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0)
+        {
+            //  Returns total user time.
+            //  Can be tweaked to include kernel times as well.
+            return
+                (double)(d.dwLowDateTime |
+                    ((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
+        }
+        else {
+            //  Handle error
+            return 0;
+        }
+    }
+
+#else
+    double uclock_sec(int find_start)
     { return (double)clock()/(double)CLOCKS_PER_SEC; }
 #endif
 
 
 void save(complexf * d, int B, int N, const char * fn)
 {
-    if (!fn)
+    if (!fn || !fn[0])
+    {
+        if (! SAVE_BY_DEFAULT)
+            return;
         fn = "/dev/shm/bench.bin";
+    }
     FILE * f = fopen(fn, "wb");
     if (!f) {
         fprintf(stderr, "error writing result to %s\n", fn);
         return;
     }
+    if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 )
+        N = SAVE_LIMIT_MSPS * 1024 * 1024;
     for (int off = 0; off + B <= N; off += B)
     {
         fwrite(d+off, sizeof(complexf), B, f);
@@ -75,21 +140,22 @@ double bench_shift_math_cc(int B, int N) {
 
     iter = 0;
     off = 0;
-    t0 = uclock_sec();
+    t0 = uclock_sec(1);
     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
     do {
         // work
         phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase);
         off += B;
         ++iter;
-        t1 = uclock_sec();
+        t1 = uclock_sec(0);
     } while ( t1 < tstop && off + B < N );
 
-    save(output, B, off, NULL);
+    save(output, B, off, BENCH_FILE_SHIFT_MATH_CC);
+
     free(input);
     free(output);
-    printf("processed %f Msamples\n", off * 1E-6);
     T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
     return (nI / T);    /* normalized iterations per second */
 }
@@ -112,7 +178,7 @@ double bench_shift_table_cc(int B, int N) {
 
     iter = 0;
     off = 0;
-    t0 = uclock_sec();
+    t0 = uclock_sec(1);
     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
     do {
         // work
@@ -120,14 +186,14 @@ double bench_shift_table_cc(int B, int N) {
 
         off += B;
         ++iter;
-        t1 = uclock_sec();
+        t1 = uclock_sec(0);
     } while ( t1 < tstop && off + B < N );
 
     save(output, B, off, NULL);
     free(input);
     free(output);
-    printf("processed %f Msamples\n", off * 1E-6);
     T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
     return (nI / T);    /* normalized iterations per second */
 }
@@ -136,7 +202,6 @@ double bench_shift_table_cc(int B, int N) {
 double bench_shift_addfast(int B, int N) {
     double t0, t1, tstop, T, nI;
     int iter, off;
-    int table_size=65536;
     float phase = 0.0F;
     complexf *input = (complexf *)malloc(N * sizeof(complexf));
     complexf *output = (complexf *)malloc(N * sizeof(complexf));
@@ -149,7 +214,7 @@ double bench_shift_addfast(int B, int N) {
 
     iter = 0;
     off = 0;
-    t0 = uclock_sec();
+    t0 = uclock_sec(1);
     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
     do {
         // work
@@ -157,14 +222,49 @@ double bench_shift_addfast(int B, int N) {
 
         off += B;
         ++iter;
-        t1 = uclock_sec();
+        t1 = uclock_sec(0);
     } while ( t1 < tstop && off + B < N );
 
-    save(output, B, off, NULL);
+    save(output, B, off, BENCH_FILE_ADD_FAST_CC);
+
     free(input);
     free(output);
-    printf("processed %f Msamples\n", off * 1E-6);
     T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+double bench_shift_addfast_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_addfast_data_t state = shift_addfast_init(-0.0009F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_addfast_inp_c(input+off, B, &state, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_ADD_FAST_INP_C);
+
+    free(input);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
     return (nI / T);    /* normalized iterations per second */
 }
@@ -185,7 +285,7 @@ double bench_shift_unroll_oop(int B, int N) {
 
     iter = 0;
     off = 0;
-    t0 = uclock_sec();
+    t0 = uclock_sec(1);
     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
     do {
         // work
@@ -193,14 +293,14 @@ double bench_shift_unroll_oop(int B, int N) {
 
         off += B;
         ++iter;
-        t1 = uclock_sec();
+        t1 = uclock_sec(0);
     } while ( t1 < tstop && off + B < N );
 
     save(output, B, off, NULL);
     free(input);
     free(output);
-    printf("processed %f Msamples\n", off * 1E-6);
     T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
     return (nI / T);    /* normalized iterations per second */
 }
@@ -219,7 +319,7 @@ double bench_shift_unroll_inp(int B, int N) {
 
     iter = 0;
     off = 0;
-    t0 = uclock_sec();
+    t0 = uclock_sec(1);
     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
     do {
         // work
@@ -227,13 +327,14 @@ double bench_shift_unroll_inp(int B, int N) {
 
         off += B;
         ++iter;
-        t1 = uclock_sec();
+        t1 = uclock_sec(0);
     } while ( t1 < tstop && off + B < N );
 
-    save(input, B, off, NULL);
+    save(input, B, off, BENCH_FILE_UNROLL_INP_C);
+
     free(input);
-    printf("processed %f Msamples\n", off * 1E-6);
     T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
     return (nI / T);    /* normalized iterations per second */
 }
@@ -254,7 +355,7 @@ double bench_shift_limited_unroll_oop(int B, int N) {
 
     iter = 0;
     off = 0;
-    t0 = uclock_sec();
+    t0 = uclock_sec(1);
     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
     do {
         // work
@@ -262,14 +363,14 @@ double bench_shift_limited_unroll_oop(int B, int N) {
 
         off += B;
         ++iter;
-        t1 = uclock_sec();
+        t1 = uclock_sec(0);
     } while ( t1 < tstop && off + B < N );
 
     save(output, B, off, NULL);
     free(input);
     free(output);
-    printf("processed %f Msamples\n", off * 1E-6);
     T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
     return (nI / T);    /* normalized iterations per second */
 }
@@ -288,7 +389,7 @@ double bench_shift_limited_unroll_inp(int B, int N) {
 
     iter = 0;
     off = 0;
-    t0 = uclock_sec();
+    t0 = uclock_sec(1);
     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
     do {
         // work
@@ -296,48 +397,121 @@ double bench_shift_limited_unroll_inp(int B, int N) {
 
         off += B;
         ++iter;
-        t1 = uclock_sec();
+        t1 = uclock_sec(0);
     } while ( t1 < tstop && off + B < N );
 
-    save(input, B, off, NULL);
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C);
+
     free(input);
-    printf("processed %f Msamples\n", off * 1E-6);
     T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
     return (nI / T);    /* normalized iterations per second */
 }
 
 
-double bench_shift_limited_unroll_sse_inp(int B, int N) {
+double bench_shift_limited_unroll_A_sse_inp(int B, int N) {
     double t0, t1, tstop, T, nI;
     int iter, off;
     complexf *input = (complexf *)malloc(N * sizeof(complexf));
     shift_recursive_osc_t gen_state;
     shift_recursive_osc_conf_t gen_conf;
-    shift_limited_unroll_sse_data_t *state = malloc(sizeof(shift_limited_unroll_sse_data_t));
+    shift_limited_unroll_A_sse_data_t *state = malloc(sizeof(shift_limited_unroll_A_sse_data_t));
 
-    *state = shift_limited_unroll_sse_init(-0.0009F, 0.0F);
+    *state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F);
 
     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
 
     iter = 0;
     off = 0;
-    t0 = uclock_sec();
+    t0 = uclock_sec(1);
     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
     do {
         // work
-        shift_limited_unroll_sse_inp_c(input+off, B, state);
+        shift_limited_unroll_A_sse_inp_c(input+off, B, state);
 
         off += B;
         ++iter;
-        t1 = uclock_sec();
+        t1 = uclock_sec(0);
     } while ( t1 < tstop && off + B < N );
 
-    save(input, B, off, NULL);
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C);
+    
     free(input);
-    printf("processed %f Msamples\n", off * 1E-6);
     T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+double bench_shift_limited_unroll_B_sse_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_B_sse_data_t *state = malloc(sizeof(shift_limited_unroll_B_sse_data_t));
+
+    *state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    //shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_limited_unroll_B_sse_inp_c(input+off, B, state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C);
+    
+    free(input);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+double bench_shift_limited_unroll_C_sse_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_C_sse_data_t *state = malloc(sizeof(shift_limited_unroll_C_sse_data_t));
+
+    *state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_limited_unroll_C_sse_inp_c(input+off, B, state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C);
+    
+    free(input);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
     return (nI / T);    /* normalized iterations per second */
 }
@@ -358,7 +532,7 @@ double bench_shift_rec_osc_cc_oop(int B, int N) {
 
     iter = 0;
     off = 0;
-    t0 = uclock_sec();
+    t0 = uclock_sec(1);
     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
     do {
         // work
@@ -366,15 +540,16 @@ double bench_shift_rec_osc_cc_oop(int B, int N) {
 
         off += B;
         ++iter;
-        t1 = uclock_sec();
+        t1 = uclock_sec(0);
     } while ( t1 < tstop && off + B < N );
 
-    //save(input, B, off, "/dev/shm/bench_inp.bin");
+    save(input, B, off, BENCH_FILE_REC_OSC_CC);
+
     save(output, B, off, NULL);
     free(input);
     free(output);
-    printf("processed %f Msamples\n", off * 1E-6);
     T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
     return (nI / T);    /* normalized iterations per second */
 }
@@ -394,7 +569,7 @@ double bench_shift_rec_osc_cc_inp(int B, int N) {
 
     iter = 0;
     off = 0;
-    t0 = uclock_sec();
+    t0 = uclock_sec(1);
     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
     do {
         // work
@@ -402,13 +577,13 @@ double bench_shift_rec_osc_cc_inp(int B, int N) {
 
         off += B;
         ++iter;
-        t1 = uclock_sec();
+        t1 = uclock_sec(0);
     } while ( t1 < tstop && off + B < N );
 
-    save(input, B, off, NULL);
+    save(input, B, off, BENCH_FILE_REC_OSC_INP_C);
     free(input);
-    printf("processed %f Msamples\n", off * 1E-6);
     T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
     return (nI / T);    /* normalized iterations per second */
 }
@@ -432,7 +607,7 @@ double bench_shift_rec_osc_sse_c_inp(int B, int N) {
 
     iter = 0;
     off = 0;
-    t0 = uclock_sec();
+    t0 = uclock_sec(1);
     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
     do {
         // work
@@ -440,13 +615,13 @@ double bench_shift_rec_osc_sse_c_inp(int B, int N) {
 
         off += B;
         ++iter;
-        t1 = uclock_sec();
+        t1 = uclock_sec(0);
     } while ( t1 < tstop && off + B < N );
 
-    save(input, B, off, NULL);
+    save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C);
     free(input);
-    printf("processed %f Msamples\n", off * 1E-6);
     T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
     return (nI / T);    /* normalized iterations per second */
 }
@@ -460,6 +635,26 @@ int main(int argc, char **argv)
     // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte)
     int B = 8 * 1024;
     int N = 64 * 1024 * 1024;
+    int showUsage = 0;
+
+    if (argc == 1)
+        showUsage = 1;
+
+    if (1 < argc)
+        B = atoi(argv[1]);
+    if (2 < argc)
+        N = atoi(argv[2]) * 1024 * 1024;
+
+    if ( !B || !N || showUsage )
+    {
+        fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]);
+        if ( !B || !N )
+            return 0;
+    }
+
+    fprintf(stderr, "processing up to N = %d MSamples with blocke length of %d samples\n",
+        N / (1024 * 1024), B );
+
 
 #if BENCH_REF_TRIG_FUNC
     printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n");
@@ -490,18 +685,31 @@ int main(int argc, char **argv)
 #endif
 
 #if BENCH_INPLACE_ALGOS
-    printf("starting bench of shift_unroll_cc in-place ..\n");
+
+    printf("starting bench of shift_addfast_inp_c in-place ..\n");
+    rt = bench_shift_addfast_inp(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_unroll_inp_c in-place ..\n");
     rt = bench_shift_unroll_inp(B, N);
     printf("  %f MSamples/sec\n\n", rt * 1E-6);
 
-    printf("starting bench of shift_limited_unroll_cc in-place ..\n");
+    printf("starting bench of shift_limited_unroll_inp_c in-place ..\n");
     rt = bench_shift_limited_unroll_inp(B, N);
     printf("  %f MSamples/sec\n\n", rt * 1E-6);
 
     if ( have_sse_shift_mixer_impl() )
     {
-        printf("starting bench of shift_limited_unroll_sse_inp_c in-place ..\n");
-        rt = bench_shift_limited_unroll_sse_inp(B, N);
+        printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n");
+        rt = bench_shift_limited_unroll_A_sse_inp(B, N);
+        printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+        printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n");
+        rt = bench_shift_limited_unroll_B_sse_inp(B, N);
+        printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+        printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n");
+        rt = bench_shift_limited_unroll_C_sse_inp(B, N);
         printf("  %f MSamples/sec\n\n", rt * 1E-6);
     }
 
diff --git a/pf_mixer.cpp b/pf_mixer.cpp
index 81c78e4..0f2c310 100644
--- a/pf_mixer.cpp
+++ b/pf_mixer.cpp
@@ -43,6 +43,9 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define iof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)))
 #define qof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)+1))
 
+#define USE_ALIGNED_ADDRESSES  0
+
+
 
 /*
   _____   _____ _____      __                  _   _
@@ -93,10 +96,11 @@ typedef union v4_union {
   float f[4];
 } v4_union;
 
-#define VMUL(a,b)    _mm_mul_ps(a,b)
-#define VADD(a,b)    _mm_add_ps(a,b)
-#define VSUB(a,b)    _mm_sub_ps(a,b)
-#define LD_PS1(s)    _mm_set1_ps(s)
+#define VMUL(a,b)                 _mm_mul_ps(a,b)
+#define VDIV(a,b)                 _mm_div_ps(a,b)
+#define VADD(a,b)                 _mm_add_ps(a,b)
+#define VSUB(a,b)                 _mm_sub_ps(a,b)
+#define LD_PS1(s)                 _mm_set1_ps(s)
 #define VLOAD_UNALIGNED(ptr)      _mm_loadu_ps((const float *)(ptr))
 #define VLOAD_ALIGNED(ptr)        _mm_load_ps((const float *)(ptr))
 #define VSTORE_UNALIGNED(ptr, v)  _mm_storeu_ps((float*)(ptr), v)
@@ -104,6 +108,14 @@ typedef union v4_union {
 #define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
 #define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
 
+#if USE_ALIGNED_ADDRESSES
+  #define VLOAD(ptr)              _mm_load_ps((const float *)(ptr))
+  #define VSTORE(ptr, v)          _mm_store_ps((float*)(ptr), v)
+#else
+  #define VLOAD(ptr)              _mm_loadu_ps((const float *)(ptr))
+  #define VSTORE(ptr, v)          _mm_storeu_ps((float*)(ptr), v)
+#endif
+
 
 int have_sse_shift_mixer_impl()
 {
@@ -120,6 +132,12 @@ int have_sse_shift_mixer_impl()
 #endif
 
 
+/*********************************************************************/
+
+/**************/
+/*** ALGO A ***/
+/**************/
+
 PF_TARGET_CLONES
 float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase)
 {
@@ -144,11 +162,14 @@ float shift_math_cc(complexf *input, complexf* output, int input_size, float rat
     return phase;
 }
 
+/*********************************************************************/
 
+/**************/
+/*** ALGO B ***/
+/**************/
 
 shift_table_data_t shift_table_init(int table_size)
 {
-    //RTODO
     shift_table_data_t output;
     output.table=(float*)malloc(sizeof(float)*table_size);
     output.table_size=table_size;
@@ -168,7 +189,6 @@ void shift_table_deinit(shift_table_data_t table_data)
 PF_TARGET_CLONES
 float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase)
 {
-    //RTODO
     rate*=2;
     //Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table.
     float phase=starting_phase;
@@ -177,7 +197,6 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra
     for(int i=0;i<input_size; i++) //@shift_math_cc
     {
         int sin_index, cos_index, temp_index, sin_sign, cos_sign;
-        //float vphase=fmodf(phase,PI/2); //between 0 and 90deg
         int quadrant=phase/(PI/2); //between 0 and 3
         float vphase=phase-quadrant*(PI/2);
         sin_index=(vphase/(PI/2))*table_data.table_size;
@@ -204,6 +223,112 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra
     return phase;
 }
 
+/*********************************************************************/
+
+/**************/
+/*** ALGO C ***/
+/**************/
+
+shift_addfast_data_t shift_addfast_init(float rate)
+{
+    shift_addfast_data_t output;
+    output.phase_increment=2*rate*PI;
+    for(int i=0;i<4;i++)
+    {
+        output.dsin[i]=sin(output.phase_increment*(i+1));
+        output.dcos[i]=cos(output.phase_increment*(i+1));
+    }
+    return output;
+}
+
+#define SADF_L1(j) \
+    cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \
+    sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j;
+#define SADF_L2(j) \
+    iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \
+    qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j);
+
+PF_TARGET_CLONES
+float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
+{
+    //input_size should be multiple of 4
+    //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
+    float cos_start=cos(starting_phase);
+    float sin_start=sin(starting_phase);
+    float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
+        sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
+        dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
+        dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
+
+    for(int i=0;i<input_size/4; i++)
+    {
+        SADF_L1(0)
+        SADF_L1(1)
+        SADF_L1(2)
+        SADF_L1(3)
+        SADF_L2(0)
+        SADF_L2(1)
+        SADF_L2(2)
+        SADF_L2(3)
+        cos_start = cos_vals_3;
+        sin_start = sin_vals_3;
+    }
+    starting_phase+=input_size*d->phase_increment;
+    while(starting_phase>PI) starting_phase-=2*PI;
+    while(starting_phase<-PI) starting_phase+=2*PI;
+    return starting_phase;
+}
+
+#undef SADF_L2
+
+
+#define SADF_L2(j) \
+    tmp_inp_cos = iof(in_out,4*i+j); \
+    tmp_inp_sin = qof(in_out,4*i+j); \
+    iof(in_out,4*i+j)=(cos_vals_ ## j)*tmp_inp_cos - (sin_vals_ ## j)*tmp_inp_sin; \
+    qof(in_out,4*i+j)=(sin_vals_ ## j)*tmp_inp_cos + (cos_vals_ ## j)*tmp_inp_sin;
+
+PF_TARGET_CLONES
+float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase)
+{
+    //input_size should be multiple of 4
+    //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
+    float cos_start=cos(starting_phase);
+    float sin_start=sin(starting_phase);
+    float register tmp_inp_cos, tmp_inp_sin,
+        cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
+        sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
+        dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
+        dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
+
+    for(int i=0;i<N_cplx/4; i++)
+    {
+        SADF_L1(0)
+        SADF_L1(1)
+        SADF_L1(2)
+        SADF_L1(3)
+        SADF_L2(0)
+        SADF_L2(1)
+        SADF_L2(2)
+        SADF_L2(3)
+        cos_start = cos_vals_3;
+        sin_start = sin_vals_3;
+    }
+    starting_phase+=N_cplx*d->phase_increment;
+    while(starting_phase>PI) starting_phase-=2*PI;
+    while(starting_phase<-PI) starting_phase+=2*PI;
+    return starting_phase;
+}
+
+#undef SADF_L1
+#undef SADF_L2
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO D ***/
+/**************/
 
 shift_unroll_data_t shift_unroll_init(float rate, int size)
 {
@@ -226,16 +351,12 @@ shift_unroll_data_t shift_unroll_init(float rate, int size)
 
 void shift_unroll_deinit(shift_unroll_data_t* d)
 {
-    if (d && d->dsin)
-    {
-        free(d->dsin);
-        d->dsin = NULL;
-    }
-    if (d && d->dcos)
-    {
-        free(d->dcos);
-        d->dcos = NULL;
-    }
+    if (!d)
+        return;
+    free(d->dsin);
+    free(d->dcos);
+    d->dsin = NULL;
+    d->dcos = NULL;
 }
 
 PF_TARGET_CLONES
@@ -245,8 +366,8 @@ float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_u
     //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
     float cos_start = cos(starting_phase);
     float sin_start = sin(starting_phase);
-    register float cos_val, sin_val;
-    for(int i=0;i<input_size; i++) //@shift_unroll_cc
+    register float cos_val = cos_start, sin_val = sin_start;
+    for(int i=0;i<input_size; i++)
     {
         iof(output,i) = cos_val*iof(input,i) - sin_val*qof(input,i);
         qof(output,i) = sin_val*iof(input,i) + cos_val*qof(input,i);
@@ -265,8 +386,8 @@ float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, flo
 {
     float cos_start = cos(starting_phase);
     float sin_start = sin(starting_phase);
-    register float cos_val, sin_val;
-    for(int i=0;i<size; i++) //@shift_unroll_inp_c
+    register float cos_val = cos_start, sin_val = sin_start;
+    for(int i=0;i<size; i++)
     {
         register float inp_i = iof(in_out,i);
         register float inp_q = qof(in_out,i);
@@ -283,15 +404,18 @@ float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, flo
 }
 
 
-#define SHIFT_UNROLL_SIZE      CSDR_SHIFT_LIMITED_UNROLL_SIZE
-#define SHIFT_LIMITED_SIMD_SZ  CSDR_SHIFT_LIMITED_SIMD_SZ
+/*********************************************************************/
+
+/**************/
+/*** ALGO E ***/
+/**************/
 
 shift_limited_unroll_data_t shift_limited_unroll_init(float rate)
 {
     shift_limited_unroll_data_t output;
     output.phase_increment=2*rate*PI;
     float myphase = 0;
-    for(int i=0;i<SHIFT_UNROLL_SIZE;i++)
+    for(int i=0; i < PF_SHIFT_LIMITED_UNROLL_SIZE; i++)
     {
         myphase += output.phase_increment;
         while(myphase>PI) myphase-=2*PI;
@@ -309,87 +433,119 @@ void shift_limited_unroll_cc(const complexf *input, complexf* output, int size,
 {
     float cos_start = d->complex_phase.i;
     float sin_start = d->complex_phase.q;
-    register float cos_val = cos_start, sin_val = sin_start;
-    while (size > 0) //@shift_limited_unroll_cc
+    register float cos_val = cos_start, sin_val = sin_start, mag;
+    while (size > 0)
     {
-        int N = (size >= SHIFT_UNROLL_SIZE) ? SHIFT_UNROLL_SIZE : size;
-        for(int i=0;i<N/SHIFT_LIMITED_SIMD_SZ; i++ )
+        int N = (size >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : size;
+        for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
         {
-            for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++)
+            for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
             {
-                iof(output,SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,SHIFT_LIMITED_SIMD_SZ*i+j);
-                qof(output,SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,SHIFT_LIMITED_SIMD_SZ*i+j);
+                iof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
+                qof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
                 // calculate complex phasor for next iteration
-                cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
-                sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
+                cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+                sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
             }
         }
-        input += SHIFT_UNROLL_SIZE;
-        output += SHIFT_UNROLL_SIZE;
-        size -= SHIFT_UNROLL_SIZE;
+        // "starts := vals := vals / |vals|"
+        mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
+        cos_val /= mag;
+        sin_val /= mag;
+        cos_start = cos_val;
+        sin_start = sin_val;
+
+        input += PF_SHIFT_LIMITED_UNROLL_SIZE;
+        output += PF_SHIFT_LIMITED_UNROLL_SIZE;
+        size -= PF_SHIFT_LIMITED_UNROLL_SIZE;
     }
     d->complex_phase.i = cos_val;
     d->complex_phase.q = sin_val;
 }
 
 PF_TARGET_CLONES
-void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d)
+void shift_limited_unroll_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_data_t* d)
 {
-    float inp_i[SHIFT_LIMITED_SIMD_SZ];
-    float inp_q[SHIFT_LIMITED_SIMD_SZ];
+    float inp_i[PF_SHIFT_LIMITED_SIMD_SZ];
+    float inp_q[PF_SHIFT_LIMITED_SIMD_SZ];
+    // "vals := starts := phase_state"
     float cos_start = d->complex_phase.i;
     float sin_start = d->complex_phase.q;
-    register float cos_val = cos_start, sin_val = sin_start;
-    while (size > 0) //@shift_limited_unroll_inp_c
+    register float cos_val = cos_start, sin_val = sin_start, mag;
+    while (N_cplx)
     {
-        int N = (size >= SHIFT_UNROLL_SIZE) ? SHIFT_UNROLL_SIZE : size;
-        for(int i=0;i<N/SHIFT_LIMITED_SIMD_SZ; i++ )
+        int N = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
+        for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
         {
-            for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++)
-                inp_i[j] = in_out[SHIFT_LIMITED_SIMD_SZ*i+j].i;
-            for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++)
-                inp_q[j] = in_out[SHIFT_LIMITED_SIMD_SZ*i+j].q;
-            for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++)
+            for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
+                inp_i[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].i;
+            for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
+                inp_q[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].q;
+            for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
             {
-                iof(in_out,SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j];
-                qof(in_out,SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j];
+                // "out[] = inp[] * vals"
+                iof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j];
+                qof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j];
                 // calculate complex phasor for next iteration
-                cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
-                sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
+                // "vals :=  d[] * starts"
+                cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+                sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
             }
         }
-        in_out += SHIFT_UNROLL_SIZE;
-        size -= SHIFT_UNROLL_SIZE;
+        // "starts := vals := vals / |vals|"
+        mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
+        cos_val /= mag;
+        sin_val /= mag;
+        cos_start = cos_val;
+        sin_start = sin_val;
+
+        in_out += PF_SHIFT_LIMITED_UNROLL_SIZE;
+        N_cplx -= PF_SHIFT_LIMITED_UNROLL_SIZE;
     }
-    d->complex_phase.i = cos_val;
-    d->complex_phase.q = sin_val;
+    // "phase_state := starts"
+    d->complex_phase.i = cos_start;
+    d->complex_phase.q = sin_start;
 }
 
 
 #ifdef HAVE_SSE_INTRINSICS
 
-shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_freq, float phase_start_rad)
+/*********************************************************************/
+
+/**************/
+/*** ALGO F ***/
+/**************/
+
+shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad)
 {
-    shift_limited_unroll_sse_data_t output;
+    shift_limited_unroll_A_sse_data_t output;
     float myphase;
 
     output.phase_increment = 2*relative_freq*PI;
 
     myphase = 0.0F;
-    for (int i = 0; i < SHIFT_UNROLL_SIZE + SHIFT_LIMITED_SIMD_SZ; i++)
+    for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
     {
+        for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
+        {
+            myphase += output.phase_increment;
+            while(myphase>PI) myphase-=2*PI;
+            while(myphase<-PI) myphase+=2*PI;
+        }
         output.dcos[i] = cos(myphase);
         output.dsin[i] = sin(myphase);
-        myphase += output.phase_increment;
-        while(myphase>PI) myphase-=2*PI;
-        while(myphase<-PI) myphase+=2*PI;
+        for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
+        {
+            output.dcos[i+k] = output.dcos[i];
+            output.dsin[i+k] = output.dsin[i];
+        }
     }
 
     output.dcos_blk = 0.0F;
     output.dsin_blk = 0.0F;
 
     myphase = phase_start_rad;
-    for (int i = 0; i < SHIFT_LIMITED_SIMD_SZ; i++)
+    for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
     {
         output.phase_state_i[i] = cos(myphase);
         output.phase_state_q[i] = sin(myphase);
@@ -402,10 +558,11 @@ shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_fre
 
 
 PF_TARGET_CLONES
-void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_sse_data_t* d)
+void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d)
 {
-    const __m128 cos_starts = VLOAD_ALIGNED( &d->phase_state_i[0] );
-    const __m128 sin_starts = VLOAD_ALIGNED( &d->phase_state_q[0] );
+    // "vals := starts := phase_state"
+    __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
+    __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
     __m128 cos_vals = cos_starts;
     __m128 sin_vals = sin_starts;
     __m128 inp_re, inp_im;
@@ -417,27 +574,29 @@ void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_
 
     while (N_cplx)
     {
-        const int NB = (N_cplx >= CSDR_SHIFT_LIMITED_UNROLL_SIZE) ? CSDR_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
+        const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
         int B = NB;
         p_trig_cos_tab = (__m128*)( &d->dcos[0] );
         p_trig_sin_tab = (__m128*)( &d->dsin[0] );
-        while (B >= 0)
+        while (B)
         {
             // complex multiplication of 4 complex values from/to in_out[]
             // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
-            UNINTERLEAVE2(VLOAD_ALIGNED(u), VLOAD_ALIGNED(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+            // "out[] = inp[] * vals"
+            UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
             product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
             product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
             INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
-            VSTORE_ALIGNED(u, interl_prod_a);
-            VSTORE_ALIGNED(u+1, interl_prod_b);
+            VSTORE(u, interl_prod_a);
+            VSTORE(u+1, interl_prod_b);
             u += 2;
             // calculate complex phasor for next iteration
-            // cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
-            // sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
+            // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
             // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
-            inp_re = VLOAD_ALIGNED(p_trig_cos_tab);
-            inp_im = VLOAD_ALIGNED(p_trig_sin_tab);
+            // "vals :=  d[] * starts"
+            inp_re = VLOAD(p_trig_cos_tab);
+            inp_im = VLOAD(p_trig_sin_tab);
             cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
             sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
             ++p_trig_cos_tab;
@@ -447,97 +606,294 @@ void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_
         N_cplx -= NB;
         /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
         /* re-use product_re[]/product_im[] for normalization */
+        // "starts := vals := vals / |vals|"
         product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
+#if 0
+        // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
+        // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
         product_im = _mm_rsqrt_ps(product_re);
-        cos_vals = VMUL(cos_vals, product_im);
-        sin_vals = VMUL(sin_vals, product_im);
+        cos_starts = cos_vals = VMUL(cos_vals, product_im);
+        sin_starts = sin_vals = VMUL(sin_vals, product_im);
+#else
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
+        product_im = _mm_sqrt_ps(product_re);
+        cos_starts = cos_vals = VDIV(cos_vals, product_im);
+        sin_starts = sin_vals = VDIV(sin_vals, product_im);
+#endif
     }
-    VSTORE_ALIGNED( &d->phase_state_i[0], cos_vals );
-    VSTORE_ALIGNED( &d->phase_state_q[0], sin_vals );
+    // "phase_state := starts"
+    VSTORE( &d->phase_state_i[0], cos_starts );
+    VSTORE( &d->phase_state_q[0], sin_starts );
 }
 
-#else
 
-int have_sse_shift_mixer_impl()
-{
-    return 0;
-}
+/*********************************************************************/
 
+/**************/
+/*** ALGO G ***/
+/**************/
 
-shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_freq, float phase_start_rad)
+shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad)
 {
-    assert(0);
-    shift_limited_unroll_sse_data_t r;
-    return r;
+    shift_limited_unroll_B_sse_data_t output;
+    float myphase;
+
+    output.phase_increment = 2*relative_freq*PI;
+
+    myphase = 0.0F;
+    for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
+    {
+        for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
+        {
+            myphase += output.phase_increment;
+            while(myphase>PI) myphase-=2*PI;
+            while(myphase<-PI) myphase+=2*PI;
+        }
+        output.dtrig[i+0] = cos(myphase);
+        output.dtrig[i+1] = sin(myphase);
+        output.dtrig[i+2] = output.dtrig[i+0];
+        output.dtrig[i+3] = output.dtrig[i+1];
+    }
+
+    output.dcos_blk = 0.0F;
+    output.dsin_blk = 0.0F;
+
+    myphase = phase_start_rad;
+    for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
+    {
+        output.phase_state_i[i] = cos(myphase);
+        output.phase_state_q[i] = sin(myphase);
+        myphase += output.phase_increment;
+        while(myphase>PI) myphase-=2*PI;
+        while(myphase<-PI) myphase+=2*PI;
+    }
+    return output;
 }
 
-void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_sse_data_t* d)
+
+PF_TARGET_CLONES
+void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d)
 {
-    assert(0);
-}
+    // "vals := starts := phase_state"
+    __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
+    __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
+    __m128 cos_vals = cos_starts;
+    __m128 sin_vals = sin_starts;
+    __m128 inp_re, inp_im;
+    __m128 product_re, product_im;
+    __m128 interl_prod_a, interl_prod_b;
+    __m128 * RESTRICT p_trig_tab;
+    __m128 * RESTRICT u = (__m128*)in_out;
 
+    while (N_cplx)
+    {
+        const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
+        int B = NB;
+        p_trig_tab = (__m128*)( &d->dtrig[0] );
+        while (B)
+        {
+            // complex multiplication of 4 complex values from/to in_out[]
+            // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
+            // "out[] = inp[] * vals"
+            UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+            product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
+            product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
+            INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
+            VSTORE(u, interl_prod_a);
+            VSTORE(u+1, interl_prod_b);
+            u += 2;
+            // calculate complex phasor for next iteration
+            // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
+            // "vals :=  d[] * starts"
+            product_re = VLOAD(p_trig_tab);
+            UNINTERLEAVE2(product_re, product_re, inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+            cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
+            sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
+            ++p_trig_tab;
+            B -= 4;
+        }
+        N_cplx -= NB;
+        /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
+        /* re-use product_re[]/product_im[] for normalization */
+        // "starts := vals := vals / |vals|"
+        product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
+#if 0
+        // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
+        // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
+        product_im = _mm_rsqrt_ps(product_re);
+        cos_starts = cos_vals = VMUL(cos_vals, product_im);
+        sin_starts = sin_vals = VMUL(sin_vals, product_im);
+#else
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
+        product_im = _mm_sqrt_ps(product_re);
+        cos_starts = cos_vals = VDIV(cos_vals, product_im);
+        sin_starts = sin_vals = VDIV(sin_vals, product_im);
 #endif
+    }
+    // "phase_state := starts"
+    VSTORE( &d->phase_state_i[0], cos_starts );
+    VSTORE( &d->phase_state_q[0], sin_starts );
+}
 
 
+/*********************************************************************/
 
-shift_addfast_data_t shift_addfast_init(float rate)
+
+/**************/
+/*** ALGO H ***/
+/**************/
+
+shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad)
 {
-    shift_addfast_data_t output;
-    output.phase_increment=2*rate*PI;
-    for(int i=0;i<4;i++)
+    shift_limited_unroll_C_sse_data_t output;
+    float myphase;
+
+    output.phase_increment = 2*relative_freq*PI;
+
+    myphase = 0.0F;
+    for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
     {
-        output.dsin[i]=sin(output.phase_increment*(i+1));
-        output.dcos[i]=cos(output.phase_increment*(i+1));
+        for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
+        {
+            myphase += output.phase_increment;
+            while(myphase>PI) myphase-=2*PI;
+            while(myphase<-PI) myphase+=2*PI;
+        }
+        output.dinterl_trig[2*i] = cos(myphase);
+        output.dinterl_trig[2*i+4] = sin(myphase);
+        for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
+        {
+            output.dinterl_trig[2*i+k] = output.dinterl_trig[2*i];
+            output.dinterl_trig[2*i+k+4] = output.dinterl_trig[2*i+4];
+        }
     }
-    return output;
-}
 
+    output.dcos_blk = 0.0F;
+    output.dsin_blk = 0.0F;
 
-#ifndef HAVE_ADDFAST_CC_IMPL
+    myphase = phase_start_rad;
+    for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
+    {
+        output.phase_state_i[i] = cos(myphase);
+        output.phase_state_q[i] = sin(myphase);
+        myphase += output.phase_increment;
+        while(myphase>PI) myphase-=2*PI;
+        while(myphase<-PI) myphase+=2*PI;
+    }
+    return output;
+}
 
-#define SADF_L1(j) \
-    cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \
-    sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j;
-#define SADF_L2(j) \
-    iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \
-    qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j);
 
 PF_TARGET_CLONES
-float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
+void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d)
 {
-    //input_size should be multiple of 4
-    //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
-    float cos_start=cos(starting_phase);
-    float sin_start=sin(starting_phase);
-    float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
-        sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
-        dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
-        dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
+    // "vals := starts := phase_state"
+    __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
+    __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
+    __m128 cos_vals = cos_starts;
+    __m128 sin_vals = sin_starts;
+    __m128 inp_re, inp_im;
+    __m128 product_re, product_im;
+    __m128 interl_prod_a, interl_prod_b;
+    __m128 * RESTRICT p_trig_tab;
+    __m128 * RESTRICT u = (__m128*)in_out;
 
-    for(int i=0;i<input_size/4; i++) //@shift_addfast_cc
+    while (N_cplx)
     {
-        SADF_L1(0)
-        SADF_L1(1)
-        SADF_L1(2)
-        SADF_L1(3)
-        SADF_L2(0)
-        SADF_L2(1)
-        SADF_L2(2)
-        SADF_L2(3)
-        cos_start = cos_vals_3;
-        sin_start = sin_vals_3;
+        const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
+        int B = NB;
+        p_trig_tab = (__m128*)( &d->dinterl_trig[0] );
+        while (B)
+        {
+            // complex multiplication of 4 complex values from/to in_out[]
+            // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
+            // "out[] = inp[] * vals"
+            UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+            product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
+            product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
+            INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
+            VSTORE(u, interl_prod_a);
+            VSTORE(u+1, interl_prod_b);
+            u += 2;
+            // calculate complex phasor for next iteration
+            // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
+            // "vals :=  d[] * starts"
+            inp_re = VLOAD(p_trig_tab);
+            inp_im = VLOAD(p_trig_tab+1);
+            cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
+            sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
+            p_trig_tab += 2;
+            B -= 4;
+        }
+        N_cplx -= NB;
+        /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
+        /* re-use product_re[]/product_im[] for normalization */
+        // "starts := vals := vals / |vals|"
+        product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
+#if 0
+        // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
+        // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
+        product_im = _mm_rsqrt_ps(product_re);
+        cos_starts = cos_vals = VMUL(cos_vals, product_im);
+        sin_starts = sin_vals = VMUL(sin_vals, product_im);
+#else
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
+        product_im = _mm_sqrt_ps(product_re);
+        cos_starts = cos_vals = VDIV(cos_vals, product_im);
+        sin_starts = sin_vals = VDIV(sin_vals, product_im);
+#endif
     }
-    starting_phase+=input_size*d->phase_increment;
-    while(starting_phase>PI) starting_phase-=2*PI;
-    while(starting_phase<-PI) starting_phase+=2*PI;
-    return starting_phase;
+    // "phase_state := starts"
+    VSTORE( &d->phase_state_i[0], cos_starts );
+    VSTORE( &d->phase_state_q[0], sin_starts );
+}
+
+
+#else
+
+/*********************************************************************/
+
+shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) {
+    assert(0);
+    shift_limited_unroll_A_sse_data_t r;
+    return r;
+}
+shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) {
+    assert(0);
+    shift_limited_unroll_B_sse_data_t r;
+    return r;
+}
+shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) {
+    assert(0);
+    shift_limited_unroll_C_sse_data_t r;
+    return r;
+}
+
+void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) {
+    assert(0);
+}
+void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) {
+    assert(0);
+}
+void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) {
+    assert(0);
 }
 
 #endif
 
 
+/*********************************************************************/
 
-#define SHIFT_REC_SIMD_SZ CSDR_SHIFT_RECURSIVE_SIMD_SZ
+/**************/
+/*** ALGO I ***/
+/**************/
 
 void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state)
 {
@@ -545,7 +901,7 @@ void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *con
     float phase_increment_s = rate*PI;
     float k1 = tan(0.5*phase_increment_s);
     float k2 = 2*k1 /(1 + k1 * k1);
-    for (int j=1; j<SHIFT_REC_SIMD_SZ; j++)
+    for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SZ; j++)
     {
         float tmp;
         state->u_cos[j] = state->u_cos[j-1];
@@ -556,8 +912,8 @@ void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *con
         state->u_cos[j] = tmp - k1 * state->v_sin[j];
     }
 
-    // constants for SHIFT_REC_SIMD_SZ times phase step
-    float phase_increment_b = phase_increment_s * SHIFT_REC_SIMD_SZ;
+    // constants for PF_SHIFT_RECURSIVE_SIMD_SZ times phase step
+    float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SZ;
     while(phase_increment_b > PI) phase_increment_b-=2*PI;
     while(phase_increment_b < -PI) phase_increment_b+=2*PI;
     conf->k1 = tan(0.5*phase_increment_b);
@@ -584,31 +940,31 @@ PF_TARGET_CLONES
 void shift_recursive_osc_cc(const complexf *input, complexf* output,
     int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
 {
-    float tmp[SHIFT_REC_SIMD_SZ];
-    float inp_i[SHIFT_REC_SIMD_SZ];
-    float inp_q[SHIFT_REC_SIMD_SZ];
+    float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
     shift_recursive_osc_t state = *state_ext;
     const float k1 = conf->k1;
     const float k2 = conf->k2;
-    for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@shift_recursive_osc_cc
+    for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_cc
     {
         //we multiply two complex numbers - similar to shift_math_cc
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
         {
-            inp_i[j] = input[SHIFT_REC_SIMD_SZ*i+j].i;
-            inp_q[j] = input[SHIFT_REC_SIMD_SZ*i+j].q;
+            inp_i[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
+            inp_q[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
         }
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
         {
-            iof(output,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
-            qof(output,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
+            iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
+            qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
         }
         // update complex phasor - like incrementing phase
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
             tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
             state.v_sin[j] += k2 * tmp[j];
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
             state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
     }
     *state_ext = state;
@@ -618,31 +974,31 @@ PF_TARGET_CLONES
 void shift_recursive_osc_inp_c(complexf* in_out,
     int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
 {
-    float tmp[SHIFT_REC_SIMD_SZ];
-    float inp_i[SHIFT_REC_SIMD_SZ];
-    float inp_q[SHIFT_REC_SIMD_SZ];
+    float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
     shift_recursive_osc_t state = *state_ext;
     const float k1 = conf->k1;
     const float k2 = conf->k2;
-    for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@shift_recursive_osc_inp_c
+    for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_inp_c
     {
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
         {
-            inp_i[j] = in_out[SHIFT_REC_SIMD_SZ*i+j].i;
-            inp_q[j] = in_out[SHIFT_REC_SIMD_SZ*i+j].q;
+            inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
+            inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
         }
         //we multiply two complex numbers - similar to shift_math_cc
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
         {
-            iof(in_out,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
-            qof(in_out,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
+            iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
+            qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
         }
         // update complex phasor - like incrementing phase
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
             tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
             state.v_sin[j] += k2 * tmp[j];
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
             state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
     }
     *state_ext = state;
@@ -652,24 +1008,24 @@ PF_TARGET_CLONES
 void gen_recursive_osc_c(complexf* output,
     int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
 {
-    float tmp[SHIFT_REC_SIMD_SZ];
+    float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
     shift_recursive_osc_t state = *state_ext;
     const float k1 = conf->k1;
     const float k2 = conf->k2;
-    for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@gen_recursive_osc_c
+    for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@gen_recursive_osc_c
     {
         // output complex oscillator value
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
         {
-            iof(output,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j];
-            qof(output,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j];
+            iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j];
+            qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j];
         }
         // update complex phasor - like incrementing phase
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
             tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
             state.v_sin[j] += k2 * tmp[j];
-        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
             state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
     }
     *state_ext = state;
@@ -678,13 +1034,19 @@ void gen_recursive_osc_c(complexf* output,
 
 #ifdef HAVE_SSE_INTRINSICS
 
+/*********************************************************************/
+
+/**************/
+/*** ALGO J ***/
+/**************/
+
 void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
 {
     // constants for single phase step
     float phase_increment_s = rate*PI;
     float k1 = tan(0.5*phase_increment_s);
     float k2 = 2*k1 /(1 + k1 * k1);
-    for (int j=1; j<CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++)
+    for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++)
     {
         float tmp;
         state->u_cos[j] = state->u_cos[j-1];
@@ -695,8 +1057,8 @@ void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_con
         state->u_cos[j] = tmp - k1 * state->v_sin[j];
     }
 
-    // constants for CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step
-    float phase_increment_b = phase_increment_s * CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ;
+    // constants for PF_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step
+    float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SSE_SZ;
     while(phase_increment_b > PI) phase_increment_b-=2*PI;
     while(phase_increment_b < -PI) phase_increment_b+=2*PI;
     conf->k1 = tan(0.5*phase_increment_b);
@@ -726,8 +1088,8 @@ void shift_recursive_osc_sse_inp_c(complexf* in_out,
 {
     const __m128 k1 = LD_PS1( conf->k1 );
     const __m128 k2 = LD_PS1( conf->k2 );
-    __m128 u_cos = VLOAD_ALIGNED( &state_ext->u_cos[0] );
-    __m128 v_sin = VLOAD_ALIGNED( &state_ext->v_sin[0] );
+    __m128 u_cos = VLOAD( &state_ext->u_cos[0] );
+    __m128 v_sin = VLOAD( &state_ext->v_sin[0] );
     __m128 inp_re, inp_im;
     __m128 product_re, product_im;
     __m128 interl_prod_a, interl_prod_b;
@@ -735,18 +1097,18 @@ void shift_recursive_osc_sse_inp_c(complexf* in_out,
 
     while (N_cplx)
     {
-        //inp_i[j] = in_out[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i;
-        //inp_q[j] = in_out[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q;
-        UNINTERLEAVE2(VLOAD_ALIGNED(u), VLOAD_ALIGNED(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+        //inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i;
+        //inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q;
+        UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
 
         //we multiply two complex numbers - similar to shift_math_cc
-        //iof(in_out,CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
-        //qof(in_out,CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
+        //iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
+        //qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
         product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) );
         product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) );
         INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
-        VSTORE_ALIGNED(u, interl_prod_a);
-        VSTORE_ALIGNED(u+1, interl_prod_b);
+        VSTORE(u, interl_prod_a);
+        VSTORE(u+1, interl_prod_b);
         u += 2;
 
         // update complex phasor - like incrementing phase
@@ -759,8 +1121,8 @@ void shift_recursive_osc_sse_inp_c(complexf* in_out,
 
         N_cplx -= 4;
     }
-    VSTORE_ALIGNED( &state_ext->u_cos[0], u_cos );
-    VSTORE_ALIGNED( &state_ext->v_sin[0], v_sin );
+    VSTORE( &state_ext->u_cos[0], u_cos );
+    VSTORE( &state_ext->v_sin[0], v_sin );
 }
 
 #else
diff --git a/pf_mixer.h b/pf_mixer.h
index ae21423..f407c21 100644
--- a/pf_mixer.h
+++ b/pf_mixer.h
@@ -55,18 +55,37 @@ typedef struct complexf_s { float i; float q; } complexf;
 
 int have_sse_shift_mixer_impl();
 
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO A ***/
+/**************/
+
 float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase);
 
 
+/*********************************************************************/
+
+/**************/
+/*** ALGO B ***/
+/**************/
+
 typedef struct shift_table_data_s
 {
     float* table;
     int table_size;
 } shift_table_data_t;
+
 void shift_table_deinit(shift_table_data_t table_data);
 shift_table_data_t shift_table_init(int table_size);
 float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase);
 
+/*********************************************************************/
+
+/**************/
+/*** ALGO C ***/
+/**************/
 
 typedef struct shift_addfast_data_s
 {
@@ -74,9 +93,17 @@ typedef struct shift_addfast_data_s
     float dcos[4];
     float phase_increment;
 } shift_addfast_data_t;
+
 shift_addfast_data_t shift_addfast_init(float rate);
 float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);
+float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase);
+
 
+/*********************************************************************/
+
+/**************/
+/*** ALGO D ***/
+/**************/
 
 typedef struct shift_unroll_data_s
 {
@@ -85,59 +112,128 @@ typedef struct shift_unroll_data_s
     float phase_increment;
     int size;
 } shift_unroll_data_t;
+
 shift_unroll_data_t shift_unroll_init(float rate, int size);
 void shift_unroll_deinit(shift_unroll_data_t* d);
 float shift_unroll_cc(complexf *input, complexf* output, int size, shift_unroll_data_t* d, float starting_phase);
 float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase);
 
 
+/*********************************************************************/
+
+/**************/
+/*** ALGO E ***/
+/**************/
+
 /* similar to shift_unroll_cc() - but, have fixed and limited precalc size
  * idea: smaller cache usage by table
  * size must be multiple of CSDR_SHIFT_LIMITED_SIMD (= 4)
  */
-#define CSDR_SHIFT_LIMITED_UNROLL_SIZE  64
-#define CSDR_SHIFT_LIMITED_SIMD_SZ  4
+#define PF_SHIFT_LIMITED_UNROLL_SIZE  128
+#define PF_SHIFT_LIMITED_SIMD_SZ  4
+
 typedef struct shift_limited_unroll_data_s
 {
-    float dcos[CSDR_SHIFT_LIMITED_UNROLL_SIZE];
-    float dsin[CSDR_SHIFT_LIMITED_UNROLL_SIZE];
+    float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE];
+    float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE];
     complexf complex_phase;
     float phase_increment;
 } shift_limited_unroll_data_t;
+
 shift_limited_unroll_data_t shift_limited_unroll_init(float rate);
-/* size must be multiple of CSDR_SHIFT_LIMITED_SIMD_SZ */
+/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
 /* starting_phase for next call is kept internal in state */
 void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d);
 void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d);
 
 
-typedef struct shift_limited_unroll_sse_data_s
+/*********************************************************************/
+
+/**************/
+/*** ALGO F ***/
+/**************/
+
+typedef struct shift_limited_unroll_A_sse_data_s
+{
+    /* small/limited trig table */
+    float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
+    float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
+    /* 4 times complex phase */
+    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
+    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
+    /* N_cplx_per_block times increment - for future parallel variants */
+    float dcos_blk;
+    float dsin_blk;
+    /* */
+    float phase_increment;
+} shift_limited_unroll_A_sse_data_t;
+
+shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad);
+void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d);
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO G ***/
+/**************/
+
+typedef struct shift_limited_unroll_B_sse_data_s
 {
     /* small/limited trig table */
-    float dcos[CSDR_SHIFT_LIMITED_UNROLL_SIZE+CSDR_SHIFT_LIMITED_SIMD_SZ];
-    float dsin[CSDR_SHIFT_LIMITED_UNROLL_SIZE+CSDR_SHIFT_LIMITED_SIMD_SZ];
+    float dtrig[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
     /* 4 times complex phase */
-    float phase_state_i[CSDR_SHIFT_LIMITED_SIMD_SZ];
-    float phase_state_q[CSDR_SHIFT_LIMITED_SIMD_SZ];
+    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
+    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
     /* N_cplx_per_block times increment - for future parallel variants */
     float dcos_blk;
     float dsin_blk;
     /* */
     float phase_increment;
-} shift_limited_unroll_sse_data_t;
-shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_freq, float phase_start_rad);
-void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_sse_data_t* d);
+} shift_limited_unroll_B_sse_data_t;
+
+shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad);
+void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d);
 
+/*********************************************************************/
 
+/**************/
+/*** ALGO H ***/
+/**************/
+
+typedef struct shift_limited_unroll_C_sse_data_s
+{
+    /* small/limited trig table - interleaved: 4 cos, 4 sin, 4 cos, .. */
+    float dinterl_trig[2*(PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ)];
+    /* 4 times complex phase */
+    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
+    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
+    /* N_cplx_per_block times increment - for future parallel variants */
+    float dcos_blk;
+    float dsin_blk;
+    /* */
+    float phase_increment;
+} shift_limited_unroll_C_sse_data_t;
+
+shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad);
+void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d);
+
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO I ***/
+/**************/
 
 /* Recursive Quadrature Oscillator functions "recursive_osc"
  * see https://www.vicanek.de/articles/QuadOsc.pdf
  */
-#define CSDR_SHIFT_RECURSIVE_SIMD_SZ  8
+#define PF_SHIFT_RECURSIVE_SIMD_SZ  8
 typedef struct shift_recursive_osc_s
 {
-    float u_cos[CSDR_SHIFT_RECURSIVE_SIMD_SZ];
-    float v_sin[CSDR_SHIFT_RECURSIVE_SIMD_SZ];
+    float u_cos[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    float v_sin[PF_SHIFT_RECURSIVE_SIMD_SZ];
 } shift_recursive_osc_t;
 
 typedef struct shift_recursive_osc_conf_s
@@ -149,17 +245,23 @@ typedef struct shift_recursive_osc_conf_s
 void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state);
 void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
 
-/* size must be multiple of CSDR_SHIFT_LIMITED_SIMD_SZ */
+/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
 /* starting_phase for next call is kept internal in state */
 void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
 void shift_recursive_osc_inp_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
 void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
 
-#define CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ  4
+/*********************************************************************/
+
+/**************/
+/*** ALGO J ***/
+/**************/
+
+#define PF_SHIFT_RECURSIVE_SIMD_SSE_SZ  4
 typedef struct shift_recursive_osc_sse_s
 {
-    float u_cos[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ];
-    float v_sin[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ];
+    float u_cos[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
+    float v_sin[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
 } shift_recursive_osc_sse_t;
 
 typedef struct shift_recursive_osc_sse_conf_s
author	hayati ayguen <h_ayguen@web.de>	2020-11-14 15:12:19 +0100
committer	hayati ayguen <h_ayguen@web.de>	2020-11-14 15:12:50 +0100
commit	d64eea5c6abe2c81f37b4e5932d15276fe85d09a (patch)
tree	53fd06bbe82afe8da2107e35188cfd9db7359f81
parent	2587d835de992cabfb653bfed9db80e499df4449 (diff)
download	pffft-d64eea5c6abe2c81f37b4e5932d15276fe85d09a.tar.gz