Upgrade pffft to b9819ffacac25e62727fc854f8c78522fb4eb2b3 am: 56310291a5

Original change: https://android-review.googlesource.com/c/platform/external/pffft/+/1536831 MUST ONLY BE SUBMITTED BY AUTOMERGER Change-Id: Ic106385f7187d3c1f7296a15e6752228c88cd21f
author: Haibo Huang <hhb@google.com> 2021-01-06 03:39:50 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> 2021-01-06 03:39:50 +0000
commit: f1dc9824465ecb52ba7220a5f4d02b0858f3fcd5 (patch)
tree: 7e5b45d174a2d4b038e8842bb4e0fbed7ec77914
parent: f72b8bc0eac730df148031eea3a5155e1391c869 (diff)
parent: 56310291a52a05dd2b856fcac112a3f91fb5a76d (diff)
download: pffft-f1dc9824465ecb52ba7220a5f4d02b0858f3fcd5.tar.gz
31 files changed, 9671 insertions, 49 deletions
diff --git a/.gitmodules b/.gitmodules
index 701a96c..9ef3633 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "kissfft"]
 	path = kissfft
 	url = https://github.com/hayguen/kissfft.git
+[submodule "pocketfft"]
+	path = pocketfft
+	url = https://github.com/hayguen/pocketfft.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b6bc10..7856b75 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,16 +7,19 @@ option(USE_TYPE_DOUBLE "activate 'double' precision float?" ON)
 
 # architecture/optimization options
 option(USE_SIMD        "use SIMD (SSE/AVX/NEON/ALTIVEC) CPU features? - " ON)
+option(DISABLE_SIMD_AVX "disable AVX CPU features? - " OFF)
 option(USE_SIMD_NEON   "force using NEON on ARM? (requires USE_SIMD)" OFF)
 option(USE_SCALAR_VECT "use 4-element vector scalar operations (if no other SIMD)" ON)
 
 # test options
-option(USE_BENCH_FFTW  "use (system-installed) FFTW3 in fft benchmark?" OFF)
-option(USE_BENCH_GREEN "use Green FFT in fft benchmark? - if exists in subdir" ON)
-option(USE_BENCH_KISS  "use KissFFT in fft benchmark? - if exists in subdir" ON)
+option(USE_BENCH_FFTW   "use (system-installed) FFTW3 in fft benchmark?" OFF)
+option(USE_BENCH_GREEN  "use Green FFT in fft benchmark? - if exists in subdir" ON)
+option(USE_BENCH_KISS   "use KissFFT in fft benchmark? - if exists in subdir" ON)
+option(USE_BENCH_POCKET "use PocketFFT in fft benchmark? - if exists in subdir" ON)
 
 option(USE_DEBUG_ASAN  "use GCC's address sanitizer?" OFF)
 
+option(DISABLE_LINK_WITH_M "Disables linking with m library to build with clangCL from MSVC" OFF)
 
 # C90 requires the gcc extensions for function attributes like always_inline
 # C99 provides the function attributes: no gcc extensions required
@@ -61,6 +64,17 @@ if (USE_BENCH_KISS)
   endif()
 endif()
 
+if (USE_BENCH_POCKET)
+  # git submodule add https://github.com/hayguen/pocketfft.git
+  if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/pocketfft/pocketfft_double.c")
+    message(STATUS "found subdir pocketfft")
+    set(PATH_POCKET "${CMAKE_CURRENT_LIST_DIR}/pocketfft")
+    add_subdirectory( "${PATH_POCKET}" )
+  else()
+    message(WARNING "PocketFFT not found in subdir pocketfft")
+  endif()
+endif()
+
 
 ########################################################################
 # select the release build type by default to get optimization flags
@@ -70,13 +84,23 @@ if(NOT CMAKE_BUILD_TYPE)
    message(STATUS "Build type not specified: defaulting to release.")
 endif(NOT CMAKE_BUILD_TYPE)
 
-if ("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC")
+if ( CMAKE_C_COMPILER_ID MATCHES "MSVC" )
   # using Visual Studio C++
   message(STATUS "INFO: detected MSVC: will not link math lib m")
   set(MATHLIB "")
+
+  add_definitions("/D_CRT_SECURE_NO_WARNINGS")
+
+  set(MSVC_DISABLED_WARNINGS_LIST
+      "C4996"
+  )
+
 else()
-  message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
-  set(MATHLIB "m")
+  if(DISABLE_LINK_WITH_M)
+  else()
+    message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
+    set(MATHLIB "m")
+  endif()
 endif()
 
 set( SIMD_FLOAT_HDRS simd/pf_float.h simd/pf_sse1_float.h simd/pf_altivec_float.h simd/pf_neon_float.h simd/pf_scalar_float.h )
@@ -114,10 +138,17 @@ if (USE_SIMD AND USE_SIMD_NEON)
 endif()
 if (USE_SIMD AND USE_TYPE_DOUBLE)
   if(WIN32)
-    set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "/arch:AVX")
+    if(DISABLE_SIMD_AVX)
+      set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "/arch:SSE2")
+    else()
+      set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "/arch:AVX")
+    endif()
   else()
     set_property(SOURCE pffft_double.c PROPERTY COMPILE_FLAGS "-march=native")
   endif()
+  if(DISABLE_SIMD_AVX)
+    target_compile_definitions(PFFFT PRIVATE PFFFT_AVX_DISABLE=1)
+  endif()
 endif()
 target_link_libraries( PFFFT ${MATHLIB} )
 set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
@@ -126,6 +157,25 @@ set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
 
 ######################################################
 
+if (USE_TYPE_FLOAT)
+
+  add_library(PFDSP STATIC pf_mixer.cpp pf_mixer.h pf_carrier.cpp pf_carrier.h pf_cic.cpp pf_cic.h fmv.h )
+  target_compile_definitions(PFDSP PRIVATE _USE_MATH_DEFINES)
+  if (USE_DEBUG_ASAN)
+    target_compile_options(PFDSP PRIVATE "-fsanitize=address")
+  endif()
+  if (USE_SIMD AND USE_SIMD_NEON)
+    target_compile_definitions(PFDSP PRIVATE PFFFT_ENABLE_NEON=1)
+    target_compile_options(PFDSP PRIVATE "-march=armv7-a" "-mfpu=neon")
+  endif()
+  target_link_libraries( PFDSP ${MATHLIB} )
+  set_property(TARGET PFDSP APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  )
+endif()
+
+######################################################
+
 add_library(FFTPACK STATIC fftpack.c fftpack.h)
 target_compile_definitions(FFTPACK PRIVATE _USE_MATH_DEFINES)
 target_link_libraries( FFTPACK ${MATHLIB} )
@@ -236,6 +286,12 @@ if (USE_TYPE_FLOAT)
     target_compile_definitions(bench_pffft_float PRIVATE HAVE_KISS_FFT=1)
     target_link_libraries(bench_pffft_float  KissFFT)
   endif()
+
+  if (PATH_POCKET AND USE_BENCH_POCKET)
+    target_compile_definitions(bench_pffft_float PRIVATE HAVE_POCKET_FFT=1)
+    target_link_libraries(bench_pffft_float  PocketFFT)
+  endif()
+
 endif()
 
 if (USE_TYPE_DOUBLE)
@@ -248,6 +304,22 @@ if (USE_TYPE_DOUBLE)
     target_compile_definitions(bench_pffft_double PRIVATE HAVE_FFTW=1)
     target_link_libraries(bench_pffft_double  fftw3)
   endif()
+
+  if (PATH_POCKET AND USE_BENCH_POCKET)
+    target_compile_definitions(bench_pffft_double PRIVATE HAVE_POCKET_FFT=1)
+    target_link_libraries(bench_pffft_double  PocketFFT)
+  endif()
+endif()
+
+######################################################
+
+if (USE_TYPE_FLOAT)
+  add_executable(bench_pf_mixer_float   bench_mixers.c)
+  target_compile_definitions(bench_pf_mixer_float PRIVATE _USE_MATH_DEFINES)
+  target_compile_definitions(bench_pf_mixer_float PRIVATE PFFFT_ENABLE_FLOAT)
+
+  target_link_libraries( bench_pf_mixer_float  PFDSP ${ASANLIB} )
+
 endif()
 
 ######################################################
diff --git a/METADATA b/METADATA
index 737bc49..f3b9f91 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@ third_party {
     type: GIT
     value: "https://github.com/marton78/pffft.git"
   }
-  version: "7ed5e2a45040ae91c0b2fade901a503c77ccf381"
+  version: "b9819ffacac25e62727fc854f8c78522fb4eb2b3"
   license_type: NOTICE
   last_upgrade_date {
     year: 2020
-    month: 7
-    day: 11
+    month: 12
+    day: 29
   }
 }
diff --git a/README.md b/README.md
index 27dd530..f11e32f 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,11 @@ license is BSD-like.
 PFFASTCONV does fast convolution (FIR filtering), of single precision 
 real vectors, utilizing the PFFFT library. The license is BSD-like.
 
+PFDSP contains a few other signal processing functions.
+Currently, mixing and carrier generation functions are contained.
+It is work in progress - also the API!
+The fast convolution from PFFASTCONV might get merged into PFDSP.
+
 
 ## Why does it exist:
 
@@ -98,6 +103,11 @@ and `libPFFASTCONV.a` from the source files, plus the additional
 `libFFTPACK.a` library. Later one's sources are there anyway for the benchmark.
 
 
+## Origin:
+Origin for this code is Julien Pommier's pffft on bitbucket:
+[https://bitbucket.org/jpommier/pffft/](https://bitbucket.org/jpommier/pffft/)
+
+
 ## Comparison with other FFTs:
 
 The idea was not to break speed records, but to get a decently fast
@@ -114,6 +124,27 @@ It is also a bit focused on performing 1D convolutions, that is why it
 provides "unordered" FFTs , and a fourier domain convolution
 operation.
 
+Very interesting is [https://www.nayuki.io/page/free-small-fft-in-multiple-languages](https://www.nayuki.io/page/free-small-fft-in-multiple-languages).
+It shows how small an FFT can be - including the Bluestein algorithm, but it's everything else than fast.
+The whole C++ implementation file is 161 lines, including the Copyright header, see
+[https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp](https://github.com/nayuki/Nayuki-web-published-code/blob/master/free-small-fft-in-multiple-languages/FftComplex.cpp)
+
+## Dependencies / Required Linux packages
+
+On Debian/Ubuntu Linux following packages should be installed:
+
+```
+sudo apt-get install build-essential gcc g++ cmake
+```
+
+for benchmarking, you should have additional packages:
+```
+sudo apt-get install libfftw3-dev gnuplot
+```
+
+run the benchmarks with `./bench_all.sh ON` , to include benchmarks of fftw3 ..
+more details in README of [https://github.com/hayguen/pffft_benchmarks](https://github.com/hayguen/pffft_benchmarks)
+
 
 ## Benchmark results
 
diff --git a/bench_all.sh b/bench_all.sh
index 37be94c..b4c6b23 100755
--- a/bench_all.sh
+++ b/bench_all.sh
@@ -32,13 +32,13 @@ else
 fi
 
 
-#cmake -DCMAKE_TOOLCHAIN_FILE=ToolChain.cmake -DUSE_FFTW=${FFTW} -DUSE_SIMD=OFF ${CMAKEOPT} ../
+#cmake -DCMAKE_TOOLCHAIN_FILE=ToolChain.cmake -DUSE_BENCH_FFTW=${FFTW} -DUSE_SIMD=OFF ${CMAKEOPT} ../
 #make clean
 #make
 #echo -e "\n\nrunning without simd (==scalar) .."
 #time ctest -V
 
-cmake -DCMAKE_TOOLCHAIN_FILE=ToolChain.cmake -DUSE_FFTW=${FFTW} -DUSE_SIMD=ON ${CMAKEOPT} ../
+cmake -DCMAKE_TOOLCHAIN_FILE=ToolChain.cmake -DUSE_BENCH_FFTW=${FFTW} -DUSE_SIMD=ON ${CMAKEOPT} ../
 #make clean
 make
 echo -e "\n\nrunning with simd .."
diff --git a/bench_mixers.c b/bench_mixers.c
new file mode 100644
index 0000000..5b22b3f
--- /dev/null
+++ b/bench_mixers.c
@@ -0,0 +1,730 @@
+/*
+  Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+
+  bench for mixer algorithm/implementations
+
+ */
+
+#include <pf_mixer.h>
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+#define HAVE_SYS_TIMES
+
+#ifdef HAVE_SYS_TIMES
+#  include <sys/times.h>
+#  include <unistd.h>
+#endif
+
+#define BENCH_REF_TRIG_FUNC       1
+#define BENCH_OUT_OF_PLACE_ALGOS  0
+#define BENCH_INPLACE_ALGOS       1
+
+#define SAVE_BY_DEFAULT  0
+#define SAVE_LIMIT_MSPS           16
+
+#if 0
+  #define BENCH_FILE_SHIFT_MATH_CC           "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin"
+  #define BENCH_FILE_ADD_FAST_CC             "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin"
+  #define BENCH_FILE_ADD_FAST_INP_C          "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin"
+  #define BENCH_FILE_UNROLL_INP_C            "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_INP_C        "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin"
+  #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin"
+  #define BENCH_FILE_REC_OSC_CC              ""
+  #define BENCH_FILE_REC_OSC_INP_C           "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin"
+  #define BENCH_FILE_REC_OSC_SSE_INP_C       "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin"
+#else
+  #define BENCH_FILE_SHIFT_MATH_CC           ""
+  #define BENCH_FILE_ADD_FAST_CC             ""
+  #define BENCH_FILE_ADD_FAST_INP_C          ""
+  #define BENCH_FILE_UNROLL_INP_C            ""
+  #define BENCH_FILE_LTD_UNROLL_INP_C        ""
+  #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  ""
+  #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  ""
+  #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  ""
+  #define BENCH_FILE_REC_OSC_CC              ""
+  #define BENCH_FILE_REC_OSC_INP_C           ""
+  #define BENCH_FILE_REC_OSC_SSE_INP_C       ""
+#endif
+
+
+
+#if defined(HAVE_SYS_TIMES)
+    static double ttclk = 0.;
+
+    static double uclock_sec(int find_start)
+    {
+        struct tms t0, t;
+        if (ttclk == 0.)
+        {
+            ttclk = sysconf(_SC_CLK_TCK);
+            fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk);
+        }
+        times(&t);
+        if (find_start)
+        {
+            t0 = t;
+            while (t0.tms_utime == t.tms_utime)
+                times(&t);
+        }
+        /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
+        return ((double)t.tms_utime) / ttclk;
+    }
+
+#elif 0
+    // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes
+    double uclock_sec(int find_start)
+    {
+        FILETIME a, b, c, d;
+        if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0)
+        {
+            //  Returns total user time.
+            //  Can be tweaked to include kernel times as well.
+            return
+                (double)(d.dwLowDateTime |
+                    ((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
+        }
+        else {
+            //  Handle error
+            return 0;
+        }
+    }
+
+#else
+    double uclock_sec(int find_start)
+    { return (double)clock()/(double)CLOCKS_PER_SEC; }
+#endif
+
+
+void save(complexf * d, int B, int N, const char * fn)
+{
+    if (!fn || !fn[0])
+    {
+        if (! SAVE_BY_DEFAULT)
+            return;
+        fn = "/dev/shm/bench.bin";
+    }
+    FILE * f = fopen(fn, "wb");
+    if (!f) {
+        fprintf(stderr, "error writing result to %s\n", fn);
+        return;
+    }
+    if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 )
+        N = SAVE_LIMIT_MSPS * 1024 * 1024;
+    for (int off = 0; off + B <= N; off += B)
+    {
+        fwrite(d+off, sizeof(complexf), B, f);
+    }
+    fclose(f);
+}
+
+
+double bench_shift_math_cc(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase);
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, BENCH_FILE_SHIFT_MATH_CC);
+
+    free(input);
+    free(output);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_table_cc(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    int table_size=65536;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+
+    shift_table_data_t table_data = shift_table_init(table_size);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_addfast(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_addfast_data_t state = shift_addfast_init(-0.0009F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_addfast_cc(input+off, output+off, B, &state, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, BENCH_FILE_ADD_FAST_CC);
+
+    free(input);
+    free(output);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+double bench_shift_addfast_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_addfast_data_t state = shift_addfast_init(-0.0009F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_addfast_inp_c(input+off, B, &state, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_ADD_FAST_INP_C);
+
+    free(input);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_unroll_oop(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_unroll_cc(input+off, output+off, B, &state, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+double bench_shift_unroll_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_unroll_inp_c(input+off, B, &state, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_UNROLL_INP_C);
+
+    free(input);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+
+double bench_shift_limited_unroll_oop(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_limited_unroll_cc(input+off, output+off, B, &state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_limited_unroll_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_limited_unroll_inp_c(input+off, B, &state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C);
+
+    free(input);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_limited_unroll_A_sse_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_A_sse_data_t *state = malloc(sizeof(shift_limited_unroll_A_sse_data_t));
+
+    *state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_limited_unroll_A_sse_inp_c(input+off, B, state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C);
+    
+    free(input);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+double bench_shift_limited_unroll_B_sse_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_B_sse_data_t *state = malloc(sizeof(shift_limited_unroll_B_sse_data_t));
+
+    *state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    //shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_limited_unroll_B_sse_inp_c(input+off, B, state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C);
+    
+    free(input);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+double bench_shift_limited_unroll_C_sse_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_C_sse_data_t *state = malloc(sizeof(shift_limited_unroll_C_sse_data_t));
+
+    *state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_limited_unroll_C_sse_inp_c(input+off, B, state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C);
+    
+    free(input);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_rec_osc_cc_oop(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state, shift_state;
+    shift_recursive_osc_conf_t gen_conf, shift_conf;
+
+    shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_REC_OSC_CC);
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_rec_osc_cc_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state, shift_state;
+    shift_recursive_osc_conf_t gen_conf, shift_conf;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+    shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_recursive_osc_inp_c(input+off, B, &shift_conf, &shift_state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_REC_OSC_INP_C);
+    free(input);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_rec_osc_sse_c_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+
+    shift_recursive_osc_sse_t *shift_state = malloc(sizeof(shift_recursive_osc_sse_t));
+    shift_recursive_osc_sse_conf_t shift_conf;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec(1);
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_recursive_osc_sse_inp_c(input+off, B, &shift_conf, shift_state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec(0);
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C);
+    free(input);
+    T = ( t1 - t0 );  /* duration per fft() */
+    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+
+int main(int argc, char **argv)
+{
+    double rt;
+
+    // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte)
+    int B = 8 * 1024;
+    int N = 64 * 1024 * 1024;
+    int showUsage = 0;
+
+    if (argc == 1)
+        showUsage = 1;
+
+    if (1 < argc)
+        B = atoi(argv[1]);
+    if (2 < argc)
+        N = atoi(argv[2]) * 1024 * 1024;
+
+    if ( !B || !N || showUsage )
+    {
+        fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]);
+        if ( !B || !N )
+            return 0;
+    }
+
+    fprintf(stderr, "processing up to N = %d MSamples with blocke length of %d samples\n",
+        N / (1024 * 1024), B );
+
+
+#if BENCH_REF_TRIG_FUNC
+    printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n");
+    rt = bench_shift_math_cc(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+#endif
+
+#if BENCH_OUT_OF_PLACE_ALGOS
+    printf("starting bench of shift_table_cc (out-of-place) ..\n");
+    rt = bench_shift_table_cc(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_addfast_cc (out-of-place) ..\n");
+    rt = bench_shift_addfast(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n");
+    rt = bench_shift_unroll_oop(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n");
+    rt = bench_shift_limited_unroll_oop(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n");
+    rt = bench_shift_rec_osc_cc_oop(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+#endif
+
+#if BENCH_INPLACE_ALGOS
+
+    printf("starting bench of shift_addfast_inp_c in-place ..\n");
+    rt = bench_shift_addfast_inp(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_unroll_inp_c in-place ..\n");
+    rt = bench_shift_unroll_inp(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_limited_unroll_inp_c in-place ..\n");
+    rt = bench_shift_limited_unroll_inp(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    if ( have_sse_shift_mixer_impl() )
+    {
+        printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n");
+        rt = bench_shift_limited_unroll_A_sse_inp(B, N);
+        printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+        printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n");
+        rt = bench_shift_limited_unroll_B_sse_inp(B, N);
+        printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+        printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n");
+        rt = bench_shift_limited_unroll_C_sse_inp(B, N);
+        printf("  %f MSamples/sec\n\n", rt * 1E-6);
+    }
+
+    printf("starting bench of shift_recursive_osc_cc in-place ..\n");
+    rt = bench_shift_rec_osc_cc_inp(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    if ( have_sse_shift_mixer_impl() )
+    {
+        printf("starting bench of shift_recursive_osc_sse_c in-place ..\n");
+        rt = bench_shift_rec_osc_sse_c_inp(B, N);
+        printf("  %f MSamples/sec\n\n", rt * 1E-6);
+    }
+#endif
+
+    return 0;
+}
+
diff --git a/bench_pffft.c b/bench_pffft.c
index 49d4faa..e3a9e9e 100644
--- a/bench_pffft.c
+++ b/bench_pffft.c
@@ -26,6 +26,7 @@
  */
 
 #define CONCAT_TOKENS(A, B)  A ## B
+#define CONCAT_THREE_TOKENS(A, B, C)  A ## B ## C
 
 #ifdef PFFFT_ENABLE_FLOAT
 #include "pffft.h"
@@ -45,7 +46,6 @@ typedef PFFFTD_Setup PFFFT_SETUP;
 #define PFFFT_FUNC(F)  CONCAT_TOKENS(pffftd_, F)
 #endif
 
-
 #ifdef PFFFT_ENABLE_FLOAT
 
 #include "fftpack.h"
@@ -61,6 +61,25 @@ typedef PFFFTD_Setup PFFFT_SETUP;
 
 #endif
 
+#ifdef HAVE_POCKET_FFT
+#include <pocketfft_double.h>
+#include <pocketfft_single.h>
+#endif
+
+#ifdef PFFFT_ENABLE_FLOAT
+  #define POCKFFTR_PRE(R)   CONCAT_TOKENS(rffts, R)
+  #define POCKFFTC_PRE(R)   CONCAT_TOKENS(cffts, R)
+  #define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rffts, R)
+  #define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cffts, R)
+#else
+  #define POCKFFTR_PRE(R)   CONCAT_TOKENS(rfft, R)
+  #define POCKFFTC_PRE(R)   CONCAT_TOKENS(cfft, R)
+  #define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rfft, R)
+  #define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cfft, R)
+#endif
+
+
+
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -97,7 +116,7 @@ typedef fftw_complex FFTW_COMPLEX;
 #endif
 
 
-#define NUM_FFT_ALGOS  8
+#define NUM_FFT_ALGOS  9
 enum {
   ALGO_FFTPACK = 0,
   ALGO_VECLIB,
@@ -105,8 +124,9 @@ enum {
   ALGO_FFTW_AUTO,
   ALGO_GREEN,
   ALGO_KISS,
-  ALGO_PFFFT_U, /* = 6 */
-  ALGO_PFFFT_O  /* = 7 */
+  ALGO_POCKET,
+  ALGO_PFFFT_U, /* = 7 */
+  ALGO_PFFFT_O  /* = 8 */
 };
 
 #define NUM_TYPES      7
@@ -128,6 +148,7 @@ const char * algoName[NUM_FFT_ALGOS] = {
   "FFTW F(auto) ",
   "Green        ",
   "Kiss         ",
+  "Pocket       ",
   "PFFFT-U(simd)",  /* unordered */
   "PFFFT (simd) "   /* ordered */
 };
@@ -160,6 +181,11 @@ int compiledInAlgo[NUM_FFT_ALGOS] = {
 #else
   0,
 #endif
+#if defined(HAVE_POCKET_FFT)
+  1, /* "Pocket     " */
+#else
+  0,
+#endif
   1, /* "PFFFT_U    " */
   1  /* "PFFFT_O    " */
 };
@@ -171,6 +197,7 @@ const char * algoTableHeader[NUM_FFT_ALGOS][2] = {
 { "|real FFTWauto ", "|cplx FFTWauto " },
 { "|  real  Green ", "|  cplx  Green " },
 { "|  real   Kiss ", "|  cplx   Kiss " },
+{ "|  real Pocket ", "|  cplx Pocket " },
 { "| real PFFFT-U ", "| cplx PFFFT-U " },
 { "|  real  PFFFT ", "|  cplx  PFFFT " } };
 
@@ -271,8 +298,8 @@ void pffft_validate_N(int N, int cplx) {
     if (pass == 0) {
       float *wrk = malloc(2*Nbytes+15*sizeof(pffft_scalar));
       for (k=0; k < Nfloat; ++k) {
-        ref[k] = in[k] = frand()*2-1; 
-        out[k] = 1e30;
+        ref[k] = in[k] = (float)( frand()*2-1 );
+        out[k] = 1e30F;
       }
       if (!cplx) {
         rffti(N, wrk);
@@ -290,7 +317,7 @@ void pffft_validate_N(int N, int cplx) {
       free(wrk);
     }
 
-    for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, fabs(ref[k]));
+    for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, (float)( fabs(ref[k]) ));
 
       
     /* pass 0 : non canonical ordering of transform coefficients */
@@ -820,6 +847,67 @@ void benchmark_ffts(int N, int cplx, int withFFTWfullMeas, double iterCal, doubl
   }
 #endif
 
+#if defined(HAVE_POCKET_FFT)
+
+  Nmax = (cplx ? nextPow2N*2 : nextPow2N);
+  X[Nmax] = checkVal;
+  if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
+  {
+    POCKFFTR_PRE(_plan) planr;
+    POCKFFTC_PRE(_plan) planc;
+
+    te = uclock_sec();
+    if (cplx) {
+      planc = POCKFFTC_MID(make_,_plan)(nextPow2N);
+    } else {
+      planr = POCKFFTR_MID(make_,_plan)(nextPow2N);
+    }
+
+    t0 = uclock_sec();
+    tstop = t0 + max_test_duration;
+    max_iter = 0;
+    do {
+      for ( k = 0; k < step_iter; ++k ) {
+        if (cplx) {
+          assert( X[Nmax] == checkVal );
+          memcpy(Y, X, 2*nextPow2N * sizeof(pffft_scalar) );
+          POCKFFTC_PRE(_forward)(planc, Y, 1.);
+          assert( X[Nmax] == checkVal );
+          memcpy(X, Y, 2*nextPow2N * sizeof(pffft_scalar) );
+          POCKFFTC_PRE(_backward)(planc, X, 1./nextPow2N);
+          assert( X[Nmax] == checkVal );
+        } else {
+          assert( X[Nmax] == checkVal );
+          memcpy(Y, X, nextPow2N * sizeof(pffft_scalar) );
+          POCKFFTR_PRE(_forward)(planr, Y, 1.);
+          assert( X[Nmax] == checkVal );
+          memcpy(X, Y, nextPow2N * sizeof(pffft_scalar) );
+          POCKFFTR_PRE(_backward)(planr, X, 1./nextPow2N);
+          assert( X[Nmax] == checkVal );
+        }
+        ++max_iter;
+      }
+      t1 = uclock_sec();
+    } while ( t1 < tstop );
+
+    if (cplx) {
+      POCKFFTC_MID(destroy_,_plan)(planc);
+    } else {
+      POCKFFTR_MID(destroy_,_plan)(planr);
+    }
+
+    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
+    tmeas[TYPE_ITER][ALGO_POCKET] = max_iter;
+    tmeas[TYPE_MFLOPS][ALGO_POCKET] = flops/1e6/(t1 - t0 + 1e-16);
+    tmeas[TYPE_DUR_TOT][ALGO_POCKET] = t1 - t0;
+    tmeas[TYPE_DUR_NS][ALGO_POCKET] = show_output("Pocket", N, cplx, flops, t0, t1, max_iter, tableFile);
+    tmeas[TYPE_PREP][ALGO_POCKET] = (t0 - te) * 1e3;
+    haveAlgo[ALGO_POCKET] = 1;
+  } else {
+    show_output("Pocket", N, cplx, -1, -1, -1, -1, tableFile);
+  }
+#endif
+
 
   /* PFFFT-U (unordered) benchmark */
   Nmax = (cplx ? pffftPow2N*2 : pffftPow2N);
@@ -977,9 +1065,9 @@ int main(int argc, char **argv) {
     -1 };
 #endif
 
-#define NUMPOW2FFTLENS  21
+#define NUMPOW2FFTLENS  22
 #define MAXNUMFFTLENS MAX( NUMPOW2FFTLENS, NUMNONPOW2LENS )
-  int Npow2[NUMPOW2FFTLENS];  /* exp = 1 .. 20, -1 */
+  int Npow2[NUMPOW2FFTLENS];  /* exp = 1 .. 21, -1 */
   const int *Nvalues = NULL;
   double tmeas[2][MAXNUMFFTLENS][NUM_TYPES][NUM_FFT_ALGOS];
   double iterCalReal, iterCalCplx;
diff --git a/fmv.h b/fmv.h
new file mode 100644
index 0000000..0aa439d
--- /dev/null
+++ b/fmv.h
@@ -0,0 +1,20 @@
+#ifndef FMV_H
+
+#if HAVE_FUNC_ATTRIBUTE_IFUNC
+#if defined(__has_attribute)
+#if __has_attribute(target_clones)
+#if defined(__x86_64)
+
+// see https://gcc.gnu.org/wiki/FunctionMultiVersioning
+#define PF_TARGET_CLONES __attribute__((target_clones("avx","sse4.2","sse3","sse2","sse","default")))
+#define HAVE_PF_TARGET_CLONES  1
+#endif
+#endif
+#endif
+#endif
+
+#ifndef PF_TARGET_CLONES
+#define PF_TARGET_CLONES
+#endif
+
+#endif
diff --git a/pf_carrier.cpp b/pf_carrier.cpp
new file mode 100644
index 0000000..d751a55
--- /dev/null
+++ b/pf_carrier.cpp
@@ -0,0 +1,298 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* include own header first, to see missing includes */
+#include "pf_carrier.h"
+#include "fmv.h"
+
+#include <limits.h>
+#include <assert.h>
+
+
+PF_TARGET_CLONES
+void generate_dc_f(float* output, int size)
+{
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=(127.0F / 128.0F);
+        output[i++]=0.0F;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_s16(short* output, int size)
+{
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=SHRT_MAX;
+        output[i++]=0;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_pos_fs4_f(float* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=(127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0.0F;
+        output[i++]=(127.0F / 128.0F);
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=(-127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0.0F;
+        output[i++]=(-127.0F / 128.0F);
+    }
+}
+
+PF_TARGET_CLONES
+void generate_pos_fs4_s16(short* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0;
+        output[i++]=SHRT_MAX;
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=-SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0;
+        output[i++]=-SHRT_MAX;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_neg_fs4_f(float* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=(127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0.0F;
+        output[i++]=(-127.0F / 128.0F);
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=(-127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0.0F;
+        output[i++]=(127.0F / 128.0F);
+    }
+}
+
+PF_TARGET_CLONES
+void generate_neg_fs4_s16(short* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0;
+        output[i++]=-SHRT_MAX;
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=-SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0;
+        output[i++]=SHRT_MAX;
+    }
+}
+
+/****************************************************/
+
+PF_TARGET_CLONES
+void generate_dc_pos_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+1+i*0 */
+        output[i++]=m+m;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 1+0+i*1 */
+        output[i++]=m+0;
+        output[i++]=m;
+        /* exp(i* +pi) = 1-1+i*0 */
+        output[i++]=m-m;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 1+0+i*-1 */
+        output[i++]=m;
+        output[i++]=-m;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_neg_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+1+i*0 */
+        output[i++]=m+m;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 1+0+i*-1 */
+        output[i++]=m+0;
+        output[i++]=-m;
+        /* exp(i* +pi) = 1-1+i*0 */
+        output[i++]=m-m;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 1+0+i*1 */
+        output[i++]=m+0;
+        output[i++]=m;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_pos_neg_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* pos(0) + neg(0) = exp(i*  0   ) + exp(i*  0   ) =  1 +i*  0  +  1 +i*  0 */
+        output[i++]=m;
+        output[i++]=-m;
+
+        /* pos(1) + neg(1) = exp(i* +pi/2) + exp(i* -pi/2) =  0 +i*  1  +  0 +i* -1 */
+        output[i++]=-m;
+        output[i++]=m;
+
+        /* pos(2) + neg(2) = exp(i* +pi  ) + exp(i* +pi  ) = -1 +i*  0  + -1 +i*  0 */
+        output[i++]=-m;
+        output[i++]=m;
+
+        /* pos(3) + neg(3) = exp(i* -pi/2) + exp(i* +pi/2) =  0 +i* -1  +  0 +i*  1 */
+        output[i++]=m;
+        output[i++]=-m;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_pos_neg_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* dc + pos(0) + neg(0) = dc + exp(i*  0   ) + exp(i*  0   ) =  1 +i*  0  +  1 +i*  0 */
+        output[i++]=m+m;
+        output[i++]=-m;
+
+        /* dc + pos(1) + neg(1) = dc + exp(i* +pi/2) + exp(i* -pi/2) =  0 +i*  1  +  0 +i* -1 */
+        output[i++]=0;
+        output[i++]=m;
+
+        /* dc + pos(2) + neg(2) = dc + exp(i* +pi  ) + exp(i* +pi  ) = -1 +i*  0  + -1 +i*  0 */
+        output[i++]=0;
+        output[i++]=m;
+
+        /* dc + pos(3) + neg(3) = dc + exp(i* -pi/2) + exp(i* +pi/2) =  0 +i* -1  +  0 +i*  1 */
+        output[i++]=m+m;
+        output[i++]=-m;
+    }
+}
+
+
+PF_TARGET_CLONES
+void generate_pos_neg_fs2_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* dc + exp(i* 0 ) = +1 */
+        output[i++]=m;
+        output[i++]=0;
+        /* dc + exp(i* pi) = -1 */
+        output[i++]=-m;
+        output[i++]=0;
+        /* dc + exp(i* 0 ) = +1 */
+        output[i++]=m;
+        output[i++]=0;
+        /* dc + exp(i* pi) = -1 */
+        output[i++]=-m;
+        output[i++]=0;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_pos_neg_fs2_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* with dc = i*1 */
+        /* dc + exp(i* 0 ) = i*1 +1 */
+        output[i++]=m;
+        output[i++]=m;
+        /* dc + exp(i* pi) = i*1 -1 */
+        output[i++]=-m;
+        output[i++]=m;
+        /* dc + exp(i* 0 ) = i*1 +1 */
+        output[i++]=m;
+        output[i++]=m;
+        /* dc + exp(i* pi) = i*1 -1 */
+        output[i++]=-m;
+        output[i++]=m;
+    }
+}
+
+
diff --git a/pf_carrier.h b/pf_carrier.h
new file mode 100644
index 0000000..c328ce0
--- /dev/null
+++ b/pf_carrier.h
@@ -0,0 +1,75 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+   _____                      _
+  / ____|                    | |
+ | |     ___  _ __ ___  _ __ | | _____  __
+ | |    / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
+ | |___| (_) | | | | | | |_) | |  __/>  <
+  \_____\___/|_| |_| |_| .__/|_|\___/_/\_\
+                       | |
+                       |_|
+*/
+
+typedef struct complexf_s { float i; float q; } complexf;
+
+
+/* generation functions */
+void generate_dc_f(float* output, int size);
+void generate_dc_s16(short* output, int size);
+void generate_pos_fs4_f(float* output, int size);
+void generate_pos_fs4_s16(short* output, int size);
+void generate_neg_fs4_f(float* output, int size);
+void generate_neg_fs4_s16(short* output, int size);
+
+void generate_dc_pos_fs4_s16(short* output, int size);
+void generate_dc_neg_fs4_s16(short* output, int size);
+void generate_pos_neg_fs4_s16(short* output, int size);
+void generate_dc_pos_neg_fs4_s16(short* output, int size);
+
+void generate_pos_neg_fs2_s16(short* output, int size);
+void generate_dc_pos_neg_fs2_s16(short* output, int size);
+
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/pf_cic.cpp b/pf_cic.cpp
new file mode 100644
index 0000000..34b90c5
--- /dev/null
+++ b/pf_cic.cpp
@@ -0,0 +1,252 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* include own header first, to see missing includes */
+#include "pf_cic.h"
+#include "fmv.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+
+/*
+   ____ ___ ____   ____  ____   ____
+  / ___|_ _/ ___| |  _ \|  _ \ / ___|
+ | |    | | |     | | | | | | | |
+ | |___ | | |___  | |_| | |_| | |___
+  \____|___\____| |____/|____/ \____|
+*/
+
+#define SINESHIFT 12
+#define SINESIZE (1<<SINESHIFT)
+typedef int64_t cic_dt; // data type used for integrators and combs
+typedef struct {
+    int factor;
+    uint64_t phase;
+    float gain;
+    cic_dt ig0a, ig0b, ig1a, ig1b;
+    cic_dt comb0a, comb0b, comb1a, comb1b;
+    int16_t *sinetable;
+} cicddc_t;
+
+void *cicddc_init(int factor) {
+    int i;
+    int sinesize2 = SINESIZE * 5/4; // 25% extra to get cosine from the same table
+    cicddc_t *s;
+    s = (cicddc_t *)malloc(sizeof(cicddc_t));
+    memset(s, 0, sizeof(cicddc_t));
+
+    float sineamp = 32767.0f;
+    s->factor = factor;
+    s->gain = 1.0f / SHRT_MAX / sineamp / factor / factor / factor; // compensate for gain of 3 integrators
+
+    s->sinetable = (int16_t *)malloc(sinesize2 * sizeof(*s->sinetable));
+    double f = 2.0*M_PI / (double)SINESIZE;
+    for(i = 0; i < sinesize2; i++) {
+        s->sinetable[i] = sineamp * cos(f * i);
+    }
+    return s;
+}
+
+void cicddc_free(void *state) {
+    cicddc_t *s = (cicddc_t *)state;
+    free(s->sinetable);
+    free(s);
+}
+
+
+PF_TARGET_CLONES
+void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
+    cicddc_t *s = (cicddc_t *)state;
+    int k;
+    int factor = s->factor;
+    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
+    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
+    uint64_t phase = s->phase, freq;
+    int16_t *sinetable = s->sinetable;
+    float gain = s->gain;
+
+    freq = rate * ((float)(1ULL << 63) * 2);
+
+    int16_t *inp = input;
+    for(k = 0; k < outsize; k++) {
+        int i;
+        cic_dt out0a, out0b, out1a, out1b;
+        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
+        for(i = 0; i < factor; i++) {
+            cic_dt in_a, in_b;
+            int sinep = phase >> (64-SINESHIFT);
+            in_a = (int32_t)inp[i] * (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
+            in_b = (int32_t)inp[i] * (int32_t)sinetable[sinep];
+            phase += freq;
+            /* integrators:
+            The calculations are ordered so that each integrator
+            takes a result from previous loop iteration
+            to make the code more "pipeline-friendly". */
+            ig2a += ig1a; ig2b += ig1b;
+            ig1a += ig0a; ig1b += ig0b;
+            ig0a += in_a; ig0b += in_b;
+        }
+        inp += factor;
+        // comb filters:
+        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
+        comb0a = ig2a;           comb0b = ig2b;
+        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
+        comb1a = out0a;          comb1b = out0b;
+
+        output[k].i = (float)out1a * gain;
+        output[k].q = (float)out1b * gain;
+    }
+
+    s->ig0a = ig0a; s->ig0b = ig0b;
+    s->ig1a = ig1a; s->ig1b = ig1b;
+    s->comb0a = comb0a; s->comb0b = comb0b;
+    s->comb1a = comb1a; s->comb1b = comb1b;
+    s->phase = phase;
+}
+
+PF_TARGET_CLONES
+void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
+    cicddc_t *s = (cicddc_t *)state;
+    int k;
+    int factor = s->factor;
+    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
+    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
+    uint64_t phase = s->phase, freq;
+    int16_t *sinetable = s->sinetable;
+    float gain = s->gain;
+
+    freq = rate * ((float)(1ULL << 63) * 2);
+
+    int16_t *inp = input;
+    for(k = 0; k < outsize; k++) {
+        int i;
+        cic_dt out0a, out0b, out1a, out1b;
+        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
+        for(i = 0; i < factor; i++) {
+            cic_dt in_a, in_b;
+            int32_t m_a, m_b, m_c, m_d;
+            int sinep = phase >> (64-SINESHIFT);
+            m_a = inp[2*i];
+            m_b = inp[2*i+1];
+            m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
+            m_d = (int32_t)sinetable[sinep];
+            // complex multiplication:
+            in_a = m_a*m_c - m_b*m_d;
+            in_b = m_a*m_d + m_b*m_c;
+            phase += freq;
+            /* integrators:
+            The calculations are ordered so that each integrator
+            takes a result from previous loop iteration
+            to make the code more "pipeline-friendly". */
+            ig2a += ig1a; ig2b += ig1b;
+            ig1a += ig0a; ig1b += ig0b;
+            ig0a += in_a; ig0b += in_b;
+        }
+        inp += 2*factor;
+        // comb filters:
+        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
+        comb0a = ig2a;           comb0b = ig2b;
+        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
+        comb1a = out0a;          comb1b = out0b;
+
+        output[k].i = (float)out1a * gain;
+        output[k].q = (float)out1b * gain;
+    }
+
+    s->ig0a = ig0a; s->ig0b = ig0b;
+    s->ig1a = ig1a; s->ig1b = ig1b;
+    s->comb0a = comb0a; s->comb0b = comb0b;
+    s->comb1a = comb1a; s->comb1b = comb1b;
+    s->phase = phase;
+}
+
+
+/* This is almost copy paste from cicddc_cs16_c.
+   I'm afraid this is going to be annoying to maintain... */
+PF_TARGET_CLONES
+void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate) {
+    cicddc_t *s = (cicddc_t *)state;
+    int k;
+    int factor = s->factor;
+    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
+    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
+    uint64_t phase = s->phase, freq;
+    int16_t *sinetable = s->sinetable;
+    float gain = s->gain;
+
+    freq = rate * ((float)(1ULL << 63) * 2);
+
+    uint8_t *inp = input;
+    for(k = 0; k < outsize; k++) {
+        int i;
+        cic_dt out0a, out0b, out1a, out1b;
+        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
+        for(i = 0; i < factor; i++) {
+            cic_dt in_a, in_b;
+            int32_t m_a, m_b, m_c, m_d;
+            int sinep = phase >> (64-SINESHIFT);
+            // subtract 127.4 (good for rtl-sdr)
+            m_a = (((int32_t)inp[2*i])   << 8) - 32614;
+            m_b = (((int32_t)inp[2*i+1]) << 8) - 32614;
+            m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
+            m_d = (int32_t)sinetable[sinep];
+            // complex multiplication:
+            in_a = m_a*m_c - m_b*m_d;
+            in_b = m_a*m_d + m_b*m_c;
+            phase += freq;
+            /* integrators:
+            The calculations are ordered so that each integrator
+            takes a result from previous loop iteration
+            to make the code more "pipeline-friendly". */
+            ig2a += ig1a; ig2b += ig1b;
+            ig1a += ig0a; ig1b += ig0b;
+            ig0a += in_a; ig0b += in_b;
+        }
+        inp += 2*factor;
+        // comb filters:
+        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
+        comb0a = ig2a;           comb0b = ig2b;
+        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
+        comb1a = out0a;          comb1b = out0b;
+
+        output[k].i = (float)out1a * gain;
+        output[k].q = (float)out1b * gain;
+    }
+
+    s->ig0a = ig0a; s->ig0b = ig0b;
+    s->ig1a = ig1a; s->ig1b = ig1b;
+    s->comb0a = comb0a; s->comb0b = comb0b;
+    s->comb1a = comb1a; s->comb1b = comb1b;
+    s->phase = phase;
+}
+
diff --git a/pf_cic.h b/pf_cic.h
new file mode 100644
index 0000000..681ee4f
--- /dev/null
+++ b/pf_cic.h
@@ -0,0 +1,58 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+   ____ ___ ____   ____  ____   ____
+  / ___|_ _/ ___| |  _ \|  _ \ / ___|
+ | |    | | |     | | | | | | | |
+ | |___ | | |___  | |_| | |_| | |___
+  \____|___\____| |____/|____/ \____|
+*/
+
+typedef struct complexf_s { float i; float q; } complexf;
+
+void *cicddc_init(int factor);
+void cicddc_free(void *state);
+void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
+void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
+void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/pf_mixer.cpp b/pf_mixer.cpp
new file mode 100644
index 0000000..0f2c310
--- /dev/null
+++ b/pf_mixer.cpp
@@ -0,0 +1,1148 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* include own header first, to see missing includes */
+#include "pf_mixer.h"
+#include "fmv.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <assert.h>
+
+//they dropped M_PI in C99, so we define it:
+#define PI ((float)3.14159265358979323846)
+
+//apply to pointers:
+#define iof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)))
+#define qof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)+1))
+
+#define USE_ALIGNED_ADDRESSES  0
+
+
+
+/*
+  _____   _____ _____      __                  _   _
+ |  __ \ / ____|  __ \    / _|                | | (_)
+ | |  | | (___ | |__) |  | |_ _   _ _ __   ___| |_ _  ___  _ __  ___
+ | |  | |\___ \|  ___/   |  _| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
+ | |__| |____) | |       | | | |_| | | | | (__| |_| | (_) | | | \__ \
+ |_____/|_____/|_|       |_|  \__,_|_| |_|\___|\__|_|\___/|_| |_|___/
+
+*/
+
+
+#if defined(__GNUC__)
+#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#  define RESTRICT __restrict
+#elif defined(_MSC_VER)
+#  define ALWAYS_INLINE(return_type) __forceinline return_type
+#  define RESTRICT __restrict
+#endif
+
+
+#ifndef PFFFT_SIMD_DISABLE
+#if (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+  #pragma message "Manual SSE x86/x64 optimizations are ON"
+  #include <xmmintrin.h>
+  #define HAVE_SSE_INTRINSICS 1
+  
+#elif defined(PFFFT_ENABLE_NEON) && defined(__arm__)
+  #pragma message "Manual NEON (arm32) optimizations are ON"
+  #include "sse2neon.h"
+  #define HAVE_SSE_INTRINSICS 1
+  
+#elif defined(PFFFT_ENABLE_NEON) && defined(__aarch64__)
+  #pragma message "Manual NEON (aarch64) optimizations are ON"
+  #include "sse2neon.h"
+  #define HAVE_SSE_INTRINSICS 1
+
+#endif
+#endif
+
+#ifdef HAVE_SSE_INTRINSICS
+
+typedef __m128 v4sf;
+#  define SIMD_SZ 4
+
+typedef union v4_union {
+  __m128  v;
+  float f[4];
+} v4_union;
+
+#define VMUL(a,b)                 _mm_mul_ps(a,b)
+#define VDIV(a,b)                 _mm_div_ps(a,b)
+#define VADD(a,b)                 _mm_add_ps(a,b)
+#define VSUB(a,b)                 _mm_sub_ps(a,b)
+#define LD_PS1(s)                 _mm_set1_ps(s)
+#define VLOAD_UNALIGNED(ptr)      _mm_loadu_ps((const float *)(ptr))
+#define VLOAD_ALIGNED(ptr)        _mm_load_ps((const float *)(ptr))
+#define VSTORE_UNALIGNED(ptr, v)  _mm_storeu_ps((float*)(ptr), v)
+#define VSTORE_ALIGNED(ptr, v)    _mm_store_ps((float*)(ptr), v)
+#define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
+#define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
+
+#if USE_ALIGNED_ADDRESSES
+  #define VLOAD(ptr)              _mm_load_ps((const float *)(ptr))
+  #define VSTORE(ptr, v)          _mm_store_ps((float*)(ptr), v)
+#else
+  #define VLOAD(ptr)              _mm_loadu_ps((const float *)(ptr))
+  #define VSTORE(ptr, v)          _mm_storeu_ps((float*)(ptr), v)
+#endif
+
+
+int have_sse_shift_mixer_impl()
+{
+    return 1;
+}
+
+#else
+
+int have_sse_shift_mixer_impl()
+{
+    return 0;
+}
+
+#endif
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO A ***/
+/**************/
+
+PF_TARGET_CLONES
+float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase)
+{
+    rate*=2;
+    //Shifts the complex spectrum. Basically a complex mixer. This version uses cmath.
+    float phase=starting_phase;
+    float phase_increment=rate*PI;
+    float cosval, sinval;
+    for(int i=0;i<input_size; i++)
+    {
+        cosval=cos(phase);
+        sinval=sin(phase);
+        //we multiply two complex numbers.
+        //how? enter this to maxima (software) for explanation:
+        //   (a+b*%i)*(c+d*%i), rectform;
+        iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
+        qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
+        phase+=phase_increment;
+        while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
+        while(phase<0) phase+=2*PI;
+    }
+    return phase;
+}
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO B ***/
+/**************/
+
+shift_table_data_t shift_table_init(int table_size)
+{
+    shift_table_data_t output;
+    output.table=(float*)malloc(sizeof(float)*table_size);
+    output.table_size=table_size;
+    for(int i=0;i<table_size;i++)
+    {
+        output.table[i]=sin(((float)i/table_size)*(PI/2));
+    }
+    return output;
+}
+
+void shift_table_deinit(shift_table_data_t table_data)
+{
+    free(table_data.table);
+}
+
+
+PF_TARGET_CLONES
+float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase)
+{
+    rate*=2;
+    //Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table.
+    float phase=starting_phase;
+    float phase_increment=rate*PI;
+    float cosval, sinval;
+    for(int i=0;i<input_size; i++) //@shift_math_cc
+    {
+        int sin_index, cos_index, temp_index, sin_sign, cos_sign;
+        int quadrant=phase/(PI/2); //between 0 and 3
+        float vphase=phase-quadrant*(PI/2);
+        sin_index=(vphase/(PI/2))*table_data.table_size;
+        cos_index=table_data.table_size-1-sin_index;
+        if(quadrant&1) //in quadrant 1 and 3
+        {
+            temp_index=sin_index;
+            sin_index=cos_index;
+            cos_index=temp_index;
+        }
+        sin_sign=(quadrant>1)?-1:1; //in quadrant 2 and 3
+        cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2
+        sinval=sin_sign*table_data.table[sin_index];
+        cosval=cos_sign*table_data.table[cos_index];
+        //we multiply two complex numbers.
+        //how? enter this to maxima (software) for explanation:
+        //   (a+b*%i)*(c+d*%i), rectform;
+        iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
+        qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
+        phase+=phase_increment;
+        while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
+        while(phase<0) phase+=2*PI;
+    }
+    return phase;
+}
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO C ***/
+/**************/
+
+shift_addfast_data_t shift_addfast_init(float rate)
+{
+    shift_addfast_data_t output;
+    output.phase_increment=2*rate*PI;
+    for(int i=0;i<4;i++)
+    {
+        output.dsin[i]=sin(output.phase_increment*(i+1));
+        output.dcos[i]=cos(output.phase_increment*(i+1));
+    }
+    return output;
+}
+
+#define SADF_L1(j) \
+    cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \
+    sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j;
+#define SADF_L2(j) \
+    iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \
+    qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j);
+
+PF_TARGET_CLONES
+float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
+{
+    //input_size should be multiple of 4
+    //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
+    float cos_start=cos(starting_phase);
+    float sin_start=sin(starting_phase);
+    float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
+        sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
+        dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
+        dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
+
+    for(int i=0;i<input_size/4; i++)
+    {
+        SADF_L1(0)
+        SADF_L1(1)
+        SADF_L1(2)
+        SADF_L1(3)
+        SADF_L2(0)
+        SADF_L2(1)
+        SADF_L2(2)
+        SADF_L2(3)
+        cos_start = cos_vals_3;
+        sin_start = sin_vals_3;
+    }
+    starting_phase+=input_size*d->phase_increment;
+    while(starting_phase>PI) starting_phase-=2*PI;
+    while(starting_phase<-PI) starting_phase+=2*PI;
+    return starting_phase;
+}
+
+#undef SADF_L2
+
+
+#define SADF_L2(j) \
+    tmp_inp_cos = iof(in_out,4*i+j); \
+    tmp_inp_sin = qof(in_out,4*i+j); \
+    iof(in_out,4*i+j)=(cos_vals_ ## j)*tmp_inp_cos - (sin_vals_ ## j)*tmp_inp_sin; \
+    qof(in_out,4*i+j)=(sin_vals_ ## j)*tmp_inp_cos + (cos_vals_ ## j)*tmp_inp_sin;
+
+PF_TARGET_CLONES
+float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase)
+{
+    //input_size should be multiple of 4
+    //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
+    float cos_start=cos(starting_phase);
+    float sin_start=sin(starting_phase);
+    float register tmp_inp_cos, tmp_inp_sin,
+        cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
+        sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
+        dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
+        dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
+
+    for(int i=0;i<N_cplx/4; i++)
+    {
+        SADF_L1(0)
+        SADF_L1(1)
+        SADF_L1(2)
+        SADF_L1(3)
+        SADF_L2(0)
+        SADF_L2(1)
+        SADF_L2(2)
+        SADF_L2(3)
+        cos_start = cos_vals_3;
+        sin_start = sin_vals_3;
+    }
+    starting_phase+=N_cplx*d->phase_increment;
+    while(starting_phase>PI) starting_phase-=2*PI;
+    while(starting_phase<-PI) starting_phase+=2*PI;
+    return starting_phase;
+}
+
+#undef SADF_L1
+#undef SADF_L2
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO D ***/
+/**************/
+
+shift_unroll_data_t shift_unroll_init(float rate, int size)
+{
+    shift_unroll_data_t output;
+    output.phase_increment=2*rate*PI;
+    output.size = size;
+    output.dsin=(float*)malloc(sizeof(float)*size);
+    output.dcos=(float*)malloc(sizeof(float)*size);
+    float myphase = 0;
+    for(int i=0;i<size;i++)
+    {
+        myphase += output.phase_increment;
+        while(myphase>PI) myphase-=2*PI;
+        while(myphase<-PI) myphase+=2*PI;
+        output.dsin[i]=sin(myphase);
+        output.dcos[i]=cos(myphase);
+    }
+    return output;
+}
+
+void shift_unroll_deinit(shift_unroll_data_t* d)
+{
+    if (!d)
+        return;
+    free(d->dsin);
+    free(d->dcos);
+    d->dsin = NULL;
+    d->dcos = NULL;
+}
+
+PF_TARGET_CLONES
+float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
+{
+    //input_size should be multiple of 4
+    //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
+    float cos_start = cos(starting_phase);
+    float sin_start = sin(starting_phase);
+    register float cos_val = cos_start, sin_val = sin_start;
+    for(int i=0;i<input_size; i++)
+    {
+        iof(output,i) = cos_val*iof(input,i) - sin_val*qof(input,i);
+        qof(output,i) = sin_val*iof(input,i) + cos_val*qof(input,i);
+        // calculate complex phasor for next iteration
+        cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
+        sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
+    }
+    starting_phase+=input_size*d->phase_increment;
+    while(starting_phase>PI) starting_phase-=2*PI;
+    while(starting_phase<-PI) starting_phase+=2*PI;
+    return starting_phase;
+}
+
+PF_TARGET_CLONES
+float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase)
+{
+    float cos_start = cos(starting_phase);
+    float sin_start = sin(starting_phase);
+    register float cos_val = cos_start, sin_val = sin_start;
+    for(int i=0;i<size; i++)
+    {
+        register float inp_i = iof(in_out,i);
+        register float inp_q = qof(in_out,i);
+        iof(in_out,i) = cos_val*inp_i - sin_val*inp_q;
+        qof(in_out,i) = sin_val*inp_i + cos_val*inp_q;
+        // calculate complex phasor for next iteration
+        cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
+        sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
+    }
+    starting_phase += size * d->phase_increment;
+    while(starting_phase>PI) starting_phase-=2*PI;
+    while(starting_phase<-PI) starting_phase+=2*PI;
+    return starting_phase;
+}
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO E ***/
+/**************/
+
+shift_limited_unroll_data_t shift_limited_unroll_init(float rate)
+{
+    shift_limited_unroll_data_t output;
+    output.phase_increment=2*rate*PI;
+    float myphase = 0;
+    for(int i=0; i < PF_SHIFT_LIMITED_UNROLL_SIZE; i++)
+    {
+        myphase += output.phase_increment;
+        while(myphase>PI) myphase-=2*PI;
+        while(myphase<-PI) myphase+=2*PI;
+        output.dcos[i] = cos(myphase);
+        output.dsin[i] = sin(myphase);
+    }
+    output.complex_phase.i = 1.0F;
+    output.complex_phase.q = 0.0F;
+    return output;
+}
+
+PF_TARGET_CLONES
+void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d)
+{
+    float cos_start = d->complex_phase.i;
+    float sin_start = d->complex_phase.q;
+    register float cos_val = cos_start, sin_val = sin_start, mag;
+    while (size > 0)
+    {
+        int N = (size >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : size;
+        for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
+        {
+            for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
+            {
+                iof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
+                qof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
+                // calculate complex phasor for next iteration
+                cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+                sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            }
+        }
+        // "starts := vals := vals / |vals|"
+        mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
+        cos_val /= mag;
+        sin_val /= mag;
+        cos_start = cos_val;
+        sin_start = sin_val;
+
+        input += PF_SHIFT_LIMITED_UNROLL_SIZE;
+        output += PF_SHIFT_LIMITED_UNROLL_SIZE;
+        size -= PF_SHIFT_LIMITED_UNROLL_SIZE;
+    }
+    d->complex_phase.i = cos_val;
+    d->complex_phase.q = sin_val;
+}
+
+PF_TARGET_CLONES
+void shift_limited_unroll_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_data_t* d)
+{
+    float inp_i[PF_SHIFT_LIMITED_SIMD_SZ];
+    float inp_q[PF_SHIFT_LIMITED_SIMD_SZ];
+    // "vals := starts := phase_state"
+    float cos_start = d->complex_phase.i;
+    float sin_start = d->complex_phase.q;
+    register float cos_val = cos_start, sin_val = sin_start, mag;
+    while (N_cplx)
+    {
+        int N = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
+        for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
+        {
+            for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
+                inp_i[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].i;
+            for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
+                inp_q[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].q;
+            for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
+            {
+                // "out[] = inp[] * vals"
+                iof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j];
+                qof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j];
+                // calculate complex phasor for next iteration
+                // "vals :=  d[] * starts"
+                cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+                sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            }
+        }
+        // "starts := vals := vals / |vals|"
+        mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
+        cos_val /= mag;
+        sin_val /= mag;
+        cos_start = cos_val;
+        sin_start = sin_val;
+
+        in_out += PF_SHIFT_LIMITED_UNROLL_SIZE;
+        N_cplx -= PF_SHIFT_LIMITED_UNROLL_SIZE;
+    }
+    // "phase_state := starts"
+    d->complex_phase.i = cos_start;
+    d->complex_phase.q = sin_start;
+}
+
+
+#ifdef HAVE_SSE_INTRINSICS
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO F ***/
+/**************/
+
+shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad)
+{
+    shift_limited_unroll_A_sse_data_t output;
+    float myphase;
+
+    output.phase_increment = 2*relative_freq*PI;
+
+    myphase = 0.0F;
+    for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
+    {
+        for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
+        {
+            myphase += output.phase_increment;
+            while(myphase>PI) myphase-=2*PI;
+            while(myphase<-PI) myphase+=2*PI;
+        }
+        output.dcos[i] = cos(myphase);
+        output.dsin[i] = sin(myphase);
+        for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
+        {
+            output.dcos[i+k] = output.dcos[i];
+            output.dsin[i+k] = output.dsin[i];
+        }
+    }
+
+    output.dcos_blk = 0.0F;
+    output.dsin_blk = 0.0F;
+
+    myphase = phase_start_rad;
+    for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
+    {
+        output.phase_state_i[i] = cos(myphase);
+        output.phase_state_q[i] = sin(myphase);
+        myphase += output.phase_increment;
+        while(myphase>PI) myphase-=2*PI;
+        while(myphase<-PI) myphase+=2*PI;
+    }
+    return output;
+}
+
+
+PF_TARGET_CLONES
+void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d)
+{
+    // "vals := starts := phase_state"
+    __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
+    __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
+    __m128 cos_vals = cos_starts;
+    __m128 sin_vals = sin_starts;
+    __m128 inp_re, inp_im;
+    __m128 product_re, product_im;
+    __m128 interl_prod_a, interl_prod_b;
+    __m128 * RESTRICT p_trig_cos_tab;
+    __m128 * RESTRICT p_trig_sin_tab;
+    __m128 * RESTRICT u = (__m128*)in_out;
+
+    while (N_cplx)
+    {
+        const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
+        int B = NB;
+        p_trig_cos_tab = (__m128*)( &d->dcos[0] );
+        p_trig_sin_tab = (__m128*)( &d->dsin[0] );
+        while (B)
+        {
+            // complex multiplication of 4 complex values from/to in_out[]
+            // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
+            // "out[] = inp[] * vals"
+            UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+            product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
+            product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
+            INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
+            VSTORE(u, interl_prod_a);
+            VSTORE(u+1, interl_prod_b);
+            u += 2;
+            // calculate complex phasor for next iteration
+            // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
+            // "vals :=  d[] * starts"
+            inp_re = VLOAD(p_trig_cos_tab);
+            inp_im = VLOAD(p_trig_sin_tab);
+            cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
+            sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
+            ++p_trig_cos_tab;
+            ++p_trig_sin_tab;
+            B -= 4;
+        }
+        N_cplx -= NB;
+        /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
+        /* re-use product_re[]/product_im[] for normalization */
+        // "starts := vals := vals / |vals|"
+        product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
+#if 0
+        // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
+        // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
+        product_im = _mm_rsqrt_ps(product_re);
+        cos_starts = cos_vals = VMUL(cos_vals, product_im);
+        sin_starts = sin_vals = VMUL(sin_vals, product_im);
+#else
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
+        product_im = _mm_sqrt_ps(product_re);
+        cos_starts = cos_vals = VDIV(cos_vals, product_im);
+        sin_starts = sin_vals = VDIV(sin_vals, product_im);
+#endif
+    }
+    // "phase_state := starts"
+    VSTORE( &d->phase_state_i[0], cos_starts );
+    VSTORE( &d->phase_state_q[0], sin_starts );
+}
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO G ***/
+/**************/
+
+shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad)
+{
+    shift_limited_unroll_B_sse_data_t output;
+    float myphase;
+
+    output.phase_increment = 2*relative_freq*PI;
+
+    myphase = 0.0F;
+    for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
+    {
+        for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
+        {
+            myphase += output.phase_increment;
+            while(myphase>PI) myphase-=2*PI;
+            while(myphase<-PI) myphase+=2*PI;
+        }
+        output.dtrig[i+0] = cos(myphase);
+        output.dtrig[i+1] = sin(myphase);
+        output.dtrig[i+2] = output.dtrig[i+0];
+        output.dtrig[i+3] = output.dtrig[i+1];
+    }
+
+    output.dcos_blk = 0.0F;
+    output.dsin_blk = 0.0F;
+
+    myphase = phase_start_rad;
+    for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
+    {
+        output.phase_state_i[i] = cos(myphase);
+        output.phase_state_q[i] = sin(myphase);
+        myphase += output.phase_increment;
+        while(myphase>PI) myphase-=2*PI;
+        while(myphase<-PI) myphase+=2*PI;
+    }
+    return output;
+}
+
+
+PF_TARGET_CLONES
+void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d)
+{
+    // "vals := starts := phase_state"
+    __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
+    __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
+    __m128 cos_vals = cos_starts;
+    __m128 sin_vals = sin_starts;
+    __m128 inp_re, inp_im;
+    __m128 product_re, product_im;
+    __m128 interl_prod_a, interl_prod_b;
+    __m128 * RESTRICT p_trig_tab;
+    __m128 * RESTRICT u = (__m128*)in_out;
+
+    while (N_cplx)
+    {
+        const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
+        int B = NB;
+        p_trig_tab = (__m128*)( &d->dtrig[0] );
+        while (B)
+        {
+            // complex multiplication of 4 complex values from/to in_out[]
+            // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
+            // "out[] = inp[] * vals"
+            UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+            product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
+            product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
+            INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
+            VSTORE(u, interl_prod_a);
+            VSTORE(u+1, interl_prod_b);
+            u += 2;
+            // calculate complex phasor for next iteration
+            // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
+            // "vals :=  d[] * starts"
+            product_re = VLOAD(p_trig_tab);
+            UNINTERLEAVE2(product_re, product_re, inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+            cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
+            sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
+            ++p_trig_tab;
+            B -= 4;
+        }
+        N_cplx -= NB;
+        /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
+        /* re-use product_re[]/product_im[] for normalization */
+        // "starts := vals := vals / |vals|"
+        product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
+#if 0
+        // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
+        // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
+        product_im = _mm_rsqrt_ps(product_re);
+        cos_starts = cos_vals = VMUL(cos_vals, product_im);
+        sin_starts = sin_vals = VMUL(sin_vals, product_im);
+#else
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
+        product_im = _mm_sqrt_ps(product_re);
+        cos_starts = cos_vals = VDIV(cos_vals, product_im);
+        sin_starts = sin_vals = VDIV(sin_vals, product_im);
+#endif
+    }
+    // "phase_state := starts"
+    VSTORE( &d->phase_state_i[0], cos_starts );
+    VSTORE( &d->phase_state_q[0], sin_starts );
+}
+
+
+/*********************************************************************/
+
+
+/**************/
+/*** ALGO H ***/
+/**************/
+
+shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad)
+{
+    shift_limited_unroll_C_sse_data_t output;
+    float myphase;
+
+    output.phase_increment = 2*relative_freq*PI;
+
+    myphase = 0.0F;
+    for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
+    {
+        for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
+        {
+            myphase += output.phase_increment;
+            while(myphase>PI) myphase-=2*PI;
+            while(myphase<-PI) myphase+=2*PI;
+        }
+        output.dinterl_trig[2*i] = cos(myphase);
+        output.dinterl_trig[2*i+4] = sin(myphase);
+        for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
+        {
+            output.dinterl_trig[2*i+k] = output.dinterl_trig[2*i];
+            output.dinterl_trig[2*i+k+4] = output.dinterl_trig[2*i+4];
+        }
+    }
+
+    output.dcos_blk = 0.0F;
+    output.dsin_blk = 0.0F;
+
+    myphase = phase_start_rad;
+    for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
+    {
+        output.phase_state_i[i] = cos(myphase);
+        output.phase_state_q[i] = sin(myphase);
+        myphase += output.phase_increment;
+        while(myphase>PI) myphase-=2*PI;
+        while(myphase<-PI) myphase+=2*PI;
+    }
+    return output;
+}
+
+
+PF_TARGET_CLONES
+void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d)
+{
+    // "vals := starts := phase_state"
+    __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
+    __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
+    __m128 cos_vals = cos_starts;
+    __m128 sin_vals = sin_starts;
+    __m128 inp_re, inp_im;
+    __m128 product_re, product_im;
+    __m128 interl_prod_a, interl_prod_b;
+    __m128 * RESTRICT p_trig_tab;
+    __m128 * RESTRICT u = (__m128*)in_out;
+
+    while (N_cplx)
+    {
+        const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
+        int B = NB;
+        p_trig_tab = (__m128*)( &d->dinterl_trig[0] );
+        while (B)
+        {
+            // complex multiplication of 4 complex values from/to in_out[]
+            // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
+            // "out[] = inp[] * vals"
+            UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+            product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
+            product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
+            INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
+            VSTORE(u, interl_prod_a);
+            VSTORE(u+1, interl_prod_b);
+            u += 2;
+            // calculate complex phasor for next iteration
+            // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
+            // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
+            // "vals :=  d[] * starts"
+            inp_re = VLOAD(p_trig_tab);
+            inp_im = VLOAD(p_trig_tab+1);
+            cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
+            sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
+            p_trig_tab += 2;
+            B -= 4;
+        }
+        N_cplx -= NB;
+        /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
+        /* re-use product_re[]/product_im[] for normalization */
+        // "starts := vals := vals / |vals|"
+        product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
+#if 0
+        // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
+        // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
+        product_im = _mm_rsqrt_ps(product_re);
+        cos_starts = cos_vals = VMUL(cos_vals, product_im);
+        sin_starts = sin_vals = VMUL(sin_vals, product_im);
+#else
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
+        // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
+        product_im = _mm_sqrt_ps(product_re);
+        cos_starts = cos_vals = VDIV(cos_vals, product_im);
+        sin_starts = sin_vals = VDIV(sin_vals, product_im);
+#endif
+    }
+    // "phase_state := starts"
+    VSTORE( &d->phase_state_i[0], cos_starts );
+    VSTORE( &d->phase_state_q[0], sin_starts );
+}
+
+
+#else
+
+/*********************************************************************/
+
+shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) {
+    assert(0);
+    shift_limited_unroll_A_sse_data_t r;
+    return r;
+}
+shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) {
+    assert(0);
+    shift_limited_unroll_B_sse_data_t r;
+    return r;
+}
+shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) {
+    assert(0);
+    shift_limited_unroll_C_sse_data_t r;
+    return r;
+}
+
+void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) {
+    assert(0);
+}
+void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) {
+    assert(0);
+}
+void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) {
+    assert(0);
+}
+
+#endif
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO I ***/
+/**************/
+
+void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state)
+{
+    // constants for single phase step
+    float phase_increment_s = rate*PI;
+    float k1 = tan(0.5*phase_increment_s);
+    float k2 = 2*k1 /(1 + k1 * k1);
+    for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SZ; j++)
+    {
+        float tmp;
+        state->u_cos[j] = state->u_cos[j-1];
+        state->v_sin[j] = state->v_sin[j-1];
+        // small steps
+        tmp = state->u_cos[j] - k1 * state->v_sin[j];
+        state->v_sin[j] += k2 * tmp;
+        state->u_cos[j] = tmp - k1 * state->v_sin[j];
+    }
+
+    // constants for PF_SHIFT_RECURSIVE_SIMD_SZ times phase step
+    float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SZ;
+    while(phase_increment_b > PI) phase_increment_b-=2*PI;
+    while(phase_increment_b < -PI) phase_increment_b+=2*PI;
+    conf->k1 = tan(0.5*phase_increment_b);
+    conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
+}
+
+void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state)
+{
+    if (starting_phase != 0.0F)
+    {
+        state->u_cos[0] = cos(starting_phase);
+        state->v_sin[0] = sin(starting_phase);
+    }
+    else
+    {
+        state->u_cos[0] = 1.0F;
+        state->v_sin[0] = 0.0F;
+    }
+    shift_recursive_osc_update_rate(rate, conf, state);
+}
+
+
+PF_TARGET_CLONES
+void shift_recursive_osc_cc(const complexf *input, complexf* output,
+    int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
+{
+    float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    shift_recursive_osc_t state = *state_ext;
+    const float k1 = conf->k1;
+    const float k2 = conf->k2;
+    for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_cc
+    {
+        //we multiply two complex numbers - similar to shift_math_cc
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+        {
+            inp_i[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
+            inp_q[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
+        }
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+        {
+            iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
+            qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
+        }
+        // update complex phasor - like incrementing phase
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+            tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+            state.v_sin[j] += k2 * tmp[j];
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+            state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
+    }
+    *state_ext = state;
+}
+
+PF_TARGET_CLONES
+void shift_recursive_osc_inp_c(complexf* in_out,
+    int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
+{
+    float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    shift_recursive_osc_t state = *state_ext;
+    const float k1 = conf->k1;
+    const float k2 = conf->k2;
+    for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_inp_c
+    {
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+        {
+            inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
+            inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
+        }
+        //we multiply two complex numbers - similar to shift_math_cc
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+        {
+            iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
+            qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
+        }
+        // update complex phasor - like incrementing phase
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+            tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+            state.v_sin[j] += k2 * tmp[j];
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+            state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
+    }
+    *state_ext = state;
+}
+
+PF_TARGET_CLONES
+void gen_recursive_osc_c(complexf* output,
+    int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
+{
+    float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    shift_recursive_osc_t state = *state_ext;
+    const float k1 = conf->k1;
+    const float k2 = conf->k2;
+    for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@gen_recursive_osc_c
+    {
+        // output complex oscillator value
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+        {
+            iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j];
+            qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j];
+        }
+        // update complex phasor - like incrementing phase
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+            tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+            state.v_sin[j] += k2 * tmp[j];
+        for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
+            state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
+    }
+    *state_ext = state;
+}
+
+
+#ifdef HAVE_SSE_INTRINSICS
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO J ***/
+/**************/
+
+void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
+{
+    // constants for single phase step
+    float phase_increment_s = rate*PI;
+    float k1 = tan(0.5*phase_increment_s);
+    float k2 = 2*k1 /(1 + k1 * k1);
+    for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++)
+    {
+        float tmp;
+        state->u_cos[j] = state->u_cos[j-1];
+        state->v_sin[j] = state->v_sin[j-1];
+        // small steps
+        tmp = state->u_cos[j] - k1 * state->v_sin[j];
+        state->v_sin[j] += k2 * tmp;
+        state->u_cos[j] = tmp - k1 * state->v_sin[j];
+    }
+
+    // constants for PF_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step
+    float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SSE_SZ;
+    while(phase_increment_b > PI) phase_increment_b-=2*PI;
+    while(phase_increment_b < -PI) phase_increment_b+=2*PI;
+    conf->k1 = tan(0.5*phase_increment_b);
+    conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
+}
+
+
+void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state)
+{
+    if (starting_phase != 0.0F)
+    {
+        state->u_cos[0] = cos(starting_phase);
+        state->v_sin[0] = sin(starting_phase);
+    }
+    else
+    {
+        state->u_cos[0] = 1.0F;
+        state->v_sin[0] = 0.0F;
+    }
+    shift_recursive_osc_sse_update_rate(rate, conf, state);
+}
+
+
+PF_TARGET_CLONES
+void shift_recursive_osc_sse_inp_c(complexf* in_out,
+    int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext)
+{
+    const __m128 k1 = LD_PS1( conf->k1 );
+    const __m128 k2 = LD_PS1( conf->k2 );
+    __m128 u_cos = VLOAD( &state_ext->u_cos[0] );
+    __m128 v_sin = VLOAD( &state_ext->v_sin[0] );
+    __m128 inp_re, inp_im;
+    __m128 product_re, product_im;
+    __m128 interl_prod_a, interl_prod_b;
+    __m128 * RESTRICT u = (__m128*)in_out;
+
+    while (N_cplx)
+    {
+        //inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i;
+        //inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q;
+        UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+
+        //we multiply two complex numbers - similar to shift_math_cc
+        //iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
+        //qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
+        product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) );
+        product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) );
+        INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
+        VSTORE(u, interl_prod_a);
+        VSTORE(u+1, interl_prod_b);
+        u += 2;
+
+        // update complex phasor - like incrementing phase
+        // tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
+        product_re = VSUB( u_cos, VMUL(k1, v_sin) );
+        // state.v_sin[j] += k2 * tmp[j];
+        v_sin = VADD( v_sin, VMUL(k2, product_re) );
+        // state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
+        u_cos = VSUB( product_re, VMUL(k1, v_sin) );
+
+        N_cplx -= 4;
+    }
+    VSTORE( &state_ext->u_cos[0], u_cos );
+    VSTORE( &state_ext->v_sin[0], v_sin );
+}
+
+#else
+
+void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
+{
+    assert(0);
+}
+
+void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state)
+{
+    assert(0);
+}
+
+
+void shift_recursive_osc_sse_inp_c(complexf* in_out,
+    int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext)
+{
+    assert(0);
+}
+
+#endif
+
diff --git a/pf_mixer.h b/pf_mixer.h
new file mode 100644
index 0000000..f407c21
--- /dev/null
+++ b/pf_mixer.h
@@ -0,0 +1,281 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+   _____                      _
+  / ____|                    | |
+ | |     ___  _ __ ___  _ __ | | _____  __
+ | |    / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
+ | |___| (_) | | | | | | |_) | |  __/>  <
+  \_____\___/|_| |_| |_| .__/|_|\___/_/\_\
+                       | |
+                       |_|
+*/
+
+typedef struct complexf_s { float i; float q; } complexf;
+
+// =================================================================================
+
+int have_sse_shift_mixer_impl();
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO A ***/
+/**************/
+
+float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase);
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO B ***/
+/**************/
+
+typedef struct shift_table_data_s
+{
+    float* table;
+    int table_size;
+} shift_table_data_t;
+
+void shift_table_deinit(shift_table_data_t table_data);
+shift_table_data_t shift_table_init(int table_size);
+float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase);
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO C ***/
+/**************/
+
+typedef struct shift_addfast_data_s
+{
+    float dsin[4];
+    float dcos[4];
+    float phase_increment;
+} shift_addfast_data_t;
+
+shift_addfast_data_t shift_addfast_init(float rate);
+float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);
+float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase);
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO D ***/
+/**************/
+
+typedef struct shift_unroll_data_s
+{
+    float* dsin;
+    float* dcos;
+    float phase_increment;
+    int size;
+} shift_unroll_data_t;
+
+shift_unroll_data_t shift_unroll_init(float rate, int size);
+void shift_unroll_deinit(shift_unroll_data_t* d);
+float shift_unroll_cc(complexf *input, complexf* output, int size, shift_unroll_data_t* d, float starting_phase);
+float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase);
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO E ***/
+/**************/
+
+/* similar to shift_unroll_cc() - but, have fixed and limited precalc size
+ * idea: smaller cache usage by table
+ * size must be multiple of CSDR_SHIFT_LIMITED_SIMD (= 4)
+ */
+#define PF_SHIFT_LIMITED_UNROLL_SIZE  128
+#define PF_SHIFT_LIMITED_SIMD_SZ  4
+
+typedef struct shift_limited_unroll_data_s
+{
+    float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE];
+    float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE];
+    complexf complex_phase;
+    float phase_increment;
+} shift_limited_unroll_data_t;
+
+shift_limited_unroll_data_t shift_limited_unroll_init(float rate);
+/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
+/* starting_phase for next call is kept internal in state */
+void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d);
+void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d);
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO F ***/
+/**************/
+
+typedef struct shift_limited_unroll_A_sse_data_s
+{
+    /* small/limited trig table */
+    float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
+    float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
+    /* 4 times complex phase */
+    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
+    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
+    /* N_cplx_per_block times increment - for future parallel variants */
+    float dcos_blk;
+    float dsin_blk;
+    /* */
+    float phase_increment;
+} shift_limited_unroll_A_sse_data_t;
+
+shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad);
+void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d);
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO G ***/
+/**************/
+
+typedef struct shift_limited_unroll_B_sse_data_s
+{
+    /* small/limited trig table */
+    float dtrig[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ];
+    /* 4 times complex phase */
+    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
+    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
+    /* N_cplx_per_block times increment - for future parallel variants */
+    float dcos_blk;
+    float dsin_blk;
+    /* */
+    float phase_increment;
+} shift_limited_unroll_B_sse_data_t;
+
+shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad);
+void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d);
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO H ***/
+/**************/
+
+typedef struct shift_limited_unroll_C_sse_data_s
+{
+    /* small/limited trig table - interleaved: 4 cos, 4 sin, 4 cos, .. */
+    float dinterl_trig[2*(PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ)];
+    /* 4 times complex phase */
+    float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ];
+    float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ];
+    /* N_cplx_per_block times increment - for future parallel variants */
+    float dcos_blk;
+    float dsin_blk;
+    /* */
+    float phase_increment;
+} shift_limited_unroll_C_sse_data_t;
+
+shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad);
+void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d);
+
+
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO I ***/
+/**************/
+
+/* Recursive Quadrature Oscillator functions "recursive_osc"
+ * see https://www.vicanek.de/articles/QuadOsc.pdf
+ */
+#define PF_SHIFT_RECURSIVE_SIMD_SZ  8
+typedef struct shift_recursive_osc_s
+{
+    float u_cos[PF_SHIFT_RECURSIVE_SIMD_SZ];
+    float v_sin[PF_SHIFT_RECURSIVE_SIMD_SZ];
+} shift_recursive_osc_t;
+
+typedef struct shift_recursive_osc_conf_s
+{
+    float k1;
+    float k2;
+} shift_recursive_osc_conf_t;
+
+void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state);
+void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+
+/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */
+/* starting_phase for next call is kept internal in state */
+void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+void shift_recursive_osc_inp_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+
+/*********************************************************************/
+
+/**************/
+/*** ALGO J ***/
+/**************/
+
+#define PF_SHIFT_RECURSIVE_SIMD_SSE_SZ  4
+typedef struct shift_recursive_osc_sse_s
+{
+    float u_cos[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
+    float v_sin[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ];
+} shift_recursive_osc_sse_t;
+
+typedef struct shift_recursive_osc_sse_conf_s
+{
+    float k1;
+    float k2;
+} shift_recursive_osc_sse_conf_t;
+
+void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state);
+void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state);
+void shift_recursive_osc_sse_inp_c(complexf* in_out, int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext);
+
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/pffft.hpp b/pffft.hpp
index ce910f9..8437e05 100644
--- a/pffft.hpp
+++ b/pffft.hpp
@@ -127,7 +127,7 @@ public:
 
   // simple helper to determine next power of 2 - without inexact/rounding floating point operations
   static int nextPowerOfTwo(int N) { return pffft_next_power_of_two(N); }
-  static bool isPowerOfTwo(int N) { return pffft_is_power_of_two(N); }
+  static bool isPowerOfTwo(int N) { return pffft_is_power_of_two(N) ? true : false; }
 
   static int simd_size() { return Types<T>::simd_size(); }
   static const char * simd_arch() { return Types<T>::simd_arch(); }
@@ -383,7 +383,7 @@ inline int nextPowerOfTwo(int N) {
 }
 
 inline bool isPowerOfTwo(int N) {
-  return pffft_is_power_of_two(N);
+  return pffft_is_power_of_two(N) ? true : false;
 }
 
 
diff --git a/pffft_priv_impl.h b/pffft_priv_impl.h
index 8d87561..e1b7b94 100644
--- a/pffft_priv_impl.h
+++ b/pffft_priv_impl.h
@@ -1825,15 +1825,16 @@ static void pffft_assert1( float result, float ref, const char * vartxt, const c
   }
 }
 
-static void pffft_assert4( const v4sf_union V, float a, float b, float c, float d, const char * functxt, int * numErrs, const char * f, int lineNo )
+static void pffft_assert4(  vsfscalar v0, vsfscalar v1, vsfscalar v2, vsfscalar v3,
+  float a, float b, float c, float d, const char * functxt, int * numErrs, const char * f, int lineNo )
 {
-  pffft_assert1( V.f[0], a, "[0]", functxt, numErrs, f, lineNo );
-  pffft_assert1( V.f[1], b, "[1]", functxt, numErrs, f, lineNo );
-  pffft_assert1( V.f[2], c, "[2]", functxt, numErrs, f, lineNo );
-  pffft_assert1( V.f[3], d, "[3]", functxt, numErrs, f, lineNo );
+  pffft_assert1( v0, a, "[0]", functxt, numErrs, f, lineNo );
+  pffft_assert1( v1, b, "[1]", functxt, numErrs, f, lineNo );
+  pffft_assert1( v2, c, "[2]", functxt, numErrs, f, lineNo );
+  pffft_assert1( v3, d, "[3]", functxt, numErrs, f, lineNo );
 }
 
-#define PFFFT_ASSERT4( V, a, b, c, d, FUNCTXT )  pffft_assert4( V, a, b, c, d, FUNCTXT, &numErrs, __FILE__, __LINE__ )
+#define PFFFT_ASSERT4( V, a, b, c, d, FUNCTXT )  pffft_assert4( (V).f[0], (V).f[1], (V).f[2], (V).f[3], a, b, c, d, FUNCTXT, &numErrs, __FILE__, __LINE__ )
 
 
 int FUNC_VALIDATE_SIMD_EX(FILE * DbgOut)
diff --git a/pocketfft b/pocketfft
new file mode 160000
+Subproject 78a0254546e3cd0e042e0216b71f7e1e79f0309
diff --git a/simd/pf_altivec_float.h b/simd/pf_altivec_float.h
index 30e3f18..ef2526d 100644
--- a/simd/pf_altivec_float.h
+++ b/simd/pf_altivec_float.h
@@ -37,7 +37,7 @@
    Altivec support macros
 */
 #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
-#pragma message __FILE__ ": ALTIVEC float macros are defined"
+#pragma message( __FILE__ ": ALTIVEC float macros are defined" )
 typedef vector float v4sf;
 
 #  define SIMD_SZ 4
diff --git a/simd/pf_avx_double.h b/simd/pf_avx_double.h
index 62c9123..251f0b9 100644
--- a/simd/pf_avx_double.h
+++ b/simd/pf_avx_double.h
@@ -46,8 +46,8 @@
 /*
   AVX support macros
 */
-#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && defined(__AVX__)
-#pragma message __FILE__ ": AVX macros are defined"
+#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && !defined(PFFFT_AVX_DISABLE) && defined(__AVX__)
+#pragma message( __FILE__ ": AVX macros are defined" )
 
 #include <immintrin.h>
 typedef __m256d v4sf;
diff --git a/simd/pf_double.h b/simd/pf_double.h
index c6bac31..1025827 100644
--- a/simd/pf_double.h
+++ b/simd/pf_double.h
@@ -60,10 +60,12 @@
 typedef double vsfscalar;
 
 #include "pf_avx_double.h"
+#include "pf_sse2_double.h"
+#include "pf_neon_double.h"
 
 #ifndef SIMD_SZ
 #  if !defined(PFFFT_SIMD_DISABLE)
-#    warning "building double with simd disabled !\n";
+#    pragma message( "building double with simd disabled !" )
 #    define PFFFT_SIMD_DISABLE /* fallback to scalar code */
 #  endif
 #endif
diff --git a/simd/pf_float.h b/simd/pf_float.h
index 1798194..eab2723 100644
--- a/simd/pf_float.h
+++ b/simd/pf_float.h
@@ -65,7 +65,7 @@ typedef float vsfscalar;
 
 #ifndef SIMD_SZ
 #  if !defined(PFFFT_SIMD_DISABLE)
-#    warning "building float with simd disabled !\n";
+#    pragma message( "building float with simd disabled !" )
 #    define PFFFT_SIMD_DISABLE /* fallback to scalar code */
 #  endif
 #endif
diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h
new file mode 100644
index 0000000..e432abc
--- /dev/null
+++ b/simd/pf_neon_double.h
@@ -0,0 +1,203 @@
+/*
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+*/
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_NEON_DBL_H
+#define PF_NEON_DBL_H
+
+/*
+  NEON 64bit support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
+
+#pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
+
+#include "pf_neon_double_from_avx.h"
+typedef __m256d v4sf;
+
+/* 4 doubles by simd vector */
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  double f[SIMD_SZ];
+} v4sf_union;
+
+#  define VARCH "NEON"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() _mm256_setzero_pd()
+#  define VMUL(a,b) _mm256_mul_pd(a,b)
+#  define VADD(a,b) _mm256_add_pd(a,b)
+#  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
+#  define VSUB(a,b) _mm256_sub_pd(a,b)
+#  define LD_PS1(p) _mm256_set1_pd(p)
+#  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
+#  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
+
+FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
+{
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[0];
+    res.vect_f64[1] = b;
+    return res;
+}
+
+FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
+{
+    float64x1_t al = vget_low_f64(a);
+    float64x1_t bl = vget_low_f64(b);
+    return vcombine_f64(al, bl);
+}
+
+FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
+{
+    float64x1_t ah = vget_high_f64(a);
+    float64x1_t bh = vget_high_f64(b);
+    return vcombine_f64(ah, bh);
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
+    res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
+    res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[0];
+    res.vect_f64[1] = b.vect_f64[0];
+    return res;
+}
+
+
+FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
+{
+    __m256d res;
+    res.vect_f64[0] = a.vect_f64[1];
+    res.vect_f64[1] = b.vect_f64[1];
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_reverse(__m256d x)
+{
+    __m256d res;
+    float64x2_t low = x.vect_f64[0];
+    float64x2_t high = x.vect_f64[1];
+    float64x1_t a = vget_low_f64(low);
+    float64x1_t b = vget_high_f64(low);
+    float64x1_t c = vget_low_f64(high);
+    float64x1_t d = vget_high_f64(high);
+    res.vect_f64[0] =  vcombine_f64(d, c);
+    res.vect_f64[1] =  vcombine_f64(b, a);
+    return res;
+}
+
+/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in2[0], in1[1], in2[1] ]
+out2 = [ in1[2], in2[2], in1[3], in2[3] ]
+*/
+#  define INTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)),		\
+		_mm_shuffle_pd_11(low1__, low2__));								\
+	out2 = _mm256_insertf128_pd_1(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)),	\
+		_mm_shuffle_pd_11(high1__, high2__));							\
+	out1 = tmp__;														\
+}
+
+/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in1[2], in2[0], in2[2] ]
+out2 = [ in1[1], in1[3], in2[1], in2[3] ]
+*/
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
+	__m256d tmp__ = _mm256_insertf128_pd_1(								\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)),		\
+		_mm_shuffle_pd_00(low2__, high2__));							\
+	out2 = _mm256_insertf128_pd_1(										\
+		_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)),		\
+		_mm_shuffle_pd_11(low2__, high2__));							\
+	out1 = tmp__;														\
+}
+
+#  define VTRANSPOSE4(row0, row1, row2, row3) {							\
+        __m256d tmp3, tmp2, tmp1, tmp0;                     			\
+                                                            			\
+        tmp0 = _mm256_shuffle_pd_00((row0),(row1));       				\
+        tmp2 = _mm256_shuffle_pd_11((row0),(row1));       				\
+        tmp1 = _mm256_shuffle_pd_00((row2),(row3));       				\
+        tmp3 = _mm256_shuffle_pd_11((row2),(row3));       				\
+                                                            			\
+        (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
+        (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
+        (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
+        (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
+    }
+
+/*VSWAPHL(a, b) pseudo code:
+return [ b[0], b[1], a[2], a[3] ]
+*/
+#  define VSWAPHL(a,b)	\
+   _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
+
+/* reverse/flip all floats */
+#  define VREV_S(a)   _mm256_reverse(a)
+
+/* reverse/flip complex floats */
+#  define VREV_C(a)    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
+
+#endif
+
+#endif /* PF_AVX_DBL_H */
+
diff --git a/simd/pf_neon_double_from_avx.h b/simd/pf_neon_double_from_avx.h
new file mode 100644
index 0000000..5cce17e
--- /dev/null
+++ b/simd/pf_neon_double_from_avx.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+
+ */
+
+//see https://github.com/kunpengcompute/AvxToNeon
+
+#ifndef PF_NEON_DBL_FROM_AVX_H
+#define PF_NEON_DBL_FROM_AVX_H
+#include <arm_neon.h>
+
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#pragma push_macro("FORCE_INLINE")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+
+#else
+
+#error "Macro name collisions may happens with unknown compiler"
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+
+#define FORCE_INLINE static inline
+
+#endif
+
+typedef struct {
+    float32x4_t vect_f32[2];
+} __m256;
+
+typedef struct {
+    float64x2_t vect_f64[2];
+} __m256d;
+
+typedef float64x2_t __m128d;
+
+FORCE_INLINE __m256d _mm256_setzero_pd(void)
+{
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256d _mm256_set1_pd(double a)
+{
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
+{
+    __m256d res;
+    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+    return res;
+}
+FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
+{
+    __m256d res;
+    res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+    res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+    return res;
+}
+
+FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
+{
+    return a.vect_f64[0];
+}
+
+FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.vect_f64[imm8];
+}
+
+FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
+{
+    __m256d res;
+    res.vect_f64[0] = a;
+    return res;
+}
+
+#endif /* PF_AVX_DBL_H */
+
diff --git a/simd/pf_neon_float.h b/simd/pf_neon_float.h
index 1bdd370..c7a547f 100644
--- a/simd/pf_neon_float.h
+++ b/simd/pf_neon_float.h
@@ -37,7 +37,7 @@
   ARM NEON support macros
 */
 #if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
-#pragma message __FILE__ ": ARM NEON macros are defined"
+#pragma message( __FILE__ ": ARM NEON macros are defined" )
 
 #  include <arm_neon.h>
 typedef float32x4_t v4sf;
@@ -80,7 +80,7 @@ typedef union v4sf_union {
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
 
 #else
-/* #pragma message __FILE__ ": ARM NEON macros are not defined" */
+/* #pragma message( __FILE__ ": ARM NEON macros are not defined" ) */
 #endif
 
 #endif /* PF_NEON_FLT_H */
diff --git a/simd/pf_scalar_double.h b/simd/pf_scalar_double.h
index 8c88c05..b7a1cae 100644
--- a/simd/pf_scalar_double.h
+++ b/simd/pf_scalar_double.h
@@ -39,7 +39,7 @@
 */
 
 #if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
-#pragma message __FILE__ ": double SCALAR4 macros are defined"
+#pragma message( __FILE__ ": double SCALAR4 macros are defined" )
 
 typedef struct {
   vsfscalar a;
@@ -149,12 +149,12 @@ typedef union v4sf_union {
   }
 
 #else
-/* #pragma message __FILE__ ": double SCALAR4 macros are not defined" */
+/* #pragma message( __FILE__ ": double SCALAR4 macros are not defined" ) */
 #endif
 
 
 #if !defined(SIMD_SZ)
-#pragma message __FILE__ ": float SCALAR1 macros are defined"
+#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
 typedef vsfscalar v4sf;
 
 #  define SIMD_SZ 1
@@ -177,7 +177,7 @@ typedef union v4sf_union {
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
 
 #else
-/* #pragma message __FILE__ ": double SCALAR1 macros are not defined" */
+/* #pragma message( __FILE__ ": double SCALAR1 macros are not defined" ) */
 #endif
 
 
diff --git a/simd/pf_scalar_float.h b/simd/pf_scalar_float.h
index 7e5e894..4588588 100644
--- a/simd/pf_scalar_float.h
+++ b/simd/pf_scalar_float.h
@@ -39,7 +39,7 @@
 */
 
 #if !defined(SIMD_SZ) && defined(PFFFT_SCALVEC_ENABLED)
-#pragma message __FILE__ ": float SCALAR4 macros are defined"
+#pragma message( __FILE__ ": float SCALAR4 macros are defined" )
 
 typedef struct {
   vsfscalar a;
@@ -149,12 +149,12 @@ typedef union v4sf_union {
   }
 
 #else
-/* #pragma message __FILE__ ": float SCALAR4 macros are not defined" */
+/* #pragma message( __FILE__ ": float SCALAR4 macros are not defined" ) */
 #endif
 
 
 #if !defined(SIMD_SZ)
-#pragma message __FILE__ ": float SCALAR1 macros are defined"
+#pragma message( __FILE__ ": float SCALAR1 macros are defined" )
 typedef vsfscalar v4sf;
 
 #  define SIMD_SZ 1
@@ -177,7 +177,7 @@ typedef union v4sf_union {
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & (sizeof(vsfscalar)-1) ) == 0)
 
 #else
-/* #pragma message __FILE__ ": float SCALAR1 macros are not defined" */
+/* #pragma message( __FILE__ ": float SCALAR1 macros are not defined" ) */
 #endif
 
 
diff --git a/simd/pf_sse1_float.h b/simd/pf_sse1_float.h
index 808350a..ac649db 100644
--- a/simd/pf_sse1_float.h
+++ b/simd/pf_sse1_float.h
@@ -37,7 +37,7 @@
   SSE1 support macros
 */
 #if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
-#pragma message __FILE__ ": SSE1 float macros are defined"
+#pragma message( __FILE__ ": SSE1 float macros are defined" )
 
 #include <xmmintrin.h>
 typedef __m128 v4sf;
@@ -75,7 +75,7 @@ typedef union v4sf_union {
 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
 
 #else
-/* #pragma message __FILE__ ": SSE1 float macros are not defined" */
+/* #pragma message( __FILE__ ": SSE1 float macros are not defined" ) */
 #endif
 
 #endif /* PF_SSE1_FLT_H */
diff --git a/simd/pf_sse2_double.h b/simd/pf_sse2_double.h
new file mode 100644
index 0000000..6c53e8f
--- /dev/null
+++ b/simd/pf_sse2_double.h
@@ -0,0 +1,281 @@
+/*
+   Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
+*/
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_NEON_DBL_H
+#define PF_NEON_DBL_H
+
+//detect sse2 support under MSVC
+#if defined ( _M_IX86_FP )
+#  if _M_IX86_FP == 2
+#    if !defined(__SSE2__)
+#      define __SSE2__
+#    endif
+#  endif
+#endif
+
+/*
+  SSE2 64bit support macros
+*/
+#if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) |  defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ))
+#pragma message (__FILE__ ": SSE2 double macros are defined" )
+
+#include <emmintrin.h>
+
+typedef struct {
+    __m128d d128[2];
+} m256d;
+
+typedef m256d v4sf;
+
+#  define SIMD_SZ 4
+
+typedef union v4sf_union {
+  v4sf  v;
+  double f[SIMD_SZ];
+} v4sf_union;
+
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#pragma push_macro("FORCE_INLINE")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+
+#elif defined (_MSC_VER)
+#define FORCE_INLINE static __forceinline
+
+#else
+#error "Macro name collisions may happens with unknown compiler"
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#endif
+
+FORCE_INLINE m256d mm256_setzero_pd(void)
+{
+    m256d ret;
+    ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
+{
+    m256d ret;
+    ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
+{
+    m256d ret;
+    ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
+{
+    m256d ret;
+    ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
+    ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_set1_pd(double a)
+{
+    m256d ret;
+    ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
+    return ret;
+}
+
+FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
+{
+    m256d res;
+    res.d128[0] = _mm_load_pd((const double *)mem_addr);
+    res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
+    return res;
+}
+FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
+{
+    m256d res;
+    res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
+    res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
+    return res;
+}
+
+
+#  define VARCH "SSE2"
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() mm256_setzero_pd()
+#  define VMUL(a,b) mm256_mul_pd(a,b)
+#  define VADD(a,b) mm256_add_pd(a,b)
+#  define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
+#  define VSUB(a,b) mm256_sub_pd(a,b)
+#  define LD_PS1(p) mm256_set1_pd(p)
+#  define VLOAD_UNALIGNED(ptr)  mm256_loadu_pd(ptr)
+#  define VLOAD_ALIGNED(ptr)    mm256_load_pd(ptr)
+
+
+FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
+{
+    return a.d128[0];
+}
+
+FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.d128[imm8];
+}
+FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
+{
+    m256d res;
+    res.d128[0] = a.d128[0];
+    res.d128[1] = b;
+    return res;
+}
+FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
+{
+    m256d res;
+    res.d128[0] = a;
+    return res;
+}
+
+FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
+{
+    m256d res;
+    res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
+    res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
+    return res;
+}
+
+FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
+{
+    m256d res;
+    res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
+    res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
+    return res;
+}
+
+FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
+    m256d res;
+    res.d128[0] = a.d128[0];
+    res.d128[1] = b.d128[0];
+    return res;
+}
+
+
+FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
+{
+    m256d res;
+    res.d128[0] = a.d128[1];
+    res.d128[1] = b.d128[1];
+    return res;
+}
+
+FORCE_INLINE m256d mm256_reverse(m256d x)
+{
+    m256d res;
+    res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
+    res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
+    return res;
+}
+
+/* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in2[0], in1[1], in2[1] ]
+out2 = [ in1[2], in2[2], in1[3], in2[3] ]
+*/
+#  define INTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = mm256_extractf128_pd(in2, 1);					\
+	m256d tmp__ = mm256_insertf128_pd_1(								\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
+		_mm_shuffle_pd(low1__, low2__, 3));								\
+	out2 = mm256_insertf128_pd_1(										\
+		mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
+		_mm_shuffle_pd(high1__, high2__, 3));							\
+	out1 = tmp__;														\
+}
+
+/*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
+out1 = [ in1[0], in1[2], in2[0], in2[2] ]
+out2 = [ in1[1], in1[3], in2[1], in2[3] ]
+*/
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
+	__m128d low1__ = mm256_castpd256_pd128(in1);						\
+	__m128d low2__ = mm256_castpd256_pd128(in2);						\
+	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
+	__m128d high2__ = mm256_extractf128_pd(in2, 1); 					\
+	m256d tmp__ = mm256_insertf128_pd_1(								\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
+		_mm_shuffle_pd(low2__, high2__, 0));							\
+	out2 = mm256_insertf128_pd_1(										\
+		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
+		_mm_shuffle_pd(low2__, high2__, 3));							\
+	out1 = tmp__;														\
+}
+
+#  define VTRANSPOSE4(row0, row1, row2, row3) {							\
+        m256d tmp3, tmp2, tmp1, tmp0;                     			\
+                                                            			\
+        tmp0 = mm256_shuffle_pd_00((row0),(row1));       				\
+        tmp2 = mm256_shuffle_pd_11((row0),(row1));       				\
+        tmp1 = mm256_shuffle_pd_00((row2),(row3));       				\
+        tmp3 = mm256_shuffle_pd_11((row2),(row3));       				\
+                                                            			\
+        (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
+        (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
+        (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
+        (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
+    }
+
+/*VSWAPHL(a, b) pseudo code:
+return [ b[0], b[1], a[2], a[3] ]
+*/
+#  define VSWAPHL(a,b)	\
+   mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
+
+/* reverse/flip all floats */
+#  define VREV_S(a)   mm256_reverse(a)
+
+/* reverse/flip complex floats */
+#  define VREV_C(a)    mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
+
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
+
+#endif
+#endif
diff --git a/sse2neon.h b/sse2neon.h
new file mode 100644
index 0000000..b28a797
--- /dev/null
+++ b/sse2neon.h
@@ -0,0 +1,5956 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// This header file does not yet translate all of the SSE intrinsics.
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@biilabs.io>
+//   Mark Cheng <marktwtn@biilabs.io>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yanghau@biilabs.io>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of _mm_min_ps and _mm_max_ps
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#else
+#error "Macro name collisions may happen with unsupported compiler."
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#elif defined(__aarch64__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if __GNUC__ <= 9
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an _m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__)
+#if __GNUC__ <= 9
+FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#endif
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ *
+ * Data (Number, Binary, Byte Index):
+    +------+------+-------------+------+------+-------------+
+    |      1      |      2      |      3      |      4      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |      5      |      6      |      7      |      8      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+ * Index (Byte Index):
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
+    +------+------+------+------+------+------+------+------+
+ * Result:
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
+    +------+------+------+------+------+------+------+------+
+    |     256     |      2      |      5      |      6      | Number
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |      3      |      7      |      4      |      8      | Number
+    +------+------+------+------+------+------+-------------+
+ */
+
+/* Set/get methods */
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint {
+    _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,   /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
+    _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
+    _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
+    _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
+    _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
+};
+
+// Loads one cache line of data from address p to a location closer to the
+// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
+FORCE_INLINE void _mm_prefetch(const void *p, int i)
+{
+    (void) i;
+    __builtin_prefetch(p);
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Sets the 128-bit value to zero
+// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Clears the four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Sets the four single-precision, floating-point values to w.
+//
+//   r0 := r1 := r2 := r3 := w
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to w.
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs.
+// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in
+// reverse order.
+// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the 8 signed 16-bit integer values in reverse order.
+//
+// Return Value
+//   r0 := w0
+//   r1 := w1
+//   ...
+//   r7 := w7
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Sets the 4 signed 32-bit integer values in reverse order
+// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Sets the 16 signed 8-bit integer values to b.
+//
+//   r0 := b
+//   r1 := b
+//   ...
+//   r15 := b
+//
+// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Sets the 8 signed 16-bit integer values to w.
+//
+//   r0 := w
+//   r1 := w
+//   ...
+//   r7 := w
+//
+// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 8 signed 16-bit integer values.
+// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Sets the 16 signed 8-bit integer values in reverse order.
+// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 4 signed 32-bit integer values to i.
+//
+//   r0 := i
+//   r1 := i
+//   r2 := i
+//   r3 := I
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Sets the 4 signed 32-bit integer values.
+// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
+    return vreinterpretq_m128i_s64(vld1q_s64(data));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores the lower single - precision, floating - point value.
+// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
+// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
+    uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
+    *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
+}
+
+// Stores the lower two single-precision floating point values of a to the
+// address p.
+//
+//   *p0 := a0
+//   *p1 := a1
+//
+// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Stores the upper two single-precision, floating-point values of a to the
+// address p.
+//
+//   *p0 := a2
+//   *p1 := a3
+//
+// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Loads a single single-precision, floating-point value, copying it into all
+// four words
+// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Sets the lower two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the upper two values are passed
+// through from a.
+//
+// Return Value
+//   r0 := *p0
+//   r1 := *p1
+//   r2 := a2
+//   r3 := a3
+//
+// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Sets the upper two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the lower two values are passed
+// through from a.
+//
+//   r0 := a0
+//   r1 := a1
+//   r2 := *p0
+//   r3 := *p1
+//
+// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+//
+//   dst[15:0] := MEM[mem_addr+15:mem_addr]
+//   dst[MAX:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[MAX:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from 16-byte aligned memory, floating-point
+// values.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Loads an single - precision, floating - point value into the low word and
+// clears the upper three words.
+// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Sets the low word to the single-precision, floating-point value of b
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+    __m128 a;
+    return a;
+}
+
+/* Logic/Binary operations */
+
+// Computes the bitwise AND-NOT of the four single-precision, floating-point
+// values of a and b.
+//
+//   r0 := ~a0 & b0
+//   r1 := ~a1 & b1
+//   r2 := ~a2 & b2
+//   r3 := ~a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+// 	     i := j*64
+// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
+// 128-bit value in a.
+//
+//   r := (~a) & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
+// b.
+//
+//   r := a & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise AND of the four single-precision, floating-point values
+// of a and b.
+//
+//   r0 := a0 & b0
+//   r1 := a1 & b1
+//   r2 := a2 & b2
+//   r3 := a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the four single-precision, floating-point values
+// of a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Computes bitwise EXOR (exclusive-or) of the four single-precision,
+// floating-point values of a and b.
+// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := j*64
+//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
+//
+//   r := a | b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
+// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Moves the upper two values of B into the lower two values of A.
+//
+//   r3 := a3
+//   r2 := a2
+//   r1 := b3
+//   r0 := b2
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+}
+
+// Moves the lower two values of B into the upper two values of A.
+//
+//   r3 := b1
+//   r2 := b0
+//   r1 := a1
+//   r0 := a0
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// NEON does not support a general purpose permute intrinsic
+// Selects four specific single-precision, floating-point values from a and b,
+// based on the mask i.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    __extension__({                                                        \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_ps(a, b, imm)                                \
+    __extension__({                                              \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
+        float32x4_t _shuf = __builtin_shufflevector(             \
+            _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                           \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                          \
+    __extension__({                                        \
+        __m128 ret;                                        \
+        switch (imm) {                                     \
+        case _MM_SHUFFLE(1, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_1032((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 3, 0, 1):                      \
+            ret = _mm_shuffle_ps_2301((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 3, 2, 1):                      \
+            ret = _mm_shuffle_ps_0321((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 1, 0, 3):                      \
+            ret = _mm_shuffle_ps_2103((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 1, 0):                      \
+            ret = _mm_movelh_ps((a), (b));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_1001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 1, 0, 1):                      \
+            ret = _mm_shuffle_ps_0101((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 1, 0):                      \
+            ret = _mm_shuffle_ps_3210((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 1, 1):                      \
+            ret = _mm_shuffle_ps_0011((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 2, 2):                      \
+            ret = _mm_shuffle_ps_0022((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 2, 0, 0):                      \
+            ret = _mm_shuffle_ps_2200((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 0, 2):                      \
+            ret = _mm_shuffle_ps_3202((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 3, 2):                      \
+            ret = _mm_movehl_ps((b), (a));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 1, 3, 3):                      \
+            ret = _mm_shuffle_ps_1133((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 1, 0):                      \
+            ret = _mm_shuffle_ps_2010((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_2001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_2032((a), (b));           \
+            break;                                         \
+        default:                                           \
+            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+            break;                                         \
+        }                                                  \
+        ret;                                               \
+    })
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most signficant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least signficant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    __extension__({                                                         \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    __extension__({                                              \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
+#else
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    __extension__({                                                          \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
+#endif
+
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
+// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_epi32(a, imm)                              \
+    __extension__({                                            \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);         \
+        int32x4_t _shuf = __builtin_shufflevector(             \
+            _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                        \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    __extension__({                                      \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif
+
+// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
+// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    __extension__({                                                           \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = __builtin_shufflevector(                   \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    __extension__({                                                            \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflehi_epi16(a, imm)                             \
+    __extension__({                                             \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);          \
+        int16x8_t _shuf = __builtin_shufflevector(              \
+            _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+            (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+            (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                         \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+//
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF imm8[j]
+//           dst[i+15:i] := b[i+15:i]
+//       ELSE
+//           dst[i+15:i] := a[i+15:i]
+//       FI
+//   ENDFOR
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                        \
+    __extension__({                                                       \
+        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                          \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);                       \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);                       \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));            \
+    })
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+//
+//   FOR j := 0 to 15
+//       i := j*8
+//       IF mask[i+7]
+//           dst[i+7:i] := b[i+7:i]
+//       ELSE
+//           dst[i+7:i] := a[i+7:i]
+//       FI
+//   ENDFOR
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+/* Shifts */
+
+
+// Shift packed 16-bit integers in a right by imm while shifting in sign
+// bits, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+//   r0 := a0 << count
+//   r1 := a1 << count
+//   ...
+//   r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
+#define _mm_slli_epi16(a, imm)                                   \
+    __extension__({                                              \
+        __m128i ret;                                             \
+        if ((imm) <= 0) {                                        \
+            ret = a;                                             \
+        } else if ((imm) > 15) {                                 \
+            ret = _mm_setzero_si128();                           \
+        } else {                                                 \
+            ret = vreinterpretq_m128i_s16(                       \
+                vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
+        }                                                        \
+        ret;                                                     \
+    })
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros. :
+// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
+        return a;
+    if (imm > 31) /* TODO: add unlikely macro */
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
+        return a;
+    if (imm > 63) /* TODO: add unlikely macro */
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if ((imm) == 0) {                                                  \
+            ret = a;                                                       \
+        } else if (0 < (imm) && (imm) < 16) {                              \
+            ret = vreinterpretq_m128i_u16(                                 \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if ((imm) == 0) {                                                  \
+            ret = a;                                                       \
+        } else if (0 < (imm) && (imm) < 32) {                              \
+            ret = vreinterpretq_m128i_u32(                                 \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF imm8[7:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if ((imm) == 0) {                                                  \
+            ret = a;                                                       \
+        } else if (0 < (imm) && (imm) < 64) {                              \
+            ret = vreinterpretq_m128i_u64(                                 \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+//     ELSE
+//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if ((imm) == 0) {                                                  \
+            ret = a;                                                       \
+        } else if (0 < (imm) && (imm) < 32) {                              \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
+        } else {                                                           \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shifts the 128 - bit value in a right by imm bytes while shifting in
+// zeros.imm must be an immediate.
+//
+//   r := srl(a, imm*8)
+//
+// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
+// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_si128(a, imm)                                              \
+    __extension__({                                                         \
+        __m128i ret;                                                        \
+        if ((imm) <= 0) {                                                   \
+            ret = a;                                                        \
+        } else if ((imm) > 15) {                                            \
+            ret = _mm_setzero_si128();                                      \
+        } else {                                                            \
+            ret = vreinterpretq_m128i_s8(                                   \
+                vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
+// must be an immediate.
+//
+//   r := a << (imm * 8)
+//
+// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_slli_si128(a, imm)                                          \
+    __extension__({                                                     \
+        __m128i ret;                                                    \
+        if ((imm) <= 0) {                                               \
+            ret = a;                                                    \
+        } else if ((imm) > 15) {                                        \
+            ret = _mm_setzero_si128();                                  \
+        } else {                                                        \
+            ret = vreinterpretq_m128i_s8(vextq_s8(                      \
+                vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
+        }                                                               \
+        ret;                                                            \
+    })
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+//   r0 := a0 << count
+//   r1 := a1 << count
+//   ...
+//   r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (c > 15)
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+// r2 := a2 << count
+// r3 := a3 << count
+//
+// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (c > 31)
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+//
+// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (c > 63)
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// ...
+// r7 := srl(a7, count)
+//
+// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (c > 15)
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// r2 := srl(a2, count)
+// r3 := srl(a3, count)
+//
+// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (c > 31)
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+//
+// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (c > 63)
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// NEON does not provide a version of this function.
+// Creates a 16-bit mask from the most significant bits of the 16 signed or
+// unsigned 8-bit integers in a and zero extends the upper bits.
+// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+#if defined(__aarch64__)
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+    const int8_t ALIGN_STRUCT(16)
+        xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
+    const uint8x16_t mask_and = vdupq_n_u8(0x80);
+    const int8x16_t mask_shift = vld1q_s8(xr);
+    const uint8x16_t mask_result =
+        vshlq_u8(vandq_u8(input, mask_and), mask_shift);
+    uint8x8_t lo = vget_low_u8(mask_result);
+    uint8x8_t hi = vget_high_u8(mask_result);
+
+    return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
+#else
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+#endif
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four
+// single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+    static const int32x4_t shift = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
+                                                                           : 1;
+}
+
+/* Math operations */
+
+// Subtracts the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] - b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
+// and store the results in dst.
+//    r0 := a0 - b0
+//    r1 := a1 - b1
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
+// unsigned 32-bit integers of a.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] - b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
+// integers of a and saturates..
+// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
+// integers of a and saturates.
+//
+//   r0 := UnsignedSaturate(a0 - b0)
+//   r1 := UnsignedSaturate(a1 - b1)
+//   ...
+//   r15 := UnsignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r15 := SignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r7 := SignedSaturate(a7 - b7)
+//
+// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..15
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..7
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..3
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      IF b[i+15:i] < 0
+//        dst[i+15:i] := -(a[i+15:i])
+//      ELSE IF b[i+15:i] == 0
+//        dst[i+15:i] := 0
+//      ELSE
+//        dst[i+15:i] := a[i+15:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 1
+//      i := j*32
+//      IF b[i+31:i] < 0
+//        dst[i+31:i] := -(a[i+31:i])
+//      ELSE IF b[i+31:i] == 0
+//        dst[i+31:i] := 0
+//      ELSE
+//        dst[i+31:i] := a[i+31:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      IF b[i+7:i] < 0
+//        dst[i+7:i] := -(a[i+7:i])
+//      ELSE IF b[i+7:i] == 0
+//        dst[i+7:i] := 0
+//      ELSE
+//        dst[i+7:i] := a[i+7:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Computes the average of the 16 unsigned 8-bit integers in a and the 16
+// unsigned 8-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r15 := (a15 + b15) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the average of the 8 unsigned 16-bit integers in a and the 8
+// unsigned 16-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r7 := (a7 + b7) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Adds the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1] + db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// adds the scalar single-precision floating point values of a and b.
+// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
+// unsigned 16-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
+// unsigned 8-bit integers in b.
+// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
+// and saturates.
+//
+//   r0 := SignedSaturate(a0 + b0)
+//   r1 := SignedSaturate(a1 + b1)
+//   ...
+//   r7 := SignedSaturate(a7 + b7)
+//
+// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
+// b and saturates..
+// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
+// unsigned 16-bit integers from b.
+//
+//   r0 := (a0 * b0)[15:0]
+//   r1 := (a1 * b1)[15:0]
+//   ...
+//   r7 := (a7 * b7)[15:0]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
+// unsigned 32-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      tmp[31:0] := a[i+15:i] * b[i+15:i]
+//      dst[i+15:i] := tmp[31:16]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Multiplies the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 * b0
+//   r1 := a1 * b1
+//   r2 := a2 * b2
+//   r3 := a3 * b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] * b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+//
+//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
+//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+//
+//   dst[63:0] := a[31:0] * b[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+//
+//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
+//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0) + (a1 * b1)
+//   r1 := (a2 * b2) + (a3 * b3)
+//   r2 := (a4 * b4) + (a5 * b5)
+//   r3 := (a6 * b6) + (a7 * b7)
+// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+//
+//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
+//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
+//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
+//   ...
+//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*16
+//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
+//      a[i+7:i]*b[i+7:i] )
+//   ENDFOR
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Computes the fused multiple add product of 32-bit floating point numbers.
+//
+// Return Value
+// Multiplies A and B, and adds C to the temporary result before returning it.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
+FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
+                                            vreinterpretq_f32_m128(b),
+                                            vreinterpretq_f32_m128(a)));
+#else
+    return _mm_add_ps(_mm_mul_ps(a, b), c);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
+    return _mm_fmadd_ps(b, mask, a);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+    uint16_t r4 = t[4] + t[5] + t[6] + t[7];
+    uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
+    return (__m128i) vsetq_lane_u16(r4, r, 4);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint16x4_t t =
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+    return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+//   ENDFOR
+//   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
+//   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Divides the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 / b0
+//   r1 := a1 / b1
+//   r2 := a2 / b2
+//   r3 := a3 / b3
+//
+// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    float32x4_t recip1 =
+        vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
+#endif
+}
+
+// Divides the scalar single-precision floating point value of a by b.
+// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the approximations of reciprocals of the four single-precision,
+// floating-point values of a.
+// https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+//
+//   dst[31:0] := (1.0 / a[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Computes the approximations of square roots of the four single-precision,
+// floating-point values of a. First computes reciprocal square roots and then
+// reciprocals of the four values.
+//
+//   r0 := sqrt(a0)
+//   r1 := sqrt(a1)
+//   r2 := sqrt(a2)
+//   r3 := sqrt(a3)
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    // ??? use step versions of both sqrt and recip for better accuracy?
+    return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Computes the approximation of the square root of the scalar single-precision
+// floating point value of in.
+// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Computes the approximations of the reciprocal square roots of the four
+// single-precision floating point values of in.
+// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Computes the maximums of the four single-precision, floating-point values of
+// a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Computes the minima of the four single-precision, floating-point values of a
+// and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Computes the maximum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 > b0) ? a0 : b0
+//   r1 := (a1 > b1) ? a1 : b1
+//   r2 := (a2 > b2) ? a2 : b2
+//   r3 := (a3 > b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 < b0) ? a0 : b0
+//   r1 := (a1 < b1) ? a1 : b1
+//   r2 := (a2 < b2) ? a2 : b2
+//   r3 := (a3 < b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0)[31:16]
+//   r1 := (a1 * b1)[31:16]
+//   ...
+//   r7 := (a7 * b7)[31:16]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point
+// values a and b.
+// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally substract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsubq_f32(
+        vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
+        vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
+#else
+    float32x4x2_t c =
+        vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Computes pairwise difference of each argument as a 16-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Subtract
+    return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
+}
+
+// Computes saturated pairwise sub of each argument as a 16-bit signed
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes saturated pairwise difference of each argument as a 16-bit signed
+// integer values a and b.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated subtract
+    return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+}
+
+// Computes pairwise difference of each argument as a 32-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int64x2_t a = vreinterpretq_s64_m128i(_a);
+    int64x2_t b = vreinterpretq_s64_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|b0|b2]
+    // [a1|a2|b1|b3]
+    int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
+    int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
+    // Subtract
+    return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+    y -= *c;
+    float t = *sum + y;
+    *c = (t - *sum) - y;
+    *sum = t;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+#if defined(__aarch64__)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+    }
+    if (imm == 0x7F) {
+        float32x4_t m = _mm_mul_ps(a, b);
+        m[3] = 0;
+        return _mm_set1_ps(vaddvq_f32(m));
+    }
+#endif
+
+    float s = 0, c = 0;
+    float32x4_t f32a = vreinterpretq_f32_m128(a);
+    float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+    /* To improve the accuracy of floating-point summation, Kahan algorithm
+     * is used for each operation.
+     */
+    if (imm & (1 << 4))
+        sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+    if (imm & (1 << 5))
+        sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+    if (imm & (1 << 6))
+        sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+    if (imm & (1 << 7))
+        sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+    s += c;
+
+    float32x4_t res = {
+        (imm & 0x1) ? s : 0,
+        (imm & 0x2) ? s : 0,
+        (imm & 0x4) ? s : 0,
+        (imm & 0x8) ? s : 0,
+    };
+    return vreinterpretq_m128_f32(res);
+}
+
+/* Compare operations */
+
+// Compares for less than
+// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compares for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffffffff : 0x0
+//   r1 := (a1 > b1) ? 0xffffffff : 0x0
+//   r2 := (a2 > b2) ? 0xffffffff : 0x0
+//   r3 := (a3 > b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compares for greater than or equal.
+// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compares for less than or equal.
+//
+//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
+//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
+//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
+//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compares for equality.
+// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for equality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compares for inequality.
+// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for inequality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return _mm_cmplt_ps(a, b);
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_cmplt_ss(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return _mm_cmple_ps(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_cmple_ss(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return _mm_cmpgt_ps(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_cmpgt_ss(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return _mm_cmpge_ps(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_cmpge_ss(a, b);
+}
+
+// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
+// unsigned 8-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
+// unsigned 16-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for lesser than.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xff : 0x0
+//   r1 := (a1 > b1) ? 0xff : 0x0
+//   ...
+//   r15 := (a15 > b15) ? 0xff : 0x0
+//
+// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for less than.
+//
+//   r0 := (a0 < b0) ? 0xffff : 0x0
+//   r1 := (a1 < b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 < b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffff : 0x0
+//   r1 := (a1 > b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 > b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for less than.
+// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for greater than.
+// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    // ARMv7 lacks vcgtq_s64.
+    // This is based off of Clang's SSE2 polyfill:
+    // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
+
+    // Mask the sign bit out since we need a signed AND an unsigned comparison
+    // and it is ugly to try and split them.
+    int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
+    int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
+    int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
+    // Check if a > b
+    int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
+    // Copy upper mask to lower mask
+    // a_hi > b_hi
+    int64x2_t gt_hi = vshrq_n_s64(greater, 63);
+    // Copy lower mask to upper mask
+    // a_lo > b_lo
+    int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
+    // Compare for equality
+    int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
+    // Copy upper mask to lower mask
+    // a_hi == b_hi
+    int64x2_t eq_hi = vshrq_n_s64(equal, 63);
+    // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
+    int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
+    return vreinterpretq_m128i_s64(ret);
+#endif
+}
+
+// Compares the four 32-bit floats in a and b to check if any values are NaN.
+// Ordered compare between each value returns true for "orderable" and false for
+// "not orderable" (NaN).
+// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
+// also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compares for ordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than operation. :
+// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
+// note!! The documentation on MSDN is incorrect!  If either of the values is a
+// NAN the docs say you will get a one, but in fact, it will return a zero!!
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than operation. :
+// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an equality operation. :
+// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an inequality operation. :
+// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
+    uint32x4_t a_neq_b = vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
+}
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+/* Conversions */
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    __m128 ret = a;
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(ret), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t diff = data - floor(data);
+    if (diff > 0.5)
+        return (int32_t) ceil(data);
+    if (diff == 0.5) {
+        int32_t f = (int32_t) floor(data);
+        int32_t c = (int32_t) ceil(data);
+        return c & 1 ? f : c;
+    }
+    return (int32_t) floor(data);
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then covert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values using truncate.
+// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Converts the four signed 32-bit integer values of a to single-precision,
+// floating-point values
+// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Converts the two unsigned 8-bit integers in the lower 16 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Converts the two signed 8-bit integers in the lower 32 bits to four
+// signed 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four signed 16-bit integers in the lower 64 bits to four signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Converts the two signed 16-bit integers in the lower 32 bits two signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four unsigned 16-bit integers in the lower 64 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Converts the two unsigned 16-bit integers in the lower 32 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the two unsigned 32-bit integers in the lower 64 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Converts the two signed 32-bit integers in the lower 64 bits to two signed
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values.
+//
+//   r0 := (int) a0
+//   r1 := (int) a1
+//   r2 := (int) a2
+//   r3 := (int) a3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+#else
+    uint32x4_t signmask = vdupq_n_u32(0x80000000);
+    float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                 vdupq_n_f32(0.5f)); /* +/- 0.5 */
+    int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+        vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+    int32x4_t r_trunc =
+        vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+    int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+        vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+    int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+    float32x4_t delta = vsubq_f32(
+        vreinterpretq_f32_m128(a),
+        vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+    uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
+    return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
+#endif
+}
+
+// Copy the lower 32-bit integer in a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+//   r2 := 0x0
+//   r3 := 0x0
+//
+// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Applies a type cast to reinterpret four 32-bit floating point values passed
+// in as a 128-bit parameter as packed 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Applies a type cast to reinterpret four 32-bit integers passed in as a
+// 128-bit parameter as packed 32-bit floating point values.
+// https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[MAX:32] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 32*j
+//     k := 64*j
+//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
+//   ENDFOR
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    float a0 = (float) ((double *) &a)[0];
+    float a1 = (float) ((double *) &a)[1];
+    return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    return ((double *) &a)[0];
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 64*j
+//     k := 32*j
+//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
+{
+    return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
+                                            vreinterpretq_f32_m128(b),
+                                            vreinterpretq_f32_m128(a)));
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+    __m128 zero, neg_inf, pos_inf;
+
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return (__m128){floorf(v_float[0]), floorf(v_float[1]),
+                        floorf(v_float[2]), floorf(v_float[3])};
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
+                        ceilf(v_float[3])};
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
+        neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
+                             floorf(v_float[2]), floorf(v_float[3]));
+        pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
+                             ceilf(v_float[2]), ceilf(v_float[3]));
+        return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return (__m128){roundf(v_float[0]), roundf(v_float[1]),
+                        roundf(v_float[2]), roundf(v_float[3])};
+    }
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+    return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+    return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+/* Miscellaneous Operations */
+
+// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   ...
+//   r7 := a7 >> count
+//
+// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (c > 15)
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+}
+
+// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   r2 := a2 >> count
+//   r3 := a3 >> count
+//
+// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (c > 31)
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+}
+
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
+// saturates.
+// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   ...
+//   r7 := UnsignedSaturate(a7)
+//   r8 := UnsignedSaturate(b0)
+//   r9 := UnsignedSaturate(b1)
+//   ...
+//   r15 := UnsignedSaturate(b7)
+//
+// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
+// and saturates.
+//
+//   r0 := SignedSaturate(a0)
+//   r1 := SignedSaturate(a1)
+//   r2 := SignedSaturate(a2)
+//   r3 := SignedSaturate(a3)
+//   r4 := SignedSaturate(b0)
+//   r5 := SignedSaturate(b1)
+//   r6 := SignedSaturate(b2)
+//   r7 := SignedSaturate(b3)
+//
+// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   r2 := UnsignedSaturate(a2)
+//   r3 := UnsignedSaturate(a3)
+//   r4 := UnsignedSaturate(b0)
+//   r5 := UnsignedSaturate(b1)
+//   r6 := UnsignedSaturate(b2)
+//   r7 := UnsignedSaturate(b3)
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   ...
+//   r14 := a7
+//   r15 := b7
+//
+// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
+// lower 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   r4 := a2
+//   r5 := b2
+//   r6 := a3
+//   r7 := b3
+//
+// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
+// lower 2 signed or unsigned 32 - bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a2
+//   r1 := b2
+//   r2 := a3
+//   r3 := b3
+//
+// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a8
+//   r1 := b8
+//   r2 := a9
+//   r3 := b9
+//   ...
+//   r14 := a15
+//   r15 := b15
+//
+// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
+// upper 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a4
+//   r1 := b4
+//   r2 := a5
+//   r3 := b5
+//   r4 := a6
+//   r5 := b6
+//   r6 := a7
+//   r7 := b7
+//
+// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
+// upper 2 signed or unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper signed or unsigned 64-bit integer in a with the
+// upper signed or unsigned 64-bit integer in b.
+//
+//   r0 := a1
+//   r1 := b1
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+//
+//   index[2:0] := 0
+//   min[15:0] := a[15:0]
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF a[i+15:i] < min[15:0]
+//           index[2:0] := j
+//           min[15:0] := a[i+15:i]
+//       FI
+//   ENDFOR
+//   dst[15:0] := min[15:0]
+//   dst[18:16] := index[2:0]
+//   dst[127:19] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+    // Find the minimum value
+#if defined(__aarch64__)
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+#else
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+#endif
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// shift to right
+// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
+// http://blog.csdn.net/hemmingway/article/details/44828303
+// Clang requires a macro here, as it is extremely picky about c being a
+// literal.
+#define _mm_alignr_epi8(a, b, c) \
+    ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
+                  vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Extracts the selected signed or unsigned 8-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Inserts the least significant 8 bits of b into the selected 8-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        vreinterpretq_m128i_s8(                                    \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero
+// extends.
+// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Inserts the least significant 16 bits of b into the selected 16-bit integer
+// of a.
+// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 32-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Inserts the least significant 32 bits of b into the selected 32-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 64-bit integer from a and zero
+// extends.
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Inserts the least significant 64 bits of b into the selected 64-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+/* Crypto Extensions */
+
+#if defined(__ARM_FEATURE_CRYPTO)
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_DATA(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+// In the absence of crypto extensions, implement aesenc using regular neon
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
+// for more information Reproduced with permission of the author.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
+                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
+                                         0xc, 0x1, 0x6, 0xb};
+    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
+    v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
+
+    // mix columns
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    //  add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
+    (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
+     (b0))
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
+    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
+    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
+    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4] = {
+        [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
+        [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
+        [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
+        [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+    };
+    for (int i = 0; i < 16; i++)
+        vreinterpretq_nth_u8_m128i(a, i) =
+            v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
+    return a;
+}
+
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+//
+// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
+{
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+}
+#undef SSE2NEON_AES_DATA
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Streaming Extensions */
+
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
+{
+    __sync_synchronize();
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Stores the data in a to the address p without polluting the caches.  If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Cache line containing p is flushed and invalidated from all caches in the
+// coherency domain. :
+// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+    // no corollary for Neon?
+}
+
+// Allocate aligned blocks of memory.
+// https://software.intel.com/en-us/
+//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc ^= v;
+    for (int bit = 0; bit < 8; bit++) {
+        if (crc & 1)
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+        else
+            crc = (crc >> 1);
+    }
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+
+#endif
diff --git a/test_pffft.cpp b/test_pffft.cpp
index 4104a1b..d388563 100644
--- a/test_pffft.cpp
+++ b/test_pffft.cpp
@@ -120,10 +120,10 @@ Ttest(int N, bool useOrdered)
         (m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */
       for (j = 0; j < N; ++j) {
         if (cplx) {
-          Xs[2 * j] = amp * cos(phi);     /* real part */
-          Xs[2 * j + 1] = amp * sin(phi); /* imag part */
+          Xs[2 * j] = (FftScalar)( amp * cos(phi) );     /* real part */
+          Xs[2 * j + 1] = (FftScalar)( amp * sin(phi) ); /* imag part */
         } else
-          Xs[j] = amp * cos(phi); /* only real part */
+          Xs[j] = (FftScalar)( amp * cos(phi) ); /* only real part */
 
         /* phase increment .. stay normalized - cos()/sin() might degrade! */
         phi += dPhi;
diff --git a/use_gcc8.inc b/use_gcc8.inc
index c4535f1..c4535f1 100755..100644
--- a/use_gcc8.inc
+++ b/use_gcc8.inc
author	Haibo Huang <hhb@google.com>	2021-01-06 03:39:50 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	2021-01-06 03:39:50 +0000
commit	f1dc9824465ecb52ba7220a5f4d02b0858f3fcd5 (patch)
tree	7e5b45d174a2d4b038e8842bb4e0fbed7ec77914
parent	f72b8bc0eac730df148031eea3a5155e1391c869 (diff)
parent	56310291a52a05dd2b856fcac112a3f91fb5a76d (diff)
download	pffft-f1dc9824465ecb52ba7220a5f4d02b0858f3fcd5.tar.gz