diff options
author | Julien Pommier <pommier@pianoteq.com> | 2013-01-12 19:28:03 +0100 |
---|---|---|
committer | Julien Pommier <pommier@pianoteq.com> | 2013-01-12 19:28:03 +0100 |
commit | 432b3e85e30eb6c24f10f4ce0df655cbab89cfae (patch) | |
tree | 41235f52c52853476cf3174991a1dca4893fe813 | |
parent | d3b146e5b33cbf39585b08b534a921c8292ebc5e (diff) | |
download | pffft-432b3e85e30eb6c24f10f4ce0df655cbab89cfae.tar.gz |
fixed a bug in real ffts for N multiple of 25; work around a compiler bug with clang 3.2 for arm on linux
-rw-r--r-- | README.txt | 135 | ||||
-rw-r--r-- | fftpack.c | 25 | ||||
-rw-r--r-- | pffft.c | 107 | ||||
-rw-r--r-- | pffft.h | 4 | ||||
-rw-r--r--[-rwxr-xr-x] | test_pffft.c | 51 |
5 files changed, 166 insertions, 156 deletions
@@ -91,33 +91,42 @@ operation. Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9) -- +The benchmark shows the performance of various fft implementations measured in +MFlops, with the number of floating point operations being defined as 5Nlog2(N) +for a length N complex fft, and 2.5*Nlog2(N) for a real fft. +See http://www.fftw.org/speed/method.html for an explanation of these formulas. + MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600 Built with: gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f -| N (input length) | real FFTPack | real vDSP | real FFTW | real PFFFT | | cplx FFTPack | cplx vDSP | cplx FFTW | cplx PFFFT | -|------------------+--------------+--------------+--------------+--------------| |--------------+--------------+--------------+--------------| -| 64 | 2887 | 9124 | 7082 | 8140 | | 3416 | 15722 | 15705 | 11222 | -| 96 | 3356 | n/a | 8520 | 8158 | | 4038 | n/a | 16560 | 11141 | -| 128 | 3895 | 12135 | 9613 | 10321 | | 4292 | 17981 | 17054 | 12538 | -| 192 | 3941 | n/a | 10220 | 11167 | | 4388 | n/a | 16226 | 12964 | -| 256 | 4532 | 13669 | 11031 | 12779 | | 4628 | 19905 | 17259 | 14305 | -| 384 | 3512 | n/a | 11013 | 12278 | | 3645 | n/a | 16559 | 13370 | -| 512 | 3716 | 15236 | 11515 | 14376 | | 3737 | 20423 | 17050 | 14746 | -| 768 | 3756 | n/a | 11524 | 13659 | | 3748 | n/a | 16201 | 14891 | -| 1024 | 4060 | 15841 | 10393 | 15732 | | 3828 | 21555 | 15898 | 15883 | -| 2048 | 4646 | 16806 | 11888 | 15752 | | 4323 | 20802 | 15360 | 15219 | -| 4096 | 4794 | 17008 | 11866 | 15785 | | 4167 | 19842 | 14532 | 14723 | -| 8192 | 3887 | 16519 | 11290 | 12854 | | 3738 | 18923 | 12528 | 14164 | -| 9216 | 3924 | n/a | 10980 | 13211 | | 3684 | n/a | 12349 | 14474 | -| 16384 | 3907 | 16045 | 11146 | 13111 | | 3687 | 17628 | 12364 | 14176 | -| 32768 | 4279 | 15169 | 10946 | 11538 | | 3919 | 15179 | 11558 | 11911 | -| 262144 | 3423 | 11792 | 6753 | 9827 | | 2913 | 11989 | 8406 | 10960 | -| 1048576 | 3313 | 10613 | 5478 | 7142 | | 2683 | 8487 | 2826 | 5961 | -|------------------+--------------+--------------+--------------+--------------| |--------------+--------------+--------------+--------------| - +| input len |real FFTPack| real vDSP | real FFTW | real PFFFT | |cplx FFTPack| cplx vDSP | cplx FFTW | cplx PFFFT | +|-----------+------------+------------+------------+------------| |------------+------------+------------+------------| +| 64 | 2816 | 8596 | 7329 | 8187 | | 2887 | 14898 | 14668 | 11108 | +| 96 | 3298 | n/a | 8378 | 7727 | | 3953 | n/a | 15680 | 10878 | +| 128 | 3507 | 11575 | 9266 | 10108 | | 4233 | 17598 | 16427 | 12000 | +| 160 | 3391 | n/a | 9838 | 10711 | | 4220 | n/a | 16653 | 11187 | +| 192 | 3919 | n/a | 9868 | 10956 | | 4297 | n/a | 15770 | 12540 | +| 256 | 4283 | 13179 | 10694 | 13128 | | 4545 | 19550 | 16350 | 13822 | +| 384 | 3136 | n/a | 10810 | 12061 | | 3600 | n/a | 16103 | 13240 | +| 480 | 3477 | n/a | 10632 | 12074 | | 3536 | n/a | 11630 | 12522 | +| 512 | 3783 | 15141 | 11267 | 13838 | | 3649 | 20002 | 16560 | 13580 | +| 640 | 3639 | n/a | 11164 | 13946 | | 3695 | n/a | 15416 | 13890 | +| 768 | 3800 | n/a | 11245 | 13495 | | 3590 | n/a | 15802 | 14552 | +| 800 | 3440 | n/a | 10499 | 13301 | | 3659 | n/a | 12056 | 13268 | +| 1024 | 3924 | 15605 | 11450 | 15339 | | 3769 | 20963 | 13941 | 15467 | +| 2048 | 4518 | 16195 | 11551 | 15532 | | 4258 | 20413 | 13723 | 15042 | +| 2400 | 4294 | n/a | 10685 | 13078 | | 4093 | n/a | 12777 | 13119 | +| 4096 | 4750 | 16596 | 11672 | 15817 | | 4157 | 19662 | 14316 | 14336 | +| 8192 | 3820 | 16227 | 11084 | 12555 | | 3691 | 18132 | 12102 | 13813 | +| 9216 | 3864 | n/a | 10254 | 12870 | | 3586 | n/a | 12119 | 13994 | +| 16384 | 3822 | 15123 | 10454 | 12822 | | 3613 | 16874 | 12370 | 13881 | +| 32768 | 4175 | 14512 | 10662 | 11095 | | 3881 | 14702 | 11619 | 11524 | +| 262144 | 3317 | 11429 | 6269 | 9517 | | 2810 | 11729 | 7757 | 10179 | +| 1048576 | 2913 | 10551 | 4730 | 5867 | | 2661 | 7881 | 3520 | 5350 | +|-----------+------------+------------+------------+------------| |------------+------------+------------+------------| Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600 @@ -263,30 +272,68 @@ cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c -Ubuntu 11.04 on Pandaboard, gcc-4.5.2, 32-bit, with fftw 3.3.1 beta (neon enabled), on a 1GHz ARM Cortex A9 (TI OMAP4430) +Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3) Built with: -gcc-4.5 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=softfp -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f +gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f -I must admit that the performance was a bit disappointing here... +| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT | +|-----------+------------+------------+------------| |------------+------------+------------| +| 64 | 549 | 452 | 731 | | 512 | 602 | 640 | +| 96 | 421 | 272 | 702 | | 496 | 571 | 602 | +| 128 | 498 | 512 | 815 | | 597 | 618 | 652 | +| 160 | 521 | 536 | 815 | | 586 | 669 | 625 | +| 192 | 539 | 571 | 883 | | 485 | 597 | 626 | +| 256 | 640 | 539 | 975 | | 569 | 611 | 671 | +| 384 | 499 | 610 | 879 | | 499 | 602 | 637 | +| 480 | 518 | 507 | 877 | | 496 | 661 | 616 | +| 512 | 524 | 591 | 1002 | | 549 | 678 | 668 | +| 640 | 542 | 612 | 955 | | 568 | 663 | 645 | +| 768 | 557 | 613 | 981 | | 491 | 663 | 598 | +| 800 | 514 | 353 | 882 | | 514 | 360 | 574 | +| 1024 | 640 | 640 | 1067 | | 492 | 683 | 602 | +| 2048 | 587 | 640 | 908 | | 486 | 640 | 552 | +| 2400 | 479 | 368 | 777 | | 422 | 376 | 518 | +| 4096 | 511 | 614 | 853 | | 426 | 640 | 534 | +| 8192 | 415 | 584 | 708 | | 386 | 622 | 516 | +| 9216 | 419 | 571 | 687 | | 364 | 586 | 506 | +| 16384 | 426 | 577 | 716 | | 398 | 606 | 530 | +| 32768 | 417 | 572 | 673 | | 399 | 572 | 468 | +| 262144 | 219 | 380 | 293 | | 255 | 431 | 343 | +| 1048576 | 202 | 274 | 237 | | 265 | 282 | 355 | +|-----------+------------+------------+------------| |------------+------------+------------| -| N (input length) | real FFTPack | real FFTW | real PFFFT | | cplx FFTPack | cplx FFTW | cplx PFFFT | -|------------------+--------------+--------------+--------------| |--------------+--------------+--------------| -| 64 | 384 | 614 | 591 | | 404 | 1024 | 549 | -| 96 | 324 | 702 | 562 | | 337 | 864 | 503 | -| 128 | 407 | 717 | 640 | | 407 | 1086 | 543 | -| 192 | 404 | 809 | 693 | | 388 | 903 | 547 | -| 256 | 465 | 788 | 788 | | 427 | 871 | 594 | -| 384 | 392 | 814 | 687 | | 343 | 862 | 543 | -| 512 | 411 | 768 | 794 | | 372 | 940 | 583 | -| 768 | 438 | 818 | 767 | | 383 | 846 | 584 | -| 1024 | 427 | 800 | 883 | | 400 | 883 | 602 | -| 2048 | 414 | 853 | 805 | | 343 | 828 | 477 | -| 4096 | 426 | 768 | 698 | | 341 | 808 | 469 | -| 8192 | 332 | 666 | 594 | | 297 | 765 | 438 | -| 9216 | 335 | 660 | 571 | | 294 | 687 | 432 | -| 16384 | 344 | 675 | 606 | | 314 | 709 | 456 | -| 32768 | 342 | 685 | 564 | | 295 | 634 | 399 | -| 262144 | 143 | 301 | 197 | | 160 | 321 | 251 | -| 1048576 | 138 | 238 | 174 | | 173 | 212 | 253 | -|------------------+--------------+--------------+--------------| |--------------+--------------+--------------| +Same platform as above, but this time pffft and fftpack are built with clang 3.2: + +clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f + +| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT | +|-----------+------------+------------+------------| |------------+------------+------------| +| 64 | 427 | 452 | 853 | | 427 | 602 | 1024 | +| 96 | 351 | 276 | 843 | | 337 | 571 | 963 | +| 128 | 373 | 512 | 996 | | 390 | 618 | 1054 | +| 160 | 426 | 536 | 987 | | 375 | 669 | 914 | +| 192 | 404 | 571 | 1079 | | 388 | 588 | 1079 | +| 256 | 465 | 539 | 1205 | | 445 | 602 | 1170 | +| 384 | 366 | 610 | 1099 | | 343 | 594 | 1099 | +| 480 | 356 | 507 | 1140 | | 335 | 651 | 931 | +| 512 | 411 | 591 | 1213 | | 384 | 649 | 1124 | +| 640 | 398 | 612 | 1193 | | 373 | 654 | 901 | +| 768 | 409 | 613 | 1227 | | 383 | 663 | 1044 | +| 800 | 411 | 348 | 1073 | | 353 | 358 | 809 | +| 1024 | 427 | 640 | 1280 | | 413 | 692 | 1004 | +| 2048 | 414 | 626 | 1126 | | 371 | 640 | 853 | +| 2400 | 399 | 373 | 898 | | 319 | 368 | 653 | +| 4096 | 404 | 602 | 1059 | | 357 | 633 | 778 | +| 8192 | 332 | 584 | 792 | | 308 | 616 | 716 | +| 9216 | 322 | 561 | 783 | | 299 | 586 | 687 | +| 16384 | 344 | 568 | 778 | | 314 | 617 | 745 | +| 32768 | 342 | 564 | 737 | | 314 | 552 | 629 | +| 262144 | 201 | 383 | 313 | | 227 | 435 | 413 | +| 1048576 | 187 | 262 | 251 | | 228 | 281 | 409 | +|-----------+------------+------------+------------| |------------+------------+------------| + +So it looks like, on ARM, gcc 4.7 is the best at scalar floating point +(the fftpack performance numbers are better with gcc), while clang is +the best with neon intrinsics (see how pffft perf has improved with +clang 3.2). @@ -1493,22 +1493,14 @@ static void radf5(integer ido, integer l1, const real *cc, real *ch, for (k = 1; k <= l1; ++k) { for (i = 3; i <= ido; i += 2) { ic = idp2 - i; - dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * - cc_ref(i, k, 2); - di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref( - i - 1, k, 2); - dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * - cc_ref(i, k, 3); - di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref( - i - 1, k, 3); - dr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] * - cc_ref(i, k, 4); - di4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref( - i - 1, k, 4); - dr5 = wa4[i - 2] * cc_ref(i - 1, k, 5) + wa4[i - 1] * - cc_ref(i, k, 5); - di5 = wa4[i - 2] * cc_ref(i, k, 5) - wa4[i - 1] * cc_ref( - i - 1, k, 5); + dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * cc_ref(i, k, 2); + di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(i - 1, k, 2); + dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * cc_ref(i, k, 3); + di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(i - 1, k, 3); + dr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] * cc_ref(i, k, 4); + di4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref(i - 1, k, 4); + dr5 = wa4[i - 2] * cc_ref(i - 1, k, 5) + wa4[i - 1] * cc_ref(i, k, 5); + di5 = wa4[i - 2] * cc_ref(i, k, 5) - wa4[i - 1] * cc_ref(i - 1, k, 5); cr2 = dr2 + dr5; ci5 = dr5 - dr2; cr5 = di2 - di5; @@ -2125,6 +2117,7 @@ static void rfftf1(integer n, real *c, real *ch, const real *wa, integer *ifac) ix3 = ix2 + ido; ix4 = ix3 + ido; radf5(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]); + break; default: if (ido == 1) { na = 1 - na; @@ -1,4 +1,4 @@ -/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com ) +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) Based on original fortran 77 code from FFTPACKv4 from NETLIB (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber @@ -71,7 +71,7 @@ #endif #if defined(COMPILER_GCC) -# define ALWAYS_INLINE(return_type) return_type __attribute__ ((always_inline)) +# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) # define NEVER_INLINE(return_type) return_type __attribute__ ((noinline)) # define RESTRICT __restrict # define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; @@ -160,7 +160,7 @@ typedef float32x4_t v4sf; # define LD_PS1(p) vld1q_dup_f32(&(p)) # define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } # define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } -# define VTRANSPOSE4_(x0,x1,x2,x3) { \ +# define VTRANSPOSE4(x0,x1,x2,x3) { \ float32x4x2_t t0_ = vzipq_f32(x0, x2); \ float32x4x2_t t1_ = vzipq_f32(x1, x3); \ float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \ @@ -168,7 +168,7 @@ typedef float32x4_t v4sf; x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \ } // marginally faster version -# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } +//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } # define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) # define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0) #else @@ -825,14 +825,14 @@ static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT for (k = 1; k <= l1; ++k) { for (i = 3; i <= ido; i += 2) { ic = idp2 - i; - dr2 = LD_PS1(wa1[i-2]); di2 = LD_PS1(wa1[i-1]); - dr3 = LD_PS1(wa2[i-2]); di3 = LD_PS1(wa2[i-1]); - dr4 = LD_PS1(wa3[i-2]); di4 = LD_PS1(wa3[i-1]); - dr5 = LD_PS1(wa4[i-2]); di5 = LD_PS1(wa4[i-1]); - VCPLXMUL(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2)); - VCPLXMUL(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3)); - VCPLXMUL(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4)); - VCPLXMUL(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5)); + dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]); + dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]); + dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]); + dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]); + VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2)); + VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3)); + VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4)); + VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5)); cr2 = VADD(dr2, dr5); ci5 = VSUB(dr5, dr2); cr5 = VSUB(di2, di5); @@ -842,21 +842,21 @@ static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT cr4 = VSUB(di3, di4); ci3 = VADD(di3, di4); ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3)); - ch_ref(i, 1, k) = VADD(cc_ref(i, k, 1), VADD(ci2, ci3)); + ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));// tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3))); - ti2 = VADD(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3))); + ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));// tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3))); - ti3 = VADD(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3))); + ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));// tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4)); ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4)); tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4)); ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4)); - ch_ref(i - 1, 3, k) = VADD(tr2, tr5); - ch_ref(ic - 1, 2, k) = VSUB(tr2, tr5); + ch_ref(i - 1, 3, k) = VSUB(tr2, tr5); + ch_ref(ic - 1, 2, k) = VADD(tr2, tr5); ch_ref(i, 3, k) = VADD(ti2, ti5); ch_ref(ic, 2, k) = VSUB(ti5, ti2); - ch_ref(i - 1, 5, k) = VADD(tr3, tr4); - ch_ref(ic - 1, 4, k) = VSUB(tr3, tr4); + ch_ref(i - 1, 5, k) = VSUB(tr3, tr4); + ch_ref(ic - 1, 4, k) = VADD(tr3, tr4); ch_ref(i, 5, k) = VADD(ti3, ti4); ch_ref(ic, 4, k) = VSUB(ti4, ti3); } @@ -939,10 +939,10 @@ static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch dr2 = VSUB(cr2, ci5); di5 = VSUB(ci2, cr5); di2 = VADD(ci2, cr5); - VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1])); - VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1])); - VCPLXMUL(dr4, di4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1])); - VCPLXMUL(dr5, di5, LD_PS1(wa4[i-2]), LD_PS1(wa4[i-1])); + VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2])); + VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2])); + VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2])); + VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2])); ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2; ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3; @@ -1411,7 +1411,7 @@ void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) } -ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in, +static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in, const v4sf *e, v4sf *out) { v4sf r0, i0, r1, i1, r2, i2, r3, i3; v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; @@ -1512,7 +1512,7 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o } -ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, +static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out, int first) { v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7]; /* @@ -1617,8 +1617,6 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack }; int ib = (nf_odd ^ ordered ? 1 : 0); - if (scratch == 0) scratch = scratch_on_stack; - assert(VALIGNED(finput) && VALIGNED(foutput)); //assert(finput != foutput); @@ -1675,7 +1673,7 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo } void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { - int i, Ncvec = s->Ncvec; + int Ncvec = s->Ncvec; const v4sf * RESTRICT va = (const v4sf*)a; const v4sf * RESTRICT vb = (const v4sf*)b; v4sf * RESTRICT vab = (v4sf*)ab; @@ -1693,10 +1691,16 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, __builtin_prefetch(va+6); __builtin_prefetch(vb+6); __builtin_prefetch(vab+6); +# ifndef __clang__ +# define ZCONVOLVE_USING_INLINE_NEON_ASM +# endif #endif float ar, ai, br, bi, abr, abi; +#ifndef ZCONVOLVE_USING_INLINE_ASM v4sf vscal = LD_PS1(scaling); + int i; +#endif assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); ar = ((v4sf_union*)va)[0].f[0]; @@ -1706,8 +1710,7 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, abr = ((v4sf_union*)vab)[0].f[0]; abi = ((v4sf_union*)vab)[1].f[0]; -#ifdef __arm__ -# if 1 // inline asm version +#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc const float *a_ = a, *b_ = b; float *ab_ = ab; int N = Ncvec; asm volatile("mov r8, %2 \n" @@ -1743,49 +1746,7 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, "subs %3, #2 \n" "bne 1b \n" : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory"); - -# else // neon instrinsics version, 30% slower that the asm one with gcc 4.6 - v4sf a1r, a1i, b1r, b1i; - v4sf a2r, a2i, b2r, b2i; - v4sf ab1r, ab1i, ab2r, ab2i; - for (i=0; i < Ncvec; i += 2) { - __builtin_prefetch(va+8); - __builtin_prefetch(va+10); - - a1r = *va++; a1i = *va++; - a2r = *va++; a2i = *va++; - b1r = *vb++; b1i = *vb++; - b2r = *vb++; b2i = *vb++; - ab1r = vab[0]; ab1i = vab[1]; - ab2r = vab[2]; ab2i = vab[3]; - - v4sf z1r = VMUL(a1r, b1r); - v4sf z2r = VMUL(a2r, b2r); - v4sf z1i = VMUL(a1r, b1i); - v4sf z2i = VMUL(a2r, b2i); - - __builtin_prefetch(vb+4); - __builtin_prefetch(vb+6); - - z1r = vmlsq_f32(z1r, a1i, b1i); - z2r = vmlsq_f32(z2r, a2i, b2i); - z1i = vmlaq_f32(z1i, a1i, b1r); - z2i = vmlaq_f32(z2i, a2i, b2r); - - __builtin_prefetch(vab+4); - __builtin_prefetch(vab+6); - - ab1r = vmlaq_f32(ab1r, z1r, vscal); - ab2r = vmlaq_f32(ab2r, z2r, vscal); - ab1i = vmlaq_f32(ab1i, z1i, vscal); - ab2i = vmlaq_f32(ab2i, z2i, vscal); - - *vab++ = ab1r; *vab++ = ab1i; - *vab++ = ab2r; *vab++ = ab2i; - } -# endif - -#else // not ARM, no need to use a special routine +#else // default routine, works fine for non-arm cpus with current compilers for (i=0; i < Ncvec; i += 2) { v4sf ar, ai, br, bi; ar = va[2*i+0]; ai = va[2*i+1]; @@ -1,4 +1,4 @@ -/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com ) +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) Based on original fortran 77 code from FFTPACKv4 from NETLIB, authored by Dr Paul Swarztrauber of NCAR, in 1985. @@ -115,7 +115,7 @@ extern "C" { The 'work' pointer should point to an area of N (2*N for complex fft) floats, properly aligned. If 'work' is NULL, then stack will - be used instead (this is probably the beest strategy for small + be used instead (this is probably the best strategy for small FFTs, say for N < 16384). input and output may alias. diff --git a/test_pffft.c b/test_pffft.c index f2c1ef3..512f0ae 100755..100644 --- a/test_pffft.c +++ b/test_pffft.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2011 Julien Pommier. + Copyright (c) 2013 Julien Pommier. Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP @@ -86,7 +86,7 @@ void pffft_validate_N(int N, int cplx) { if (pass == 0) { float *wrk = malloc(2*Nbytes+15*sizeof(float)); for (k=0; k < Nfloat; ++k) { - ref[k] = in[k] = frand(); //sqrt((float)(k+1)); + ref[k] = in[k] = frand()*2-1; out[k] = 1e30; } if (!cplx) { @@ -165,24 +165,33 @@ void pffft_validate_N(int N, int cplx) { } // quick test of the circular convolution in fft domain - pffft_zreorder(s, ref, tmp, PFFFT_FORWARD); - memset(out, 0, Nbytes); - pffft_zconvolve_accumulate(s, ref, ref, out, 1.0); - pffft_zreorder(s, out, tmp2, PFFFT_FORWARD); - - for (k=0; k < Nfloat; k += 2) { - float ar = tmp[k], ai=tmp[k+1]; - if (k || cplx) { - tmp[k] = ar*ar - ai*ai; - tmp[k+1] = 2*ar*ai; - } else { - tmp[0] = ar*ar; - tmp[1] = ai*ai; - } - } + { + float conv_err = 0, conv_max = 0; - for (k=0; k < Nfloat; ++k) { - assert(fabs(tmp[k] - tmp2[k]) < 1e-2*ref_max); + pffft_zreorder(s, ref, tmp, PFFFT_FORWARD); + memset(out, 0, Nbytes); + pffft_zconvolve_accumulate(s, ref, ref, out, 1.0); + pffft_zreorder(s, out, tmp2, PFFFT_FORWARD); + + for (k=0; k < Nfloat; k += 2) { + float ar = tmp[k], ai=tmp[k+1]; + if (cplx || k > 0) { + tmp[k] = ar*ar - ai*ai; + tmp[k+1] = 2*ar*ai; + } else { + tmp[0] = ar*ar; + tmp[1] = ai*ai; + } + } + + for (k=0; k < Nfloat; ++k) { + float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]); + if (d > conv_err) conv_err = d; + if (e > conv_max) conv_max = e; + } + if (conv_err > 1e-5*conv_max) { + printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1); + } } } @@ -198,7 +207,7 @@ void pffft_validate_N(int N, int cplx) { } void pffft_validate(int cplx) { - static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 864, 1024, 2048, 2592, 4096, 36864, 0}; + static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0}; int k; for (k = 0; Ntest[k]; ++k) { int N = Ntest[k]; @@ -357,7 +366,7 @@ void validate_pffft_simd(); // a small function inside pffft.c that will detect #endif int main(int argc, char **argv) { - int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 1024, 2048, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 }; + int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 }; int i; if (argc > 1 && strcmp(argv[1], "--array-format") == 0) { |