aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulien Pommier <pommier@pianoteq.com>2013-01-12 19:28:03 +0100
committerJulien Pommier <pommier@pianoteq.com>2013-01-12 19:28:03 +0100
commit432b3e85e30eb6c24f10f4ce0df655cbab89cfae (patch)
tree41235f52c52853476cf3174991a1dca4893fe813
parentd3b146e5b33cbf39585b08b534a921c8292ebc5e (diff)
downloadpffft-432b3e85e30eb6c24f10f4ce0df655cbab89cfae.tar.gz
fixed a bug in real ffts for N multiple of 25; work around a compiler bug with clang 3.2 for arm on linux
-rw-r--r--README.txt135
-rw-r--r--fftpack.c25
-rw-r--r--pffft.c107
-rw-r--r--pffft.h4
-rw-r--r--[-rwxr-xr-x]test_pffft.c51
5 files changed, 166 insertions, 156 deletions
diff --git a/README.txt b/README.txt
index 38d3fd7..37770d8 100644
--- a/README.txt
+++ b/README.txt
@@ -91,33 +91,42 @@ operation.
Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9)
--
+The benchmark shows the performance of various fft implementations measured in
+MFlops, with the number of floating point operations being defined as 5Nlog2(N)
+for a length N complex fft, and 2.5*Nlog2(N) for a real fft.
+See http://www.fftw.org/speed/method.html for an explanation of these formulas.
+
MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600
Built with:
gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f
-| N (input length) | real FFTPack | real vDSP | real FFTW | real PFFFT | | cplx FFTPack | cplx vDSP | cplx FFTW | cplx PFFFT |
-|------------------+--------------+--------------+--------------+--------------| |--------------+--------------+--------------+--------------|
-| 64 | 2887 | 9124 | 7082 | 8140 | | 3416 | 15722 | 15705 | 11222 |
-| 96 | 3356 | n/a | 8520 | 8158 | | 4038 | n/a | 16560 | 11141 |
-| 128 | 3895 | 12135 | 9613 | 10321 | | 4292 | 17981 | 17054 | 12538 |
-| 192 | 3941 | n/a | 10220 | 11167 | | 4388 | n/a | 16226 | 12964 |
-| 256 | 4532 | 13669 | 11031 | 12779 | | 4628 | 19905 | 17259 | 14305 |
-| 384 | 3512 | n/a | 11013 | 12278 | | 3645 | n/a | 16559 | 13370 |
-| 512 | 3716 | 15236 | 11515 | 14376 | | 3737 | 20423 | 17050 | 14746 |
-| 768 | 3756 | n/a | 11524 | 13659 | | 3748 | n/a | 16201 | 14891 |
-| 1024 | 4060 | 15841 | 10393 | 15732 | | 3828 | 21555 | 15898 | 15883 |
-| 2048 | 4646 | 16806 | 11888 | 15752 | | 4323 | 20802 | 15360 | 15219 |
-| 4096 | 4794 | 17008 | 11866 | 15785 | | 4167 | 19842 | 14532 | 14723 |
-| 8192 | 3887 | 16519 | 11290 | 12854 | | 3738 | 18923 | 12528 | 14164 |
-| 9216 | 3924 | n/a | 10980 | 13211 | | 3684 | n/a | 12349 | 14474 |
-| 16384 | 3907 | 16045 | 11146 | 13111 | | 3687 | 17628 | 12364 | 14176 |
-| 32768 | 4279 | 15169 | 10946 | 11538 | | 3919 | 15179 | 11558 | 11911 |
-| 262144 | 3423 | 11792 | 6753 | 9827 | | 2913 | 11989 | 8406 | 10960 |
-| 1048576 | 3313 | 10613 | 5478 | 7142 | | 2683 | 8487 | 2826 | 5961 |
-|------------------+--------------+--------------+--------------+--------------| |--------------+--------------+--------------+--------------|
-
+| input len |real FFTPack| real vDSP | real FFTW | real PFFFT | |cplx FFTPack| cplx vDSP | cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+| 64 | 2816 | 8596 | 7329 | 8187 | | 2887 | 14898 | 14668 | 11108 |
+| 96 | 3298 | n/a | 8378 | 7727 | | 3953 | n/a | 15680 | 10878 |
+| 128 | 3507 | 11575 | 9266 | 10108 | | 4233 | 17598 | 16427 | 12000 |
+| 160 | 3391 | n/a | 9838 | 10711 | | 4220 | n/a | 16653 | 11187 |
+| 192 | 3919 | n/a | 9868 | 10956 | | 4297 | n/a | 15770 | 12540 |
+| 256 | 4283 | 13179 | 10694 | 13128 | | 4545 | 19550 | 16350 | 13822 |
+| 384 | 3136 | n/a | 10810 | 12061 | | 3600 | n/a | 16103 | 13240 |
+| 480 | 3477 | n/a | 10632 | 12074 | | 3536 | n/a | 11630 | 12522 |
+| 512 | 3783 | 15141 | 11267 | 13838 | | 3649 | 20002 | 16560 | 13580 |
+| 640 | 3639 | n/a | 11164 | 13946 | | 3695 | n/a | 15416 | 13890 |
+| 768 | 3800 | n/a | 11245 | 13495 | | 3590 | n/a | 15802 | 14552 |
+| 800 | 3440 | n/a | 10499 | 13301 | | 3659 | n/a | 12056 | 13268 |
+| 1024 | 3924 | 15605 | 11450 | 15339 | | 3769 | 20963 | 13941 | 15467 |
+| 2048 | 4518 | 16195 | 11551 | 15532 | | 4258 | 20413 | 13723 | 15042 |
+| 2400 | 4294 | n/a | 10685 | 13078 | | 4093 | n/a | 12777 | 13119 |
+| 4096 | 4750 | 16596 | 11672 | 15817 | | 4157 | 19662 | 14316 | 14336 |
+| 8192 | 3820 | 16227 | 11084 | 12555 | | 3691 | 18132 | 12102 | 13813 |
+| 9216 | 3864 | n/a | 10254 | 12870 | | 3586 | n/a | 12119 | 13994 |
+| 16384 | 3822 | 15123 | 10454 | 12822 | | 3613 | 16874 | 12370 | 13881 |
+| 32768 | 4175 | 14512 | 10662 | 11095 | | 3881 | 14702 | 11619 | 11524 |
+| 262144 | 3317 | 11429 | 6269 | 9517 | | 2810 | 11729 | 7757 | 10179 |
+| 1048576 | 2913 | 10551 | 4730 | 5867 | | 2661 | 7881 | 3520 | 5350 |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600
@@ -263,30 +272,68 @@ cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
-Ubuntu 11.04 on Pandaboard, gcc-4.5.2, 32-bit, with fftw 3.3.1 beta (neon enabled), on a 1GHz ARM Cortex A9 (TI OMAP4430)
+Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3)
Built with:
-gcc-4.5 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=softfp -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
+gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
-I must admit that the performance was a bit disappointing here...
+| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+| 64 | 549 | 452 | 731 | | 512 | 602 | 640 |
+| 96 | 421 | 272 | 702 | | 496 | 571 | 602 |
+| 128 | 498 | 512 | 815 | | 597 | 618 | 652 |
+| 160 | 521 | 536 | 815 | | 586 | 669 | 625 |
+| 192 | 539 | 571 | 883 | | 485 | 597 | 626 |
+| 256 | 640 | 539 | 975 | | 569 | 611 | 671 |
+| 384 | 499 | 610 | 879 | | 499 | 602 | 637 |
+| 480 | 518 | 507 | 877 | | 496 | 661 | 616 |
+| 512 | 524 | 591 | 1002 | | 549 | 678 | 668 |
+| 640 | 542 | 612 | 955 | | 568 | 663 | 645 |
+| 768 | 557 | 613 | 981 | | 491 | 663 | 598 |
+| 800 | 514 | 353 | 882 | | 514 | 360 | 574 |
+| 1024 | 640 | 640 | 1067 | | 492 | 683 | 602 |
+| 2048 | 587 | 640 | 908 | | 486 | 640 | 552 |
+| 2400 | 479 | 368 | 777 | | 422 | 376 | 518 |
+| 4096 | 511 | 614 | 853 | | 426 | 640 | 534 |
+| 8192 | 415 | 584 | 708 | | 386 | 622 | 516 |
+| 9216 | 419 | 571 | 687 | | 364 | 586 | 506 |
+| 16384 | 426 | 577 | 716 | | 398 | 606 | 530 |
+| 32768 | 417 | 572 | 673 | | 399 | 572 | 468 |
+| 262144 | 219 | 380 | 293 | | 255 | 431 | 343 |
+| 1048576 | 202 | 274 | 237 | | 265 | 282 | 355 |
+|-----------+------------+------------+------------| |------------+------------+------------|
-| N (input length) | real FFTPack | real FFTW | real PFFFT | | cplx FFTPack | cplx FFTW | cplx PFFFT |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-| 64 | 384 | 614 | 591 | | 404 | 1024 | 549 |
-| 96 | 324 | 702 | 562 | | 337 | 864 | 503 |
-| 128 | 407 | 717 | 640 | | 407 | 1086 | 543 |
-| 192 | 404 | 809 | 693 | | 388 | 903 | 547 |
-| 256 | 465 | 788 | 788 | | 427 | 871 | 594 |
-| 384 | 392 | 814 | 687 | | 343 | 862 | 543 |
-| 512 | 411 | 768 | 794 | | 372 | 940 | 583 |
-| 768 | 438 | 818 | 767 | | 383 | 846 | 584 |
-| 1024 | 427 | 800 | 883 | | 400 | 883 | 602 |
-| 2048 | 414 | 853 | 805 | | 343 | 828 | 477 |
-| 4096 | 426 | 768 | 698 | | 341 | 808 | 469 |
-| 8192 | 332 | 666 | 594 | | 297 | 765 | 438 |
-| 9216 | 335 | 660 | 571 | | 294 | 687 | 432 |
-| 16384 | 344 | 675 | 606 | | 314 | 709 | 456 |
-| 32768 | 342 | 685 | 564 | | 295 | 634 | 399 |
-| 262144 | 143 | 301 | 197 | | 160 | 321 | 251 |
-| 1048576 | 138 | 238 | 174 | | 173 | 212 | 253 |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+Same platform as above, but this time pffft and fftpack are built with clang 3.2:
+
+clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
+
+| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+| 64 | 427 | 452 | 853 | | 427 | 602 | 1024 |
+| 96 | 351 | 276 | 843 | | 337 | 571 | 963 |
+| 128 | 373 | 512 | 996 | | 390 | 618 | 1054 |
+| 160 | 426 | 536 | 987 | | 375 | 669 | 914 |
+| 192 | 404 | 571 | 1079 | | 388 | 588 | 1079 |
+| 256 | 465 | 539 | 1205 | | 445 | 602 | 1170 |
+| 384 | 366 | 610 | 1099 | | 343 | 594 | 1099 |
+| 480 | 356 | 507 | 1140 | | 335 | 651 | 931 |
+| 512 | 411 | 591 | 1213 | | 384 | 649 | 1124 |
+| 640 | 398 | 612 | 1193 | | 373 | 654 | 901 |
+| 768 | 409 | 613 | 1227 | | 383 | 663 | 1044 |
+| 800 | 411 | 348 | 1073 | | 353 | 358 | 809 |
+| 1024 | 427 | 640 | 1280 | | 413 | 692 | 1004 |
+| 2048 | 414 | 626 | 1126 | | 371 | 640 | 853 |
+| 2400 | 399 | 373 | 898 | | 319 | 368 | 653 |
+| 4096 | 404 | 602 | 1059 | | 357 | 633 | 778 |
+| 8192 | 332 | 584 | 792 | | 308 | 616 | 716 |
+| 9216 | 322 | 561 | 783 | | 299 | 586 | 687 |
+| 16384 | 344 | 568 | 778 | | 314 | 617 | 745 |
+| 32768 | 342 | 564 | 737 | | 314 | 552 | 629 |
+| 262144 | 201 | 383 | 313 | | 227 | 435 | 413 |
+| 1048576 | 187 | 262 | 251 | | 228 | 281 | 409 |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+So it looks like, on ARM, gcc 4.7 is the best at scalar floating point
+(the fftpack performance numbers are better with gcc), while clang is
+the best with neon intrinsics (see how pffft perf has improved with
+clang 3.2).
diff --git a/fftpack.c b/fftpack.c
index bc2c000..b6375a8 100644
--- a/fftpack.c
+++ b/fftpack.c
@@ -1493,22 +1493,14 @@ static void radf5(integer ido, integer l1, const real *cc, real *ch,
for (k = 1; k <= l1; ++k) {
for (i = 3; i <= ido; i += 2) {
ic = idp2 - i;
- dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] *
- cc_ref(i, k, 2);
- di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(
- i - 1, k, 2);
- dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] *
- cc_ref(i, k, 3);
- di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(
- i - 1, k, 3);
- dr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] *
- cc_ref(i, k, 4);
- di4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref(
- i - 1, k, 4);
- dr5 = wa4[i - 2] * cc_ref(i - 1, k, 5) + wa4[i - 1] *
- cc_ref(i, k, 5);
- di5 = wa4[i - 2] * cc_ref(i, k, 5) - wa4[i - 1] * cc_ref(
- i - 1, k, 5);
+ dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * cc_ref(i, k, 2);
+ di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(i - 1, k, 2);
+ dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * cc_ref(i, k, 3);
+ di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(i - 1, k, 3);
+ dr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] * cc_ref(i, k, 4);
+ di4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref(i - 1, k, 4);
+ dr5 = wa4[i - 2] * cc_ref(i - 1, k, 5) + wa4[i - 1] * cc_ref(i, k, 5);
+ di5 = wa4[i - 2] * cc_ref(i, k, 5) - wa4[i - 1] * cc_ref(i - 1, k, 5);
cr2 = dr2 + dr5;
ci5 = dr5 - dr2;
cr5 = di2 - di5;
@@ -2125,6 +2117,7 @@ static void rfftf1(integer n, real *c, real *ch, const real *wa, integer *ifac)
ix3 = ix2 + ido;
ix4 = ix3 + ido;
radf5(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+ break;
default:
if (ido == 1) {
na = 1 - na;
diff --git a/pffft.c b/pffft.c
index fc4e314..0603f2b 100644
--- a/pffft.c
+++ b/pffft.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com )
+/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Based on original fortran 77 code from FFTPACKv4 from NETLIB
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
@@ -71,7 +71,7 @@
#endif
#if defined(COMPILER_GCC)
-# define ALWAYS_INLINE(return_type) return_type __attribute__ ((always_inline))
+# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
# define RESTRICT __restrict
# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
@@ -160,7 +160,7 @@ typedef float32x4_t v4sf;
# define LD_PS1(p) vld1q_dup_f32(&(p))
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
-# define VTRANSPOSE4_(x0,x1,x2,x3) { \
+# define VTRANSPOSE4(x0,x1,x2,x3) { \
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
@@ -168,7 +168,7 @@ typedef float32x4_t v4sf;
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
}
// marginally faster version
-# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
#else
@@ -825,14 +825,14 @@ static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT
for (k = 1; k <= l1; ++k) {
for (i = 3; i <= ido; i += 2) {
ic = idp2 - i;
- dr2 = LD_PS1(wa1[i-2]); di2 = LD_PS1(wa1[i-1]);
- dr3 = LD_PS1(wa2[i-2]); di3 = LD_PS1(wa2[i-1]);
- dr4 = LD_PS1(wa3[i-2]); di4 = LD_PS1(wa3[i-1]);
- dr5 = LD_PS1(wa4[i-2]); di5 = LD_PS1(wa4[i-1]);
- VCPLXMUL(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
- VCPLXMUL(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
- VCPLXMUL(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
- VCPLXMUL(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
+ dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
+ dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
+ dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
+ dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
+ VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
+ VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
+ VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
+ VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
cr2 = VADD(dr2, dr5);
ci5 = VSUB(dr5, dr2);
cr5 = VSUB(di2, di5);
@@ -842,21 +842,21 @@ static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT
cr4 = VSUB(di3, di4);
ci3 = VADD(di3, di4);
ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
- ch_ref(i, 1, k) = VADD(cc_ref(i, k, 1), VADD(ci2, ci3));
+ ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
- ti2 = VADD(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));
+ ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
- ti3 = VADD(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));
+ ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
- ch_ref(i - 1, 3, k) = VADD(tr2, tr5);
- ch_ref(ic - 1, 2, k) = VSUB(tr2, tr5);
+ ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
+ ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
ch_ref(i, 3, k) = VADD(ti2, ti5);
ch_ref(ic, 2, k) = VSUB(ti5, ti2);
- ch_ref(i - 1, 5, k) = VADD(tr3, tr4);
- ch_ref(ic - 1, 4, k) = VSUB(tr3, tr4);
+ ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
+ ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
ch_ref(i, 5, k) = VADD(ti3, ti4);
ch_ref(ic, 4, k) = VSUB(ti4, ti3);
}
@@ -939,10 +939,10 @@ static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
dr2 = VSUB(cr2, ci5);
di5 = VSUB(ci2, cr5);
di2 = VADD(ci2, cr5);
- VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
- VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
- VCPLXMUL(dr4, di4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
- VCPLXMUL(dr5, di5, LD_PS1(wa4[i-2]), LD_PS1(wa4[i-1]));
+ VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
+ VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
+ VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
+ VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
@@ -1411,7 +1411,7 @@ void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
}
-ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
+static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
const v4sf *e, v4sf *out) {
v4sf r0, i0, r1, i1, r2, i2, r3, i3;
v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
@@ -1512,7 +1512,7 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o
}
-ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
+static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
const v4sf *e, v4sf *out, int first) {
v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
/*
@@ -1617,8 +1617,6 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
int ib = (nf_odd ^ ordered ? 1 : 0);
- if (scratch == 0) scratch = scratch_on_stack;
-
assert(VALIGNED(finput) && VALIGNED(foutput));
//assert(finput != foutput);
@@ -1675,7 +1673,7 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
}
void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
- int i, Ncvec = s->Ncvec;
+ int Ncvec = s->Ncvec;
const v4sf * RESTRICT va = (const v4sf*)a;
const v4sf * RESTRICT vb = (const v4sf*)b;
v4sf * RESTRICT vab = (v4sf*)ab;
@@ -1693,10 +1691,16 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
__builtin_prefetch(va+6);
__builtin_prefetch(vb+6);
__builtin_prefetch(vab+6);
+# ifndef __clang__
+# define ZCONVOLVE_USING_INLINE_NEON_ASM
+# endif
#endif
float ar, ai, br, bi, abr, abi;
+#ifndef ZCONVOLVE_USING_INLINE_ASM
v4sf vscal = LD_PS1(scaling);
+ int i;
+#endif
assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
ar = ((v4sf_union*)va)[0].f[0];
@@ -1706,8 +1710,7 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
abr = ((v4sf_union*)vab)[0].f[0];
abi = ((v4sf_union*)vab)[1].f[0];
-#ifdef __arm__
-# if 1 // inline asm version
+#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
const float *a_ = a, *b_ = b; float *ab_ = ab;
int N = Ncvec;
asm volatile("mov r8, %2 \n"
@@ -1743,49 +1746,7 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
"subs %3, #2 \n"
"bne 1b \n"
: "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
-
-# else // neon instrinsics version, 30% slower that the asm one with gcc 4.6
- v4sf a1r, a1i, b1r, b1i;
- v4sf a2r, a2i, b2r, b2i;
- v4sf ab1r, ab1i, ab2r, ab2i;
- for (i=0; i < Ncvec; i += 2) {
- __builtin_prefetch(va+8);
- __builtin_prefetch(va+10);
-
- a1r = *va++; a1i = *va++;
- a2r = *va++; a2i = *va++;
- b1r = *vb++; b1i = *vb++;
- b2r = *vb++; b2i = *vb++;
- ab1r = vab[0]; ab1i = vab[1];
- ab2r = vab[2]; ab2i = vab[3];
-
- v4sf z1r = VMUL(a1r, b1r);
- v4sf z2r = VMUL(a2r, b2r);
- v4sf z1i = VMUL(a1r, b1i);
- v4sf z2i = VMUL(a2r, b2i);
-
- __builtin_prefetch(vb+4);
- __builtin_prefetch(vb+6);
-
- z1r = vmlsq_f32(z1r, a1i, b1i);
- z2r = vmlsq_f32(z2r, a2i, b2i);
- z1i = vmlaq_f32(z1i, a1i, b1r);
- z2i = vmlaq_f32(z2i, a2i, b2r);
-
- __builtin_prefetch(vab+4);
- __builtin_prefetch(vab+6);
-
- ab1r = vmlaq_f32(ab1r, z1r, vscal);
- ab2r = vmlaq_f32(ab2r, z2r, vscal);
- ab1i = vmlaq_f32(ab1i, z1i, vscal);
- ab2i = vmlaq_f32(ab2i, z2i, vscal);
-
- *vab++ = ab1r; *vab++ = ab1i;
- *vab++ = ab2r; *vab++ = ab2i;
- }
-# endif
-
-#else // not ARM, no need to use a special routine
+#else // default routine, works fine for non-arm cpus with current compilers
for (i=0; i < Ncvec; i += 2) {
v4sf ar, ai, br, bi;
ar = va[2*i+0]; ai = va[2*i+1];
diff --git a/pffft.h b/pffft.h
index 7c4bcc9..2bfa7b3 100644
--- a/pffft.h
+++ b/pffft.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com )
+/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
authored by Dr Paul Swarztrauber of NCAR, in 1985.
@@ -115,7 +115,7 @@ extern "C" {
The 'work' pointer should point to an area of N (2*N for complex
fft) floats, properly aligned. If 'work' is NULL, then stack will
- be used instead (this is probably the beest strategy for small
+ be used instead (this is probably the best strategy for small
FFTs, say for N < 16384).
input and output may alias.
diff --git a/test_pffft.c b/test_pffft.c
index f2c1ef3..512f0ae 100755..100644
--- a/test_pffft.c
+++ b/test_pffft.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2011 Julien Pommier.
+ Copyright (c) 2013 Julien Pommier.
Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
@@ -86,7 +86,7 @@ void pffft_validate_N(int N, int cplx) {
if (pass == 0) {
float *wrk = malloc(2*Nbytes+15*sizeof(float));
for (k=0; k < Nfloat; ++k) {
- ref[k] = in[k] = frand(); //sqrt((float)(k+1));
+ ref[k] = in[k] = frand()*2-1;
out[k] = 1e30;
}
if (!cplx) {
@@ -165,24 +165,33 @@ void pffft_validate_N(int N, int cplx) {
}
// quick test of the circular convolution in fft domain
- pffft_zreorder(s, ref, tmp, PFFFT_FORWARD);
- memset(out, 0, Nbytes);
- pffft_zconvolve_accumulate(s, ref, ref, out, 1.0);
- pffft_zreorder(s, out, tmp2, PFFFT_FORWARD);
-
- for (k=0; k < Nfloat; k += 2) {
- float ar = tmp[k], ai=tmp[k+1];
- if (k || cplx) {
- tmp[k] = ar*ar - ai*ai;
- tmp[k+1] = 2*ar*ai;
- } else {
- tmp[0] = ar*ar;
- tmp[1] = ai*ai;
- }
- }
+ {
+ float conv_err = 0, conv_max = 0;
- for (k=0; k < Nfloat; ++k) {
- assert(fabs(tmp[k] - tmp2[k]) < 1e-2*ref_max);
+ pffft_zreorder(s, ref, tmp, PFFFT_FORWARD);
+ memset(out, 0, Nbytes);
+ pffft_zconvolve_accumulate(s, ref, ref, out, 1.0);
+ pffft_zreorder(s, out, tmp2, PFFFT_FORWARD);
+
+ for (k=0; k < Nfloat; k += 2) {
+ float ar = tmp[k], ai=tmp[k+1];
+ if (cplx || k > 0) {
+ tmp[k] = ar*ar - ai*ai;
+ tmp[k+1] = 2*ar*ai;
+ } else {
+ tmp[0] = ar*ar;
+ tmp[1] = ai*ai;
+ }
+ }
+
+ for (k=0; k < Nfloat; ++k) {
+ float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]);
+ if (d > conv_err) conv_err = d;
+ if (e > conv_max) conv_max = e;
+ }
+ if (conv_err > 1e-5*conv_max) {
+ printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1);
+ }
}
}
@@ -198,7 +207,7 @@ void pffft_validate_N(int N, int cplx) {
}
void pffft_validate(int cplx) {
- static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 864, 1024, 2048, 2592, 4096, 36864, 0};
+ static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0};
int k;
for (k = 0; Ntest[k]; ++k) {
int N = Ntest[k];
@@ -357,7 +366,7 @@ void validate_pffft_simd(); // a small function inside pffft.c that will detect
#endif
int main(int argc, char **argv) {
- int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 1024, 2048, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 };
+ int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 };
int i;
if (argc > 1 && strcmp(argv[1], "--array-format") == 0) {