diff options
author | Frank Barchard <fbarchard@google.com> | 2022-08-23 18:26:39 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-08-23 18:27:36 -0700 |
commit | f133344411a0fc6ebf1a3b1518321b61e2e759b9 (patch) | |
tree | 1137051dff33f67f267d43309611bb6ccc301490 | |
parent | 7adc837b2c842b16ff5a57100bb99d147a9718ed (diff) | |
download | XNNPACK-f133344411a0fc6ebf1a3b1518321b61e2e759b9.tar.gz |
fftr microkernel single data pointer
- always used in-place so use a single data pointer
PiperOrigin-RevId: 469606317
-rw-r--r-- | bench/cs16-fftr.cc | 9 | ||||
-rw-r--r-- | src/cs16-fftr/gen/scalar-x1.c | 56 | ||||
-rw-r--r-- | src/cs16-fftr/gen/scalar-x2.c | 94 | ||||
-rw-r--r-- | src/cs16-fftr/gen/scalar-x3.c | 110 | ||||
-rw-r--r-- | src/cs16-fftr/gen/scalar-x4.c | 126 | ||||
-rw-r--r-- | src/cs16-fftr/scalar.c.in | 78 | ||||
-rw-r--r-- | src/xnnpack/fft.h | 3 | ||||
-rw-r--r-- | src/xnnpack/microfnptr.h | 3 | ||||
-rw-r--r-- | test/cs16-fftr.cc | 25 | ||||
-rw-r--r-- | test/fftr-microkernel-tester.h | 23 | ||||
-rwxr-xr-x | tools/generate-fftr-test.py | 8 |
11 files changed, 223 insertions, 312 deletions
diff --git a/bench/cs16-fftr.cc b/bench/cs16-fftr.cc index b2d63cff4..e6fb60c61 100644 --- a/bench/cs16-fftr.cc +++ b/bench/cs16-fftr.cc @@ -33,17 +33,14 @@ void cs16_fftr( assert(samples % 2 == 0); const size_t sample_size = samples * 2 + 2; - std::vector<int16_t, AlignedAllocator<int16_t, 64>> input( - sample_size + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::vector<int16_t, AlignedAllocator<int16_t, 64>> output(sample_size); + std::vector<int16_t, AlignedAllocator<int16_t, 64>> data(sample_size + XNN_EXTRA_BYTES / sizeof(int16_t)); std::vector<int16_t, AlignedAllocator<int16_t, 64>> twiddle(samples); - std::iota(input.begin(), input.end(), 0); - std::iota(output.begin(), output.end(), 1); + std::iota(data.begin(), data.end(), 0); std::iota(twiddle.begin(), twiddle.end(), 2); for (auto _ : state) { - fftr(samples, input.data(), output.data(), twiddle.data()); + fftr(samples, data.data(), twiddle.data()); } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); diff --git a/src/cs16-fftr/gen/scalar-x1.c b/src/cs16-fftr/gen/scalar-x1.c index c0d3c37fe..2131ed918 100644 --- a/src/cs16-fftr/gen/scalar-x1.c +++ b/src/cs16-fftr/gen/scalar-x1.c @@ -4,8 +4,8 @@ // // Copyright 2022 Google LLC // -// This source code is licensed under the BSD-style license found il the -// LICENSE file il the root directory of this source tree. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. #include <assert.h> #include <stddef.h> @@ -17,43 +17,39 @@ void xnn_cs16_fftr_ukernel__scalar_x1( size_t samples, - const int16_t* input, - int16_t* output, + int16_t* data, const int16_t* twiddle) { assert(samples >= 2); assert(samples % 2 == 0); - assert(input != NULL); - assert(output != NULL); + assert(data != NULL); + assert(data != NULL); assert(twiddle != NULL); - const int16_t* il = input; - const int16_t* ir = input + samples * 2; - int32_t vdcr = (int32_t) il[0]; - int32_t vdci = (int32_t) il[1]; - il += 2; + int16_t* dl = data; + int16_t* dr = data + samples * 2; + int32_t vdcr = (int32_t) dl[0]; + int32_t vdci = (int32_t) dl[1]; + vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); vdci = math_asr_s32(vdci * 16383 + 16384, 15); - int16_t* ol = output; - int16_t* or = output + samples * 2; - ol[0] = vdcr + vdci; - ol[1] = 0; - ol += 2; - or[0] = vdcr - vdci; - or[1] = 0; + dl[0] = vdcr + vdci; + dl[1] = 0; + dl += 2; + dr[0] = vdcr - vdci; + dr[1] = 0; samples >>= 1; if XNN_UNLIKELY(samples != 0) { do { - int32_t vilr = il[0]; - int32_t vili = il[1]; - il += 2; - ir -= 2; - int32_t virr = (int32_t) ir[0]; - int32_t viri = -(int32_t) ir[1]; + dr -= 2; + int32_t vilr = dl[0]; + int32_t vili = dl[1]; + int32_t virr = (int32_t) dr[0]; + int32_t viri = -(int32_t) dr[1]; const int32_t vtwr = twiddle[0]; const int32_t vtwi = twiddle[1]; twiddle += 2; @@ -70,13 +66,11 @@ void xnn_cs16_fftr_ukernel__scalar_x1( const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); - ol[0] = math_asr_s32(vacc1r + twr, 1); - ol[1] = math_asr_s32(vacc1i + twi, 1); - ol += 2; - or -= 2; - or[0] = math_asr_s32(vacc1r - twr, 1); - or[1] = math_asr_s32(twi - vacc1i, 1); - + dl[0] = math_asr_s32(vacc1r + twr, 1); + dl[1] = math_asr_s32(vacc1i + twi, 1); + dr[0] = math_asr_s32(vacc1r - twr, 1); + dr[1] = math_asr_s32(twi - vacc1i, 1); + dl += 2; } while (--samples != 0); } } diff --git a/src/cs16-fftr/gen/scalar-x2.c b/src/cs16-fftr/gen/scalar-x2.c index 2c602f4c7..83dc7c20c 100644 --- a/src/cs16-fftr/gen/scalar-x2.c +++ b/src/cs16-fftr/gen/scalar-x2.c @@ -4,8 +4,8 @@ // // Copyright 2022 Google LLC // -// This source code is licensed under the BSD-style license found il the -// LICENSE file il the root directory of this source tree. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. #include <assert.h> #include <stddef.h> @@ -17,45 +17,41 @@ void xnn_cs16_fftr_ukernel__scalar_x2( size_t samples, - const int16_t* input, - int16_t* output, + int16_t* data, const int16_t* twiddle) { assert(samples >= 2); assert(samples % 2 == 0); - assert(input != NULL); - assert(output != NULL); + assert(data != NULL); + assert(data != NULL); assert(twiddle != NULL); - const int16_t* il = input; - const int16_t* ir = input + samples * 2; - int32_t vdcr = (int32_t) il[0]; - int32_t vdci = (int32_t) il[1]; - il += 2; + int16_t* dl = data; + int16_t* dr = data + samples * 2; + int32_t vdcr = (int32_t) dl[0]; + int32_t vdci = (int32_t) dl[1]; + vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); vdci = math_asr_s32(vdci * 16383 + 16384, 15); - int16_t* ol = output; - int16_t* or = output + samples * 2; - ol[0] = vdcr + vdci; - ol[1] = 0; - ol += 2; - or[0] = vdcr - vdci; - or[1] = 0; + dl[0] = vdcr + vdci; + dl[1] = 0; + dl += 2; + dr[0] = vdcr - vdci; + dr[1] = 0; samples >>= 1; for (; samples >= 2; samples -= 2) { - int32_t vilr0 = il[0]; - int32_t vili0 = il[1]; - int32_t vilr1 = il[2]; - int32_t vili1 = il[3]; - il += 2 * 2; - ir -= 2 * 2; - int32_t virr0 = (int32_t) ir[2]; - int32_t viri0 = -(int32_t) ir[3]; - int32_t virr1 = (int32_t) ir[0]; - int32_t viri1 = -(int32_t) ir[1]; + dr -= 2 * 2; + int32_t vilr0 = dl[0]; + int32_t vili0 = dl[1]; + int32_t vilr1 = dl[2]; + int32_t vili1 = dl[3]; + int32_t virr0 = (int32_t) dr[2]; + int32_t viri0 = -(int32_t) dr[3]; + int32_t virr1 = (int32_t) dr[0]; + int32_t viri1 = -(int32_t) dr[1]; const int32_t vtwr0 = twiddle[0]; const int32_t vtwi0 = twiddle[1]; const int32_t vtwr1 = twiddle[2]; @@ -84,26 +80,24 @@ void xnn_cs16_fftr_ukernel__scalar_x2( const int32_t twi0 = math_asr_s32(vacc2r0 * vtwi0 + vacc2i0 * vtwr0 + 16384, 15); const int32_t twi1 = math_asr_s32(vacc2r1 * vtwi1 + vacc2i1 * vtwr1 + 16384, 15); - ol[0] = math_asr_s32(vacc1r0 + twr0, 1); - ol[1] = math_asr_s32(vacc1i0 + twi0, 1); - ol[2] = math_asr_s32(vacc1r1 + twr1, 1); - ol[3] = math_asr_s32(vacc1i1 + twi1, 1); - ol += 2 * 2; - or -= 2 * 2; - or[2] = math_asr_s32(vacc1r0 - twr0, 1); - or[3] = math_asr_s32(twi0 - vacc1i0, 1); - or[0] = math_asr_s32(vacc1r1 - twr1, 1); - or[1] = math_asr_s32(twi1 - vacc1i1, 1); + dl[0] = math_asr_s32(vacc1r0 + twr0, 1); + dl[1] = math_asr_s32(vacc1i0 + twi0, 1); + dl[2] = math_asr_s32(vacc1r1 + twr1, 1); + dl[3] = math_asr_s32(vacc1i1 + twi1, 1); + dr[2] = math_asr_s32(vacc1r0 - twr0, 1); + dr[3] = math_asr_s32(twi0 - vacc1i0, 1); + dr[0] = math_asr_s32(vacc1r1 - twr1, 1); + dr[1] = math_asr_s32(twi1 - vacc1i1, 1); + dl += 2 * 2; } if XNN_UNLIKELY(samples != 0) { do { - int32_t vilr = il[0]; - int32_t vili = il[1]; - il += 2; - ir -= 2; - int32_t virr = (int32_t) ir[0]; - int32_t viri = -(int32_t) ir[1]; + dr -= 2; + int32_t vilr = dl[0]; + int32_t vili = dl[1]; + int32_t virr = (int32_t) dr[0]; + int32_t viri = -(int32_t) dr[1]; const int32_t vtwr = twiddle[0]; const int32_t vtwi = twiddle[1]; twiddle += 2; @@ -120,13 +114,11 @@ void xnn_cs16_fftr_ukernel__scalar_x2( const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); - ol[0] = math_asr_s32(vacc1r + twr, 1); - ol[1] = math_asr_s32(vacc1i + twi, 1); - ol += 2; - or -= 2; - or[0] = math_asr_s32(vacc1r - twr, 1); - or[1] = math_asr_s32(twi - vacc1i, 1); - + dl[0] = math_asr_s32(vacc1r + twr, 1); + dl[1] = math_asr_s32(vacc1i + twi, 1); + dr[0] = math_asr_s32(vacc1r - twr, 1); + dr[1] = math_asr_s32(twi - vacc1i, 1); + dl += 2; } while (--samples != 0); } } diff --git a/src/cs16-fftr/gen/scalar-x3.c b/src/cs16-fftr/gen/scalar-x3.c index 3d492a1ea..24d016848 100644 --- a/src/cs16-fftr/gen/scalar-x3.c +++ b/src/cs16-fftr/gen/scalar-x3.c @@ -4,8 +4,8 @@ // // Copyright 2022 Google LLC // -// This source code is licensed under the BSD-style license found il the -// LICENSE file il the root directory of this source tree. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. #include <assert.h> #include <stddef.h> @@ -17,49 +17,45 @@ void xnn_cs16_fftr_ukernel__scalar_x3( size_t samples, - const int16_t* input, - int16_t* output, + int16_t* data, const int16_t* twiddle) { assert(samples >= 2); assert(samples % 2 == 0); - assert(input != NULL); - assert(output != NULL); + assert(data != NULL); + assert(data != NULL); assert(twiddle != NULL); - const int16_t* il = input; - const int16_t* ir = input + samples * 2; - int32_t vdcr = (int32_t) il[0]; - int32_t vdci = (int32_t) il[1]; - il += 2; + int16_t* dl = data; + int16_t* dr = data + samples * 2; + int32_t vdcr = (int32_t) dl[0]; + int32_t vdci = (int32_t) dl[1]; + vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); vdci = math_asr_s32(vdci * 16383 + 16384, 15); - int16_t* ol = output; - int16_t* or = output + samples * 2; - ol[0] = vdcr + vdci; - ol[1] = 0; - ol += 2; - or[0] = vdcr - vdci; - or[1] = 0; + dl[0] = vdcr + vdci; + dl[1] = 0; + dl += 2; + dr[0] = vdcr - vdci; + dr[1] = 0; samples >>= 1; for (; samples >= 3; samples -= 3) { - int32_t vilr0 = il[0]; - int32_t vili0 = il[1]; - int32_t vilr1 = il[2]; - int32_t vili1 = il[3]; - int32_t vilr2 = il[4]; - int32_t vili2 = il[5]; - il += 3 * 2; - ir -= 3 * 2; - int32_t virr0 = (int32_t) ir[4]; - int32_t viri0 = -(int32_t) ir[5]; - int32_t virr1 = (int32_t) ir[2]; - int32_t viri1 = -(int32_t) ir[3]; - int32_t virr2 = (int32_t) ir[0]; - int32_t viri2 = -(int32_t) ir[1]; + dr -= 3 * 2; + int32_t vilr0 = dl[0]; + int32_t vili0 = dl[1]; + int32_t vilr1 = dl[2]; + int32_t vili1 = dl[3]; + int32_t vilr2 = dl[4]; + int32_t vili2 = dl[5]; + int32_t virr0 = (int32_t) dr[4]; + int32_t viri0 = -(int32_t) dr[5]; + int32_t virr1 = (int32_t) dr[2]; + int32_t viri1 = -(int32_t) dr[3]; + int32_t virr2 = (int32_t) dr[0]; + int32_t viri2 = -(int32_t) dr[1]; const int32_t vtwr0 = twiddle[0]; const int32_t vtwi0 = twiddle[1]; const int32_t vtwr1 = twiddle[2]; @@ -100,30 +96,28 @@ void xnn_cs16_fftr_ukernel__scalar_x3( const int32_t twi1 = math_asr_s32(vacc2r1 * vtwi1 + vacc2i1 * vtwr1 + 16384, 15); const int32_t twi2 = math_asr_s32(vacc2r2 * vtwi2 + vacc2i2 * vtwr2 + 16384, 15); - ol[0] = math_asr_s32(vacc1r0 + twr0, 1); - ol[1] = math_asr_s32(vacc1i0 + twi0, 1); - ol[2] = math_asr_s32(vacc1r1 + twr1, 1); - ol[3] = math_asr_s32(vacc1i1 + twi1, 1); - ol[4] = math_asr_s32(vacc1r2 + twr2, 1); - ol[5] = math_asr_s32(vacc1i2 + twi2, 1); - ol += 3 * 2; - or -= 3 * 2; - or[4] = math_asr_s32(vacc1r0 - twr0, 1); - or[5] = math_asr_s32(twi0 - vacc1i0, 1); - or[2] = math_asr_s32(vacc1r1 - twr1, 1); - or[3] = math_asr_s32(twi1 - vacc1i1, 1); - or[0] = math_asr_s32(vacc1r2 - twr2, 1); - or[1] = math_asr_s32(twi2 - vacc1i2, 1); + dl[0] = math_asr_s32(vacc1r0 + twr0, 1); + dl[1] = math_asr_s32(vacc1i0 + twi0, 1); + dl[2] = math_asr_s32(vacc1r1 + twr1, 1); + dl[3] = math_asr_s32(vacc1i1 + twi1, 1); + dl[4] = math_asr_s32(vacc1r2 + twr2, 1); + dl[5] = math_asr_s32(vacc1i2 + twi2, 1); + dr[4] = math_asr_s32(vacc1r0 - twr0, 1); + dr[5] = math_asr_s32(twi0 - vacc1i0, 1); + dr[2] = math_asr_s32(vacc1r1 - twr1, 1); + dr[3] = math_asr_s32(twi1 - vacc1i1, 1); + dr[0] = math_asr_s32(vacc1r2 - twr2, 1); + dr[1] = math_asr_s32(twi2 - vacc1i2, 1); + dl += 3 * 2; } if XNN_UNLIKELY(samples != 0) { do { - int32_t vilr = il[0]; - int32_t vili = il[1]; - il += 2; - ir -= 2; - int32_t virr = (int32_t) ir[0]; - int32_t viri = -(int32_t) ir[1]; + dr -= 2; + int32_t vilr = dl[0]; + int32_t vili = dl[1]; + int32_t virr = (int32_t) dr[0]; + int32_t viri = -(int32_t) dr[1]; const int32_t vtwr = twiddle[0]; const int32_t vtwi = twiddle[1]; twiddle += 2; @@ -140,13 +134,11 @@ void xnn_cs16_fftr_ukernel__scalar_x3( const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); - ol[0] = math_asr_s32(vacc1r + twr, 1); - ol[1] = math_asr_s32(vacc1i + twi, 1); - ol += 2; - or -= 2; - or[0] = math_asr_s32(vacc1r - twr, 1); - or[1] = math_asr_s32(twi - vacc1i, 1); - + dl[0] = math_asr_s32(vacc1r + twr, 1); + dl[1] = math_asr_s32(vacc1i + twi, 1); + dr[0] = math_asr_s32(vacc1r - twr, 1); + dr[1] = math_asr_s32(twi - vacc1i, 1); + dl += 2; } while (--samples != 0); } } diff --git a/src/cs16-fftr/gen/scalar-x4.c b/src/cs16-fftr/gen/scalar-x4.c index 409920e86..be23e44c1 100644 --- a/src/cs16-fftr/gen/scalar-x4.c +++ b/src/cs16-fftr/gen/scalar-x4.c @@ -4,8 +4,8 @@ // // Copyright 2022 Google LLC // -// This source code is licensed under the BSD-style license found il the -// LICENSE file il the root directory of this source tree. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. #include <assert.h> #include <stddef.h> @@ -17,53 +17,49 @@ void xnn_cs16_fftr_ukernel__scalar_x4( size_t samples, - const int16_t* input, - int16_t* output, + int16_t* data, const int16_t* twiddle) { assert(samples >= 2); assert(samples % 2 == 0); - assert(input != NULL); - assert(output != NULL); + assert(data != NULL); + assert(data != NULL); assert(twiddle != NULL); - const int16_t* il = input; - const int16_t* ir = input + samples * 2; - int32_t vdcr = (int32_t) il[0]; - int32_t vdci = (int32_t) il[1]; - il += 2; + int16_t* dl = data; + int16_t* dr = data + samples * 2; + int32_t vdcr = (int32_t) dl[0]; + int32_t vdci = (int32_t) dl[1]; + vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); vdci = math_asr_s32(vdci * 16383 + 16384, 15); - int16_t* ol = output; - int16_t* or = output + samples * 2; - ol[0] = vdcr + vdci; - ol[1] = 0; - ol += 2; - or[0] = vdcr - vdci; - or[1] = 0; + dl[0] = vdcr + vdci; + dl[1] = 0; + dl += 2; + dr[0] = vdcr - vdci; + dr[1] = 0; samples >>= 1; for (; samples >= 4; samples -= 4) { - int32_t vilr0 = il[0]; - int32_t vili0 = il[1]; - int32_t vilr1 = il[2]; - int32_t vili1 = il[3]; - int32_t vilr2 = il[4]; - int32_t vili2 = il[5]; - int32_t vilr3 = il[6]; - int32_t vili3 = il[7]; - il += 4 * 2; - ir -= 4 * 2; - int32_t virr0 = (int32_t) ir[6]; - int32_t viri0 = -(int32_t) ir[7]; - int32_t virr1 = (int32_t) ir[4]; - int32_t viri1 = -(int32_t) ir[5]; - int32_t virr2 = (int32_t) ir[2]; - int32_t viri2 = -(int32_t) ir[3]; - int32_t virr3 = (int32_t) ir[0]; - int32_t viri3 = -(int32_t) ir[1]; + dr -= 4 * 2; + int32_t vilr0 = dl[0]; + int32_t vili0 = dl[1]; + int32_t vilr1 = dl[2]; + int32_t vili1 = dl[3]; + int32_t vilr2 = dl[4]; + int32_t vili2 = dl[5]; + int32_t vilr3 = dl[6]; + int32_t vili3 = dl[7]; + int32_t virr0 = (int32_t) dr[6]; + int32_t viri0 = -(int32_t) dr[7]; + int32_t virr1 = (int32_t) dr[4]; + int32_t viri1 = -(int32_t) dr[5]; + int32_t virr2 = (int32_t) dr[2]; + int32_t viri2 = -(int32_t) dr[3]; + int32_t virr3 = (int32_t) dr[0]; + int32_t viri3 = -(int32_t) dr[1]; const int32_t vtwr0 = twiddle[0]; const int32_t vtwi0 = twiddle[1]; const int32_t vtwr1 = twiddle[2]; @@ -116,34 +112,32 @@ void xnn_cs16_fftr_ukernel__scalar_x4( const int32_t twi2 = math_asr_s32(vacc2r2 * vtwi2 + vacc2i2 * vtwr2 + 16384, 15); const int32_t twi3 = math_asr_s32(vacc2r3 * vtwi3 + vacc2i3 * vtwr3 + 16384, 15); - ol[0] = math_asr_s32(vacc1r0 + twr0, 1); - ol[1] = math_asr_s32(vacc1i0 + twi0, 1); - ol[2] = math_asr_s32(vacc1r1 + twr1, 1); - ol[3] = math_asr_s32(vacc1i1 + twi1, 1); - ol[4] = math_asr_s32(vacc1r2 + twr2, 1); - ol[5] = math_asr_s32(vacc1i2 + twi2, 1); - ol[6] = math_asr_s32(vacc1r3 + twr3, 1); - ol[7] = math_asr_s32(vacc1i3 + twi3, 1); - ol += 4 * 2; - or -= 4 * 2; - or[6] = math_asr_s32(vacc1r0 - twr0, 1); - or[7] = math_asr_s32(twi0 - vacc1i0, 1); - or[4] = math_asr_s32(vacc1r1 - twr1, 1); - or[5] = math_asr_s32(twi1 - vacc1i1, 1); - or[2] = math_asr_s32(vacc1r2 - twr2, 1); - or[3] = math_asr_s32(twi2 - vacc1i2, 1); - or[0] = math_asr_s32(vacc1r3 - twr3, 1); - or[1] = math_asr_s32(twi3 - vacc1i3, 1); + dl[0] = math_asr_s32(vacc1r0 + twr0, 1); + dl[1] = math_asr_s32(vacc1i0 + twi0, 1); + dl[2] = math_asr_s32(vacc1r1 + twr1, 1); + dl[3] = math_asr_s32(vacc1i1 + twi1, 1); + dl[4] = math_asr_s32(vacc1r2 + twr2, 1); + dl[5] = math_asr_s32(vacc1i2 + twi2, 1); + dl[6] = math_asr_s32(vacc1r3 + twr3, 1); + dl[7] = math_asr_s32(vacc1i3 + twi3, 1); + dr[6] = math_asr_s32(vacc1r0 - twr0, 1); + dr[7] = math_asr_s32(twi0 - vacc1i0, 1); + dr[4] = math_asr_s32(vacc1r1 - twr1, 1); + dr[5] = math_asr_s32(twi1 - vacc1i1, 1); + dr[2] = math_asr_s32(vacc1r2 - twr2, 1); + dr[3] = math_asr_s32(twi2 - vacc1i2, 1); + dr[0] = math_asr_s32(vacc1r3 - twr3, 1); + dr[1] = math_asr_s32(twi3 - vacc1i3, 1); + dl += 4 * 2; } if XNN_UNLIKELY(samples != 0) { do { - int32_t vilr = il[0]; - int32_t vili = il[1]; - il += 2; - ir -= 2; - int32_t virr = (int32_t) ir[0]; - int32_t viri = -(int32_t) ir[1]; + dr -= 2; + int32_t vilr = dl[0]; + int32_t vili = dl[1]; + int32_t virr = (int32_t) dr[0]; + int32_t viri = -(int32_t) dr[1]; const int32_t vtwr = twiddle[0]; const int32_t vtwi = twiddle[1]; twiddle += 2; @@ -160,13 +154,11 @@ void xnn_cs16_fftr_ukernel__scalar_x4( const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); - ol[0] = math_asr_s32(vacc1r + twr, 1); - ol[1] = math_asr_s32(vacc1i + twi, 1); - ol += 2; - or -= 2; - or[0] = math_asr_s32(vacc1r - twr, 1); - or[1] = math_asr_s32(twi - vacc1i, 1); - + dl[0] = math_asr_s32(vacc1r + twr, 1); + dl[1] = math_asr_s32(vacc1i + twi, 1); + dr[0] = math_asr_s32(vacc1r - twr, 1); + dr[1] = math_asr_s32(twi - vacc1i, 1); + dl += 2; } while (--samples != 0); } } diff --git a/src/cs16-fftr/scalar.c.in b/src/cs16-fftr/scalar.c.in index 1b592288d..8af6c6c88 100644 --- a/src/cs16-fftr/scalar.c.in +++ b/src/cs16-fftr/scalar.c.in @@ -1,7 +1,7 @@ // Copyright 2022 Google LLC // -// This source code is licensed under the BSD-style license found il the -// LICENSE file il the root directory of this source tree. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. $assert SAMPLE_TILE >= 1 #include <assert.h> @@ -14,44 +14,40 @@ $assert SAMPLE_TILE >= 1 void xnn_cs16_fftr_ukernel__scalar_x${SAMPLE_TILE}( size_t samples, - const int16_t* input, - int16_t* output, + int16_t* data, const int16_t* twiddle) { assert(samples >= 2); assert(samples % 2 == 0); - assert(input != NULL); - assert(output != NULL); + assert(data != NULL); + assert(data != NULL); assert(twiddle != NULL); - const int16_t* il = input; - const int16_t* ir = input + samples * 2; - int32_t vdcr = (int32_t) il[0]; - int32_t vdci = (int32_t) il[1]; - il += 2; + int16_t* dl = data; + int16_t* dr = data + samples * 2; + int32_t vdcr = (int32_t) dl[0]; + int32_t vdci = (int32_t) dl[1]; + vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); vdci = math_asr_s32(vdci * 16383 + 16384, 15); - int16_t* ol = output; - int16_t* or = output + samples * 2; - ol[0] = vdcr + vdci; - ol[1] = 0; - ol += 2; - or[0] = vdcr - vdci; - or[1] = 0; + dl[0] = vdcr + vdci; + dl[1] = 0; + dl += 2; + dr[0] = vdcr - vdci; + dr[1] = 0; samples >>= 1; $if SAMPLE_TILE > 1: for (; samples >= ${SAMPLE_TILE}; samples -= ${SAMPLE_TILE}) { + dr -= ${SAMPLE_TILE} * 2; $for C in range(SAMPLE_TILE): - int32_t vilr${C} = il[${C * 2 + 0}]; - int32_t vili${C} = il[${C * 2 + 1}]; - il += ${SAMPLE_TILE} * 2; - ir -= ${SAMPLE_TILE} * 2; + int32_t vilr${C} = dl[${C * 2 + 0}]; + int32_t vili${C} = dl[${C * 2 + 1}]; $for C in range(SAMPLE_TILE): - int32_t virr${C} = (int32_t) ir[${(SAMPLE_TILE - 1 - C) * 2 + 0}]; - int32_t viri${C} = -(int32_t) ir[${(SAMPLE_TILE - 1 - C) * 2 + 1}]; + int32_t virr${C} = (int32_t) dr[${(SAMPLE_TILE - 1 - C) * 2 + 0}]; + int32_t viri${C} = -(int32_t) dr[${(SAMPLE_TILE - 1 - C) * 2 + 1}]; $for C in range(SAMPLE_TILE): const int32_t vtwr${C} = twiddle[${C * 2 + 0}]; const int32_t vtwi${C} = twiddle[${C * 2 + 1}]; @@ -76,23 +72,21 @@ void xnn_cs16_fftr_ukernel__scalar_x${SAMPLE_TILE}( const int32_t twi${C} = math_asr_s32(vacc2r${C} * vtwi${C} + vacc2i${C} * vtwr${C} + 16384, 15); $for C in range(SAMPLE_TILE): - ol[${C * 2 + 0}] = math_asr_s32(vacc1r${C} + twr${C}, 1); - ol[${C * 2 + 1}] = math_asr_s32(vacc1i${C} + twi${C}, 1); - ol += ${SAMPLE_TILE} * 2; - or -= ${SAMPLE_TILE} * 2; + dl[${C * 2 + 0}] = math_asr_s32(vacc1r${C} + twr${C}, 1); + dl[${C * 2 + 1}] = math_asr_s32(vacc1i${C} + twi${C}, 1); $for C in range(SAMPLE_TILE): - or[${(SAMPLE_TILE - 1 - C) * 2 + 0}] = math_asr_s32(vacc1r${C} - twr${C}, 1); - or[${(SAMPLE_TILE - 1 - C) * 2 + 1}] = math_asr_s32(twi${C} - vacc1i${C}, 1); + dr[${(SAMPLE_TILE - 1 - C) * 2 + 0}] = math_asr_s32(vacc1r${C} - twr${C}, 1); + dr[${(SAMPLE_TILE - 1 - C) * 2 + 1}] = math_asr_s32(twi${C} - vacc1i${C}, 1); + dl += ${SAMPLE_TILE} * 2; } if XNN_UNLIKELY(samples != 0) { do { - int32_t vilr = il[0]; - int32_t vili = il[1]; - il += 2; - ir -= 2; - int32_t virr = (int32_t) ir[0]; - int32_t viri = -(int32_t) ir[1]; + dr -= 2; + int32_t vilr = dl[0]; + int32_t vili = dl[1]; + int32_t virr = (int32_t) dr[0]; + int32_t viri = -(int32_t) dr[1]; const int32_t vtwr = twiddle[0]; const int32_t vtwi = twiddle[1]; twiddle += 2; @@ -109,13 +103,11 @@ void xnn_cs16_fftr_ukernel__scalar_x${SAMPLE_TILE}( const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); - ol[0] = math_asr_s32(vacc1r + twr, 1); - ol[1] = math_asr_s32(vacc1i + twi, 1); - ol += 2; - or -= 2; - or[0] = math_asr_s32(vacc1r - twr, 1); - or[1] = math_asr_s32(twi - vacc1i, 1); - + dl[0] = math_asr_s32(vacc1r + twr, 1); + dl[1] = math_asr_s32(vacc1i + twi, 1); + dr[0] = math_asr_s32(vacc1r - twr, 1); + dr[1] = math_asr_s32(twi - vacc1i, 1); + dl += 2; } while (--samples != 0); } } diff --git a/src/xnnpack/fft.h b/src/xnnpack/fft.h index 9aa0afd79..374241136 100644 --- a/src/xnnpack/fft.h +++ b/src/xnnpack/fft.h @@ -31,8 +31,7 @@ DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__scalar) #define DECLARE_CS16_FFTR_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ size_t samples, \ - const int16_t* input, \ - int16_t* output, \ + int16_t* data, \ const int16_t* twiddle); DECLARE_CS16_FFTR_UKERNEL_FUNCTION(xnn_cs16_fftr_ukernel__scalar_x1) diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index 93525b6fe..a522c0fe9 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -1687,8 +1687,7 @@ typedef void (*xnn_cs16_bfly4_ukernel_function)( typedef void (*xnn_cs16_fftr_ukernel_function)( size_t samples, - const int16_t* input, - int16_t* output, + int16_t* data, const int16_t* twiddle); diff --git a/test/cs16-fftr.cc b/test/cs16-fftr.cc index 223e4914f..6e761bfbd 100644 --- a/test/cs16-fftr.cc +++ b/test/cs16-fftr.cc @@ -23,12 +23,6 @@ TEST(CS16_FFTR__SCALAR_X1, samples_eq_256) { .Test(xnn_cs16_fftr_ukernel__scalar_x1); } -TEST(CS16_FFTR__SCALAR_X1, inplace) { - FftrMicrokernelTester() - .samples(256) - .inplace(true) - .Test(xnn_cs16_fftr_ukernel__scalar_x1); -} TEST(CS16_FFTR__SCALAR_X2, samples_eq_256) { FftrMicrokernelTester() @@ -36,12 +30,6 @@ TEST(CS16_FFTR__SCALAR_X2, samples_eq_256) { .Test(xnn_cs16_fftr_ukernel__scalar_x2); } -TEST(CS16_FFTR__SCALAR_X2, inplace) { - FftrMicrokernelTester() - .samples(256) - .inplace(true) - .Test(xnn_cs16_fftr_ukernel__scalar_x2); -} TEST(CS16_FFTR__SCALAR_X3, samples_eq_256) { FftrMicrokernelTester() @@ -49,22 +37,9 @@ TEST(CS16_FFTR__SCALAR_X3, samples_eq_256) { .Test(xnn_cs16_fftr_ukernel__scalar_x3); } -TEST(CS16_FFTR__SCALAR_X3, inplace) { - FftrMicrokernelTester() - .samples(256) - .inplace(true) - .Test(xnn_cs16_fftr_ukernel__scalar_x3); -} TEST(CS16_FFTR__SCALAR_X4, samples_eq_256) { FftrMicrokernelTester() .samples(256) .Test(xnn_cs16_fftr_ukernel__scalar_x4); } - -TEST(CS16_FFTR__SCALAR_X4, inplace) { - FftrMicrokernelTester() - .samples(256) - .inplace(true) - .Test(xnn_cs16_fftr_ukernel__scalar_x4); -}
\ No newline at end of file diff --git a/test/fftr-microkernel-tester.h b/test/fftr-microkernel-tester.h index 6f4fd256d..55dfab769 100644 --- a/test/fftr-microkernel-tester.h +++ b/test/fftr-microkernel-tester.h @@ -96,15 +96,6 @@ class FftrMicrokernelTester { return this->samples_; } - inline FftrMicrokernelTester& inplace(bool inplace) { - this->inplace_ = inplace; - return *this; - } - - inline bool inplace() const { - return this->inplace_; - } - inline FftrMicrokernelTester& iterations(size_t iterations) { this->iterations_ = iterations; return *this; @@ -120,23 +111,20 @@ class FftrMicrokernelTester { auto i16rng = std::bind(std::uniform_int_distribution<int16_t>(), std::ref(rng)); const size_t sample_size = samples() * 2 + 2; - std::vector<int16_t> x(sample_size + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::vector<int16_t> twiddle(samples() + XNN_EXTRA_BYTES / sizeof(int16_t)); - std::vector<int16_t> y(sample_size + (inplace() ? XNN_EXTRA_BYTES / sizeof(int16_t) : 0)); + std::vector<int16_t> twiddle(samples()); + std::vector<int16_t> y(sample_size); std::vector<int16_t> y_ref(sample_size); - const int16_t* x_data = inplace() ? y.data() : x.data(); for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(x.begin(), x.end(), std::ref(i16rng)); std::generate(twiddle.begin(), twiddle.end(), std::ref(i16rng)); std::generate(y.begin(), y.end(), std::ref(i16rng)); - std::generate(y_ref.begin(), y_ref.end(), std::ref(i16rng)); + std::copy(y.begin(), y.end(), y_ref.begin()); // Compute reference results. - xnn_cs16_fftr_reference(samples(), x_data, y_ref.data(), twiddle.data()); + xnn_cs16_fftr_reference(samples(), y_ref.data(), y_ref.data(), twiddle.data()); // Call optimized micro-kernel. - fftr(samples(), x_data, y.data(), twiddle.data()); + fftr(samples(), y.data(), twiddle.data()); // Verify results. for (size_t n = 0; n < sample_size; n++) { @@ -148,6 +136,5 @@ class FftrMicrokernelTester { private: size_t samples_{256}; - bool inplace_{false}; size_t iterations_{15}; }; diff --git a/tools/generate-fftr-test.py b/tools/generate-fftr-test.py index 53de31293..fa84f602b 100755 --- a/tools/generate-fftr-test.py +++ b/tools/generate-fftr-test.py @@ -44,14 +44,6 @@ TEST(${TEST_NAME}, samples_eq_256) { .Test(${", ".join(TEST_ARGS)}); } -TEST(${TEST_NAME}, inplace) { - $if ISA_CHECK: - ${ISA_CHECK}; - FftrMicrokernelTester() - .samples(256) - .inplace(true) - .Test(${", ".join(TEST_ARGS)}); -} """ |