diff options
author | Frank Barchard <fbarchard@google.com> | 2022-08-09 17:38:18 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-08-09 17:39:31 -0700 |
commit | 8e9b46df265561209ef934974b2edbe493bc5291 (patch) | |
tree | cdde41c72bd9c5179b85281162f36a90a47c9cdb /src | |
parent | 4a1afee44a65ddddaca7342f464bd51fe3a40fa9 (diff) | |
download | XNNPACK-8e9b46df265561209ef934974b2edbe493bc5291.tar.gz |
CS16 fftr scalar microkernel
- Scalar C microkernel, test and benchmark
PiperOrigin-RevId: 466519212
Diffstat (limited to 'src')
-rw-r--r-- | src/cs16-fftr/gen/scalar-x1.c | 82 | ||||
-rw-r--r-- | src/cs16-fftr/gen/scalar-x2.c | 132 | ||||
-rw-r--r-- | src/cs16-fftr/gen/scalar-x3.c | 152 | ||||
-rw-r--r-- | src/cs16-fftr/gen/scalar-x4.c | 172 | ||||
-rw-r--r-- | src/cs16-fftr/scalar.c.in | 121 | ||||
-rw-r--r-- | src/xnnpack/fft.h | 12 | ||||
-rw-r--r-- | src/xnnpack/params.h | 6 |
7 files changed, 677 insertions, 0 deletions
diff --git a/src/cs16-fftr/gen/scalar-x1.c b/src/cs16-fftr/gen/scalar-x1.c new file mode 100644 index 000000000..c0d3c37fe --- /dev/null +++ b/src/cs16-fftr/gen/scalar-x1.c @@ -0,0 +1,82 @@ +// Auto-generated file. Do not edit! +// Template: src/cs16-fftr/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found il the +// LICENSE file il the root directory of this source tree. + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> + +#include <xnnpack/math.h> +#include <xnnpack/fft.h> + + +void xnn_cs16_fftr_ukernel__scalar_x1( + size_t samples, + const int16_t* input, + int16_t* output, + const int16_t* twiddle) { + + assert(samples >= 2); + assert(samples % 2 == 0); + assert(input != NULL); + assert(output != NULL); + assert(twiddle != NULL); + + const int16_t* il = input; + const int16_t* ir = input + samples * 2; + int32_t vdcr = (int32_t) il[0]; + int32_t vdci = (int32_t) il[1]; + il += 2; + vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); + vdci = math_asr_s32(vdci * 16383 + 16384, 15); + + int16_t* ol = output; + int16_t* or = output + samples * 2; + ol[0] = vdcr + vdci; + ol[1] = 0; + ol += 2; + or[0] = vdcr - vdci; + or[1] = 0; + + samples >>= 1; + + + if XNN_UNLIKELY(samples != 0) { + do { + int32_t vilr = il[0]; + int32_t vili = il[1]; + il += 2; + ir -= 2; + int32_t virr = (int32_t) ir[0]; + int32_t viri = -(int32_t) ir[1]; + const int32_t vtwr = twiddle[0]; + const int32_t vtwi = twiddle[1]; + twiddle += 2; + + vilr = math_asr_s32(vilr * 16383 + 16384, 15); + vili = math_asr_s32(vili * 16383 + 16384, 15); + virr = math_asr_s32(virr * 16383 + 16384, 15); + viri = math_asr_s32(viri * 16383 + 16384, 15); + const int32_t vacc1r = vilr + virr; + const int32_t vacc1i = vili + viri; + const int32_t vacc2r = vilr - virr; + const int32_t vacc2i = vili - viri; + + const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); + const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); + + ol[0] = math_asr_s32(vacc1r + twr, 1); + ol[1] = math_asr_s32(vacc1i + twi, 1); + ol += 2; + or -= 2; + or[0] = math_asr_s32(vacc1r - twr, 1); + or[1] = math_asr_s32(twi - vacc1i, 1); + + } while (--samples != 0); + } +} diff --git a/src/cs16-fftr/gen/scalar-x2.c b/src/cs16-fftr/gen/scalar-x2.c new file mode 100644 index 000000000..2c602f4c7 --- /dev/null +++ b/src/cs16-fftr/gen/scalar-x2.c @@ -0,0 +1,132 @@ +// Auto-generated file. Do not edit! +// Template: src/cs16-fftr/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found il the +// LICENSE file il the root directory of this source tree. + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> + +#include <xnnpack/math.h> +#include <xnnpack/fft.h> + + +void xnn_cs16_fftr_ukernel__scalar_x2( + size_t samples, + const int16_t* input, + int16_t* output, + const int16_t* twiddle) { + + assert(samples >= 2); + assert(samples % 2 == 0); + assert(input != NULL); + assert(output != NULL); + assert(twiddle != NULL); + + const int16_t* il = input; + const int16_t* ir = input + samples * 2; + int32_t vdcr = (int32_t) il[0]; + int32_t vdci = (int32_t) il[1]; + il += 2; + vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); + vdci = math_asr_s32(vdci * 16383 + 16384, 15); + + int16_t* ol = output; + int16_t* or = output + samples * 2; + ol[0] = vdcr + vdci; + ol[1] = 0; + ol += 2; + or[0] = vdcr - vdci; + or[1] = 0; + + samples >>= 1; + + for (; samples >= 2; samples -= 2) { + int32_t vilr0 = il[0]; + int32_t vili0 = il[1]; + int32_t vilr1 = il[2]; + int32_t vili1 = il[3]; + il += 2 * 2; + ir -= 2 * 2; + int32_t virr0 = (int32_t) ir[2]; + int32_t viri0 = -(int32_t) ir[3]; + int32_t virr1 = (int32_t) ir[0]; + int32_t viri1 = -(int32_t) ir[1]; + const int32_t vtwr0 = twiddle[0]; + const int32_t vtwi0 = twiddle[1]; + const int32_t vtwr1 = twiddle[2]; + const int32_t vtwi1 = twiddle[3]; + twiddle += 2 * 2; + + vilr0 = math_asr_s32(vilr0 * 16383 + 16384, 15); + virr0 = math_asr_s32(virr0 * 16383 + 16384, 15); + vilr1 = math_asr_s32(vilr1 * 16383 + 16384, 15); + virr1 = math_asr_s32(virr1 * 16383 + 16384, 15); + vili0 = math_asr_s32(vili0 * 16383 + 16384, 15); + viri0 = math_asr_s32(viri0 * 16383 + 16384, 15); + vili1 = math_asr_s32(vili1 * 16383 + 16384, 15); + viri1 = math_asr_s32(viri1 * 16383 + 16384, 15); + const int32_t vacc1r0 = vilr0 + virr0; + const int32_t vacc2r0 = vilr0 - virr0; + const int32_t vacc1r1 = vilr1 + virr1; + const int32_t vacc2r1 = vilr1 - virr1; + const int32_t vacc1i0 = vili0 + viri0; + const int32_t vacc2i0 = vili0 - viri0; + const int32_t vacc1i1 = vili1 + viri1; + const int32_t vacc2i1 = vili1 - viri1; + + const int32_t twr0 = math_asr_s32(vacc2r0 * vtwr0 - vacc2i0 * vtwi0 + 16384, 15); + const int32_t twr1 = math_asr_s32(vacc2r1 * vtwr1 - vacc2i1 * vtwi1 + 16384, 15); + const int32_t twi0 = math_asr_s32(vacc2r0 * vtwi0 + vacc2i0 * vtwr0 + 16384, 15); + const int32_t twi1 = math_asr_s32(vacc2r1 * vtwi1 + vacc2i1 * vtwr1 + 16384, 15); + + ol[0] = math_asr_s32(vacc1r0 + twr0, 1); + ol[1] = math_asr_s32(vacc1i0 + twi0, 1); + ol[2] = math_asr_s32(vacc1r1 + twr1, 1); + ol[3] = math_asr_s32(vacc1i1 + twi1, 1); + ol += 2 * 2; + or -= 2 * 2; + or[2] = math_asr_s32(vacc1r0 - twr0, 1); + or[3] = math_asr_s32(twi0 - vacc1i0, 1); + or[0] = math_asr_s32(vacc1r1 - twr1, 1); + or[1] = math_asr_s32(twi1 - vacc1i1, 1); + } + + if XNN_UNLIKELY(samples != 0) { + do { + int32_t vilr = il[0]; + int32_t vili = il[1]; + il += 2; + ir -= 2; + int32_t virr = (int32_t) ir[0]; + int32_t viri = -(int32_t) ir[1]; + const int32_t vtwr = twiddle[0]; + const int32_t vtwi = twiddle[1]; + twiddle += 2; + + vilr = math_asr_s32(vilr * 16383 + 16384, 15); + vili = math_asr_s32(vili * 16383 + 16384, 15); + virr = math_asr_s32(virr * 16383 + 16384, 15); + viri = math_asr_s32(viri * 16383 + 16384, 15); + const int32_t vacc1r = vilr + virr; + const int32_t vacc1i = vili + viri; + const int32_t vacc2r = vilr - virr; + const int32_t vacc2i = vili - viri; + + const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); + const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); + + ol[0] = math_asr_s32(vacc1r + twr, 1); + ol[1] = math_asr_s32(vacc1i + twi, 1); + ol += 2; + or -= 2; + or[0] = math_asr_s32(vacc1r - twr, 1); + or[1] = math_asr_s32(twi - vacc1i, 1); + + } while (--samples != 0); + } +} diff --git a/src/cs16-fftr/gen/scalar-x3.c b/src/cs16-fftr/gen/scalar-x3.c new file mode 100644 index 000000000..3d492a1ea --- /dev/null +++ b/src/cs16-fftr/gen/scalar-x3.c @@ -0,0 +1,152 @@ +// Auto-generated file. Do not edit! +// Template: src/cs16-fftr/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found il the +// LICENSE file il the root directory of this source tree. + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> + +#include <xnnpack/math.h> +#include <xnnpack/fft.h> + + +void xnn_cs16_fftr_ukernel__scalar_x3( + size_t samples, + const int16_t* input, + int16_t* output, + const int16_t* twiddle) { + + assert(samples >= 2); + assert(samples % 2 == 0); + assert(input != NULL); + assert(output != NULL); + assert(twiddle != NULL); + + const int16_t* il = input; + const int16_t* ir = input + samples * 2; + int32_t vdcr = (int32_t) il[0]; + int32_t vdci = (int32_t) il[1]; + il += 2; + vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); + vdci = math_asr_s32(vdci * 16383 + 16384, 15); + + int16_t* ol = output; + int16_t* or = output + samples * 2; + ol[0] = vdcr + vdci; + ol[1] = 0; + ol += 2; + or[0] = vdcr - vdci; + or[1] = 0; + + samples >>= 1; + + for (; samples >= 3; samples -= 3) { + int32_t vilr0 = il[0]; + int32_t vili0 = il[1]; + int32_t vilr1 = il[2]; + int32_t vili1 = il[3]; + int32_t vilr2 = il[4]; + int32_t vili2 = il[5]; + il += 3 * 2; + ir -= 3 * 2; + int32_t virr0 = (int32_t) ir[4]; + int32_t viri0 = -(int32_t) ir[5]; + int32_t virr1 = (int32_t) ir[2]; + int32_t viri1 = -(int32_t) ir[3]; + int32_t virr2 = (int32_t) ir[0]; + int32_t viri2 = -(int32_t) ir[1]; + const int32_t vtwr0 = twiddle[0]; + const int32_t vtwi0 = twiddle[1]; + const int32_t vtwr1 = twiddle[2]; + const int32_t vtwi1 = twiddle[3]; + const int32_t vtwr2 = twiddle[4]; + const int32_t vtwi2 = twiddle[5]; + twiddle += 3 * 2; + + vilr0 = math_asr_s32(vilr0 * 16383 + 16384, 15); + virr0 = math_asr_s32(virr0 * 16383 + 16384, 15); + vilr1 = math_asr_s32(vilr1 * 16383 + 16384, 15); + virr1 = math_asr_s32(virr1 * 16383 + 16384, 15); + vilr2 = math_asr_s32(vilr2 * 16383 + 16384, 15); + virr2 = math_asr_s32(virr2 * 16383 + 16384, 15); + vili0 = math_asr_s32(vili0 * 16383 + 16384, 15); + viri0 = math_asr_s32(viri0 * 16383 + 16384, 15); + vili1 = math_asr_s32(vili1 * 16383 + 16384, 15); + viri1 = math_asr_s32(viri1 * 16383 + 16384, 15); + vili2 = math_asr_s32(vili2 * 16383 + 16384, 15); + viri2 = math_asr_s32(viri2 * 16383 + 16384, 15); + const int32_t vacc1r0 = vilr0 + virr0; + const int32_t vacc2r0 = vilr0 - virr0; + const int32_t vacc1r1 = vilr1 + virr1; + const int32_t vacc2r1 = vilr1 - virr1; + const int32_t vacc1r2 = vilr2 + virr2; + const int32_t vacc2r2 = vilr2 - virr2; + const int32_t vacc1i0 = vili0 + viri0; + const int32_t vacc2i0 = vili0 - viri0; + const int32_t vacc1i1 = vili1 + viri1; + const int32_t vacc2i1 = vili1 - viri1; + const int32_t vacc1i2 = vili2 + viri2; + const int32_t vacc2i2 = vili2 - viri2; + + const int32_t twr0 = math_asr_s32(vacc2r0 * vtwr0 - vacc2i0 * vtwi0 + 16384, 15); + const int32_t twr1 = math_asr_s32(vacc2r1 * vtwr1 - vacc2i1 * vtwi1 + 16384, 15); + const int32_t twr2 = math_asr_s32(vacc2r2 * vtwr2 - vacc2i2 * vtwi2 + 16384, 15); + const int32_t twi0 = math_asr_s32(vacc2r0 * vtwi0 + vacc2i0 * vtwr0 + 16384, 15); + const int32_t twi1 = math_asr_s32(vacc2r1 * vtwi1 + vacc2i1 * vtwr1 + 16384, 15); + const int32_t twi2 = math_asr_s32(vacc2r2 * vtwi2 + vacc2i2 * vtwr2 + 16384, 15); + + ol[0] = math_asr_s32(vacc1r0 + twr0, 1); + ol[1] = math_asr_s32(vacc1i0 + twi0, 1); + ol[2] = math_asr_s32(vacc1r1 + twr1, 1); + ol[3] = math_asr_s32(vacc1i1 + twi1, 1); + ol[4] = math_asr_s32(vacc1r2 + twr2, 1); + ol[5] = math_asr_s32(vacc1i2 + twi2, 1); + ol += 3 * 2; + or -= 3 * 2; + or[4] = math_asr_s32(vacc1r0 - twr0, 1); + or[5] = math_asr_s32(twi0 - vacc1i0, 1); + or[2] = math_asr_s32(vacc1r1 - twr1, 1); + or[3] = math_asr_s32(twi1 - vacc1i1, 1); + or[0] = math_asr_s32(vacc1r2 - twr2, 1); + or[1] = math_asr_s32(twi2 - vacc1i2, 1); + } + + if XNN_UNLIKELY(samples != 0) { + do { + int32_t vilr = il[0]; + int32_t vili = il[1]; + il += 2; + ir -= 2; + int32_t virr = (int32_t) ir[0]; + int32_t viri = -(int32_t) ir[1]; + const int32_t vtwr = twiddle[0]; + const int32_t vtwi = twiddle[1]; + twiddle += 2; + + vilr = math_asr_s32(vilr * 16383 + 16384, 15); + vili = math_asr_s32(vili * 16383 + 16384, 15); + virr = math_asr_s32(virr * 16383 + 16384, 15); + viri = math_asr_s32(viri * 16383 + 16384, 15); + const int32_t vacc1r = vilr + virr; + const int32_t vacc1i = vili + viri; + const int32_t vacc2r = vilr - virr; + const int32_t vacc2i = vili - viri; + + const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); + const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); + + ol[0] = math_asr_s32(vacc1r + twr, 1); + ol[1] = math_asr_s32(vacc1i + twi, 1); + ol += 2; + or -= 2; + or[0] = math_asr_s32(vacc1r - twr, 1); + or[1] = math_asr_s32(twi - vacc1i, 1); + + } while (--samples != 0); + } +} diff --git a/src/cs16-fftr/gen/scalar-x4.c b/src/cs16-fftr/gen/scalar-x4.c new file mode 100644 index 000000000..409920e86 --- /dev/null +++ b/src/cs16-fftr/gen/scalar-x4.c @@ -0,0 +1,172 @@ +// Auto-generated file. Do not edit! +// Template: src/cs16-fftr/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found il the +// LICENSE file il the root directory of this source tree. + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> + +#include <xnnpack/math.h> +#include <xnnpack/fft.h> + + +void xnn_cs16_fftr_ukernel__scalar_x4( + size_t samples, + const int16_t* input, + int16_t* output, + const int16_t* twiddle) { + + assert(samples >= 2); + assert(samples % 2 == 0); + assert(input != NULL); + assert(output != NULL); + assert(twiddle != NULL); + + const int16_t* il = input; + const int16_t* ir = input + samples * 2; + int32_t vdcr = (int32_t) il[0]; + int32_t vdci = (int32_t) il[1]; + il += 2; + vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); + vdci = math_asr_s32(vdci * 16383 + 16384, 15); + + int16_t* ol = output; + int16_t* or = output + samples * 2; + ol[0] = vdcr + vdci; + ol[1] = 0; + ol += 2; + or[0] = vdcr - vdci; + or[1] = 0; + + samples >>= 1; + + for (; samples >= 4; samples -= 4) { + int32_t vilr0 = il[0]; + int32_t vili0 = il[1]; + int32_t vilr1 = il[2]; + int32_t vili1 = il[3]; + int32_t vilr2 = il[4]; + int32_t vili2 = il[5]; + int32_t vilr3 = il[6]; + int32_t vili3 = il[7]; + il += 4 * 2; + ir -= 4 * 2; + int32_t virr0 = (int32_t) ir[6]; + int32_t viri0 = -(int32_t) ir[7]; + int32_t virr1 = (int32_t) ir[4]; + int32_t viri1 = -(int32_t) ir[5]; + int32_t virr2 = (int32_t) ir[2]; + int32_t viri2 = -(int32_t) ir[3]; + int32_t virr3 = (int32_t) ir[0]; + int32_t viri3 = -(int32_t) ir[1]; + const int32_t vtwr0 = twiddle[0]; + const int32_t vtwi0 = twiddle[1]; + const int32_t vtwr1 = twiddle[2]; + const int32_t vtwi1 = twiddle[3]; + const int32_t vtwr2 = twiddle[4]; + const int32_t vtwi2 = twiddle[5]; + const int32_t vtwr3 = twiddle[6]; + const int32_t vtwi3 = twiddle[7]; + twiddle += 4 * 2; + + vilr0 = math_asr_s32(vilr0 * 16383 + 16384, 15); + virr0 = math_asr_s32(virr0 * 16383 + 16384, 15); + vilr1 = math_asr_s32(vilr1 * 16383 + 16384, 15); + virr1 = math_asr_s32(virr1 * 16383 + 16384, 15); + vilr2 = math_asr_s32(vilr2 * 16383 + 16384, 15); + virr2 = math_asr_s32(virr2 * 16383 + 16384, 15); + vilr3 = math_asr_s32(vilr3 * 16383 + 16384, 15); + virr3 = math_asr_s32(virr3 * 16383 + 16384, 15); + vili0 = math_asr_s32(vili0 * 16383 + 16384, 15); + viri0 = math_asr_s32(viri0 * 16383 + 16384, 15); + vili1 = math_asr_s32(vili1 * 16383 + 16384, 15); + viri1 = math_asr_s32(viri1 * 16383 + 16384, 15); + vili2 = math_asr_s32(vili2 * 16383 + 16384, 15); + viri2 = math_asr_s32(viri2 * 16383 + 16384, 15); + vili3 = math_asr_s32(vili3 * 16383 + 16384, 15); + viri3 = math_asr_s32(viri3 * 16383 + 16384, 15); + const int32_t vacc1r0 = vilr0 + virr0; + const int32_t vacc2r0 = vilr0 - virr0; + const int32_t vacc1r1 = vilr1 + virr1; + const int32_t vacc2r1 = vilr1 - virr1; + const int32_t vacc1r2 = vilr2 + virr2; + const int32_t vacc2r2 = vilr2 - virr2; + const int32_t vacc1r3 = vilr3 + virr3; + const int32_t vacc2r3 = vilr3 - virr3; + const int32_t vacc1i0 = vili0 + viri0; + const int32_t vacc2i0 = vili0 - viri0; + const int32_t vacc1i1 = vili1 + viri1; + const int32_t vacc2i1 = vili1 - viri1; + const int32_t vacc1i2 = vili2 + viri2; + const int32_t vacc2i2 = vili2 - viri2; + const int32_t vacc1i3 = vili3 + viri3; + const int32_t vacc2i3 = vili3 - viri3; + + const int32_t twr0 = math_asr_s32(vacc2r0 * vtwr0 - vacc2i0 * vtwi0 + 16384, 15); + const int32_t twr1 = math_asr_s32(vacc2r1 * vtwr1 - vacc2i1 * vtwi1 + 16384, 15); + const int32_t twr2 = math_asr_s32(vacc2r2 * vtwr2 - vacc2i2 * vtwi2 + 16384, 15); + const int32_t twr3 = math_asr_s32(vacc2r3 * vtwr3 - vacc2i3 * vtwi3 + 16384, 15); + const int32_t twi0 = math_asr_s32(vacc2r0 * vtwi0 + vacc2i0 * vtwr0 + 16384, 15); + const int32_t twi1 = math_asr_s32(vacc2r1 * vtwi1 + vacc2i1 * vtwr1 + 16384, 15); + const int32_t twi2 = math_asr_s32(vacc2r2 * vtwi2 + vacc2i2 * vtwr2 + 16384, 15); + const int32_t twi3 = math_asr_s32(vacc2r3 * vtwi3 + vacc2i3 * vtwr3 + 16384, 15); + + ol[0] = math_asr_s32(vacc1r0 + twr0, 1); + ol[1] = math_asr_s32(vacc1i0 + twi0, 1); + ol[2] = math_asr_s32(vacc1r1 + twr1, 1); + ol[3] = math_asr_s32(vacc1i1 + twi1, 1); + ol[4] = math_asr_s32(vacc1r2 + twr2, 1); + ol[5] = math_asr_s32(vacc1i2 + twi2, 1); + ol[6] = math_asr_s32(vacc1r3 + twr3, 1); + ol[7] = math_asr_s32(vacc1i3 + twi3, 1); + ol += 4 * 2; + or -= 4 * 2; + or[6] = math_asr_s32(vacc1r0 - twr0, 1); + or[7] = math_asr_s32(twi0 - vacc1i0, 1); + or[4] = math_asr_s32(vacc1r1 - twr1, 1); + or[5] = math_asr_s32(twi1 - vacc1i1, 1); + or[2] = math_asr_s32(vacc1r2 - twr2, 1); + or[3] = math_asr_s32(twi2 - vacc1i2, 1); + or[0] = math_asr_s32(vacc1r3 - twr3, 1); + or[1] = math_asr_s32(twi3 - vacc1i3, 1); + } + + if XNN_UNLIKELY(samples != 0) { + do { + int32_t vilr = il[0]; + int32_t vili = il[1]; + il += 2; + ir -= 2; + int32_t virr = (int32_t) ir[0]; + int32_t viri = -(int32_t) ir[1]; + const int32_t vtwr = twiddle[0]; + const int32_t vtwi = twiddle[1]; + twiddle += 2; + + vilr = math_asr_s32(vilr * 16383 + 16384, 15); + vili = math_asr_s32(vili * 16383 + 16384, 15); + virr = math_asr_s32(virr * 16383 + 16384, 15); + viri = math_asr_s32(viri * 16383 + 16384, 15); + const int32_t vacc1r = vilr + virr; + const int32_t vacc1i = vili + viri; + const int32_t vacc2r = vilr - virr; + const int32_t vacc2i = vili - viri; + + const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); + const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); + + ol[0] = math_asr_s32(vacc1r + twr, 1); + ol[1] = math_asr_s32(vacc1i + twi, 1); + ol += 2; + or -= 2; + or[0] = math_asr_s32(vacc1r - twr, 1); + or[1] = math_asr_s32(twi - vacc1i, 1); + + } while (--samples != 0); + } +} diff --git a/src/cs16-fftr/scalar.c.in b/src/cs16-fftr/scalar.c.in new file mode 100644 index 000000000..1b592288d --- /dev/null +++ b/src/cs16-fftr/scalar.c.in @@ -0,0 +1,121 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found il the +// LICENSE file il the root directory of this source tree. + +$assert SAMPLE_TILE >= 1 +#include <assert.h> +#include <stddef.h> +#include <stdint.h> + +#include <xnnpack/math.h> +#include <xnnpack/fft.h> + + +void xnn_cs16_fftr_ukernel__scalar_x${SAMPLE_TILE}( + size_t samples, + const int16_t* input, + int16_t* output, + const int16_t* twiddle) { + + assert(samples >= 2); + assert(samples % 2 == 0); + assert(input != NULL); + assert(output != NULL); + assert(twiddle != NULL); + + const int16_t* il = input; + const int16_t* ir = input + samples * 2; + int32_t vdcr = (int32_t) il[0]; + int32_t vdci = (int32_t) il[1]; + il += 2; + vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); + vdci = math_asr_s32(vdci * 16383 + 16384, 15); + + int16_t* ol = output; + int16_t* or = output + samples * 2; + ol[0] = vdcr + vdci; + ol[1] = 0; + ol += 2; + or[0] = vdcr - vdci; + or[1] = 0; + + samples >>= 1; + + $if SAMPLE_TILE > 1: + for (; samples >= ${SAMPLE_TILE}; samples -= ${SAMPLE_TILE}) { + $for C in range(SAMPLE_TILE): + int32_t vilr${C} = il[${C * 2 + 0}]; + int32_t vili${C} = il[${C * 2 + 1}]; + il += ${SAMPLE_TILE} * 2; + ir -= ${SAMPLE_TILE} * 2; + $for C in range(SAMPLE_TILE): + int32_t virr${C} = (int32_t) ir[${(SAMPLE_TILE - 1 - C) * 2 + 0}]; + int32_t viri${C} = -(int32_t) ir[${(SAMPLE_TILE - 1 - C) * 2 + 1}]; + $for C in range(SAMPLE_TILE): + const int32_t vtwr${C} = twiddle[${C * 2 + 0}]; + const int32_t vtwi${C} = twiddle[${C * 2 + 1}]; + twiddle += ${SAMPLE_TILE} * 2; + + $for C in range(SAMPLE_TILE): + vilr${C} = math_asr_s32(vilr${C} * 16383 + 16384, 15); + virr${C} = math_asr_s32(virr${C} * 16383 + 16384, 15); + $for C in range(SAMPLE_TILE): + vili${C} = math_asr_s32(vili${C} * 16383 + 16384, 15); + viri${C} = math_asr_s32(viri${C} * 16383 + 16384, 15); + $for C in range(SAMPLE_TILE): + const int32_t vacc1r${C} = vilr${C} + virr${C}; + const int32_t vacc2r${C} = vilr${C} - virr${C}; + $for C in range(SAMPLE_TILE): + const int32_t vacc1i${C} = vili${C} + viri${C}; + const int32_t vacc2i${C} = vili${C} - viri${C}; + + $for C in range(SAMPLE_TILE): + const int32_t twr${C} = math_asr_s32(vacc2r${C} * vtwr${C} - vacc2i${C} * vtwi${C} + 16384, 15); + $for C in range(SAMPLE_TILE): + const int32_t twi${C} = math_asr_s32(vacc2r${C} * vtwi${C} + vacc2i${C} * vtwr${C} + 16384, 15); + + $for C in range(SAMPLE_TILE): + ol[${C * 2 + 0}] = math_asr_s32(vacc1r${C} + twr${C}, 1); + ol[${C * 2 + 1}] = math_asr_s32(vacc1i${C} + twi${C}, 1); + ol += ${SAMPLE_TILE} * 2; + or -= ${SAMPLE_TILE} * 2; + $for C in range(SAMPLE_TILE): + or[${(SAMPLE_TILE - 1 - C) * 2 + 0}] = math_asr_s32(vacc1r${C} - twr${C}, 1); + or[${(SAMPLE_TILE - 1 - C) * 2 + 1}] = math_asr_s32(twi${C} - vacc1i${C}, 1); + } + + if XNN_UNLIKELY(samples != 0) { + do { + int32_t vilr = il[0]; + int32_t vili = il[1]; + il += 2; + ir -= 2; + int32_t virr = (int32_t) ir[0]; + int32_t viri = -(int32_t) ir[1]; + const int32_t vtwr = twiddle[0]; + const int32_t vtwi = twiddle[1]; + twiddle += 2; + + vilr = math_asr_s32(vilr * 16383 + 16384, 15); + vili = math_asr_s32(vili * 16383 + 16384, 15); + virr = math_asr_s32(virr * 16383 + 16384, 15); + viri = math_asr_s32(viri * 16383 + 16384, 15); + const int32_t vacc1r = vilr + virr; + const int32_t vacc1i = vili + viri; + const int32_t vacc2r = vilr - virr; + const int32_t vacc2i = vili - viri; + + const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); + const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); + + ol[0] = math_asr_s32(vacc1r + twr, 1); + ol[1] = math_asr_s32(vacc1i + twi, 1); + ol += 2; + or -= 2; + or[0] = math_asr_s32(vacc1r - twr, 1); + or[1] = math_asr_s32(twi - vacc1i, 1); + + } while (--samples != 0); + } +} diff --git a/src/xnnpack/fft.h b/src/xnnpack/fft.h index 5cb9369e0..1b673bafd 100644 --- a/src/xnnpack/fft.h +++ b/src/xnnpack/fft.h @@ -27,6 +27,18 @@ DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x2) DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x3) DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x4) +#define DECLARE_CS16_FFTR_UKERNEL_FUNCTION(fn_name) \ + XNN_INTERNAL void fn_name( \ + size_t samples, \ + const int16_t* input, \ + int16_t* output, \ + const int16_t* twiddle); + +DECLARE_CS16_FFTR_UKERNEL_FUNCTION(xnn_cs16_fftr_ukernel__scalar_x1) +DECLARE_CS16_FFTR_UKERNEL_FUNCTION(xnn_cs16_fftr_ukernel__scalar_x2) +DECLARE_CS16_FFTR_UKERNEL_FUNCTION(xnn_cs16_fftr_ukernel__scalar_x3) +DECLARE_CS16_FFTR_UKERNEL_FUNCTION(xnn_cs16_fftr_ukernel__scalar_x4) + #ifdef __cplusplus } // extern "C" #endif diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h index 3ac564f10..e19f1b604 100644 --- a/src/xnnpack/params.h +++ b/src/xnnpack/params.h @@ -1614,6 +1614,12 @@ typedef void (*xnn_cs16_bfly4_ukernel_function)( size_t stride, const int16_t* twiddle); +typedef void (*xnn_cs16_fftr_ukernel_function)( + size_t samples, + const int16_t* input, + int16_t* output, + const int16_t* twiddle); + // Reduce-Add Extended ("mantissa" + "exponent") Exponentials typedef void (*xnn_f32_raddextexp_ukernel_function)( |