diff options
Diffstat (limited to 'src/cs16-fftr/gen/scalar-x4.c')
-rw-r--r-- | src/cs16-fftr/gen/scalar-x4.c | 126 |
1 files changed, 59 insertions, 67 deletions
diff --git a/src/cs16-fftr/gen/scalar-x4.c b/src/cs16-fftr/gen/scalar-x4.c index 409920e86..be23e44c1 100644 --- a/src/cs16-fftr/gen/scalar-x4.c +++ b/src/cs16-fftr/gen/scalar-x4.c @@ -4,8 +4,8 @@ // // Copyright 2022 Google LLC // -// This source code is licensed under the BSD-style license found il the -// LICENSE file il the root directory of this source tree. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. #include <assert.h> #include <stddef.h> @@ -17,53 +17,49 @@ void xnn_cs16_fftr_ukernel__scalar_x4( size_t samples, - const int16_t* input, - int16_t* output, + int16_t* data, const int16_t* twiddle) { assert(samples >= 2); assert(samples % 2 == 0); - assert(input != NULL); - assert(output != NULL); + assert(data != NULL); + assert(data != NULL); assert(twiddle != NULL); - const int16_t* il = input; - const int16_t* ir = input + samples * 2; - int32_t vdcr = (int32_t) il[0]; - int32_t vdci = (int32_t) il[1]; - il += 2; + int16_t* dl = data; + int16_t* dr = data + samples * 2; + int32_t vdcr = (int32_t) dl[0]; + int32_t vdci = (int32_t) dl[1]; + vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); vdci = math_asr_s32(vdci * 16383 + 16384, 15); - int16_t* ol = output; - int16_t* or = output + samples * 2; - ol[0] = vdcr + vdci; - ol[1] = 0; - ol += 2; - or[0] = vdcr - vdci; - or[1] = 0; + dl[0] = vdcr + vdci; + dl[1] = 0; + dl += 2; + dr[0] = vdcr - vdci; + dr[1] = 0; samples >>= 1; for (; samples >= 4; samples -= 4) { - int32_t vilr0 = il[0]; - int32_t vili0 = il[1]; - int32_t vilr1 = il[2]; - int32_t vili1 = il[3]; - int32_t vilr2 = il[4]; - int32_t vili2 = il[5]; - int32_t vilr3 = il[6]; - int32_t vili3 = il[7]; - il += 4 * 2; - ir -= 4 * 2; - int32_t virr0 = (int32_t) ir[6]; - int32_t viri0 = -(int32_t) ir[7]; - int32_t virr1 = (int32_t) ir[4]; - int32_t viri1 = -(int32_t) ir[5]; - int32_t virr2 = (int32_t) ir[2]; - int32_t viri2 = -(int32_t) ir[3]; - int32_t virr3 = (int32_t) ir[0]; - int32_t viri3 = -(int32_t) ir[1]; + dr -= 4 * 2; + int32_t vilr0 = dl[0]; + int32_t vili0 = dl[1]; + int32_t vilr1 = dl[2]; + int32_t vili1 = dl[3]; + int32_t vilr2 = dl[4]; + int32_t vili2 = dl[5]; + int32_t vilr3 = dl[6]; + int32_t vili3 = dl[7]; + int32_t virr0 = (int32_t) dr[6]; + int32_t viri0 = -(int32_t) dr[7]; + int32_t virr1 = (int32_t) dr[4]; + int32_t viri1 = -(int32_t) dr[5]; + int32_t virr2 = (int32_t) dr[2]; + int32_t viri2 = -(int32_t) dr[3]; + int32_t virr3 = (int32_t) dr[0]; + int32_t viri3 = -(int32_t) dr[1]; const int32_t vtwr0 = twiddle[0]; const int32_t vtwi0 = twiddle[1]; const int32_t vtwr1 = twiddle[2]; @@ -116,34 +112,32 @@ void xnn_cs16_fftr_ukernel__scalar_x4( const int32_t twi2 = math_asr_s32(vacc2r2 * vtwi2 + vacc2i2 * vtwr2 + 16384, 15); const int32_t twi3 = math_asr_s32(vacc2r3 * vtwi3 + vacc2i3 * vtwr3 + 16384, 15); - ol[0] = math_asr_s32(vacc1r0 + twr0, 1); - ol[1] = math_asr_s32(vacc1i0 + twi0, 1); - ol[2] = math_asr_s32(vacc1r1 + twr1, 1); - ol[3] = math_asr_s32(vacc1i1 + twi1, 1); - ol[4] = math_asr_s32(vacc1r2 + twr2, 1); - ol[5] = math_asr_s32(vacc1i2 + twi2, 1); - ol[6] = math_asr_s32(vacc1r3 + twr3, 1); - ol[7] = math_asr_s32(vacc1i3 + twi3, 1); - ol += 4 * 2; - or -= 4 * 2; - or[6] = math_asr_s32(vacc1r0 - twr0, 1); - or[7] = math_asr_s32(twi0 - vacc1i0, 1); - or[4] = math_asr_s32(vacc1r1 - twr1, 1); - or[5] = math_asr_s32(twi1 - vacc1i1, 1); - or[2] = math_asr_s32(vacc1r2 - twr2, 1); - or[3] = math_asr_s32(twi2 - vacc1i2, 1); - or[0] = math_asr_s32(vacc1r3 - twr3, 1); - or[1] = math_asr_s32(twi3 - vacc1i3, 1); + dl[0] = math_asr_s32(vacc1r0 + twr0, 1); + dl[1] = math_asr_s32(vacc1i0 + twi0, 1); + dl[2] = math_asr_s32(vacc1r1 + twr1, 1); + dl[3] = math_asr_s32(vacc1i1 + twi1, 1); + dl[4] = math_asr_s32(vacc1r2 + twr2, 1); + dl[5] = math_asr_s32(vacc1i2 + twi2, 1); + dl[6] = math_asr_s32(vacc1r3 + twr3, 1); + dl[7] = math_asr_s32(vacc1i3 + twi3, 1); + dr[6] = math_asr_s32(vacc1r0 - twr0, 1); + dr[7] = math_asr_s32(twi0 - vacc1i0, 1); + dr[4] = math_asr_s32(vacc1r1 - twr1, 1); + dr[5] = math_asr_s32(twi1 - vacc1i1, 1); + dr[2] = math_asr_s32(vacc1r2 - twr2, 1); + dr[3] = math_asr_s32(twi2 - vacc1i2, 1); + dr[0] = math_asr_s32(vacc1r3 - twr3, 1); + dr[1] = math_asr_s32(twi3 - vacc1i3, 1); + dl += 4 * 2; } if XNN_UNLIKELY(samples != 0) { do { - int32_t vilr = il[0]; - int32_t vili = il[1]; - il += 2; - ir -= 2; - int32_t virr = (int32_t) ir[0]; - int32_t viri = -(int32_t) ir[1]; + dr -= 2; + int32_t vilr = dl[0]; + int32_t vili = dl[1]; + int32_t virr = (int32_t) dr[0]; + int32_t viri = -(int32_t) dr[1]; const int32_t vtwr = twiddle[0]; const int32_t vtwi = twiddle[1]; twiddle += 2; @@ -160,13 +154,11 @@ void xnn_cs16_fftr_ukernel__scalar_x4( const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15); const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15); - ol[0] = math_asr_s32(vacc1r + twr, 1); - ol[1] = math_asr_s32(vacc1i + twi, 1); - ol += 2; - or -= 2; - or[0] = math_asr_s32(vacc1r - twr, 1); - or[1] = math_asr_s32(twi - vacc1i, 1); - + dl[0] = math_asr_s32(vacc1r + twr, 1); + dl[1] = math_asr_s32(vacc1i + twi, 1); + dr[0] = math_asr_s32(vacc1r - twr, 1); + dr[1] = math_asr_s32(twi - vacc1i, 1); + dl += 2; } while (--samples != 0); } } |