diff options
author | Frank Barchard <fbarchard@google.com> | 2022-08-22 12:23:23 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-08-22 12:24:04 -0700 |
commit | 93b6949eff75612d35f743ef1a8c22ad0f04c30e (patch) | |
tree | 95eaecd2f708bd9cbe1738f2e3a8594520a19519 | |
parent | 1ad54d1f839712bb1d0ae48d4b7bdc5ced9f2694 (diff) | |
download | XNNPACK-93b6949eff75612d35f743ef1a8c22ad0f04c30e.tar.gz |
bfly4m1 remove multiplies by 1 and 0.
PiperOrigin-RevId: 469254069
-rw-r--r-- | BUILD.bazel | 2 | ||||
-rwxr-xr-x | CMakeLists.txt | 2 | ||||
-rw-r--r-- | bench/cs16-bfly4.cc | 2 | ||||
-rwxr-xr-x | scripts/generate-cs16-bfly4.sh | 3 | ||||
-rw-r--r-- | src/cs16-bfly4/gen/scalar-m1-x1.c | 91 | ||||
-rw-r--r-- | src/cs16-bfly4/scalar-m1.c | 71 | ||||
-rw-r--r-- | src/xnnpack/fft.h | 2 | ||||
-rw-r--r-- | test/bfly4-microkernel-tester.h | 2 | ||||
-rw-r--r-- | test/cs16-bfly4.cc | 4 | ||||
-rw-r--r-- | test/cs16-bfly4.yaml | 2 | ||||
-rwxr-xr-x | tools/generate-bfly4-test.py | 6 |
11 files changed, 83 insertions, 104 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index d081abdb4..5e9dbbc16 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -618,7 +618,7 @@ PROD_SCALAR_RISCV_MICROKERNEL_SRCS = [ ] ALL_SCALAR_MICROKERNEL_SRCS = [ - "src/cs16-bfly4/gen/scalar-m1-x1.c", + "src/cs16-bfly4/scalar-m1.c", "src/cs16-bfly4/gen/scalar-x1.c", "src/cs16-bfly4/gen/scalar-x2.c", "src/cs16-bfly4/gen/scalar-x3.c", diff --git a/CMakeLists.txt b/CMakeLists.txt index 903c48cf5..a3f336a9d 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -606,7 +606,7 @@ SET(PROD_SCALAR_RISCV_MICROKERNEL_SRCS src/x32-zip/x4-scalar.c) SET(ALL_SCALAR_MICROKERNEL_SRCS - src/cs16-bfly4/gen/scalar-m1-x1.c + src/cs16-bfly4/scalar-m1.c src/cs16-bfly4/gen/scalar-x1.c src/cs16-bfly4/gen/scalar-x2.c src/cs16-bfly4/gen/scalar-x3.c diff --git a/bench/cs16-bfly4.cc b/bench/cs16-bfly4.cc index 43b25150b..e4c46d2d9 100644 --- a/bench/cs16-bfly4.cc +++ b/bench/cs16-bfly4.cc @@ -74,7 +74,7 @@ static void BenchmarkM1KernelSize(benchmark::internal::Benchmark* b) b->Args({1024, 1, 256}); } -BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_m1_x1, xnn_cs16_bfly4m1_ukernel__scalar_x1)->Apply(BenchmarkM1KernelSize)->UseRealTime(); +BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_m1, xnn_cs16_bfly4m1_ukernel__scalar)->Apply(BenchmarkM1KernelSize)->UseRealTime(); BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x1, xnn_cs16_bfly4_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime(); BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x2, xnn_cs16_bfly4_ukernel__scalar_x2)->Apply(BenchmarkKernelSize)->UseRealTime(); BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x3, xnn_cs16_bfly4_ukernel__scalar_x3)->Apply(BenchmarkKernelSize)->UseRealTime(); diff --git a/scripts/generate-cs16-bfly4.sh b/scripts/generate-cs16-bfly4.sh index 185c0fd13..992f61162 100755 --- a/scripts/generate-cs16-bfly4.sh +++ b/scripts/generate-cs16-bfly4.sh @@ -10,9 +10,6 @@ tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=2 -D M=0 -o src/cs16-bfly4 tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=3 -D M=0 -o src/cs16-bfly4/gen/scalar-x3.c & tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=4 -D M=0 -o src/cs16-bfly4/gen/scalar-x4.c & -tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=1 -D M=1 -o src/cs16-bfly4/gen/scalar-m1-x1.c & - - ################################## Unit tests ################################# tools/generate-bfly4-test.py --spec test/cs16-bfly4.yaml --output test/cs16-bfly4.cc & diff --git a/src/cs16-bfly4/gen/scalar-m1-x1.c b/src/cs16-bfly4/gen/scalar-m1-x1.c deleted file mode 100644 index 09c47ff40..000000000 --- a/src/cs16-bfly4/gen/scalar-m1-x1.c +++ /dev/null @@ -1,91 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/cs16-bfly4/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include <assert.h> -#include <stddef.h> -#include <stdint.h> - -#include <xnnpack/math.h> -#include <xnnpack/fft.h> - - -void xnn_cs16_bfly4m1_ukernel__scalar_x1( - size_t samples, - int16_t* data, - const size_t stride, - const int16_t* twiddle) { - - int16_t* out0 = data; - - assert(samples == 1); - assert(data != NULL); - assert(stride != 0); - assert(twiddle != NULL); - - - if XNN_UNLIKELY(samples != 0) { - do { - int32_t vout0r = (int32_t) out0[0]; - int32_t vout0i = (int32_t) out0[1]; - int32_t vout1r = (int32_t) out0[2]; - int32_t vout1i = (int32_t) out0[3]; - int32_t vout2r = (int32_t) out0[4]; - int32_t vout2i = (int32_t) out0[5]; - int32_t vout3r = (int32_t) out0[6]; - int32_t vout3i = (int32_t) out0[7]; - - - // Note 32767 / 4 = 8191. Should be 8192. - vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); - vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); - vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); - vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); - vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); - vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); - vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); - vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - - // Note 32767 should be 32768 representing a multiply by 1. - const int32_t vtmp0r = math_asr_s32(vout1r * 32767 + 16384, 15); - const int32_t vtmp0i = math_asr_s32(vout1i * 32767 + 16384, 15); - const int32_t vtmp1r = math_asr_s32(vout2r * 32767 + 16384, 15); - const int32_t vtmp1i = math_asr_s32(vout2i * 32767 + 16384, 15); - const int32_t vtmp2r = math_asr_s32(vout3r * 32767 + 16384, 15); - const int32_t vtmp2i = math_asr_s32(vout3i * 32767 + 16384, 15); - - const int32_t vtmp5r = vout0r - vtmp1r; - const int32_t vtmp5i = vout0i - vtmp1i; - vout0r += vtmp1r; - vout0i += vtmp1i; - const int32_t vtmp3r = vtmp0r + vtmp2r; - const int32_t vtmp3i = vtmp0i + vtmp2i; - const int32_t vtmp4r = vtmp0r - vtmp2r; - const int32_t vtmp4i = vtmp0i - vtmp2i; - vout2r = vout0r - vtmp3r; - vout2i = vout0i - vtmp3i; - - vout0r += vtmp3r; - vout0i += vtmp3i; - - vout1r = vtmp5r + vtmp4i; - vout1i = vtmp5i - vtmp4r; - vout3r = vtmp5r - vtmp4i; - vout3i = vtmp5i + vtmp4r; - - out0[0] = (int16_t) vout0r; - out0[1] = (int16_t) vout0i; - out0[2] = (int16_t) vout1r; - out0[3] = (int16_t) vout1i; - out0[4] = (int16_t) vout2r; - out0[5] = (int16_t) vout2i; - out0[6] = (int16_t) vout3r; - out0[7] = (int16_t) vout3i; - } while(--samples != 0); - } -} diff --git a/src/cs16-bfly4/scalar-m1.c b/src/cs16-bfly4/scalar-m1.c new file mode 100644 index 000000000..d0f2af509 --- /dev/null +++ b/src/cs16-bfly4/scalar-m1.c @@ -0,0 +1,71 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> + +#include <xnnpack/math.h> +#include <xnnpack/fft.h> + + +void xnn_cs16_bfly4m1_ukernel__scalar( + size_t samples, + int16_t* data, + const size_t stride, + const int16_t* twiddle) { + + assert(samples == 1); + assert(data != NULL); + assert(stride != 0); + assert(twiddle != NULL); + + int32_t vout0r = (int32_t) data[0]; + int32_t vout0i = (int32_t) data[1]; + int32_t vout1r = (int32_t) data[2]; + int32_t vout1i = (int32_t) data[3]; + int32_t vout2r = (int32_t) data[4]; + int32_t vout2i = (int32_t) data[5]; + int32_t vout3r = (int32_t) data[6]; + int32_t vout3i = (int32_t) data[7]; + + // Note 32767 / 4 = 8191. Should be 8192. + vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); + vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); + vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); + vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); + vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); + vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); + vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); + vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); + + const int32_t vtmp5r = vout0r - vout2r; + const int32_t vtmp5i = vout0i - vout2i; + vout0r += vout2r; + vout0i += vout2i; + const int32_t vtmp3r = vout1r + vout3r; + const int32_t vtmp3i = vout1i + vout3i; + const int32_t vtmp4r = vout1r - vout3r; + const int32_t vtmp4i = vout1i - vout3i; + vout2r = vout0r - vtmp3r; + vout2i = vout0i - vtmp3i; + + vout0r += vtmp3r; + vout0i += vtmp3i; + + vout1r = vtmp5r + vtmp4i; + vout1i = vtmp5i - vtmp4r; + vout3r = vtmp5r - vtmp4i; + vout3i = vtmp5i + vtmp4r; + + data[0] = (int16_t) vout0r; + data[1] = (int16_t) vout0i; + data[2] = (int16_t) vout1r; + data[3] = (int16_t) vout1i; + data[4] = (int16_t) vout2r; + data[5] = (int16_t) vout2i; + data[6] = (int16_t) vout3r; + data[7] = (int16_t) vout3i; +} diff --git a/src/xnnpack/fft.h b/src/xnnpack/fft.h index 50bb7a2e9..9aa0afd79 100644 --- a/src/xnnpack/fft.h +++ b/src/xnnpack/fft.h @@ -26,7 +26,7 @@ DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x1) DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x2) DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x3) DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x4) -DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__scalar_x1) +DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__scalar) #define DECLARE_CS16_FFTR_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ diff --git a/test/bfly4-microkernel-tester.h b/test/bfly4-microkernel-tester.h index 32bf9bb77..ae3182b4a 100644 --- a/test/bfly4-microkernel-tester.h +++ b/test/bfly4-microkernel-tester.h @@ -215,7 +215,7 @@ class BFly4MicrokernelTester { std::random_device random_device; auto rng = std::mt19937(random_device()); auto i16rng = std::bind(std::uniform_int_distribution<int16_t>(), std::ref(rng)); - const size_t fft_size = samples() * stride() * 4; // 4 for bfly4. + const size_t fft_size = samples() == 1 ? 1 : (samples() * stride()) * 4; // 4 for bfly4. // 256 complex numbers = fft_size * 2 = 512 std::vector<int16_t> y(fft_size * 2 + XNN_EXTRA_BYTES / sizeof(int16_t)); diff --git a/test/cs16-bfly4.cc b/test/cs16-bfly4.cc index 4815f35f8..a3217ec78 100644 --- a/test/cs16-bfly4.cc +++ b/test/cs16-bfly4.cc @@ -133,9 +133,9 @@ TEST(CS16_BFLY4__SCALAR_X4, samples_eq_64) { } -TEST(CS16_BFLY4M1__SCALAR_X1, samples_eq_1) { +TEST(CS16_BFLY4M1__SCALAR, samples_eq_1) { BFly4MicrokernelTester() .samples(1) .stride(64) - .Test(xnn_cs16_bfly4m1_ukernel__scalar_x1); + .Test(xnn_cs16_bfly4m1_ukernel__scalar); } diff --git a/test/cs16-bfly4.yaml b/test/cs16-bfly4.yaml index a8d2f129b..a9ddc8e58 100644 --- a/test/cs16-bfly4.yaml +++ b/test/cs16-bfly4.yaml @@ -9,4 +9,4 @@ - name: xnn_cs16_bfly4_ukernel__scalar_x2 - name: xnn_cs16_bfly4_ukernel__scalar_x3 - name: xnn_cs16_bfly4_ukernel__scalar_x4 -- name: xnn_cs16_bfly4m1_ukernel__scalar_x1 +- name: xnn_cs16_bfly4m1_ukernel__scalar diff --git a/tools/generate-bfly4-test.py b/tools/generate-bfly4-test.py index 3160bbf6a..1bd2a31a7 100755 --- a/tools/generate-bfly4-test.py +++ b/tools/generate-bfly4-test.py @@ -28,11 +28,13 @@ parser.set_defaults(defines=list()) def split_ukernel_name(name): m = 0 - match = re.fullmatch(r"xnn_cs16_bfly4(m(\d+))?_ukernel__(.+)_x(\d+)", name) + samples_tile = 1 + match = re.fullmatch(r"xnn_cs16_bfly4(m(\d+))?_ukernel__(.+)(_x(\d+))?", name) assert match is not None if match.group(2): m = int(match.group(2)) - samples_tile = int(match.group(4)) + if match.group(5): + samples_tile = int(match.group(5)) arch, isa = xnncommon.parse_target_name(target_name=match.group(3)) return m, samples_tile, arch, isa |