aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-08-22 12:23:23 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-08-22 12:24:04 -0700
commit93b6949eff75612d35f743ef1a8c22ad0f04c30e (patch)
tree95eaecd2f708bd9cbe1738f2e3a8594520a19519
parent1ad54d1f839712bb1d0ae48d4b7bdc5ced9f2694 (diff)
downloadXNNPACK-93b6949eff75612d35f743ef1a8c22ad0f04c30e.tar.gz
bfly4m1 remove multiplies by 1 and 0.
PiperOrigin-RevId: 469254069
-rw-r--r--BUILD.bazel2
-rwxr-xr-xCMakeLists.txt2
-rw-r--r--bench/cs16-bfly4.cc2
-rwxr-xr-xscripts/generate-cs16-bfly4.sh3
-rw-r--r--src/cs16-bfly4/gen/scalar-m1-x1.c91
-rw-r--r--src/cs16-bfly4/scalar-m1.c71
-rw-r--r--src/xnnpack/fft.h2
-rw-r--r--test/bfly4-microkernel-tester.h2
-rw-r--r--test/cs16-bfly4.cc4
-rw-r--r--test/cs16-bfly4.yaml2
-rwxr-xr-xtools/generate-bfly4-test.py6
11 files changed, 83 insertions, 104 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index d081abdb4..5e9dbbc16 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -618,7 +618,7 @@ PROD_SCALAR_RISCV_MICROKERNEL_SRCS = [
]
ALL_SCALAR_MICROKERNEL_SRCS = [
- "src/cs16-bfly4/gen/scalar-m1-x1.c",
+ "src/cs16-bfly4/scalar-m1.c",
"src/cs16-bfly4/gen/scalar-x1.c",
"src/cs16-bfly4/gen/scalar-x2.c",
"src/cs16-bfly4/gen/scalar-x3.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 903c48cf5..a3f336a9d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -606,7 +606,7 @@ SET(PROD_SCALAR_RISCV_MICROKERNEL_SRCS
src/x32-zip/x4-scalar.c)
SET(ALL_SCALAR_MICROKERNEL_SRCS
- src/cs16-bfly4/gen/scalar-m1-x1.c
+ src/cs16-bfly4/scalar-m1.c
src/cs16-bfly4/gen/scalar-x1.c
src/cs16-bfly4/gen/scalar-x2.c
src/cs16-bfly4/gen/scalar-x3.c
diff --git a/bench/cs16-bfly4.cc b/bench/cs16-bfly4.cc
index 43b25150b..e4c46d2d9 100644
--- a/bench/cs16-bfly4.cc
+++ b/bench/cs16-bfly4.cc
@@ -74,7 +74,7 @@ static void BenchmarkM1KernelSize(benchmark::internal::Benchmark* b)
b->Args({1024, 1, 256});
}
-BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_m1_x1, xnn_cs16_bfly4m1_ukernel__scalar_x1)->Apply(BenchmarkM1KernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_m1, xnn_cs16_bfly4m1_ukernel__scalar)->Apply(BenchmarkM1KernelSize)->UseRealTime();
BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x1, xnn_cs16_bfly4_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x2, xnn_cs16_bfly4_ukernel__scalar_x2)->Apply(BenchmarkKernelSize)->UseRealTime();
BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x3, xnn_cs16_bfly4_ukernel__scalar_x3)->Apply(BenchmarkKernelSize)->UseRealTime();
diff --git a/scripts/generate-cs16-bfly4.sh b/scripts/generate-cs16-bfly4.sh
index 185c0fd13..992f61162 100755
--- a/scripts/generate-cs16-bfly4.sh
+++ b/scripts/generate-cs16-bfly4.sh
@@ -10,9 +10,6 @@ tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=2 -D M=0 -o src/cs16-bfly4
tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=3 -D M=0 -o src/cs16-bfly4/gen/scalar-x3.c &
tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=4 -D M=0 -o src/cs16-bfly4/gen/scalar-x4.c &
-tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=1 -D M=1 -o src/cs16-bfly4/gen/scalar-m1-x1.c &
-
-
################################## Unit tests #################################
tools/generate-bfly4-test.py --spec test/cs16-bfly4.yaml --output test/cs16-bfly4.cc &
diff --git a/src/cs16-bfly4/gen/scalar-m1-x1.c b/src/cs16-bfly4/gen/scalar-m1-x1.c
deleted file mode 100644
index 09c47ff40..000000000
--- a/src/cs16-bfly4/gen/scalar-m1-x1.c
+++ /dev/null
@@ -1,91 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/cs16-bfly4/scalar.c.in
-// Generator: tools/xngen
-//
-// Copyright 2022 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include <xnnpack/math.h>
-#include <xnnpack/fft.h>
-
-
-void xnn_cs16_bfly4m1_ukernel__scalar_x1(
- size_t samples,
- int16_t* data,
- const size_t stride,
- const int16_t* twiddle) {
-
- int16_t* out0 = data;
-
- assert(samples == 1);
- assert(data != NULL);
- assert(stride != 0);
- assert(twiddle != NULL);
-
-
- if XNN_UNLIKELY(samples != 0) {
- do {
- int32_t vout0r = (int32_t) out0[0];
- int32_t vout0i = (int32_t) out0[1];
- int32_t vout1r = (int32_t) out0[2];
- int32_t vout1i = (int32_t) out0[3];
- int32_t vout2r = (int32_t) out0[4];
- int32_t vout2i = (int32_t) out0[5];
- int32_t vout3r = (int32_t) out0[6];
- int32_t vout3i = (int32_t) out0[7];
-
-
- // Note 32767 / 4 = 8191. Should be 8192.
- vout0r = math_asr_s32(vout0r * 8191 + 16384, 15);
- vout0i = math_asr_s32(vout0i * 8191 + 16384, 15);
- vout1r = math_asr_s32(vout1r * 8191 + 16384, 15);
- vout1i = math_asr_s32(vout1i * 8191 + 16384, 15);
- vout2r = math_asr_s32(vout2r * 8191 + 16384, 15);
- vout2i = math_asr_s32(vout2i * 8191 + 16384, 15);
- vout3r = math_asr_s32(vout3r * 8191 + 16384, 15);
- vout3i = math_asr_s32(vout3i * 8191 + 16384, 15);
-
- // Note 32767 should be 32768 representing a multiply by 1.
- const int32_t vtmp0r = math_asr_s32(vout1r * 32767 + 16384, 15);
- const int32_t vtmp0i = math_asr_s32(vout1i * 32767 + 16384, 15);
- const int32_t vtmp1r = math_asr_s32(vout2r * 32767 + 16384, 15);
- const int32_t vtmp1i = math_asr_s32(vout2i * 32767 + 16384, 15);
- const int32_t vtmp2r = math_asr_s32(vout3r * 32767 + 16384, 15);
- const int32_t vtmp2i = math_asr_s32(vout3i * 32767 + 16384, 15);
-
- const int32_t vtmp5r = vout0r - vtmp1r;
- const int32_t vtmp5i = vout0i - vtmp1i;
- vout0r += vtmp1r;
- vout0i += vtmp1i;
- const int32_t vtmp3r = vtmp0r + vtmp2r;
- const int32_t vtmp3i = vtmp0i + vtmp2i;
- const int32_t vtmp4r = vtmp0r - vtmp2r;
- const int32_t vtmp4i = vtmp0i - vtmp2i;
- vout2r = vout0r - vtmp3r;
- vout2i = vout0i - vtmp3i;
-
- vout0r += vtmp3r;
- vout0i += vtmp3i;
-
- vout1r = vtmp5r + vtmp4i;
- vout1i = vtmp5i - vtmp4r;
- vout3r = vtmp5r - vtmp4i;
- vout3i = vtmp5i + vtmp4r;
-
- out0[0] = (int16_t) vout0r;
- out0[1] = (int16_t) vout0i;
- out0[2] = (int16_t) vout1r;
- out0[3] = (int16_t) vout1i;
- out0[4] = (int16_t) vout2r;
- out0[5] = (int16_t) vout2i;
- out0[6] = (int16_t) vout3r;
- out0[7] = (int16_t) vout3i;
- } while(--samples != 0);
- }
-}
diff --git a/src/cs16-bfly4/scalar-m1.c b/src/cs16-bfly4/scalar-m1.c
new file mode 100644
index 000000000..d0f2af509
--- /dev/null
+++ b/src/cs16-bfly4/scalar-m1.c
@@ -0,0 +1,71 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/fft.h>
+
+
+void xnn_cs16_bfly4m1_ukernel__scalar(
+ size_t samples,
+ int16_t* data,
+ const size_t stride,
+ const int16_t* twiddle) {
+
+ assert(samples == 1);
+ assert(data != NULL);
+ assert(stride != 0);
+ assert(twiddle != NULL);
+
+ int32_t vout0r = (int32_t) data[0];
+ int32_t vout0i = (int32_t) data[1];
+ int32_t vout1r = (int32_t) data[2];
+ int32_t vout1i = (int32_t) data[3];
+ int32_t vout2r = (int32_t) data[4];
+ int32_t vout2i = (int32_t) data[5];
+ int32_t vout3r = (int32_t) data[6];
+ int32_t vout3i = (int32_t) data[7];
+
+ // Note 32767 / 4 = 8191. Should be 8192.
+ vout0r = math_asr_s32(vout0r * 8191 + 16384, 15);
+ vout0i = math_asr_s32(vout0i * 8191 + 16384, 15);
+ vout1r = math_asr_s32(vout1r * 8191 + 16384, 15);
+ vout1i = math_asr_s32(vout1i * 8191 + 16384, 15);
+ vout2r = math_asr_s32(vout2r * 8191 + 16384, 15);
+ vout2i = math_asr_s32(vout2i * 8191 + 16384, 15);
+ vout3r = math_asr_s32(vout3r * 8191 + 16384, 15);
+ vout3i = math_asr_s32(vout3i * 8191 + 16384, 15);
+
+ const int32_t vtmp5r = vout0r - vout2r;
+ const int32_t vtmp5i = vout0i - vout2i;
+ vout0r += vout2r;
+ vout0i += vout2i;
+ const int32_t vtmp3r = vout1r + vout3r;
+ const int32_t vtmp3i = vout1i + vout3i;
+ const int32_t vtmp4r = vout1r - vout3r;
+ const int32_t vtmp4i = vout1i - vout3i;
+ vout2r = vout0r - vtmp3r;
+ vout2i = vout0i - vtmp3i;
+
+ vout0r += vtmp3r;
+ vout0i += vtmp3i;
+
+ vout1r = vtmp5r + vtmp4i;
+ vout1i = vtmp5i - vtmp4r;
+ vout3r = vtmp5r - vtmp4i;
+ vout3i = vtmp5i + vtmp4r;
+
+ data[0] = (int16_t) vout0r;
+ data[1] = (int16_t) vout0i;
+ data[2] = (int16_t) vout1r;
+ data[3] = (int16_t) vout1i;
+ data[4] = (int16_t) vout2r;
+ data[5] = (int16_t) vout2i;
+ data[6] = (int16_t) vout3r;
+ data[7] = (int16_t) vout3i;
+}
diff --git a/src/xnnpack/fft.h b/src/xnnpack/fft.h
index 50bb7a2e9..9aa0afd79 100644
--- a/src/xnnpack/fft.h
+++ b/src/xnnpack/fft.h
@@ -26,7 +26,7 @@ DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x1)
DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x2)
DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x3)
DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x4)
-DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__scalar_x1)
+DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__scalar)
#define DECLARE_CS16_FFTR_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
diff --git a/test/bfly4-microkernel-tester.h b/test/bfly4-microkernel-tester.h
index 32bf9bb77..ae3182b4a 100644
--- a/test/bfly4-microkernel-tester.h
+++ b/test/bfly4-microkernel-tester.h
@@ -215,7 +215,7 @@ class BFly4MicrokernelTester {
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto i16rng = std::bind(std::uniform_int_distribution<int16_t>(), std::ref(rng));
- const size_t fft_size = samples() * stride() * 4; // 4 for bfly4.
+ const size_t fft_size = samples() == 1 ? 1 : (samples() * stride()) * 4; // 4 for bfly4.
// 256 complex numbers = fft_size * 2 = 512
std::vector<int16_t> y(fft_size * 2 + XNN_EXTRA_BYTES / sizeof(int16_t));
diff --git a/test/cs16-bfly4.cc b/test/cs16-bfly4.cc
index 4815f35f8..a3217ec78 100644
--- a/test/cs16-bfly4.cc
+++ b/test/cs16-bfly4.cc
@@ -133,9 +133,9 @@ TEST(CS16_BFLY4__SCALAR_X4, samples_eq_64) {
}
-TEST(CS16_BFLY4M1__SCALAR_X1, samples_eq_1) {
+TEST(CS16_BFLY4M1__SCALAR, samples_eq_1) {
BFly4MicrokernelTester()
.samples(1)
.stride(64)
- .Test(xnn_cs16_bfly4m1_ukernel__scalar_x1);
+ .Test(xnn_cs16_bfly4m1_ukernel__scalar);
}
diff --git a/test/cs16-bfly4.yaml b/test/cs16-bfly4.yaml
index a8d2f129b..a9ddc8e58 100644
--- a/test/cs16-bfly4.yaml
+++ b/test/cs16-bfly4.yaml
@@ -9,4 +9,4 @@
- name: xnn_cs16_bfly4_ukernel__scalar_x2
- name: xnn_cs16_bfly4_ukernel__scalar_x3
- name: xnn_cs16_bfly4_ukernel__scalar_x4
-- name: xnn_cs16_bfly4m1_ukernel__scalar_x1
+- name: xnn_cs16_bfly4m1_ukernel__scalar
diff --git a/tools/generate-bfly4-test.py b/tools/generate-bfly4-test.py
index 3160bbf6a..1bd2a31a7 100755
--- a/tools/generate-bfly4-test.py
+++ b/tools/generate-bfly4-test.py
@@ -28,11 +28,13 @@ parser.set_defaults(defines=list())
def split_ukernel_name(name):
m = 0
- match = re.fullmatch(r"xnn_cs16_bfly4(m(\d+))?_ukernel__(.+)_x(\d+)", name)
+ samples_tile = 1
+ match = re.fullmatch(r"xnn_cs16_bfly4(m(\d+))?_ukernel__(.+)(_x(\d+))?", name)
assert match is not None
if match.group(2):
m = int(match.group(2))
- samples_tile = int(match.group(4))
+ if match.group(5):
+ samples_tile = int(match.group(5))
arch, isa = xnncommon.parse_target_name(target_name=match.group(3))
return m, samples_tile, arch, isa