aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXNNPACK Team <xnnpack-github-robot@google.com>2020-10-23 21:10:15 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2020-10-23 21:10:56 -0700
commit965272bc1d5d060c7b9a4c3c47f3ac7e96dde3af (patch)
tree2a36cff9a2ced646dae8c052530c91220b065dca
parentbf715f9159179086d3027bc74b625281efd20889 (diff)
downloadXNNPACK-965272bc1d5d060c7b9a4c3c47f3ac7e96dde3af.tar.gz
Add WebAssembly SIMD IBILINEAR microkernels for CHW layout
PiperOrigin-RevId: 338792392
-rw-r--r--BUILD.bazel2
-rw-r--r--include/xnnpack.h2
-rwxr-xr-xscripts/generate-f32-ibilinear-chw.sh10
-rw-r--r--src/f32-ibilinear-chw/gen/wasmsimd-p4.c173
-rw-r--r--src/f32-ibilinear-chw/gen/wasmsimd-p8.c243
-rw-r--r--src/f32-ibilinear-chw/scalar.c.in3
-rw-r--r--src/f32-ibilinear-chw/wasmsimd.c.in210
-rw-r--r--src/xnnpack/ibilinear.h2
-rw-r--r--test/f32-ibilinear-chw.cc193
-rw-r--r--test/f32-ibilinear-chw.yaml2
-rwxr-xr-xtools/generate-ibilinear-chw-test.py22
11 files changed, 842 insertions, 20 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 62542a2c0..4a984f17e 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -775,6 +775,8 @@ WASMSIMD_UKERNELS = [
"src/f32-hswish/gen/hswish-wasmsimd-x16.c",
"src/f32-ibilinear/gen/wasmsimd-c4.c",
"src/f32-ibilinear/gen/wasmsimd-c8.c",
+ "src/f32-ibilinear-chw/gen/wasmsimd-p4.c",
+ "src/f32-ibilinear-chw/gen/wasmsimd-p8.c",
"src/f32-igemm/gen/1x8-minmax-wasmsimd-loadsplat-arm.c",
"src/f32-igemm/gen/1x8-minmax-wasmsimd-loadsplat-x86.c",
"src/f32-igemm/gen/1x8-minmax-wasmsimd-splat-arm.c",
diff --git a/include/xnnpack.h b/include/xnnpack.h
index 806f985d4..355533a9f 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -19,7 +19,7 @@ extern "C" {
#endif
/// The number of bytes XNNPACK may read beyond array bounds.
-/// The caller must allocate at this this many extra bytes after the tensor data passed to XNNPACK.
+/// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK.
///
/// Note: XNNPACK reads, but never writes beyond array bounds.
#define XNN_EXTRA_BYTES 16
diff --git a/scripts/generate-f32-ibilinear-chw.sh b/scripts/generate-f32-ibilinear-chw.sh
index 159130498..845a71cdd 100755
--- a/scripts/generate-f32-ibilinear-chw.sh
+++ b/scripts/generate-f32-ibilinear-chw.sh
@@ -5,9 +5,13 @@
# LICENSE file in the root directory of this source tree.
#################################### Scalar ###################################
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=1 -o src/f32-ibilinear-chw/gen/scalar-p1.c
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=2 -o src/f32-ibilinear-chw/gen/scalar-p2.c
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/scalar-p4.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=1 -o src/f32-ibilinear-chw/gen/scalar-p1.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=2 -o src/f32-ibilinear-chw/gen/scalar-p2.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/scalar-p4.c
+
+############################### WebAssembly SIMD ##############################
+tools/xngen src/f32-ibilinear-chw/wasmsimd.c.in -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/wasmsimd-p4.c
+tools/xngen src/f32-ibilinear-chw/wasmsimd.c.in -D PIXEL_TILE=8 -o src/f32-ibilinear-chw/gen/wasmsimd-p8.c
################################## Unit tests #################################
tools/generate-ibilinear-chw-test.py --spec test/f32-ibilinear-chw.yaml --output test/f32-ibilinear-chw.cc
diff --git a/src/f32-ibilinear-chw/gen/wasmsimd-p4.c b/src/f32-ibilinear-chw/gen/wasmsimd-p4.c
new file mode 100644
index 000000000..a08772d1b
--- /dev/null
+++ b/src/f32-ibilinear-chw/gen/wasmsimd-p4.c
@@ -0,0 +1,173 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-ibilinear-chw/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4(
+ size_t output_pixels,
+ size_t channels,
+ const float**restrict input,
+ size_t input_offset,
+ const float*restrict weights,
+ float*restrict output,
+ size_t input_increment) XNN_DISABLE_TSAN
+{
+ assert(output_pixels != 0);
+ assert(channels != 0);
+ assert(input_increment % sizeof(float) == 0);
+
+ do {
+ const float** i = input;
+
+ const float* w = weights;
+
+ // The code is best read starting from the bottom (i.e. the scalar case).
+ // Please read the comments there first; only the differences are explained in vectorized versions.
+
+ size_t p = output_pixels;
+
+ for (; p >= 4; p -= 4) {
+ // Process quadruples of output pixels, each of which requires reading four input pixels.
+
+ // Separate the alternating weights for 4 pixels into two registers.
+ const v128_t vw0 = wasm_v128_load(w);
+ const v128_t vw1 = wasm_v128_load(w + 4);
+ const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6);
+ const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7);
+ w += 2 * 4;
+
+ // Read out pairs of (top-left, top-right) and (bottom-left, bottom-right) pixels
+ // into separate registers as in the scalar case.
+ const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+ const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+ const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+ const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+ const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+ const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+ const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+ const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+ i += 2 * 4;
+
+ const v128_t vtltr01 = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+ const v128_t vblbr01 = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+ const v128_t vtltr23 = wasm_f64x2_make(*(const double*) itl2, *(const double*) itl3);
+ const v128_t vblbr23 = wasm_f64x2_make(*(const double*) ibl2, *(const double*) ibl3);
+
+ const v128_t vldrd01 = wasm_f32x4_sub(vblbr01, vtltr01);
+ const v128_t vldrd23 = wasm_f32x4_sub(vblbr23, vtltr23);
+
+ // Shuffle to isolate `left_diff` and `right_diff`, packed in a single `v128` for all 4 pixels.
+ const v128_t vld = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6);
+ const v128_t vrd = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7);
+
+ // Shuffle to isolate `top_left` and `top_right`, packed in a single `v128` for all 4 pixels.
+ const v128_t vtl = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6);
+ const v128_t vtr = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7);
+
+ // Compute `left` from the equations (*).
+ const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+ // Compute `right` from the equations (*).
+ const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+ // Compute the result according to (*).
+ const v128_t vd = wasm_f32x4_sub(vr, vl);
+ const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+ wasm_v128_store(output, vo);
+ output += 4;
+ }
+
+ if XNN_UNLIKELY(p != 0) {
+ if (p & 2) {
+ // This can be understood as a truncated version of the 4-pixel case above.
+
+ const v128_t vw = wasm_v128_load(w);
+ w += 2 * 2;
+
+ const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2);
+ const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3);
+
+ const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+ const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+ const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+ const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+ i += 2 * 2;
+
+ const v128_t vtltr = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+ const v128_t vblbr = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+
+ const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+ const v128_t vld = wasm_v32x4_shuffle(vldrd, vldrd, 0, 2, 0, 2);
+ const v128_t vrd = wasm_v32x4_shuffle(vldrd, vldrd, 1, 3, 1, 3);
+
+ const v128_t vtl = wasm_v32x4_shuffle(vtltr, vtltr, 0, 2, 0, 2);
+ const v128_t vtr = wasm_v32x4_shuffle(vtltr, vtltr, 1, 3, 1, 3);
+
+ const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+ const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+ const v128_t vd = wasm_f32x4_sub(vr, vl);
+ const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+ *((double*) output) = wasm_f64x2_extract_lane(vo, 0);
+ output += 2;
+ }
+
+ if (p & 1) {
+ // We are computing the following formula:
+ // result = (1 - alpha_h) * (1 - alpha_v) * top_left +
+ // alpha_h * (1 - alpha_v) * top_right +
+ // (1 - alpha_h) * alpha_v * bottom_left +
+ // alpha_h * alpha_v * bottom_right.
+ // Rearranging gives (*):
+ // result = left + alpha_h * (right - left),
+ // where
+ // left = top_left + alpha_v * (bottom_left - top_left),
+ // right = top_right + alpha_v * (bottom_right - top_right).
+
+ const v128_t vw = wasm_v64x2_load_splat((const double*) w);
+ w += 2;
+
+ const float alphah = wasm_f32x4_extract_lane(vw, 0);
+ const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 1, 1, 1);
+
+ // Read adjacent top-left and top-right pixels into one register,
+ // and bottom-left and bottom-right into another.
+
+ const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
+ const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
+ i += 2;
+
+ const v128_t vtltr = wasm_v64x2_load_splat(itl);
+ const v128_t vblbr = wasm_v64x2_load_splat(ibl);
+
+ // Compute at once (**):
+ // left_diff = bottom_left - top_left
+ // right_diff = bottom_right - top_right
+ const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+
+ // Compute at once `left` and `right` from the equations.
+ const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav));
+
+ // Extract them and compute the result.
+ const float l = wasm_f32x4_extract_lane(vlr, 0);
+ const float r = wasm_f32x4_extract_lane(vlr, 1);
+
+ *output++ = l + alphah * (r - l);
+ }
+ }
+
+ input_offset += input_increment;
+ } while (--channels != 0);
+}
diff --git a/src/f32-ibilinear-chw/gen/wasmsimd-p8.c b/src/f32-ibilinear-chw/gen/wasmsimd-p8.c
new file mode 100644
index 000000000..da60589de
--- /dev/null
+++ b/src/f32-ibilinear-chw/gen/wasmsimd-p8.c
@@ -0,0 +1,243 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-ibilinear-chw/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8(
+ size_t output_pixels,
+ size_t channels,
+ const float**restrict input,
+ size_t input_offset,
+ const float*restrict weights,
+ float*restrict output,
+ size_t input_increment) XNN_DISABLE_TSAN
+{
+ assert(output_pixels != 0);
+ assert(channels != 0);
+ assert(input_increment % sizeof(float) == 0);
+
+ do {
+ const float** i = input;
+
+ const float* w = weights;
+
+ // The code is best read starting from the bottom (i.e. the scalar case).
+ // Please read the comments there first; only the differences are explained in vectorized versions.
+
+ size_t p = output_pixels;
+ for (; p >= 8; p -= 8) {
+ // This is just an unrolled loop for `PIXEL_TILE` of 4.
+
+ const v128_t vw0_0123 = wasm_v128_load(w + 0);
+ const v128_t vw1_0123 = wasm_v128_load(w + 4);
+ const v128_t valphah0123 = wasm_v32x4_shuffle(vw0_0123, vw1_0123, 0, 2, 4, 6);
+ const v128_t valphav0123 = wasm_v32x4_shuffle(vw0_0123, vw1_0123, 1, 3, 5, 7);
+ const v128_t vw0_4567 = wasm_v128_load(w + 8);
+ const v128_t vw1_4567 = wasm_v128_load(w + 12);
+ const v128_t valphah4567 = wasm_v32x4_shuffle(vw0_4567, vw1_4567, 0, 2, 4, 6);
+ const v128_t valphav4567 = wasm_v32x4_shuffle(vw0_4567, vw1_4567, 1, 3, 5, 7);
+ w += 2 * 8;
+
+ const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+ const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+ const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+ const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+ const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+ const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+ const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+ const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+ const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset);
+ const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset);
+ const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset);
+ const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset);
+ const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset);
+ const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset);
+ const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset);
+ const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset);
+ i += 2 * 8;
+
+ const v128_t vtltr01 = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+ const v128_t vblbr01 = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+ const v128_t vtltr23 = wasm_f64x2_make(*(const double*) itl2, *(const double*) itl3);
+ const v128_t vblbr23 = wasm_f64x2_make(*(const double*) ibl2, *(const double*) ibl3);
+ const v128_t vtltr45 = wasm_f64x2_make(*(const double*) itl4, *(const double*) itl5);
+ const v128_t vblbr45 = wasm_f64x2_make(*(const double*) ibl4, *(const double*) ibl5);
+ const v128_t vtltr67 = wasm_f64x2_make(*(const double*) itl6, *(const double*) itl7);
+ const v128_t vblbr67 = wasm_f64x2_make(*(const double*) ibl6, *(const double*) ibl7);
+
+ const v128_t vldrd01 = wasm_f32x4_sub(vblbr01, vtltr01);
+ const v128_t vldrd23 = wasm_f32x4_sub(vblbr23, vtltr23);
+ const v128_t vldrd45 = wasm_f32x4_sub(vblbr45, vtltr45);
+ const v128_t vldrd67 = wasm_f32x4_sub(vblbr67, vtltr67);
+
+ const v128_t vld0123 = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6);
+ const v128_t vrd0123 = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7);
+ const v128_t vld4567 = wasm_v32x4_shuffle(vldrd45, vldrd67, 0, 2, 4, 6);
+ const v128_t vrd4567 = wasm_v32x4_shuffle(vldrd45, vldrd67, 1, 3, 5, 7);
+
+ const v128_t vtl0123 = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6);
+ const v128_t vtr0123 = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7);
+ const v128_t vtl4567 = wasm_v32x4_shuffle(vtltr45, vtltr67, 0, 2, 4, 6);
+ const v128_t vtr4567 = wasm_v32x4_shuffle(vtltr45, vtltr67, 1, 3, 5, 7);
+
+ const v128_t vl0123 = wasm_f32x4_add(vtl0123, wasm_f32x4_mul(vld0123, valphav0123));
+ const v128_t vr0123 = wasm_f32x4_add(vtr0123, wasm_f32x4_mul(vrd0123, valphav0123));
+ const v128_t vl4567 = wasm_f32x4_add(vtl4567, wasm_f32x4_mul(vld4567, valphav4567));
+ const v128_t vr4567 = wasm_f32x4_add(vtr4567, wasm_f32x4_mul(vrd4567, valphav4567));
+
+ const v128_t vd0123 = wasm_f32x4_sub(vr0123, vl0123);
+ const v128_t vd4567 = wasm_f32x4_sub(vr4567, vl4567);
+
+ const v128_t vo0123 = wasm_f32x4_add(vl0123, wasm_f32x4_mul(vd0123, valphah0123));
+ const v128_t vo4567 = wasm_f32x4_add(vl4567, wasm_f32x4_mul(vd4567, valphah4567));
+
+ wasm_v128_store(output + 0, vo0123);
+ wasm_v128_store(output + 4, vo4567);
+ output += 8;
+ }
+
+ for (; p >= 4; p -= 4) {
+ // Process quadruples of output pixels, each of which requires reading four input pixels.
+
+ // Separate the alternating weights for 4 pixels into two registers.
+ const v128_t vw0 = wasm_v128_load(w);
+ const v128_t vw1 = wasm_v128_load(w + 4);
+ const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6);
+ const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7);
+ w += 2 * 4;
+
+ // Read out pairs of (top-left, top-right) and (bottom-left, bottom-right) pixels
+ // into separate registers as in the scalar case.
+ const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+ const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+ const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+ const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+ const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+ const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+ const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+ const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+ i += 2 * 4;
+
+ const v128_t vtltr01 = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+ const v128_t vblbr01 = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+ const v128_t vtltr23 = wasm_f64x2_make(*(const double*) itl2, *(const double*) itl3);
+ const v128_t vblbr23 = wasm_f64x2_make(*(const double*) ibl2, *(const double*) ibl3);
+
+ const v128_t vldrd01 = wasm_f32x4_sub(vblbr01, vtltr01);
+ const v128_t vldrd23 = wasm_f32x4_sub(vblbr23, vtltr23);
+
+ // Shuffle to isolate `left_diff` and `right_diff`, packed in a single `v128` for all 4 pixels.
+ const v128_t vld = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6);
+ const v128_t vrd = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7);
+
+ // Shuffle to isolate `top_left` and `top_right`, packed in a single `v128` for all 4 pixels.
+ const v128_t vtl = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6);
+ const v128_t vtr = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7);
+
+ // Compute `left` from the equations (*).
+ const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+ // Compute `right` from the equations (*).
+ const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+ // Compute the result according to (*).
+ const v128_t vd = wasm_f32x4_sub(vr, vl);
+ const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+ wasm_v128_store(output, vo);
+ output += 4;
+ }
+
+ if XNN_UNLIKELY(p != 0) {
+ if (p & 2) {
+ // This can be understood as a truncated version of the 4-pixel case above.
+
+ const v128_t vw = wasm_v128_load(w);
+ w += 2 * 2;
+
+ const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2);
+ const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3);
+
+ const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+ const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+ const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+ const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+ i += 2 * 2;
+
+ const v128_t vtltr = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+ const v128_t vblbr = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+
+ const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+ const v128_t vld = wasm_v32x4_shuffle(vldrd, vldrd, 0, 2, 0, 2);
+ const v128_t vrd = wasm_v32x4_shuffle(vldrd, vldrd, 1, 3, 1, 3);
+
+ const v128_t vtl = wasm_v32x4_shuffle(vtltr, vtltr, 0, 2, 0, 2);
+ const v128_t vtr = wasm_v32x4_shuffle(vtltr, vtltr, 1, 3, 1, 3);
+
+ const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+ const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+ const v128_t vd = wasm_f32x4_sub(vr, vl);
+ const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+ *((double*) output) = wasm_f64x2_extract_lane(vo, 0);
+ output += 2;
+ }
+
+ if (p & 1) {
+ // We are computing the following formula:
+ // result = (1 - alpha_h) * (1 - alpha_v) * top_left +
+ // alpha_h * (1 - alpha_v) * top_right +
+ // (1 - alpha_h) * alpha_v * bottom_left +
+ // alpha_h * alpha_v * bottom_right.
+ // Rearranging gives (*):
+ // result = left + alpha_h * (right - left),
+ // where
+ // left = top_left + alpha_v * (bottom_left - top_left),
+ // right = top_right + alpha_v * (bottom_right - top_right).
+
+ const v128_t vw = wasm_v64x2_load_splat((const double*) w);
+ w += 2;
+
+ const float alphah = wasm_f32x4_extract_lane(vw, 0);
+ const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 1, 1, 1);
+
+ // Read adjacent top-left and top-right pixels into one register,
+ // and bottom-left and bottom-right into another.
+
+ const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
+ const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
+ i += 2;
+
+ const v128_t vtltr = wasm_v64x2_load_splat(itl);
+ const v128_t vblbr = wasm_v64x2_load_splat(ibl);
+
+ // Compute at once (**):
+ // left_diff = bottom_left - top_left
+ // right_diff = bottom_right - top_right
+ const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+
+ // Compute at once `left` and `right` from the equations.
+ const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav));
+
+ // Extract them and compute the result.
+ const float l = wasm_f32x4_extract_lane(vlr, 0);
+ const float r = wasm_f32x4_extract_lane(vlr, 1);
+
+ *output++ = l + alphah * (r - l);
+ }
+ }
+
+ input_offset += input_increment;
+ } while (--channels != 0);
+}
diff --git a/src/f32-ibilinear-chw/scalar.c.in b/src/f32-ibilinear-chw/scalar.c.in
index 0ebeef93d..12adfa4c0 100644
--- a/src/f32-ibilinear-chw/scalar.c.in
+++ b/src/f32-ibilinear-chw/scalar.c.in
@@ -3,7 +3,6 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
-$assert CHANNEL_TILE == 1
$assert PIXEL_TILE >= 1
$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
#include <assert.h>
@@ -11,7 +10,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
#include <xnnpack/ibilinear.h>
-void xnn_f32_ibilinear_chw_ukernel__scalar_p${PIXEL_TILE}${"" if CHANNEL_TILE == 1 else "x%d" % CHANNEL_TILE}(
+void xnn_f32_ibilinear_chw_ukernel__scalar_p${PIXEL_TILE}(
size_t output_pixels,
size_t channels,
const float**restrict input,
diff --git a/src/f32-ibilinear-chw/wasmsimd.c.in b/src/f32-ibilinear-chw/wasmsimd.c.in
new file mode 100644
index 000000000..b5b034c38
--- /dev/null
+++ b/src/f32-ibilinear-chw/wasmsimd.c.in
@@ -0,0 +1,210 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert PIXEL_TILE >= 1
+$assert PIXEL_TILE % 4 == 0
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f32_ibilinear_chw_ukernel__wasmsimd_p${PIXEL_TILE}(
+ size_t output_pixels,
+ size_t channels,
+ const float**restrict input,
+ size_t input_offset,
+ const float*restrict weights,
+ float*restrict output,
+ size_t input_increment) XNN_DISABLE_TSAN
+{
+ assert(output_pixels != 0);
+ assert(channels != 0);
+ assert(input_increment % sizeof(float) == 0);
+
+ do {
+ const float** i = input;
+
+ const float* w = weights;
+
+ // The code is best read starting from the bottom (i.e. the scalar case).
+ // Please read the comments there first; only the differences are explained in vectorized versions.
+
+ size_t p = output_pixels;
+ $if PIXEL_TILE > 4:
+ for (; p >= ${PIXEL_TILE}; p -= ${PIXEL_TILE}) {
+ // This is just an unrolled loop for `PIXEL_TILE` of 4.
+
+ $for P in range(0, PIXEL_TILE, 4):
+ const v128_t vw0_${ABC[P:P+4]} = wasm_v128_load(w + ${2 * P});
+ const v128_t vw1_${ABC[P:P+4]} = wasm_v128_load(w + ${2 * P + 4});
+ const v128_t valphah${ABC[P:P+4]} = wasm_v32x4_shuffle(vw0_${ABC[P:P+4]}, vw1_${ABC[P:P+4]}, 0, 2, 4, 6);
+ const v128_t valphav${ABC[P:P+4]} = wasm_v32x4_shuffle(vw0_${ABC[P:P+4]}, vw1_${ABC[P:P+4]}, 1, 3, 5, 7);
+ w += 2 * ${PIXEL_TILE};
+
+ $for P in range(PIXEL_TILE):
+ const float* itl${ABC[P]} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset);
+ const float* ibl${ABC[P]} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset);
+ i += 2 * ${PIXEL_TILE};
+
+ $for P in range(0, PIXEL_TILE, 2):
+ const v128_t vtltr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) itl${ABC[P]}, *(const double*) itl${ABC[P+1]});
+ const v128_t vblbr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) ibl${ABC[P]}, *(const double*) ibl${ABC[P+1]});
+
+ $for P in range(0, PIXEL_TILE, 2):
+ const v128_t vldrd${ABC[P:P+2]} = wasm_f32x4_sub(vblbr${ABC[P:P+2]}, vtltr${ABC[P:P+2]});
+
+ $for P in range(0, PIXEL_TILE, 4):
+ const v128_t vld${ABC[P:P+4]} = wasm_v32x4_shuffle(vldrd${ABC[P:P+2]}, vldrd${ABC[P+2:P+4]}, 0, 2, 4, 6);
+ const v128_t vrd${ABC[P:P+4]} = wasm_v32x4_shuffle(vldrd${ABC[P:P+2]}, vldrd${ABC[P+2:P+4]}, 1, 3, 5, 7);
+
+ $for P in range(0, PIXEL_TILE, 4):
+ const v128_t vtl${ABC[P:P+4]} = wasm_v32x4_shuffle(vtltr${ABC[P:P+2]}, vtltr${ABC[P+2:P+4]}, 0, 2, 4, 6);
+ const v128_t vtr${ABC[P:P+4]} = wasm_v32x4_shuffle(vtltr${ABC[P:P+2]}, vtltr${ABC[P+2:P+4]}, 1, 3, 5, 7);
+
+ $for P in range(0, PIXEL_TILE, 4):
+ const v128_t vl${ABC[P:P+4]} = wasm_f32x4_add(vtl${ABC[P:P+4]}, wasm_f32x4_mul(vld${ABC[P:P+4]}, valphav${ABC[P:P+4]}));
+ const v128_t vr${ABC[P:P+4]} = wasm_f32x4_add(vtr${ABC[P:P+4]}, wasm_f32x4_mul(vrd${ABC[P:P+4]}, valphav${ABC[P:P+4]}));
+
+ $for P in range(0, PIXEL_TILE, 4):
+ const v128_t vd${ABC[P:P+4]} = wasm_f32x4_sub(vr${ABC[P:P+4]}, vl${ABC[P:P+4]});
+
+ $for P in range(0, PIXEL_TILE, 4):
+ const v128_t vo${ABC[P:P+4]} = wasm_f32x4_add(vl${ABC[P:P+4]}, wasm_f32x4_mul(vd${ABC[P:P+4]}, valphah${ABC[P:P+4]}));
+
+ $for P in range(0, PIXEL_TILE, 4):
+ wasm_v128_store(output + ${P}, vo${ABC[P:P+4]});
+ output += ${PIXEL_TILE};
+ }
+
+ for (; p >= 4; p -= 4) {
+ // Process quadruples of output pixels, each of which requires reading four input pixels.
+
+ // Separate the alternating weights for 4 pixels into two registers.
+ const v128_t vw0 = wasm_v128_load(w);
+ const v128_t vw1 = wasm_v128_load(w + 4);
+ const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6);
+ const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7);
+ w += 2 * 4;
+
+ // Read out pairs of (top-left, top-right) and (bottom-left, bottom-right) pixels
+ // into separate registers as in the scalar case.
+ $for P in range(4):
+ const float* itl${P} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset);
+ const float* ibl${P} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset);
+ i += 2 * 4;
+
+ $for P in range(0, 4, 2):
+ const v128_t vtltr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) itl${P}, *(const double*) itl${P+1});
+ const v128_t vblbr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) ibl${P}, *(const double*) ibl${P+1});
+
+ $for P in range(0, 4, 2):
+ const v128_t vldrd${ABC[P:P+2]} = wasm_f32x4_sub(vblbr${ABC[P:P+2]}, vtltr${ABC[P:P+2]});
+
+ // Shuffle to isolate `left_diff` and `right_diff`, packed in a single `v128` for all 4 pixels.
+ const v128_t vld = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6);
+ const v128_t vrd = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7);
+
+ // Shuffle to isolate `top_left` and `top_right`, packed in a single `v128` for all 4 pixels.
+ const v128_t vtl = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6);
+ const v128_t vtr = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7);
+
+ // Compute `left` from the equations (*).
+ const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+ // Compute `right` from the equations (*).
+ const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+ // Compute the result according to (*).
+ const v128_t vd = wasm_f32x4_sub(vr, vl);
+ const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+ wasm_v128_store(output, vo);
+ output += 4;
+ }
+
+ if XNN_UNLIKELY(p != 0) {
+ if (p & 2) {
+ // This can be understood as a truncated version of the 4-pixel case above.
+
+ const v128_t vw = wasm_v128_load(w);
+ w += 2 * 2;
+
+ const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2);
+ const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3);
+
+ $for P in range(2):
+ const float* itl${P} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset);
+ const float* ibl${P} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset);
+ i += 2 * 2;
+
+ const v128_t vtltr = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+ const v128_t vblbr = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+
+ const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+ const v128_t vld = wasm_v32x4_shuffle(vldrd, vldrd, 0, 2, 0, 2);
+ const v128_t vrd = wasm_v32x4_shuffle(vldrd, vldrd, 1, 3, 1, 3);
+
+ const v128_t vtl = wasm_v32x4_shuffle(vtltr, vtltr, 0, 2, 0, 2);
+ const v128_t vtr = wasm_v32x4_shuffle(vtltr, vtltr, 1, 3, 1, 3);
+
+ const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+ const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+ const v128_t vd = wasm_f32x4_sub(vr, vl);
+ const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+ *((double*) output) = wasm_f64x2_extract_lane(vo, 0);
+ output += 2;
+ }
+
+ if (p & 1) {
+ // We are computing the following formula:
+ // result = (1 - alpha_h) * (1 - alpha_v) * top_left +
+ // alpha_h * (1 - alpha_v) * top_right +
+ // (1 - alpha_h) * alpha_v * bottom_left +
+ // alpha_h * alpha_v * bottom_right.
+ // Rearranging gives (*):
+ // result = left + alpha_h * (right - left),
+ // where
+ // left = top_left + alpha_v * (bottom_left - top_left),
+ // right = top_right + alpha_v * (bottom_right - top_right).
+
+ const v128_t vw = wasm_v64x2_load_splat((const double*) w);
+ w += 2;
+
+ const float alphah = wasm_f32x4_extract_lane(vw, 0);
+ const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 1, 1, 1);
+
+ // Read adjacent top-left and top-right pixels into one register,
+ // and bottom-left and bottom-right into another.
+
+ const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
+ const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
+ i += 2;
+
+ const v128_t vtltr = wasm_v64x2_load_splat(itl);
+ const v128_t vblbr = wasm_v64x2_load_splat(ibl);
+
+ // Compute at once (**):
+ // left_diff = bottom_left - top_left
+ // right_diff = bottom_right - top_right
+ const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+
+ // Compute at once `left` and `right` from the equations.
+ const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav));
+
+ // Extract them and compute the result.
+ const float l = wasm_f32x4_extract_lane(vlr, 0);
+ const float r = wasm_f32x4_extract_lane(vlr, 1);
+
+ *output++ = l + alphah * (r - l);
+ }
+ }
+
+ input_offset += input_increment;
+ } while (--channels != 0);
+}
diff --git a/src/xnnpack/ibilinear.h b/src/xnnpack/ibilinear.h
index 5c4252da0..5af6b8c6c 100644
--- a/src/xnnpack/ibilinear.h
+++ b/src/xnnpack/ibilinear.h
@@ -56,6 +56,8 @@ DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__scalar
DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__scalar_p2)
DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__scalar_p4)
+DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4)
+DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8)
#ifdef __cplusplus
} // extern "C"
diff --git a/test/f32-ibilinear-chw.cc b/test/f32-ibilinear-chw.cc
index a41440c49..e4939e29b 100644
--- a/test/f32-ibilinear-chw.cc
+++ b/test/f32-ibilinear-chw.cc
@@ -33,6 +33,15 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P1, pixels_gt_1) {
}
}
+TEST(F32_IBILINEAR_CHW__SCALAR_P1, channels_eq_1) {
+ for (size_t pixels = 1; pixels <= 5; pixels += 1) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+ }
+}
+
TEST(F32_IBILINEAR_CHW__SCALAR_P1, channels_gt_1) {
for (size_t channels = 2; channels < 3; channels++) {
for (size_t pixels = 1; pixels <= 5; pixels += 1) {
@@ -103,7 +112,16 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P2, pixels_gt_2) {
}
}
-TEST(F32_IBILINEAR_CHW__SCALAR_P2, channels_gt_2) {
+TEST(F32_IBILINEAR_CHW__SCALAR_P2, channels_eq_1) {
+ for (size_t pixels = 1; pixels <= 10; pixels += 1) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+ }
+}
+
+TEST(F32_IBILINEAR_CHW__SCALAR_P2, channels_gt_1) {
for (size_t channels = 2; channels < 3; channels++) {
for (size_t pixels = 1; pixels <= 10; pixels += 1) {
IBilinearMicrokernelTester()
@@ -173,7 +191,16 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P4, pixels_gt_4) {
}
}
-TEST(F32_IBILINEAR_CHW__SCALAR_P4, channels_gt_4) {
+TEST(F32_IBILINEAR_CHW__SCALAR_P4, channels_eq_1) {
+ for (size_t pixels = 1; pixels <= 20; pixels += 3) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+ }
+}
+
+TEST(F32_IBILINEAR_CHW__SCALAR_P4, channels_gt_1) {
for (size_t channels = 2; channels < 3; channels++) {
for (size_t pixels = 1; pixels <= 20; pixels += 3) {
IBilinearMicrokernelTester()
@@ -207,3 +234,165 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P4, input_stride) {
}
}
}
+
+
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_eq_4) {
+ IBilinearMicrokernelTester()
+ .pixels(4)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_div_4) {
+ for (size_t pixels = 8; pixels < 40; pixels += 4) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_lt_4) {
+ for (size_t pixels = 1; pixels < 4; pixels++) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_gt_4) {
+ for (size_t pixels = 5; pixels < 8; pixels++) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, channels_eq_1) {
+ for (size_t pixels = 1; pixels <= 20; pixels += 3) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, channels_gt_1) {
+ for (size_t channels = 2; channels < 3; channels++) {
+ for (size_t pixels = 1; pixels <= 20; pixels += 3) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+ }
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, input_offset) {
+ for (size_t pixels = 1; pixels < 20; pixels += 3) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .input_offset(7)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+ }
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, input_stride) {
+ for (size_t pixels = 1; pixels < 20; pixels += 3) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .input_stride(83)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_eq_8) {
+ IBilinearMicrokernelTester()
+ .pixels(8)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_div_8) {
+ for (size_t pixels = 16; pixels < 80; pixels += 8) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_lt_8) {
+ for (size_t pixels = 1; pixels < 8; pixels++) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_gt_8) {
+ for (size_t pixels = 9; pixels < 16; pixels++) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, channels_eq_1) {
+ for (size_t pixels = 1; pixels <= 40; pixels += 7) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, channels_gt_1) {
+ for (size_t channels = 2; channels < 3; channels++) {
+ for (size_t pixels = 1; pixels <= 40; pixels += 7) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+ }
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, input_offset) {
+ for (size_t pixels = 1; pixels < 40; pixels += 7) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .input_offset(7)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+ }
+ }
+ }
+
+ TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, input_stride) {
+ for (size_t pixels = 1; pixels < 40; pixels += 7) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .input_stride(163)
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD
diff --git a/test/f32-ibilinear-chw.yaml b/test/f32-ibilinear-chw.yaml
index 15ca0d156..dcc5eeb1e 100644
--- a/test/f32-ibilinear-chw.yaml
+++ b/test/f32-ibilinear-chw.yaml
@@ -5,3 +5,5 @@
- name: xnn_f32_ibilinear_chw_ukernel__scalar_p1
- name: xnn_f32_ibilinear_chw_ukernel__scalar_p2
- name: xnn_f32_ibilinear_chw_ukernel__scalar_p4
+- name: xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4
+- name: xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8
diff --git a/tools/generate-ibilinear-chw-test.py b/tools/generate-ibilinear-chw-test.py
index f1a1efbcf..fb3fa4b16 100755
--- a/tools/generate-ibilinear-chw-test.py
+++ b/tools/generate-ibilinear-chw-test.py
@@ -94,20 +94,18 @@ $if CHANNEL_TILE > 1:
}
}
- TEST(${TEST_NAME}, channels_lt_${PIXEL_TILE}) {
- $if ISA_CHECK:
- ${ISA_CHECK};
- for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
- for (size_t pixels = 1; pixels <= ${PIXEL_TILE * 5}; pixels += ${max(1, PIXEL_TILE - 1)}) {
- IBilinearMicrokernelTester()
- .pixels(pixels)
- .channels(channels)
- .TestCHW(${TEST_FUNC});
- }
- }
+TEST(${TEST_NAME}, channels_eq_1) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pixels = 1; pixels <= ${PIXEL_TILE * 5}; pixels += ${max(1, PIXEL_TILE - 1)}) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(1)
+ .TestCHW(${TEST_FUNC});
}
+}
-TEST(${TEST_NAME}, channels_gt_${PIXEL_TILE}) {
+TEST(${TEST_NAME}, channels_gt_1) {
$if ISA_CHECK:
${ISA_CHECK};
for (size_t channels = ${CHANNEL_TILE+1}; channels < ${max(CHANNEL_TILE*2, 3)}; channels++) {