Add WebAssembly SIMD IBILINEAR microkernels for CHW layout

PiperOrigin-RevId: 338792392
author: XNNPACK Team <xnnpack-github-robot@google.com> 2020-10-23 21:10:15 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2020-10-23 21:10:56 -0700
commit: 965272bc1d5d060c7b9a4c3c47f3ac7e96dde3af (patch)
tree: 2a36cff9a2ced646dae8c052530c91220b065dca
parent: bf715f9159179086d3027bc74b625281efd20889 (diff)
download: XNNPACK-965272bc1d5d060c7b9a4c3c47f3ac7e96dde3af.tar.gz
11 files changed, 842 insertions, 20 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 62542a2c0..4a984f17e 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -775,6 +775,8 @@ WASMSIMD_UKERNELS = [
     "src/f32-hswish/gen/hswish-wasmsimd-x16.c",
     "src/f32-ibilinear/gen/wasmsimd-c4.c",
     "src/f32-ibilinear/gen/wasmsimd-c8.c",
+    "src/f32-ibilinear-chw/gen/wasmsimd-p4.c",
+    "src/f32-ibilinear-chw/gen/wasmsimd-p8.c",
     "src/f32-igemm/gen/1x8-minmax-wasmsimd-loadsplat-arm.c",
     "src/f32-igemm/gen/1x8-minmax-wasmsimd-loadsplat-x86.c",
     "src/f32-igemm/gen/1x8-minmax-wasmsimd-splat-arm.c",
diff --git a/include/xnnpack.h b/include/xnnpack.h
index 806f985d4..355533a9f 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -19,7 +19,7 @@ extern "C" {
 #endif
 
 /// The number of bytes XNNPACK may read beyond array bounds.
-/// The caller must allocate at this this many extra bytes after the tensor data passed to XNNPACK.
+/// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK.
 ///
 /// Note: XNNPACK reads, but never writes beyond array bounds.
 #define XNN_EXTRA_BYTES 16
diff --git a/scripts/generate-f32-ibilinear-chw.sh b/scripts/generate-f32-ibilinear-chw.sh
index 159130498..845a71cdd 100755
--- a/scripts/generate-f32-ibilinear-chw.sh
+++ b/scripts/generate-f32-ibilinear-chw.sh
@@ -5,9 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 
 #################################### Scalar ###################################
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=1 -o src/f32-ibilinear-chw/gen/scalar-p1.c
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=2 -o src/f32-ibilinear-chw/gen/scalar-p2.c
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/scalar-p4.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=1 -o src/f32-ibilinear-chw/gen/scalar-p1.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=2 -o src/f32-ibilinear-chw/gen/scalar-p2.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/scalar-p4.c
+
+############################### WebAssembly SIMD ##############################
+tools/xngen src/f32-ibilinear-chw/wasmsimd.c.in -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/wasmsimd-p4.c
+tools/xngen src/f32-ibilinear-chw/wasmsimd.c.in -D PIXEL_TILE=8 -o src/f32-ibilinear-chw/gen/wasmsimd-p8.c
 
 ################################## Unit tests #################################
 tools/generate-ibilinear-chw-test.py --spec test/f32-ibilinear-chw.yaml --output test/f32-ibilinear-chw.cc
diff --git a/src/f32-ibilinear-chw/gen/wasmsimd-p4.c b/src/f32-ibilinear-chw/gen/wasmsimd-p4.c
new file mode 100644
index 000000000..a08772d1b
--- /dev/null
+++ b/src/f32-ibilinear-chw/gen/wasmsimd-p4.c
@@ -0,0 +1,173 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-ibilinear-chw/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4(
+    size_t output_pixels,
+    size_t channels,
+    const float**restrict input,
+    size_t input_offset,
+    const float*restrict weights,
+    float*restrict output,
+    size_t input_increment) XNN_DISABLE_TSAN
+{
+  assert(output_pixels != 0);
+  assert(channels != 0);
+  assert(input_increment % sizeof(float) == 0);
+
+  do {
+    const float** i = input;
+
+    const float* w = weights;
+
+    // The code is best read starting from the bottom (i.e. the scalar case).
+    // Please read the comments there first; only the differences are explained in vectorized versions.
+
+    size_t p = output_pixels;
+
+    for (; p >= 4; p -= 4) {
+      // Process quadruples of output pixels, each of which requires reading four input pixels.
+
+      // Separate the alternating weights for 4 pixels into two registers.
+      const v128_t vw0 = wasm_v128_load(w);
+      const v128_t vw1 = wasm_v128_load(w + 4);
+      const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6);
+      const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7);
+      w += 2 * 4;
+
+      // Read out pairs of (top-left, top-right) and (bottom-left, bottom-right) pixels
+      // into separate registers as in the scalar case.
+      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+      i += 2 * 4;
+
+      const v128_t vtltr01 = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+      const v128_t vblbr01 = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+      const v128_t vtltr23 = wasm_f64x2_make(*(const double*) itl2, *(const double*) itl3);
+      const v128_t vblbr23 = wasm_f64x2_make(*(const double*) ibl2, *(const double*) ibl3);
+
+      const v128_t vldrd01 = wasm_f32x4_sub(vblbr01, vtltr01);
+      const v128_t vldrd23 = wasm_f32x4_sub(vblbr23, vtltr23);
+
+      // Shuffle to isolate `left_diff` and `right_diff`, packed in a single `v128` for all 4 pixels.
+      const v128_t vld = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6);
+      const v128_t vrd = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7);
+
+      // Shuffle to isolate `top_left` and `top_right`, packed in a single `v128` for all 4 pixels.
+      const v128_t vtl = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6);
+      const v128_t vtr = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7);
+
+      // Compute `left` from the equations (*).
+      const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+      // Compute `right` from the equations (*).
+      const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+      // Compute the result according to (*).
+      const v128_t vd = wasm_f32x4_sub(vr, vl);
+      const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+      wasm_v128_store(output, vo);
+      output += 4;
+    }
+
+    if XNN_UNLIKELY(p != 0) {
+      if (p & 2) {
+        // This can be understood as a truncated version of the 4-pixel case above.
+
+        const v128_t vw = wasm_v128_load(w);
+        w += 2 * 2;
+
+        const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2);
+        const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3);
+
+        const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+        const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+        const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+        i += 2 * 2;
+
+        const v128_t vtltr = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+        const v128_t vblbr = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+
+        const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+        const v128_t vld = wasm_v32x4_shuffle(vldrd, vldrd, 0, 2, 0, 2);
+        const v128_t vrd = wasm_v32x4_shuffle(vldrd, vldrd, 1, 3, 1, 3);
+
+        const v128_t vtl = wasm_v32x4_shuffle(vtltr, vtltr, 0, 2, 0, 2);
+        const v128_t vtr = wasm_v32x4_shuffle(vtltr, vtltr, 1, 3, 1, 3);
+
+        const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+        const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+        const v128_t vd = wasm_f32x4_sub(vr, vl);
+        const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+        *((double*) output) = wasm_f64x2_extract_lane(vo, 0);
+        output += 2;
+      }
+
+      if (p & 1) {
+        // We are computing the following formula:
+        //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
+        //                 alpha_h  * (1 - alpha_v) * top_right +
+        //            (1 - alpha_h) *      alpha_v  * bottom_left +
+        //                 alpha_h  *      alpha_v  * bottom_right.
+        // Rearranging gives (*):
+        //   result =    left + alpha_h * (right        - left),
+        // where
+        //   left =  top_left + alpha_v * (bottom_left  - top_left),
+        //  right = top_right + alpha_v * (bottom_right - top_right).
+
+        const v128_t vw = wasm_v64x2_load_splat((const double*) w);
+        w += 2;
+
+        const float alphah = wasm_f32x4_extract_lane(vw, 0);
+        const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 1, 1, 1);
+
+        // Read adjacent top-left and top-right pixels into one register,
+        // and bottom-left and bottom-right into another.
+
+        const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
+        i += 2;
+
+        const v128_t vtltr = wasm_v64x2_load_splat(itl);
+        const v128_t vblbr = wasm_v64x2_load_splat(ibl);
+
+        // Compute at once (**):
+        //    left_diff = bottom_left  - top_left
+        //   right_diff = bottom_right - top_right
+        const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+
+        // Compute at once `left` and `right` from the equations.
+        const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav));
+
+        // Extract them and compute the result.
+        const float l = wasm_f32x4_extract_lane(vlr, 0);
+        const float r = wasm_f32x4_extract_lane(vlr, 1);
+
+        *output++ = l + alphah * (r - l);
+      }
+    }
+
+    input_offset += input_increment;
+  } while (--channels != 0);
+}
diff --git a/src/f32-ibilinear-chw/gen/wasmsimd-p8.c b/src/f32-ibilinear-chw/gen/wasmsimd-p8.c
new file mode 100644
index 000000000..da60589de
--- /dev/null
+++ b/src/f32-ibilinear-chw/gen/wasmsimd-p8.c
@@ -0,0 +1,243 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-ibilinear-chw/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8(
+    size_t output_pixels,
+    size_t channels,
+    const float**restrict input,
+    size_t input_offset,
+    const float*restrict weights,
+    float*restrict output,
+    size_t input_increment) XNN_DISABLE_TSAN
+{
+  assert(output_pixels != 0);
+  assert(channels != 0);
+  assert(input_increment % sizeof(float) == 0);
+
+  do {
+    const float** i = input;
+
+    const float* w = weights;
+
+    // The code is best read starting from the bottom (i.e. the scalar case).
+    // Please read the comments there first; only the differences are explained in vectorized versions.
+
+    size_t p = output_pixels;
+    for (; p >= 8; p -= 8) {
+      // This is just an unrolled loop for `PIXEL_TILE` of 4.
+
+      const v128_t vw0_0123 = wasm_v128_load(w + 0);
+      const v128_t vw1_0123 = wasm_v128_load(w + 4);
+      const v128_t valphah0123 = wasm_v32x4_shuffle(vw0_0123, vw1_0123, 0, 2, 4, 6);
+      const v128_t valphav0123 = wasm_v32x4_shuffle(vw0_0123, vw1_0123, 1, 3, 5, 7);
+      const v128_t vw0_4567 = wasm_v128_load(w + 8);
+      const v128_t vw1_4567 = wasm_v128_load(w + 12);
+      const v128_t valphah4567 = wasm_v32x4_shuffle(vw0_4567, vw1_4567, 0, 2, 4, 6);
+      const v128_t valphav4567 = wasm_v32x4_shuffle(vw0_4567, vw1_4567, 1, 3, 5, 7);
+      w += 2 * 8;
+
+      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+      const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset);
+      const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset);
+      const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset);
+      const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset);
+      const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset);
+      const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset);
+      const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset);
+      const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset);
+      i += 2 * 8;
+
+      const v128_t vtltr01 = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+      const v128_t vblbr01 = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+      const v128_t vtltr23 = wasm_f64x2_make(*(const double*) itl2, *(const double*) itl3);
+      const v128_t vblbr23 = wasm_f64x2_make(*(const double*) ibl2, *(const double*) ibl3);
+      const v128_t vtltr45 = wasm_f64x2_make(*(const double*) itl4, *(const double*) itl5);
+      const v128_t vblbr45 = wasm_f64x2_make(*(const double*) ibl4, *(const double*) ibl5);
+      const v128_t vtltr67 = wasm_f64x2_make(*(const double*) itl6, *(const double*) itl7);
+      const v128_t vblbr67 = wasm_f64x2_make(*(const double*) ibl6, *(const double*) ibl7);
+
+      const v128_t vldrd01 = wasm_f32x4_sub(vblbr01, vtltr01);
+      const v128_t vldrd23 = wasm_f32x4_sub(vblbr23, vtltr23);
+      const v128_t vldrd45 = wasm_f32x4_sub(vblbr45, vtltr45);
+      const v128_t vldrd67 = wasm_f32x4_sub(vblbr67, vtltr67);
+
+      const v128_t vld0123 = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6);
+      const v128_t vrd0123 = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7);
+      const v128_t vld4567 = wasm_v32x4_shuffle(vldrd45, vldrd67, 0, 2, 4, 6);
+      const v128_t vrd4567 = wasm_v32x4_shuffle(vldrd45, vldrd67, 1, 3, 5, 7);
+
+      const v128_t vtl0123 = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6);
+      const v128_t vtr0123 = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7);
+      const v128_t vtl4567 = wasm_v32x4_shuffle(vtltr45, vtltr67, 0, 2, 4, 6);
+      const v128_t vtr4567 = wasm_v32x4_shuffle(vtltr45, vtltr67, 1, 3, 5, 7);
+
+      const v128_t vl0123 = wasm_f32x4_add(vtl0123, wasm_f32x4_mul(vld0123, valphav0123));
+      const v128_t vr0123 = wasm_f32x4_add(vtr0123, wasm_f32x4_mul(vrd0123, valphav0123));
+      const v128_t vl4567 = wasm_f32x4_add(vtl4567, wasm_f32x4_mul(vld4567, valphav4567));
+      const v128_t vr4567 = wasm_f32x4_add(vtr4567, wasm_f32x4_mul(vrd4567, valphav4567));
+
+      const v128_t vd0123 = wasm_f32x4_sub(vr0123, vl0123);
+      const v128_t vd4567 = wasm_f32x4_sub(vr4567, vl4567);
+
+      const v128_t vo0123 = wasm_f32x4_add(vl0123, wasm_f32x4_mul(vd0123, valphah0123));
+      const v128_t vo4567 = wasm_f32x4_add(vl4567, wasm_f32x4_mul(vd4567, valphah4567));
+
+      wasm_v128_store(output + 0, vo0123);
+      wasm_v128_store(output + 4, vo4567);
+      output += 8;
+    }
+
+    for (; p >= 4; p -= 4) {
+      // Process quadruples of output pixels, each of which requires reading four input pixels.
+
+      // Separate the alternating weights for 4 pixels into two registers.
+      const v128_t vw0 = wasm_v128_load(w);
+      const v128_t vw1 = wasm_v128_load(w + 4);
+      const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6);
+      const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7);
+      w += 2 * 4;
+
+      // Read out pairs of (top-left, top-right) and (bottom-left, bottom-right) pixels
+      // into separate registers as in the scalar case.
+      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+      i += 2 * 4;
+
+      const v128_t vtltr01 = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+      const v128_t vblbr01 = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+      const v128_t vtltr23 = wasm_f64x2_make(*(const double*) itl2, *(const double*) itl3);
+      const v128_t vblbr23 = wasm_f64x2_make(*(const double*) ibl2, *(const double*) ibl3);
+
+      const v128_t vldrd01 = wasm_f32x4_sub(vblbr01, vtltr01);
+      const v128_t vldrd23 = wasm_f32x4_sub(vblbr23, vtltr23);
+
+      // Shuffle to isolate `left_diff` and `right_diff`, packed in a single `v128` for all 4 pixels.
+      const v128_t vld = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6);
+      const v128_t vrd = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7);
+
+      // Shuffle to isolate `top_left` and `top_right`, packed in a single `v128` for all 4 pixels.
+      const v128_t vtl = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6);
+      const v128_t vtr = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7);
+
+      // Compute `left` from the equations (*).
+      const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+      // Compute `right` from the equations (*).
+      const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+      // Compute the result according to (*).
+      const v128_t vd = wasm_f32x4_sub(vr, vl);
+      const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+      wasm_v128_store(output, vo);
+      output += 4;
+    }
+
+    if XNN_UNLIKELY(p != 0) {
+      if (p & 2) {
+        // This can be understood as a truncated version of the 4-pixel case above.
+
+        const v128_t vw = wasm_v128_load(w);
+        w += 2 * 2;
+
+        const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2);
+        const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3);
+
+        const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+        const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+        const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+        i += 2 * 2;
+
+        const v128_t vtltr = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+        const v128_t vblbr = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+
+        const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+        const v128_t vld = wasm_v32x4_shuffle(vldrd, vldrd, 0, 2, 0, 2);
+        const v128_t vrd = wasm_v32x4_shuffle(vldrd, vldrd, 1, 3, 1, 3);
+
+        const v128_t vtl = wasm_v32x4_shuffle(vtltr, vtltr, 0, 2, 0, 2);
+        const v128_t vtr = wasm_v32x4_shuffle(vtltr, vtltr, 1, 3, 1, 3);
+
+        const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+        const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+        const v128_t vd = wasm_f32x4_sub(vr, vl);
+        const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+        *((double*) output) = wasm_f64x2_extract_lane(vo, 0);
+        output += 2;
+      }
+
+      if (p & 1) {
+        // We are computing the following formula:
+        //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
+        //                 alpha_h  * (1 - alpha_v) * top_right +
+        //            (1 - alpha_h) *      alpha_v  * bottom_left +
+        //                 alpha_h  *      alpha_v  * bottom_right.
+        // Rearranging gives (*):
+        //   result =    left + alpha_h * (right        - left),
+        // where
+        //   left =  top_left + alpha_v * (bottom_left  - top_left),
+        //  right = top_right + alpha_v * (bottom_right - top_right).
+
+        const v128_t vw = wasm_v64x2_load_splat((const double*) w);
+        w += 2;
+
+        const float alphah = wasm_f32x4_extract_lane(vw, 0);
+        const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 1, 1, 1);
+
+        // Read adjacent top-left and top-right pixels into one register,
+        // and bottom-left and bottom-right into another.
+
+        const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
+        i += 2;
+
+        const v128_t vtltr = wasm_v64x2_load_splat(itl);
+        const v128_t vblbr = wasm_v64x2_load_splat(ibl);
+
+        // Compute at once (**):
+        //    left_diff = bottom_left  - top_left
+        //   right_diff = bottom_right - top_right
+        const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+
+        // Compute at once `left` and `right` from the equations.
+        const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav));
+
+        // Extract them and compute the result.
+        const float l = wasm_f32x4_extract_lane(vlr, 0);
+        const float r = wasm_f32x4_extract_lane(vlr, 1);
+
+        *output++ = l + alphah * (r - l);
+      }
+    }
+
+    input_offset += input_increment;
+  } while (--channels != 0);
+}
diff --git a/src/f32-ibilinear-chw/scalar.c.in b/src/f32-ibilinear-chw/scalar.c.in
index 0ebeef93d..12adfa4c0 100644
--- a/src/f32-ibilinear-chw/scalar.c.in
+++ b/src/f32-ibilinear-chw/scalar.c.in
@@ -3,7 +3,6 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-$assert CHANNEL_TILE == 1
 $assert PIXEL_TILE >= 1
 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
@@ -11,7 +10,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <xnnpack/ibilinear.h>
 
 
-void xnn_f32_ibilinear_chw_ukernel__scalar_p${PIXEL_TILE}${"" if CHANNEL_TILE == 1 else "x%d" % CHANNEL_TILE}(
+void xnn_f32_ibilinear_chw_ukernel__scalar_p${PIXEL_TILE}(
     size_t output_pixels,
     size_t channels,
     const float**restrict input,
diff --git a/src/f32-ibilinear-chw/wasmsimd.c.in b/src/f32-ibilinear-chw/wasmsimd.c.in
new file mode 100644
index 000000000..b5b034c38
--- /dev/null
+++ b/src/f32-ibilinear-chw/wasmsimd.c.in
@@ -0,0 +1,210 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert PIXEL_TILE >= 1
+$assert PIXEL_TILE % 4 == 0
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f32_ibilinear_chw_ukernel__wasmsimd_p${PIXEL_TILE}(
+    size_t output_pixels,
+    size_t channels,
+    const float**restrict input,
+    size_t input_offset,
+    const float*restrict weights,
+    float*restrict output,
+    size_t input_increment) XNN_DISABLE_TSAN
+{
+  assert(output_pixels != 0);
+  assert(channels != 0);
+  assert(input_increment % sizeof(float) == 0);
+
+  do {
+    const float** i = input;
+
+    const float* w = weights;
+
+    // The code is best read starting from the bottom (i.e. the scalar case).
+    // Please read the comments there first; only the differences are explained in vectorized versions.
+
+    size_t p = output_pixels;
+    $if PIXEL_TILE > 4:
+      for (; p >= ${PIXEL_TILE}; p -= ${PIXEL_TILE}) {
+        // This is just an unrolled loop for `PIXEL_TILE` of 4.
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const v128_t vw0_${ABC[P:P+4]} = wasm_v128_load(w + ${2 * P});
+          const v128_t vw1_${ABC[P:P+4]} = wasm_v128_load(w + ${2 * P + 4});
+          const v128_t valphah${ABC[P:P+4]} = wasm_v32x4_shuffle(vw0_${ABC[P:P+4]}, vw1_${ABC[P:P+4]}, 0, 2, 4, 6);
+          const v128_t valphav${ABC[P:P+4]} = wasm_v32x4_shuffle(vw0_${ABC[P:P+4]}, vw1_${ABC[P:P+4]}, 1, 3, 5, 7);
+        w += 2 * ${PIXEL_TILE};
+
+        $for P in range(PIXEL_TILE):
+          const float* itl${ABC[P]} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset);
+          const float* ibl${ABC[P]} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset);
+        i += 2 * ${PIXEL_TILE};
+
+        $for P in range(0, PIXEL_TILE, 2):
+          const v128_t vtltr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) itl${ABC[P]}, *(const double*) itl${ABC[P+1]});
+          const v128_t vblbr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) ibl${ABC[P]}, *(const double*) ibl${ABC[P+1]});
+
+        $for P in range(0, PIXEL_TILE, 2):
+          const v128_t vldrd${ABC[P:P+2]} = wasm_f32x4_sub(vblbr${ABC[P:P+2]}, vtltr${ABC[P:P+2]});
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const v128_t vld${ABC[P:P+4]} = wasm_v32x4_shuffle(vldrd${ABC[P:P+2]}, vldrd${ABC[P+2:P+4]}, 0, 2, 4, 6);
+          const v128_t vrd${ABC[P:P+4]} = wasm_v32x4_shuffle(vldrd${ABC[P:P+2]}, vldrd${ABC[P+2:P+4]}, 1, 3, 5, 7);
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const v128_t vtl${ABC[P:P+4]} = wasm_v32x4_shuffle(vtltr${ABC[P:P+2]}, vtltr${ABC[P+2:P+4]}, 0, 2, 4, 6);
+          const v128_t vtr${ABC[P:P+4]} = wasm_v32x4_shuffle(vtltr${ABC[P:P+2]}, vtltr${ABC[P+2:P+4]}, 1, 3, 5, 7);
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const v128_t vl${ABC[P:P+4]} = wasm_f32x4_add(vtl${ABC[P:P+4]}, wasm_f32x4_mul(vld${ABC[P:P+4]}, valphav${ABC[P:P+4]}));
+          const v128_t vr${ABC[P:P+4]} = wasm_f32x4_add(vtr${ABC[P:P+4]}, wasm_f32x4_mul(vrd${ABC[P:P+4]}, valphav${ABC[P:P+4]}));
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const v128_t vd${ABC[P:P+4]} = wasm_f32x4_sub(vr${ABC[P:P+4]}, vl${ABC[P:P+4]});
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const v128_t vo${ABC[P:P+4]} = wasm_f32x4_add(vl${ABC[P:P+4]}, wasm_f32x4_mul(vd${ABC[P:P+4]}, valphah${ABC[P:P+4]}));
+
+        $for P in range(0, PIXEL_TILE, 4):
+          wasm_v128_store(output + ${P}, vo${ABC[P:P+4]});
+        output += ${PIXEL_TILE};
+      }
+
+    for (; p >= 4; p -= 4) {
+      // Process quadruples of output pixels, each of which requires reading four input pixels.
+
+      // Separate the alternating weights for 4 pixels into two registers.
+      const v128_t vw0 = wasm_v128_load(w);
+      const v128_t vw1 = wasm_v128_load(w + 4);
+      const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6);
+      const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7);
+      w += 2 * 4;
+
+      // Read out pairs of (top-left, top-right) and (bottom-left, bottom-right) pixels
+      // into separate registers as in the scalar case.
+      $for P in range(4):
+        const float* itl${P} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset);
+        const float* ibl${P} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset);
+      i += 2 * 4;
+
+      $for P in range(0, 4, 2):
+        const v128_t vtltr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) itl${P}, *(const double*) itl${P+1});
+        const v128_t vblbr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) ibl${P}, *(const double*) ibl${P+1});
+
+      $for P in range(0, 4, 2):
+        const v128_t vldrd${ABC[P:P+2]} = wasm_f32x4_sub(vblbr${ABC[P:P+2]}, vtltr${ABC[P:P+2]});
+
+      // Shuffle to isolate `left_diff` and `right_diff`, packed in a single `v128` for all 4 pixels.
+      const v128_t vld = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6);
+      const v128_t vrd = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7);
+
+      // Shuffle to isolate `top_left` and `top_right`, packed in a single `v128` for all 4 pixels.
+      const v128_t vtl = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6);
+      const v128_t vtr = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7);
+
+      // Compute `left` from the equations (*).
+      const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+      // Compute `right` from the equations (*).
+      const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+      // Compute the result according to (*).
+      const v128_t vd = wasm_f32x4_sub(vr, vl);
+      const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+      wasm_v128_store(output, vo);
+      output += 4;
+    }
+
+    if XNN_UNLIKELY(p != 0) {
+      if (p & 2) {
+        // This can be understood as a truncated version of the 4-pixel case above.
+
+        const v128_t vw = wasm_v128_load(w);
+        w += 2 * 2;
+
+        const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2);
+        const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3);
+
+        $for P in range(2):
+          const float* itl${P} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset);
+          const float* ibl${P} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset);
+        i += 2 * 2;
+
+        const v128_t vtltr = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1);
+        const v128_t vblbr = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1);
+
+        const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+        const v128_t vld = wasm_v32x4_shuffle(vldrd, vldrd, 0, 2, 0, 2);
+        const v128_t vrd = wasm_v32x4_shuffle(vldrd, vldrd, 1, 3, 1, 3);
+
+        const v128_t vtl = wasm_v32x4_shuffle(vtltr, vtltr, 0, 2, 0, 2);
+        const v128_t vtr = wasm_v32x4_shuffle(vtltr, vtltr, 1, 3, 1, 3);
+
+        const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav));
+        const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav));
+
+        const v128_t vd = wasm_f32x4_sub(vr, vl);
+        const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
+
+        *((double*) output) = wasm_f64x2_extract_lane(vo, 0);
+        output += 2;
+      }
+
+      if (p & 1) {
+        // We are computing the following formula:
+        //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
+        //                 alpha_h  * (1 - alpha_v) * top_right +
+        //            (1 - alpha_h) *      alpha_v  * bottom_left +
+        //                 alpha_h  *      alpha_v  * bottom_right.
+        // Rearranging gives (*):
+        //   result =    left + alpha_h * (right        - left),
+        // where
+        //   left =  top_left + alpha_v * (bottom_left  - top_left),
+        //  right = top_right + alpha_v * (bottom_right - top_right).
+
+        const v128_t vw = wasm_v64x2_load_splat((const double*) w);
+        w += 2;
+
+        const float alphah = wasm_f32x4_extract_lane(vw, 0);
+        const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 1, 1, 1);
+
+        // Read adjacent top-left and top-right pixels into one register,
+        // and bottom-left and bottom-right into another.
+
+        const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
+        i += 2;
+
+        const v128_t vtltr = wasm_v64x2_load_splat(itl);
+        const v128_t vblbr = wasm_v64x2_load_splat(ibl);
+
+        // Compute at once (**):
+        //    left_diff = bottom_left  - top_left
+        //   right_diff = bottom_right - top_right
+        const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr);
+
+        // Compute at once `left` and `right` from the equations.
+        const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav));
+
+        // Extract them and compute the result.
+        const float l = wasm_f32x4_extract_lane(vlr, 0);
+        const float r = wasm_f32x4_extract_lane(vlr, 1);
+
+        *output++ = l + alphah * (r - l);
+      }
+    }
+
+    input_offset += input_increment;
+  } while (--channels != 0);
+}
diff --git a/src/xnnpack/ibilinear.h b/src/xnnpack/ibilinear.h
index 5c4252da0..5af6b8c6c 100644
--- a/src/xnnpack/ibilinear.h
+++ b/src/xnnpack/ibilinear.h
@@ -56,6 +56,8 @@ DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__scalar
 DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__scalar_p2)
 DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__scalar_p4)
 
+DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4)
+DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/test/f32-ibilinear-chw.cc b/test/f32-ibilinear-chw.cc
index a41440c49..e4939e29b 100644
--- a/test/f32-ibilinear-chw.cc
+++ b/test/f32-ibilinear-chw.cc
@@ -33,6 +33,15 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P1, pixels_gt_1) {
   }
 }
 
+TEST(F32_IBILINEAR_CHW__SCALAR_P1, channels_eq_1) {
+  for (size_t pixels = 1; pixels <= 5; pixels += 1) {
+    IBilinearMicrokernelTester()
+      .pixels(pixels)
+      .channels(1)
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+  }
+}
+
 TEST(F32_IBILINEAR_CHW__SCALAR_P1, channels_gt_1) {
   for (size_t channels = 2; channels < 3; channels++) {
     for (size_t pixels = 1; pixels <= 5; pixels += 1) {
@@ -103,7 +112,16 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P2, pixels_gt_2) {
   }
 }
 
-TEST(F32_IBILINEAR_CHW__SCALAR_P2, channels_gt_2) {
+TEST(F32_IBILINEAR_CHW__SCALAR_P2, channels_eq_1) {
+  for (size_t pixels = 1; pixels <= 10; pixels += 1) {
+    IBilinearMicrokernelTester()
+      .pixels(pixels)
+      .channels(1)
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+  }
+}
+
+TEST(F32_IBILINEAR_CHW__SCALAR_P2, channels_gt_1) {
   for (size_t channels = 2; channels < 3; channels++) {
     for (size_t pixels = 1; pixels <= 10; pixels += 1) {
       IBilinearMicrokernelTester()
@@ -173,7 +191,16 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P4, pixels_gt_4) {
   }
 }
 
-TEST(F32_IBILINEAR_CHW__SCALAR_P4, channels_gt_4) {
+TEST(F32_IBILINEAR_CHW__SCALAR_P4, channels_eq_1) {
+  for (size_t pixels = 1; pixels <= 20; pixels += 3) {
+    IBilinearMicrokernelTester()
+      .pixels(pixels)
+      .channels(1)
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+  }
+}
+
+TEST(F32_IBILINEAR_CHW__SCALAR_P4, channels_gt_1) {
   for (size_t channels = 2; channels < 3; channels++) {
     for (size_t pixels = 1; pixels <= 20; pixels += 3) {
       IBilinearMicrokernelTester()
@@ -207,3 +234,165 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P4, input_stride) {
     }
   }
 }
+
+
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_eq_4) {
+    IBilinearMicrokernelTester()
+      .pixels(4)
+      .channels(1)
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_div_4) {
+    for (size_t pixels = 8; pixels < 40; pixels += 4) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_lt_4) {
+    for (size_t pixels = 1; pixels < 4; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_gt_4) {
+    for (size_t pixels = 5; pixels < 8; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, channels_eq_1) {
+    for (size_t pixels = 1; pixels <= 20; pixels += 3) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, channels_gt_1) {
+    for (size_t channels = 2; channels < 3; channels++) {
+      for (size_t pixels = 1; pixels <= 20; pixels += 3) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, input_offset) {
+    for (size_t pixels = 1; pixels < 20; pixels += 3) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_offset(7)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, input_stride) {
+    for (size_t pixels = 1; pixels < 20; pixels += 3) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_stride(83)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4);
+      }
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_eq_8) {
+    IBilinearMicrokernelTester()
+      .pixels(8)
+      .channels(1)
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_div_8) {
+    for (size_t pixels = 16; pixels < 80; pixels += 8) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_lt_8) {
+    for (size_t pixels = 1; pixels < 8; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_gt_8) {
+    for (size_t pixels = 9; pixels < 16; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, channels_eq_1) {
+    for (size_t pixels = 1; pixels <= 40; pixels += 7) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, channels_gt_1) {
+    for (size_t channels = 2; channels < 3; channels++) {
+      for (size_t pixels = 1; pixels <= 40; pixels += 7) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, input_offset) {
+    for (size_t pixels = 1; pixels < 40; pixels += 7) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_offset(7)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, input_stride) {
+    for (size_t pixels = 1; pixels < 40; pixels += 7) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_stride(163)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8);
+      }
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
diff --git a/test/f32-ibilinear-chw.yaml b/test/f32-ibilinear-chw.yaml
index 15ca0d156..dcc5eeb1e 100644
--- a/test/f32-ibilinear-chw.yaml
+++ b/test/f32-ibilinear-chw.yaml
@@ -5,3 +5,5 @@
 - name: xnn_f32_ibilinear_chw_ukernel__scalar_p1
 - name: xnn_f32_ibilinear_chw_ukernel__scalar_p2
 - name: xnn_f32_ibilinear_chw_ukernel__scalar_p4
+- name: xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4
+- name: xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8
diff --git a/tools/generate-ibilinear-chw-test.py b/tools/generate-ibilinear-chw-test.py
index f1a1efbcf..fb3fa4b16 100755
--- a/tools/generate-ibilinear-chw-test.py
+++ b/tools/generate-ibilinear-chw-test.py
@@ -94,20 +94,18 @@ $if CHANNEL_TILE > 1:
     }
   }
 
-  TEST(${TEST_NAME}, channels_lt_${PIXEL_TILE}) {
-    $if ISA_CHECK:
-      ${ISA_CHECK};
-    for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
-      for (size_t pixels = 1; pixels <= ${PIXEL_TILE * 5}; pixels += ${max(1, PIXEL_TILE - 1)}) {
-        IBilinearMicrokernelTester()
-          .pixels(pixels)
-          .channels(channels)
-          .TestCHW(${TEST_FUNC});
-      }
-    }
+TEST(${TEST_NAME}, channels_eq_1) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pixels = 1; pixels <= ${PIXEL_TILE * 5}; pixels += ${max(1, PIXEL_TILE - 1)}) {
+    IBilinearMicrokernelTester()
+      .pixels(pixels)
+      .channels(1)
+      .TestCHW(${TEST_FUNC});
   }
+}
 
-TEST(${TEST_NAME}, channels_gt_${PIXEL_TILE}) {
+TEST(${TEST_NAME}, channels_gt_1) {
   $if ISA_CHECK:
     ${ISA_CHECK};
   for (size_t channels = ${CHANNEL_TILE+1}; channels < ${max(CHANNEL_TILE*2, 3)}; channels++) {
author	XNNPACK Team <xnnpack-github-robot@google.com>	2020-10-23 21:10:15 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2020-10-23 21:10:56 -0700
commit	965272bc1d5d060c7b9a4c3c47f3ac7e96dde3af (patch)
tree	2a36cff9a2ced646dae8c052530c91220b065dca
parent	bf715f9159179086d3027bc74b625281efd20889 (diff)
download	XNNPACK-965272bc1d5d060c7b9a4c3c47f3ac7e96dde3af.tar.gz