diff options
author | XNNPACK Team <xnnpack-github-robot@google.com> | 2020-10-23 21:10:15 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2020-10-23 21:10:56 -0700 |
commit | 965272bc1d5d060c7b9a4c3c47f3ac7e96dde3af (patch) | |
tree | 2a36cff9a2ced646dae8c052530c91220b065dca | |
parent | bf715f9159179086d3027bc74b625281efd20889 (diff) | |
download | XNNPACK-965272bc1d5d060c7b9a4c3c47f3ac7e96dde3af.tar.gz |
Add WebAssembly SIMD IBILINEAR microkernels for CHW layout
PiperOrigin-RevId: 338792392
-rw-r--r-- | BUILD.bazel | 2 | ||||
-rw-r--r-- | include/xnnpack.h | 2 | ||||
-rwxr-xr-x | scripts/generate-f32-ibilinear-chw.sh | 10 | ||||
-rw-r--r-- | src/f32-ibilinear-chw/gen/wasmsimd-p4.c | 173 | ||||
-rw-r--r-- | src/f32-ibilinear-chw/gen/wasmsimd-p8.c | 243 | ||||
-rw-r--r-- | src/f32-ibilinear-chw/scalar.c.in | 3 | ||||
-rw-r--r-- | src/f32-ibilinear-chw/wasmsimd.c.in | 210 | ||||
-rw-r--r-- | src/xnnpack/ibilinear.h | 2 | ||||
-rw-r--r-- | test/f32-ibilinear-chw.cc | 193 | ||||
-rw-r--r-- | test/f32-ibilinear-chw.yaml | 2 | ||||
-rwxr-xr-x | tools/generate-ibilinear-chw-test.py | 22 |
11 files changed, 842 insertions, 20 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index 62542a2c0..4a984f17e 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -775,6 +775,8 @@ WASMSIMD_UKERNELS = [ "src/f32-hswish/gen/hswish-wasmsimd-x16.c", "src/f32-ibilinear/gen/wasmsimd-c4.c", "src/f32-ibilinear/gen/wasmsimd-c8.c", + "src/f32-ibilinear-chw/gen/wasmsimd-p4.c", + "src/f32-ibilinear-chw/gen/wasmsimd-p8.c", "src/f32-igemm/gen/1x8-minmax-wasmsimd-loadsplat-arm.c", "src/f32-igemm/gen/1x8-minmax-wasmsimd-loadsplat-x86.c", "src/f32-igemm/gen/1x8-minmax-wasmsimd-splat-arm.c", diff --git a/include/xnnpack.h b/include/xnnpack.h index 806f985d4..355533a9f 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -19,7 +19,7 @@ extern "C" { #endif /// The number of bytes XNNPACK may read beyond array bounds. -/// The caller must allocate at this this many extra bytes after the tensor data passed to XNNPACK. +/// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK. /// /// Note: XNNPACK reads, but never writes beyond array bounds. #define XNN_EXTRA_BYTES 16 diff --git a/scripts/generate-f32-ibilinear-chw.sh b/scripts/generate-f32-ibilinear-chw.sh index 159130498..845a71cdd 100755 --- a/scripts/generate-f32-ibilinear-chw.sh +++ b/scripts/generate-f32-ibilinear-chw.sh @@ -5,9 +5,13 @@ # LICENSE file in the root directory of this source tree. #################################### Scalar ################################### -tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=1 -o src/f32-ibilinear-chw/gen/scalar-p1.c -tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=2 -o src/f32-ibilinear-chw/gen/scalar-p2.c -tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/scalar-p4.c +tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=1 -o src/f32-ibilinear-chw/gen/scalar-p1.c +tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=2 -o src/f32-ibilinear-chw/gen/scalar-p2.c +tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/scalar-p4.c + +############################### WebAssembly SIMD ############################## +tools/xngen src/f32-ibilinear-chw/wasmsimd.c.in -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/wasmsimd-p4.c +tools/xngen src/f32-ibilinear-chw/wasmsimd.c.in -D PIXEL_TILE=8 -o src/f32-ibilinear-chw/gen/wasmsimd-p8.c ################################## Unit tests ################################# tools/generate-ibilinear-chw-test.py --spec test/f32-ibilinear-chw.yaml --output test/f32-ibilinear-chw.cc diff --git a/src/f32-ibilinear-chw/gen/wasmsimd-p4.c b/src/f32-ibilinear-chw/gen/wasmsimd-p4.c new file mode 100644 index 000000000..a08772d1b --- /dev/null +++ b/src/f32-ibilinear-chw/gen/wasmsimd-p4.c @@ -0,0 +1,173 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-ibilinear-chw/wasmsimd.c.in +// Generator: tools/xngen +// +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> + +#include <wasm_simd128.h> + +#include <xnnpack/ibilinear.h> + + +void xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4( + size_t output_pixels, + size_t channels, + const float**restrict input, + size_t input_offset, + const float*restrict weights, + float*restrict output, + size_t input_increment) XNN_DISABLE_TSAN +{ + assert(output_pixels != 0); + assert(channels != 0); + assert(input_increment % sizeof(float) == 0); + + do { + const float** i = input; + + const float* w = weights; + + // The code is best read starting from the bottom (i.e. the scalar case). + // Please read the comments there first; only the differences are explained in vectorized versions. + + size_t p = output_pixels; + + for (; p >= 4; p -= 4) { + // Process quadruples of output pixels, each of which requires reading four input pixels. + + // Separate the alternating weights for 4 pixels into two registers. + const v128_t vw0 = wasm_v128_load(w); + const v128_t vw1 = wasm_v128_load(w + 4); + const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6); + const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7); + w += 2 * 4; + + // Read out pairs of (top-left, top-right) and (bottom-left, bottom-right) pixels + // into separate registers as in the scalar case. + const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); + const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); + const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); + const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); + const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset); + const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset); + const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset); + const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset); + i += 2 * 4; + + const v128_t vtltr01 = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1); + const v128_t vblbr01 = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1); + const v128_t vtltr23 = wasm_f64x2_make(*(const double*) itl2, *(const double*) itl3); + const v128_t vblbr23 = wasm_f64x2_make(*(const double*) ibl2, *(const double*) ibl3); + + const v128_t vldrd01 = wasm_f32x4_sub(vblbr01, vtltr01); + const v128_t vldrd23 = wasm_f32x4_sub(vblbr23, vtltr23); + + // Shuffle to isolate `left_diff` and `right_diff`, packed in a single `v128` for all 4 pixels. + const v128_t vld = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6); + const v128_t vrd = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7); + + // Shuffle to isolate `top_left` and `top_right`, packed in a single `v128` for all 4 pixels. + const v128_t vtl = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6); + const v128_t vtr = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7); + + // Compute `left` from the equations (*). + const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); + // Compute `right` from the equations (*). + const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); + + // Compute the result according to (*). + const v128_t vd = wasm_f32x4_sub(vr, vl); + const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); + + wasm_v128_store(output, vo); + output += 4; + } + + if XNN_UNLIKELY(p != 0) { + if (p & 2) { + // This can be understood as a truncated version of the 4-pixel case above. + + const v128_t vw = wasm_v128_load(w); + w += 2 * 2; + + const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2); + const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3); + + const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); + const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); + const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); + const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); + i += 2 * 2; + + const v128_t vtltr = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1); + const v128_t vblbr = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1); + + const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr); + const v128_t vld = wasm_v32x4_shuffle(vldrd, vldrd, 0, 2, 0, 2); + const v128_t vrd = wasm_v32x4_shuffle(vldrd, vldrd, 1, 3, 1, 3); + + const v128_t vtl = wasm_v32x4_shuffle(vtltr, vtltr, 0, 2, 0, 2); + const v128_t vtr = wasm_v32x4_shuffle(vtltr, vtltr, 1, 3, 1, 3); + + const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); + const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); + + const v128_t vd = wasm_f32x4_sub(vr, vl); + const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); + + *((double*) output) = wasm_f64x2_extract_lane(vo, 0); + output += 2; + } + + if (p & 1) { + // We are computing the following formula: + // result = (1 - alpha_h) * (1 - alpha_v) * top_left + + // alpha_h * (1 - alpha_v) * top_right + + // (1 - alpha_h) * alpha_v * bottom_left + + // alpha_h * alpha_v * bottom_right. + // Rearranging gives (*): + // result = left + alpha_h * (right - left), + // where + // left = top_left + alpha_v * (bottom_left - top_left), + // right = top_right + alpha_v * (bottom_right - top_right). + + const v128_t vw = wasm_v64x2_load_splat((const double*) w); + w += 2; + + const float alphah = wasm_f32x4_extract_lane(vw, 0); + const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 1, 1, 1); + + // Read adjacent top-left and top-right pixels into one register, + // and bottom-left and bottom-right into another. + + const float* itl = (const float*) ((uintptr_t) i[0] + input_offset); + const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset); + i += 2; + + const v128_t vtltr = wasm_v64x2_load_splat(itl); + const v128_t vblbr = wasm_v64x2_load_splat(ibl); + + // Compute at once (**): + // left_diff = bottom_left - top_left + // right_diff = bottom_right - top_right + const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr); + + // Compute at once `left` and `right` from the equations. + const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav)); + + // Extract them and compute the result. + const float l = wasm_f32x4_extract_lane(vlr, 0); + const float r = wasm_f32x4_extract_lane(vlr, 1); + + *output++ = l + alphah * (r - l); + } + } + + input_offset += input_increment; + } while (--channels != 0); +} diff --git a/src/f32-ibilinear-chw/gen/wasmsimd-p8.c b/src/f32-ibilinear-chw/gen/wasmsimd-p8.c new file mode 100644 index 000000000..da60589de --- /dev/null +++ b/src/f32-ibilinear-chw/gen/wasmsimd-p8.c @@ -0,0 +1,243 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-ibilinear-chw/wasmsimd.c.in +// Generator: tools/xngen +// +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> + +#include <wasm_simd128.h> + +#include <xnnpack/ibilinear.h> + + +void xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8( + size_t output_pixels, + size_t channels, + const float**restrict input, + size_t input_offset, + const float*restrict weights, + float*restrict output, + size_t input_increment) XNN_DISABLE_TSAN +{ + assert(output_pixels != 0); + assert(channels != 0); + assert(input_increment % sizeof(float) == 0); + + do { + const float** i = input; + + const float* w = weights; + + // The code is best read starting from the bottom (i.e. the scalar case). + // Please read the comments there first; only the differences are explained in vectorized versions. + + size_t p = output_pixels; + for (; p >= 8; p -= 8) { + // This is just an unrolled loop for `PIXEL_TILE` of 4. + + const v128_t vw0_0123 = wasm_v128_load(w + 0); + const v128_t vw1_0123 = wasm_v128_load(w + 4); + const v128_t valphah0123 = wasm_v32x4_shuffle(vw0_0123, vw1_0123, 0, 2, 4, 6); + const v128_t valphav0123 = wasm_v32x4_shuffle(vw0_0123, vw1_0123, 1, 3, 5, 7); + const v128_t vw0_4567 = wasm_v128_load(w + 8); + const v128_t vw1_4567 = wasm_v128_load(w + 12); + const v128_t valphah4567 = wasm_v32x4_shuffle(vw0_4567, vw1_4567, 0, 2, 4, 6); + const v128_t valphav4567 = wasm_v32x4_shuffle(vw0_4567, vw1_4567, 1, 3, 5, 7); + w += 2 * 8; + + const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); + const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); + const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); + const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); + const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset); + const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset); + const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset); + const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset); + const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset); + const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset); + const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset); + const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset); + const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset); + const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset); + const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset); + const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset); + i += 2 * 8; + + const v128_t vtltr01 = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1); + const v128_t vblbr01 = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1); + const v128_t vtltr23 = wasm_f64x2_make(*(const double*) itl2, *(const double*) itl3); + const v128_t vblbr23 = wasm_f64x2_make(*(const double*) ibl2, *(const double*) ibl3); + const v128_t vtltr45 = wasm_f64x2_make(*(const double*) itl4, *(const double*) itl5); + const v128_t vblbr45 = wasm_f64x2_make(*(const double*) ibl4, *(const double*) ibl5); + const v128_t vtltr67 = wasm_f64x2_make(*(const double*) itl6, *(const double*) itl7); + const v128_t vblbr67 = wasm_f64x2_make(*(const double*) ibl6, *(const double*) ibl7); + + const v128_t vldrd01 = wasm_f32x4_sub(vblbr01, vtltr01); + const v128_t vldrd23 = wasm_f32x4_sub(vblbr23, vtltr23); + const v128_t vldrd45 = wasm_f32x4_sub(vblbr45, vtltr45); + const v128_t vldrd67 = wasm_f32x4_sub(vblbr67, vtltr67); + + const v128_t vld0123 = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6); + const v128_t vrd0123 = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7); + const v128_t vld4567 = wasm_v32x4_shuffle(vldrd45, vldrd67, 0, 2, 4, 6); + const v128_t vrd4567 = wasm_v32x4_shuffle(vldrd45, vldrd67, 1, 3, 5, 7); + + const v128_t vtl0123 = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6); + const v128_t vtr0123 = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7); + const v128_t vtl4567 = wasm_v32x4_shuffle(vtltr45, vtltr67, 0, 2, 4, 6); + const v128_t vtr4567 = wasm_v32x4_shuffle(vtltr45, vtltr67, 1, 3, 5, 7); + + const v128_t vl0123 = wasm_f32x4_add(vtl0123, wasm_f32x4_mul(vld0123, valphav0123)); + const v128_t vr0123 = wasm_f32x4_add(vtr0123, wasm_f32x4_mul(vrd0123, valphav0123)); + const v128_t vl4567 = wasm_f32x4_add(vtl4567, wasm_f32x4_mul(vld4567, valphav4567)); + const v128_t vr4567 = wasm_f32x4_add(vtr4567, wasm_f32x4_mul(vrd4567, valphav4567)); + + const v128_t vd0123 = wasm_f32x4_sub(vr0123, vl0123); + const v128_t vd4567 = wasm_f32x4_sub(vr4567, vl4567); + + const v128_t vo0123 = wasm_f32x4_add(vl0123, wasm_f32x4_mul(vd0123, valphah0123)); + const v128_t vo4567 = wasm_f32x4_add(vl4567, wasm_f32x4_mul(vd4567, valphah4567)); + + wasm_v128_store(output + 0, vo0123); + wasm_v128_store(output + 4, vo4567); + output += 8; + } + + for (; p >= 4; p -= 4) { + // Process quadruples of output pixels, each of which requires reading four input pixels. + + // Separate the alternating weights for 4 pixels into two registers. + const v128_t vw0 = wasm_v128_load(w); + const v128_t vw1 = wasm_v128_load(w + 4); + const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6); + const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7); + w += 2 * 4; + + // Read out pairs of (top-left, top-right) and (bottom-left, bottom-right) pixels + // into separate registers as in the scalar case. + const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); + const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); + const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); + const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); + const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset); + const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset); + const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset); + const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset); + i += 2 * 4; + + const v128_t vtltr01 = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1); + const v128_t vblbr01 = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1); + const v128_t vtltr23 = wasm_f64x2_make(*(const double*) itl2, *(const double*) itl3); + const v128_t vblbr23 = wasm_f64x2_make(*(const double*) ibl2, *(const double*) ibl3); + + const v128_t vldrd01 = wasm_f32x4_sub(vblbr01, vtltr01); + const v128_t vldrd23 = wasm_f32x4_sub(vblbr23, vtltr23); + + // Shuffle to isolate `left_diff` and `right_diff`, packed in a single `v128` for all 4 pixels. + const v128_t vld = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6); + const v128_t vrd = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7); + + // Shuffle to isolate `top_left` and `top_right`, packed in a single `v128` for all 4 pixels. + const v128_t vtl = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6); + const v128_t vtr = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7); + + // Compute `left` from the equations (*). + const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); + // Compute `right` from the equations (*). + const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); + + // Compute the result according to (*). + const v128_t vd = wasm_f32x4_sub(vr, vl); + const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); + + wasm_v128_store(output, vo); + output += 4; + } + + if XNN_UNLIKELY(p != 0) { + if (p & 2) { + // This can be understood as a truncated version of the 4-pixel case above. + + const v128_t vw = wasm_v128_load(w); + w += 2 * 2; + + const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2); + const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3); + + const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); + const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); + const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); + const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); + i += 2 * 2; + + const v128_t vtltr = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1); + const v128_t vblbr = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1); + + const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr); + const v128_t vld = wasm_v32x4_shuffle(vldrd, vldrd, 0, 2, 0, 2); + const v128_t vrd = wasm_v32x4_shuffle(vldrd, vldrd, 1, 3, 1, 3); + + const v128_t vtl = wasm_v32x4_shuffle(vtltr, vtltr, 0, 2, 0, 2); + const v128_t vtr = wasm_v32x4_shuffle(vtltr, vtltr, 1, 3, 1, 3); + + const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); + const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); + + const v128_t vd = wasm_f32x4_sub(vr, vl); + const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); + + *((double*) output) = wasm_f64x2_extract_lane(vo, 0); + output += 2; + } + + if (p & 1) { + // We are computing the following formula: + // result = (1 - alpha_h) * (1 - alpha_v) * top_left + + // alpha_h * (1 - alpha_v) * top_right + + // (1 - alpha_h) * alpha_v * bottom_left + + // alpha_h * alpha_v * bottom_right. + // Rearranging gives (*): + // result = left + alpha_h * (right - left), + // where + // left = top_left + alpha_v * (bottom_left - top_left), + // right = top_right + alpha_v * (bottom_right - top_right). + + const v128_t vw = wasm_v64x2_load_splat((const double*) w); + w += 2; + + const float alphah = wasm_f32x4_extract_lane(vw, 0); + const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 1, 1, 1); + + // Read adjacent top-left and top-right pixels into one register, + // and bottom-left and bottom-right into another. + + const float* itl = (const float*) ((uintptr_t) i[0] + input_offset); + const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset); + i += 2; + + const v128_t vtltr = wasm_v64x2_load_splat(itl); + const v128_t vblbr = wasm_v64x2_load_splat(ibl); + + // Compute at once (**): + // left_diff = bottom_left - top_left + // right_diff = bottom_right - top_right + const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr); + + // Compute at once `left` and `right` from the equations. + const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav)); + + // Extract them and compute the result. + const float l = wasm_f32x4_extract_lane(vlr, 0); + const float r = wasm_f32x4_extract_lane(vlr, 1); + + *output++ = l + alphah * (r - l); + } + } + + input_offset += input_increment; + } while (--channels != 0); +} diff --git a/src/f32-ibilinear-chw/scalar.c.in b/src/f32-ibilinear-chw/scalar.c.in index 0ebeef93d..12adfa4c0 100644 --- a/src/f32-ibilinear-chw/scalar.c.in +++ b/src/f32-ibilinear-chw/scalar.c.in @@ -3,7 +3,6 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -$assert CHANNEL_TILE == 1 $assert PIXEL_TILE >= 1 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include <assert.h> @@ -11,7 +10,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include <xnnpack/ibilinear.h> -void xnn_f32_ibilinear_chw_ukernel__scalar_p${PIXEL_TILE}${"" if CHANNEL_TILE == 1 else "x%d" % CHANNEL_TILE}( +void xnn_f32_ibilinear_chw_ukernel__scalar_p${PIXEL_TILE}( size_t output_pixels, size_t channels, const float**restrict input, diff --git a/src/f32-ibilinear-chw/wasmsimd.c.in b/src/f32-ibilinear-chw/wasmsimd.c.in new file mode 100644 index 000000000..b5b034c38 --- /dev/null +++ b/src/f32-ibilinear-chw/wasmsimd.c.in @@ -0,0 +1,210 @@ +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert PIXEL_TILE >= 1 +$assert PIXEL_TILE % 4 == 0 +$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" +#include <assert.h> + +#include <wasm_simd128.h> + +#include <xnnpack/ibilinear.h> + + +void xnn_f32_ibilinear_chw_ukernel__wasmsimd_p${PIXEL_TILE}( + size_t output_pixels, + size_t channels, + const float**restrict input, + size_t input_offset, + const float*restrict weights, + float*restrict output, + size_t input_increment) XNN_DISABLE_TSAN +{ + assert(output_pixels != 0); + assert(channels != 0); + assert(input_increment % sizeof(float) == 0); + + do { + const float** i = input; + + const float* w = weights; + + // The code is best read starting from the bottom (i.e. the scalar case). + // Please read the comments there first; only the differences are explained in vectorized versions. + + size_t p = output_pixels; + $if PIXEL_TILE > 4: + for (; p >= ${PIXEL_TILE}; p -= ${PIXEL_TILE}) { + // This is just an unrolled loop for `PIXEL_TILE` of 4. + + $for P in range(0, PIXEL_TILE, 4): + const v128_t vw0_${ABC[P:P+4]} = wasm_v128_load(w + ${2 * P}); + const v128_t vw1_${ABC[P:P+4]} = wasm_v128_load(w + ${2 * P + 4}); + const v128_t valphah${ABC[P:P+4]} = wasm_v32x4_shuffle(vw0_${ABC[P:P+4]}, vw1_${ABC[P:P+4]}, 0, 2, 4, 6); + const v128_t valphav${ABC[P:P+4]} = wasm_v32x4_shuffle(vw0_${ABC[P:P+4]}, vw1_${ABC[P:P+4]}, 1, 3, 5, 7); + w += 2 * ${PIXEL_TILE}; + + $for P in range(PIXEL_TILE): + const float* itl${ABC[P]} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset); + const float* ibl${ABC[P]} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset); + i += 2 * ${PIXEL_TILE}; + + $for P in range(0, PIXEL_TILE, 2): + const v128_t vtltr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) itl${ABC[P]}, *(const double*) itl${ABC[P+1]}); + const v128_t vblbr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) ibl${ABC[P]}, *(const double*) ibl${ABC[P+1]}); + + $for P in range(0, PIXEL_TILE, 2): + const v128_t vldrd${ABC[P:P+2]} = wasm_f32x4_sub(vblbr${ABC[P:P+2]}, vtltr${ABC[P:P+2]}); + + $for P in range(0, PIXEL_TILE, 4): + const v128_t vld${ABC[P:P+4]} = wasm_v32x4_shuffle(vldrd${ABC[P:P+2]}, vldrd${ABC[P+2:P+4]}, 0, 2, 4, 6); + const v128_t vrd${ABC[P:P+4]} = wasm_v32x4_shuffle(vldrd${ABC[P:P+2]}, vldrd${ABC[P+2:P+4]}, 1, 3, 5, 7); + + $for P in range(0, PIXEL_TILE, 4): + const v128_t vtl${ABC[P:P+4]} = wasm_v32x4_shuffle(vtltr${ABC[P:P+2]}, vtltr${ABC[P+2:P+4]}, 0, 2, 4, 6); + const v128_t vtr${ABC[P:P+4]} = wasm_v32x4_shuffle(vtltr${ABC[P:P+2]}, vtltr${ABC[P+2:P+4]}, 1, 3, 5, 7); + + $for P in range(0, PIXEL_TILE, 4): + const v128_t vl${ABC[P:P+4]} = wasm_f32x4_add(vtl${ABC[P:P+4]}, wasm_f32x4_mul(vld${ABC[P:P+4]}, valphav${ABC[P:P+4]})); + const v128_t vr${ABC[P:P+4]} = wasm_f32x4_add(vtr${ABC[P:P+4]}, wasm_f32x4_mul(vrd${ABC[P:P+4]}, valphav${ABC[P:P+4]})); + + $for P in range(0, PIXEL_TILE, 4): + const v128_t vd${ABC[P:P+4]} = wasm_f32x4_sub(vr${ABC[P:P+4]}, vl${ABC[P:P+4]}); + + $for P in range(0, PIXEL_TILE, 4): + const v128_t vo${ABC[P:P+4]} = wasm_f32x4_add(vl${ABC[P:P+4]}, wasm_f32x4_mul(vd${ABC[P:P+4]}, valphah${ABC[P:P+4]})); + + $for P in range(0, PIXEL_TILE, 4): + wasm_v128_store(output + ${P}, vo${ABC[P:P+4]}); + output += ${PIXEL_TILE}; + } + + for (; p >= 4; p -= 4) { + // Process quadruples of output pixels, each of which requires reading four input pixels. + + // Separate the alternating weights for 4 pixels into two registers. + const v128_t vw0 = wasm_v128_load(w); + const v128_t vw1 = wasm_v128_load(w + 4); + const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6); + const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7); + w += 2 * 4; + + // Read out pairs of (top-left, top-right) and (bottom-left, bottom-right) pixels + // into separate registers as in the scalar case. + $for P in range(4): + const float* itl${P} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset); + const float* ibl${P} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset); + i += 2 * 4; + + $for P in range(0, 4, 2): + const v128_t vtltr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) itl${P}, *(const double*) itl${P+1}); + const v128_t vblbr${ABC[P:P+2]} = wasm_f64x2_make(*(const double*) ibl${P}, *(const double*) ibl${P+1}); + + $for P in range(0, 4, 2): + const v128_t vldrd${ABC[P:P+2]} = wasm_f32x4_sub(vblbr${ABC[P:P+2]}, vtltr${ABC[P:P+2]}); + + // Shuffle to isolate `left_diff` and `right_diff`, packed in a single `v128` for all 4 pixels. + const v128_t vld = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6); + const v128_t vrd = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7); + + // Shuffle to isolate `top_left` and `top_right`, packed in a single `v128` for all 4 pixels. + const v128_t vtl = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6); + const v128_t vtr = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7); + + // Compute `left` from the equations (*). + const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); + // Compute `right` from the equations (*). + const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); + + // Compute the result according to (*). + const v128_t vd = wasm_f32x4_sub(vr, vl); + const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); + + wasm_v128_store(output, vo); + output += 4; + } + + if XNN_UNLIKELY(p != 0) { + if (p & 2) { + // This can be understood as a truncated version of the 4-pixel case above. + + const v128_t vw = wasm_v128_load(w); + w += 2 * 2; + + const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2); + const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3); + + $for P in range(2): + const float* itl${P} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset); + const float* ibl${P} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset); + i += 2 * 2; + + const v128_t vtltr = wasm_f64x2_make(*(const double*) itl0, *(const double*) itl1); + const v128_t vblbr = wasm_f64x2_make(*(const double*) ibl0, *(const double*) ibl1); + + const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr); + const v128_t vld = wasm_v32x4_shuffle(vldrd, vldrd, 0, 2, 0, 2); + const v128_t vrd = wasm_v32x4_shuffle(vldrd, vldrd, 1, 3, 1, 3); + + const v128_t vtl = wasm_v32x4_shuffle(vtltr, vtltr, 0, 2, 0, 2); + const v128_t vtr = wasm_v32x4_shuffle(vtltr, vtltr, 1, 3, 1, 3); + + const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); + const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); + + const v128_t vd = wasm_f32x4_sub(vr, vl); + const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); + + *((double*) output) = wasm_f64x2_extract_lane(vo, 0); + output += 2; + } + + if (p & 1) { + // We are computing the following formula: + // result = (1 - alpha_h) * (1 - alpha_v) * top_left + + // alpha_h * (1 - alpha_v) * top_right + + // (1 - alpha_h) * alpha_v * bottom_left + + // alpha_h * alpha_v * bottom_right. + // Rearranging gives (*): + // result = left + alpha_h * (right - left), + // where + // left = top_left + alpha_v * (bottom_left - top_left), + // right = top_right + alpha_v * (bottom_right - top_right). + + const v128_t vw = wasm_v64x2_load_splat((const double*) w); + w += 2; + + const float alphah = wasm_f32x4_extract_lane(vw, 0); + const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 1, 1, 1); + + // Read adjacent top-left and top-right pixels into one register, + // and bottom-left and bottom-right into another. + + const float* itl = (const float*) ((uintptr_t) i[0] + input_offset); + const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset); + i += 2; + + const v128_t vtltr = wasm_v64x2_load_splat(itl); + const v128_t vblbr = wasm_v64x2_load_splat(ibl); + + // Compute at once (**): + // left_diff = bottom_left - top_left + // right_diff = bottom_right - top_right + const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr); + + // Compute at once `left` and `right` from the equations. + const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav)); + + // Extract them and compute the result. + const float l = wasm_f32x4_extract_lane(vlr, 0); + const float r = wasm_f32x4_extract_lane(vlr, 1); + + *output++ = l + alphah * (r - l); + } + } + + input_offset += input_increment; + } while (--channels != 0); +} diff --git a/src/xnnpack/ibilinear.h b/src/xnnpack/ibilinear.h index 5c4252da0..5af6b8c6c 100644 --- a/src/xnnpack/ibilinear.h +++ b/src/xnnpack/ibilinear.h @@ -56,6 +56,8 @@ DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__scalar DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__scalar_p2) DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__scalar_p4) +DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4) +DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8) #ifdef __cplusplus } // extern "C" diff --git a/test/f32-ibilinear-chw.cc b/test/f32-ibilinear-chw.cc index a41440c49..e4939e29b 100644 --- a/test/f32-ibilinear-chw.cc +++ b/test/f32-ibilinear-chw.cc @@ -33,6 +33,15 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P1, pixels_gt_1) { } } +TEST(F32_IBILINEAR_CHW__SCALAR_P1, channels_eq_1) { + for (size_t pixels = 1; pixels <= 5; pixels += 1) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1); + } +} + TEST(F32_IBILINEAR_CHW__SCALAR_P1, channels_gt_1) { for (size_t channels = 2; channels < 3; channels++) { for (size_t pixels = 1; pixels <= 5; pixels += 1) { @@ -103,7 +112,16 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P2, pixels_gt_2) { } } -TEST(F32_IBILINEAR_CHW__SCALAR_P2, channels_gt_2) { +TEST(F32_IBILINEAR_CHW__SCALAR_P2, channels_eq_1) { + for (size_t pixels = 1; pixels <= 10; pixels += 1) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2); + } +} + +TEST(F32_IBILINEAR_CHW__SCALAR_P2, channels_gt_1) { for (size_t channels = 2; channels < 3; channels++) { for (size_t pixels = 1; pixels <= 10; pixels += 1) { IBilinearMicrokernelTester() @@ -173,7 +191,16 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P4, pixels_gt_4) { } } -TEST(F32_IBILINEAR_CHW__SCALAR_P4, channels_gt_4) { +TEST(F32_IBILINEAR_CHW__SCALAR_P4, channels_eq_1) { + for (size_t pixels = 1; pixels <= 20; pixels += 3) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4); + } +} + +TEST(F32_IBILINEAR_CHW__SCALAR_P4, channels_gt_1) { for (size_t channels = 2; channels < 3; channels++) { for (size_t pixels = 1; pixels <= 20; pixels += 3) { IBilinearMicrokernelTester() @@ -207,3 +234,165 @@ TEST(F32_IBILINEAR_CHW__SCALAR_P4, input_stride) { } } } + + +#if XNN_ARCH_WASMSIMD + TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_eq_4) { + IBilinearMicrokernelTester() + .pixels(4) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4); + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_div_4) { + for (size_t pixels = 8; pixels < 40; pixels += 4) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4); + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_lt_4) { + for (size_t pixels = 1; pixels < 4; pixels++) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4); + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, pixels_gt_4) { + for (size_t pixels = 5; pixels < 8; pixels++) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4); + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, channels_eq_1) { + for (size_t pixels = 1; pixels <= 20; pixels += 3) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4); + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, channels_gt_1) { + for (size_t channels = 2; channels < 3; channels++) { + for (size_t pixels = 1; pixels <= 20; pixels += 3) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4); + } + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, input_offset) { + for (size_t pixels = 1; pixels < 20; pixels += 3) { + for (size_t channels = 1; channels <= 5; channels += 1) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .input_offset(7) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4); + } + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P4, input_stride) { + for (size_t pixels = 1; pixels < 20; pixels += 3) { + for (size_t channels = 1; channels <= 5; channels += 1) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .input_stride(83) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4); + } + } + } +#endif // XNN_ARCH_WASMSIMD + + +#if XNN_ARCH_WASMSIMD + TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_eq_8) { + IBilinearMicrokernelTester() + .pixels(8) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8); + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_div_8) { + for (size_t pixels = 16; pixels < 80; pixels += 8) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8); + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_lt_8) { + for (size_t pixels = 1; pixels < 8; pixels++) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8); + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, pixels_gt_8) { + for (size_t pixels = 9; pixels < 16; pixels++) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8); + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, channels_eq_1) { + for (size_t pixels = 1; pixels <= 40; pixels += 7) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8); + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, channels_gt_1) { + for (size_t channels = 2; channels < 3; channels++) { + for (size_t pixels = 1; pixels <= 40; pixels += 7) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8); + } + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, input_offset) { + for (size_t pixels = 1; pixels < 40; pixels += 7) { + for (size_t channels = 1; channels <= 5; channels += 1) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .input_offset(7) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8); + } + } + } + + TEST(F32_IBILINEAR_CHW__WASMSIMD_P8, input_stride) { + for (size_t pixels = 1; pixels < 40; pixels += 7) { + for (size_t channels = 1; channels <= 5; channels += 1) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .input_stride(163) + .TestCHW(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8); + } + } + } +#endif // XNN_ARCH_WASMSIMD diff --git a/test/f32-ibilinear-chw.yaml b/test/f32-ibilinear-chw.yaml index 15ca0d156..dcc5eeb1e 100644 --- a/test/f32-ibilinear-chw.yaml +++ b/test/f32-ibilinear-chw.yaml @@ -5,3 +5,5 @@ - name: xnn_f32_ibilinear_chw_ukernel__scalar_p1 - name: xnn_f32_ibilinear_chw_ukernel__scalar_p2 - name: xnn_f32_ibilinear_chw_ukernel__scalar_p4 +- name: xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4 +- name: xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8 diff --git a/tools/generate-ibilinear-chw-test.py b/tools/generate-ibilinear-chw-test.py index f1a1efbcf..fb3fa4b16 100755 --- a/tools/generate-ibilinear-chw-test.py +++ b/tools/generate-ibilinear-chw-test.py @@ -94,20 +94,18 @@ $if CHANNEL_TILE > 1: } } - TEST(${TEST_NAME}, channels_lt_${PIXEL_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) { - for (size_t pixels = 1; pixels <= ${PIXEL_TILE * 5}; pixels += ${max(1, PIXEL_TILE - 1)}) { - IBilinearMicrokernelTester() - .pixels(pixels) - .channels(channels) - .TestCHW(${TEST_FUNC}); - } - } +TEST(${TEST_NAME}, channels_eq_1) { + $if ISA_CHECK: + ${ISA_CHECK}; + for (size_t pixels = 1; pixels <= ${PIXEL_TILE * 5}; pixels += ${max(1, PIXEL_TILE - 1)}) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(1) + .TestCHW(${TEST_FUNC}); } +} -TEST(${TEST_NAME}, channels_gt_${PIXEL_TILE}) { +TEST(${TEST_NAME}, channels_gt_1) { $if ISA_CHECK: ${ISA_CHECK}; for (size_t channels = ${CHANNEL_TILE+1}; channels < ${max(CHANNEL_TILE*2, 3)}; channels++) { |