aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--BUILD.bazel12
-rwxr-xr-xCMakeLists.txt11
-rwxr-xr-xscripts/generate-f16-ibilinear.sh14
-rw-r--r--src/f16-ibilinear/gen/neonfp16arith-c16.c122
-rw-r--r--src/f16-ibilinear/gen/neonfp16arith-c8.c93
-rw-r--r--src/f16-ibilinear/neonfp16arith.c.in118
-rw-r--r--src/xnnpack/ibilinear.h16
-rw-r--r--src/xnnpack/params.h9
-rw-r--r--test/f16-ibilinear.cc175
-rw-r--r--test/f16-ibilinear.yaml12
-rw-r--r--test/ibilinear-microkernel-tester.h57
11 files changed, 639 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index f70e0441e..6442aad77 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4491,6 +4491,8 @@ ALL_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS = [
"src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c",
"src/f16-gemm/gen/8x8-minmax-neonfp16arith-ld64.c",
"src/f16-gemm/gen/8x16-minmax-neonfp16arith-ld64.c",
+ "src/f16-ibilinear/gen/neonfp16arith-c8.c",
+ "src/f16-ibilinear/gen/neonfp16arith-c16.c",
"src/f16-igemm/gen/1x8-minmax-neonfp16arith-ld64.c",
"src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c",
"src/f16-igemm/gen/4x8-minmax-neonfp16arith-ld64.c",
@@ -11054,6 +11056,16 @@ xnnpack_unit_test(
)
xnnpack_unit_test(
+ name = "f16_ibilinear_test",
+ srcs = [
+ "test/f16-ibilinear.cc",
+ "test/ibilinear-microkernel-tester.h",
+ "src/xnnpack/AlignedAllocator.h",
+ ] + MICROKERNEL_TEST_HDRS,
+ deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
name = "f16_igemm_minmax_test",
srcs = [
"test/f16-igemm-minmax.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a8ebcab42..46bfd777f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3244,6 +3244,8 @@ SET(ALL_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS
src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c
src/f16-gemm/gen/8x8-minmax-neonfp16arith-ld64.c
src/f16-gemm/gen/8x16-minmax-neonfp16arith-ld64.c
+ src/f16-ibilinear/gen/neonfp16arith-c8.c
+ src/f16-ibilinear/gen/neonfp16arith-c16.c
src/f16-igemm/gen/1x8-minmax-neonfp16arith-ld64.c
src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c
src/f16-igemm/gen/4x8-minmax-neonfp16arith-ld64.c
@@ -7107,6 +7109,15 @@ IF(XNNPACK_BUILD_TESTS)
TARGET_LINK_LIBRARIES(f16-gemm-minmax-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main gemm-microkernel-tester)
ADD_TEST(f16-gemm-minmax-test f16-gemm-minmax-test)
+ ADD_EXECUTABLE(f16-ibilinear-test test/f16-ibilinear.cc $<TARGET_OBJECTS:all_microkernels>)
+ SET_TARGET_PROPERTIES(f16-ibilinear-test PROPERTIES
+ CXX_STANDARD 11
+ CXX_STANDARD_REQUIRED YES
+ CXX_EXTENSIONS YES)
+ TARGET_INCLUDE_DIRECTORIES(f16-ibilinear-test PRIVATE include src test)
+ TARGET_LINK_LIBRARIES(f16-ibilinear-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
+ ADD_TEST(f16-ibilinear-test f16-ibilinear-test)
+
ADD_EXECUTABLE(f16-igemm-minmax-test test/f16-igemm-minmax.cc $<TARGET_OBJECTS:all_microkernels> $<TARGET_OBJECTS:packing>)
SET_TARGET_PROPERTIES(f16-igemm-minmax-test PROPERTIES
CXX_STANDARD 11
diff --git a/scripts/generate-f16-ibilinear.sh b/scripts/generate-f16-ibilinear.sh
new file mode 100755
index 000000000..d84e6cd1b
--- /dev/null
+++ b/scripts/generate-f16-ibilinear.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+# Copyright 2022 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+################################### ARM NEON ##################################
+tools/xngen src/f16-ibilinear/neonfp16arith.c.in -D CHANNEL_TILE=8 -D PIXEL_TILE=1 -o src/f16-ibilinear/gen/neonfp16arith-c8.c &
+tools/xngen src/f16-ibilinear/neonfp16arith.c.in -D CHANNEL_TILE=16 -D PIXEL_TILE=1 -o src/f16-ibilinear/gen/neonfp16arith-c16.c &
+
+################################## Unit tests #################################
+tools/generate-ibilinear-test.py --spec test/f16-ibilinear.yaml --output test/f16-ibilinear.cc &
+
+wait
diff --git a/src/f16-ibilinear/gen/neonfp16arith-c16.c b/src/f16-ibilinear/gen/neonfp16arith-c16.c
new file mode 100644
index 000000000..ea26fc592
--- /dev/null
+++ b/src/f16-ibilinear/gen/neonfp16arith-c16.c
@@ -0,0 +1,122 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-ibilinear/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f16_ibilinear_ukernel__neonfp16arith_c16(
+ size_t output_pixels,
+ size_t channels,
+ const void**restrict input,
+ size_t input_offset,
+ const void*restrict weights,
+ void*restrict output,
+ size_t output_increment) XNN_OOB_READS
+{
+ assert(output_pixels != 0);
+ assert(channels != 0);
+ assert(channels % sizeof(__fp16) == 0);
+
+ __fp16* o = (__fp16*) output;
+ do {
+ const __fp16* i0 = (const __fp16*) ((uintptr_t) input[0] + input_offset);
+ const __fp16* i1 = (const __fp16*) ((uintptr_t) input[1] + input_offset);
+ const __fp16* i2 = (const __fp16*) ((uintptr_t) input[2] + input_offset);
+ const __fp16* i3 = (const __fp16*) ((uintptr_t) input[3] + input_offset);
+ input += 4;
+
+ const float16x8_t valphah = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+ const float16x8_t valphav = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+
+ size_t c = channels;
+ for (; c >= 16 * sizeof(__fp16); c -= 16 * sizeof(__fp16)) {
+ const float16x8_t vtl456789AB = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vtr456789AB = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vbl456789AB = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vbr456789AB = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vtlCDEFGHIJ = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vtrCDEFGHIJ = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vblCDEFGHIJ = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vbrCDEFGHIJ = vld1q_f16(i3); i3 += 8;
+
+ const float16x8_t vtd456789AB = vsubq_f16(vtr456789AB, vtl456789AB);
+ const float16x8_t vbd456789AB = vsubq_f16(vbr456789AB, vbl456789AB);
+ const float16x8_t vtdCDEFGHIJ = vsubq_f16(vtrCDEFGHIJ, vtlCDEFGHIJ);
+ const float16x8_t vbdCDEFGHIJ = vsubq_f16(vbrCDEFGHIJ, vblCDEFGHIJ);
+
+ const float16x8_t vt456789AB = vfmaq_f16(vtl456789AB, vtd456789AB, valphah);
+ const float16x8_t vb456789AB = vfmaq_f16(vbl456789AB, vbd456789AB, valphah);
+ const float16x8_t vtCDEFGHIJ = vfmaq_f16(vtlCDEFGHIJ, vtdCDEFGHIJ, valphah);
+ const float16x8_t vbCDEFGHIJ = vfmaq_f16(vblCDEFGHIJ, vbdCDEFGHIJ, valphah);
+
+ const float16x8_t vd456789AB = vsubq_f16(vb456789AB, vt456789AB);
+ const float16x8_t vdCDEFGHIJ = vsubq_f16(vbCDEFGHIJ, vtCDEFGHIJ);
+
+ const float16x8_t vo456789AB = vfmaq_f16(vt456789AB, vd456789AB, valphav);
+ const float16x8_t voCDEFGHIJ = vfmaq_f16(vtCDEFGHIJ, vdCDEFGHIJ, valphav);
+
+ vst1q_f16(o, vo456789AB); o += 8;
+ vst1q_f16(o, voCDEFGHIJ); o += 8;
+ }
+ for (; c >= 8 * sizeof(__fp16); c -= 8 * sizeof(__fp16)) {
+ const float16x8_t vtl = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vtr = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vbl = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vbr = vld1q_f16(i3); i3 += 8;
+
+ const float16x8_t vtd = vsubq_f16(vtr, vtl);
+ const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+ const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+ const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+ const float16x8_t vd = vsubq_f16(vb, vt);
+
+ const float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+ vst1q_f16(o, vo); o += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ const float16x8_t vtl = vld1q_f16(i0);
+ const float16x8_t vtr = vld1q_f16(i1);
+ const float16x8_t vbl = vld1q_f16(i2);
+ const float16x8_t vbr = vld1q_f16(i3);
+
+ const float16x8_t vtd = vsubq_f16(vtr, vtl);
+ const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+ const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+ const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+ const float16x8_t vd = vsubq_f16(vb, vt);
+
+ float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+ float16x4_t vo_lo = vget_low_f16(vo);
+ if (c & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vo_lo); o += 4;
+ vo_lo = vget_high_f16(vo);
+ }
+ if (c & (2 * sizeof(__fp16))) {
+ vst1_lane_u32(o, vreinterpret_u32_f16(vo_lo), 0); o += 2;
+ vo_lo = vext_f16(vo_lo, vo_lo, 2);
+ }
+ if (c & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vo_lo, 0); o += 1;
+ }
+ }
+
+ o = (__fp16*) ((uintptr_t) o + output_increment);
+ } while (--output_pixels != 0);
+}
diff --git a/src/f16-ibilinear/gen/neonfp16arith-c8.c b/src/f16-ibilinear/gen/neonfp16arith-c8.c
new file mode 100644
index 000000000..33db81fba
--- /dev/null
+++ b/src/f16-ibilinear/gen/neonfp16arith-c8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-ibilinear/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f16_ibilinear_ukernel__neonfp16arith_c8(
+ size_t output_pixels,
+ size_t channels,
+ const void**restrict input,
+ size_t input_offset,
+ const void*restrict weights,
+ void*restrict output,
+ size_t output_increment) XNN_OOB_READS
+{
+ assert(output_pixels != 0);
+ assert(channels != 0);
+ assert(channels % sizeof(__fp16) == 0);
+
+ __fp16* o = (__fp16*) output;
+ do {
+ const __fp16* i0 = (const __fp16*) ((uintptr_t) input[0] + input_offset);
+ const __fp16* i1 = (const __fp16*) ((uintptr_t) input[1] + input_offset);
+ const __fp16* i2 = (const __fp16*) ((uintptr_t) input[2] + input_offset);
+ const __fp16* i3 = (const __fp16*) ((uintptr_t) input[3] + input_offset);
+ input += 4;
+
+ const float16x8_t valphah = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+ const float16x8_t valphav = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+
+ size_t c = channels;
+ for (; c >= 8 * sizeof(__fp16); c -= 8 * sizeof(__fp16)) {
+ const float16x8_t vtl = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vtr = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vbl = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vbr = vld1q_f16(i3); i3 += 8;
+
+ const float16x8_t vtd = vsubq_f16(vtr, vtl);
+ const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+ const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+ const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+ const float16x8_t vd = vsubq_f16(vb, vt);
+
+ const float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+ vst1q_f16(o, vo); o += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ const float16x8_t vtl = vld1q_f16(i0);
+ const float16x8_t vtr = vld1q_f16(i1);
+ const float16x8_t vbl = vld1q_f16(i2);
+ const float16x8_t vbr = vld1q_f16(i3);
+
+ const float16x8_t vtd = vsubq_f16(vtr, vtl);
+ const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+ const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+ const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+ const float16x8_t vd = vsubq_f16(vb, vt);
+
+ float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+ float16x4_t vo_lo = vget_low_f16(vo);
+ if (c & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vo_lo); o += 4;
+ vo_lo = vget_high_f16(vo);
+ }
+ if (c & (2 * sizeof(__fp16))) {
+ vst1_lane_u32(o, vreinterpret_u32_f16(vo_lo), 0); o += 2;
+ vo_lo = vext_f16(vo_lo, vo_lo, 2);
+ }
+ if (c & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vo_lo, 0); o += 1;
+ }
+ }
+
+ o = (__fp16*) ((uintptr_t) o + output_increment);
+ } while (--output_pixels != 0);
+}
diff --git a/src/f16-ibilinear/neonfp16arith.c.in b/src/f16-ibilinear/neonfp16arith.c.in
new file mode 100644
index 000000000..4262b4b5a
--- /dev/null
+++ b/src/f16-ibilinear/neonfp16arith.c.in
@@ -0,0 +1,118 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert CHANNEL_TILE % 4 == 0
+$assert CHANNEL_TILE >= 4
+$assert PIXEL_TILE == 1
+$ABC = "456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f16_ibilinear_ukernel__neonfp16arith_c${CHANNEL_TILE}${"" if PIXEL_TILE == 1 else "x%d" % PIXEL_TILE}(
+ size_t output_pixels,
+ size_t channels,
+ const void**restrict input,
+ size_t input_offset,
+ const void*restrict weights,
+ void*restrict output,
+ size_t output_increment) XNN_OOB_READS
+{
+ assert(output_pixels != 0);
+ assert(channels != 0);
+ assert(channels % sizeof(__fp16) == 0);
+
+ __fp16* o = (__fp16*) output;
+ do {
+ const __fp16* i0 = (const __fp16*) ((uintptr_t) input[0] + input_offset);
+ const __fp16* i1 = (const __fp16*) ((uintptr_t) input[1] + input_offset);
+ const __fp16* i2 = (const __fp16*) ((uintptr_t) input[2] + input_offset);
+ const __fp16* i3 = (const __fp16*) ((uintptr_t) input[3] + input_offset);
+ input += 4;
+
+ const float16x8_t valphah = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+ const float16x8_t valphav = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+
+ size_t c = channels;
+ $if CHANNEL_TILE > 8:
+ for (; c >= ${CHANNEL_TILE} * sizeof(__fp16); c -= ${CHANNEL_TILE} * sizeof(__fp16)) {
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vtl${ABC[C:C+8]} = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vtr${ABC[C:C+8]} = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vbl${ABC[C:C+8]} = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vbr${ABC[C:C+8]} = vld1q_f16(i3); i3 += 8;
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vtd${ABC[C:C+8]} = vsubq_f16(vtr${ABC[C:C+8]}, vtl${ABC[C:C+8]});
+ const float16x8_t vbd${ABC[C:C+8]} = vsubq_f16(vbr${ABC[C:C+8]}, vbl${ABC[C:C+8]});
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vt${ABC[C:C+8]} = vfmaq_f16(vtl${ABC[C:C+8]}, vtd${ABC[C:C+8]}, valphah);
+ const float16x8_t vb${ABC[C:C+8]} = vfmaq_f16(vbl${ABC[C:C+8]}, vbd${ABC[C:C+8]}, valphah);
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vd${ABC[C:C+8]} = vsubq_f16(vb${ABC[C:C+8]}, vt${ABC[C:C+8]});
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vo${ABC[C:C+8]} = vfmaq_f16(vt${ABC[C:C+8]}, vd${ABC[C:C+8]}, valphav);
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vst1q_f16(o, vo${ABC[C:C+8]}); o += 8;
+ }
+ for (; c >= 8 * sizeof(__fp16); c -= 8 * sizeof(__fp16)) {
+ const float16x8_t vtl = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vtr = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vbl = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vbr = vld1q_f16(i3); i3 += 8;
+
+ const float16x8_t vtd = vsubq_f16(vtr, vtl);
+ const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+ const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+ const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+ const float16x8_t vd = vsubq_f16(vb, vt);
+
+ const float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+ vst1q_f16(o, vo); o += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ const float16x8_t vtl = vld1q_f16(i0);
+ const float16x8_t vtr = vld1q_f16(i1);
+ const float16x8_t vbl = vld1q_f16(i2);
+ const float16x8_t vbr = vld1q_f16(i3);
+
+ const float16x8_t vtd = vsubq_f16(vtr, vtl);
+ const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+ const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+ const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+ const float16x8_t vd = vsubq_f16(vb, vt);
+
+ float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+ float16x4_t vo_lo = vget_low_f16(vo);
+ if (c & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vo_lo); o += 4;
+ vo_lo = vget_high_f16(vo);
+ }
+ if (c & (2 * sizeof(__fp16))) {
+ vst1_lane_u32(o, vreinterpret_u32_f16(vo_lo), 0); o += 2;
+ vo_lo = vext_f16(vo_lo, vo_lo, 2);
+ }
+ if (c & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vo_lo, 0); o += 1;
+ }
+ }
+
+ o = (__fp16*) ((uintptr_t) o + output_increment);
+ } while (--output_pixels != 0);
+}
diff --git a/src/xnnpack/ibilinear.h b/src/xnnpack/ibilinear.h
index 739bb9a64..b39bedcc4 100644
--- a/src/xnnpack/ibilinear.h
+++ b/src/xnnpack/ibilinear.h
@@ -16,6 +16,20 @@ extern "C" {
#endif
+#define DECLARE_F16_IBILINEAR_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t output_pixels, \
+ size_t channels, \
+ const void** input, \
+ size_t input_offset, \
+ const void* weights, \
+ void* output, \
+ size_t output_increment);
+
+DECLARE_F16_IBILINEAR_UKERNEL_FUNCTION(xnn_f16_ibilinear_ukernel__neonfp16arith_c8)
+DECLARE_F16_IBILINEAR_UKERNEL_FUNCTION(xnn_f16_ibilinear_ukernel__neonfp16arith_c16)
+
+
#define DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
size_t output_pixels, \
@@ -42,6 +56,7 @@ DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(xnn_f32_ibilinear_ukernel__sse_c8)
DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(xnn_f32_ibilinear_ukernel__wasmsimd_c4)
DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(xnn_f32_ibilinear_ukernel__wasmsimd_c8)
+
#define DECLARE_S8_IBILINEAR_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
size_t output_pixels, \
@@ -81,6 +96,7 @@ DECLARE_S8_IBILINEAR_UKERNEL_FUNCTION(xnn_s8_ibilinear_ukernel__wasmsimd_mul32_c
uint8_t* output, \
size_t output_increment);
+
DECLARE_U8_IBILINEAR_UKERNEL_FUNCTION(xnn_u8_ibilinear_ukernel__scalar_c1)
DECLARE_U8_IBILINEAR_UKERNEL_FUNCTION(xnn_u8_ibilinear_ukernel__scalar_c2)
DECLARE_U8_IBILINEAR_UKERNEL_FUNCTION(xnn_u8_ibilinear_ukernel__scalar_c4)
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index c164fc70d..471176a78 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -2657,6 +2657,15 @@ typedef void (*xnn_dwconv_multipass_ukernel_function)(
const void* zero,
const void* params);
+typedef void (*xnn_f16_ibilinear_ukernel_function)(
+ size_t output_pixels,
+ size_t channels,
+ const void** input,
+ size_t input_offset,
+ const void* weights,
+ void* output,
+ size_t output_increment);
+
typedef void (*xnn_f32_ibilinear_ukernel_function)(
size_t output_pixels,
size_t channels,
diff --git a/test/f16-ibilinear.cc b/test/f16-ibilinear.cc
new file mode 100644
index 000000000..0e9f88e79
--- /dev/null
+++ b/test/f16-ibilinear.cc
@@ -0,0 +1,175 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+// Specification: test/f16-ibilinear.yaml
+// Generator: tools/generate-ibilinear-test.py
+
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/ibilinear.h>
+#include "ibilinear-microkernel-tester.h"
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ IBilinearMicrokernelTester()
+ .pixels(1)
+ .channels(8)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_div_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 16; channels < 80; channels += 8) {
+ IBilinearMicrokernelTester()
+ .pixels(1)
+ .channels(channels)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+ }
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_lt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 8; channels++) {
+ IBilinearMicrokernelTester()
+ .pixels(1)
+ .channels(channels)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+ }
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 9; channels < 16; channels++) {
+ IBilinearMicrokernelTester()
+ .pixels(1)
+ .channels(channels)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+ }
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C8, pixels_gt_1) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t pixels = 2; pixels < 3; pixels++) {
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+ }
+ }
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C8, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t pixels = 1; pixels < 5; pixels += 1) {
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .input_offset(43)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+ }
+ }
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C8, output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t pixels = 1; pixels < 5; pixels += 1) {
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .output_stride(43)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ IBilinearMicrokernelTester()
+ .pixels(1)
+ .channels(16)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_div_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 32; channels < 160; channels += 16) {
+ IBilinearMicrokernelTester()
+ .pixels(1)
+ .channels(channels)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+ }
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 16; channels++) {
+ IBilinearMicrokernelTester()
+ .pixels(1)
+ .channels(channels)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+ }
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 17; channels < 32; channels++) {
+ IBilinearMicrokernelTester()
+ .pixels(1)
+ .channels(channels)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+ }
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C16, pixels_gt_1) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t pixels = 2; pixels < 3; pixels++) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+ }
+ }
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C16, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t pixels = 1; pixels < 5; pixels += 1) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .input_offset(83)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+ }
+ }
+ }
+
+ TEST(F16_IBILINEAR__NEONFP16ARITH_C16, output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t pixels = 1; pixels < 5; pixels += 1) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ IBilinearMicrokernelTester()
+ .pixels(pixels)
+ .channels(channels)
+ .output_stride(83)
+ .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
diff --git a/test/f16-ibilinear.yaml b/test/f16-ibilinear.yaml
new file mode 100644
index 000000000..5231fe70c
--- /dev/null
+++ b/test/f16-ibilinear.yaml
@@ -0,0 +1,12 @@
+# Copyright 2022 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# ARM NEON
+- name: xnn_f16_ibilinear_ukernel__neonfp16arith_c8
+ arch:
+ - aarch64
+- name: xnn_f16_ibilinear_ukernel__neonfp16arith_c16
+ arch:
+ - aarch64
diff --git a/test/ibilinear-microkernel-tester.h b/test/ibilinear-microkernel-tester.h
index 5857285c8..85bae73c4 100644
--- a/test/ibilinear-microkernel-tester.h
+++ b/test/ibilinear-microkernel-tester.h
@@ -16,6 +16,8 @@
#include <random>
#include <vector>
+#include <fp16.h>
+
#include <xnnpack.h>
#include <xnnpack/AlignedAllocator.h>
#include <xnnpack/math.h>
@@ -92,6 +94,61 @@ class IBilinearMicrokernelTester {
}
}
+ void Test(xnn_f16_ibilinear_ukernel_function ibilinear) const {
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+ std::vector<const uint16_t*> indirection(pixels() * 4);
+ std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + indirection.size() * channels());
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(pixels() * 2);
+ std::vector<uint16_t> output((pixels() - 1) * output_stride() + channels());
+ std::vector<float> output_ref(pixels() * channels());
+
+ for (size_t iteration = 0; iteration < iterations(); iteration++) {
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
+ std::generate(packed_weights.begin(), packed_weights.end(), std::ref(f16rng));
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+
+ for (size_t i = 0; i < indirection.size(); i++) {
+ indirection[i] = input.data() + i * channels() - input_offset();
+ }
+ std::shuffle(indirection.begin(), indirection.end(), rng);
+
+ // Compute reference results.
+ for (size_t i = 0; i < pixels(); i++) {
+ for (size_t c = 0; c < channels(); c++) {
+ const float alpha_h = fp16_ieee_to_fp32_value(packed_weights[i * 2 + 0]);
+ const float alpha_v = fp16_ieee_to_fp32_value(packed_weights[i * 2 + 1]);
+ output_ref[i * channels() + c] =
+ fp16_ieee_to_fp32_value(indirection[i * 4 + 0][c + input_offset()]) * (1.0f - alpha_h) * (1.0f - alpha_v) +
+ fp16_ieee_to_fp32_value(indirection[i * 4 + 1][c + input_offset()]) * alpha_h * (1.0f - alpha_v) +
+ fp16_ieee_to_fp32_value(indirection[i * 4 + 2][c + input_offset()]) * (1.0f - alpha_h) * alpha_v +
+ fp16_ieee_to_fp32_value(indirection[i * 4 + 3][c + input_offset()]) * alpha_h * alpha_v;
+ }
+ }
+
+ // Call optimized micro-kernel.
+ ibilinear(
+ pixels(), channels() * sizeof(uint16_t),
+ reinterpret_cast<const void**>(indirection.data()), input_offset() * sizeof(uint16_t),
+ packed_weights.data(), output.data(),
+ (output_stride() - channels()) * sizeof(uint16_t));
+
+ // Verify results.
+ for (size_t i = 0; i < pixels(); i++) {
+ for (size_t c = 0; c < channels(); c++) {
+ ASSERT_NEAR(
+ fp16_ieee_to_fp32_value(output[i * output_stride() + c]),
+ output_ref[i * channels() + c],
+ std::abs(output_ref[i * channels() + c]) * 1.0e-2f)
+ << "pixel " << i << " / " << pixels() << ", channel " << c << " / " << channels();
+ }
+ }
+ }
+ }
+
void Test(xnn_f32_ibilinear_ukernel_function ibilinear) const {
std::random_device random_device;
auto rng = std::mt19937(random_device());