11 files changed, 639 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index f70e0441e..6442aad77 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4491,6 +4491,8 @@ ALL_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS = [
     "src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c",
     "src/f16-gemm/gen/8x8-minmax-neonfp16arith-ld64.c",
     "src/f16-gemm/gen/8x16-minmax-neonfp16arith-ld64.c",
+    "src/f16-ibilinear/gen/neonfp16arith-c8.c",
+    "src/f16-ibilinear/gen/neonfp16arith-c16.c",
     "src/f16-igemm/gen/1x8-minmax-neonfp16arith-ld64.c",
     "src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c",
     "src/f16-igemm/gen/4x8-minmax-neonfp16arith-ld64.c",
@@ -11054,6 +11056,16 @@ xnnpack_unit_test(
 )
 
 xnnpack_unit_test(
+    name = "f16_ibilinear_test",
+    srcs = [
+        "test/f16-ibilinear.cc",
+        "test/ibilinear-microkernel-tester.h",
+        "src/xnnpack/AlignedAllocator.h",
+    ] + MICROKERNEL_TEST_HDRS,
+    deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
     name = "f16_igemm_minmax_test",
     srcs = [
         "test/f16-igemm-minmax.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a8ebcab42..46bfd777f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3244,6 +3244,8 @@ SET(ALL_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS
   src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c
   src/f16-gemm/gen/8x8-minmax-neonfp16arith-ld64.c
   src/f16-gemm/gen/8x16-minmax-neonfp16arith-ld64.c
+  src/f16-ibilinear/gen/neonfp16arith-c8.c
+  src/f16-ibilinear/gen/neonfp16arith-c16.c
   src/f16-igemm/gen/1x8-minmax-neonfp16arith-ld64.c
   src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c
   src/f16-igemm/gen/4x8-minmax-neonfp16arith-ld64.c
@@ -7107,6 +7109,15 @@ IF(XNNPACK_BUILD_TESTS)
   TARGET_LINK_LIBRARIES(f16-gemm-minmax-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main gemm-microkernel-tester)
   ADD_TEST(f16-gemm-minmax-test f16-gemm-minmax-test)
 
+  ADD_EXECUTABLE(f16-ibilinear-test test/f16-ibilinear.cc $<TARGET_OBJECTS:all_microkernels>)
+  SET_TARGET_PROPERTIES(f16-ibilinear-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(f16-ibilinear-test PRIVATE include src test)
+  TARGET_LINK_LIBRARIES(f16-ibilinear-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
+  ADD_TEST(f16-ibilinear-test f16-ibilinear-test)
+
   ADD_EXECUTABLE(f16-igemm-minmax-test test/f16-igemm-minmax.cc $<TARGET_OBJECTS:all_microkernels>  $<TARGET_OBJECTS:packing>)
   SET_TARGET_PROPERTIES(f16-igemm-minmax-test PROPERTIES
     CXX_STANDARD 11
diff --git a/scripts/generate-f16-ibilinear.sh b/scripts/generate-f16-ibilinear.sh
new file mode 100755
index 000000000..d84e6cd1b
--- /dev/null
+++ b/scripts/generate-f16-ibilinear.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+# Copyright 2022 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+################################### ARM NEON ##################################
+tools/xngen src/f16-ibilinear/neonfp16arith.c.in -D CHANNEL_TILE=8  -D PIXEL_TILE=1 -o src/f16-ibilinear/gen/neonfp16arith-c8.c &
+tools/xngen src/f16-ibilinear/neonfp16arith.c.in -D CHANNEL_TILE=16 -D PIXEL_TILE=1 -o src/f16-ibilinear/gen/neonfp16arith-c16.c &
+
+################################## Unit tests #################################
+tools/generate-ibilinear-test.py --spec test/f16-ibilinear.yaml --output test/f16-ibilinear.cc &
+
+wait
diff --git a/src/f16-ibilinear/gen/neonfp16arith-c16.c b/src/f16-ibilinear/gen/neonfp16arith-c16.c
new file mode 100644
index 000000000..ea26fc592
--- /dev/null
+++ b/src/f16-ibilinear/gen/neonfp16arith-c16.c
@@ -0,0 +1,122 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-ibilinear/neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f16_ibilinear_ukernel__neonfp16arith_c16(
+    size_t output_pixels,
+    size_t channels,
+    const void**restrict input,
+    size_t input_offset,
+    const void*restrict weights,
+    void*restrict output,
+    size_t output_increment) XNN_OOB_READS
+{
+  assert(output_pixels != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(__fp16) == 0);
+
+  __fp16* o = (__fp16*) output;
+  do {
+    const __fp16* i0 = (const __fp16*) ((uintptr_t) input[0] + input_offset);
+    const __fp16* i1 = (const __fp16*) ((uintptr_t) input[1] + input_offset);
+    const __fp16* i2 = (const __fp16*) ((uintptr_t) input[2] + input_offset);
+    const __fp16* i3 = (const __fp16*) ((uintptr_t) input[3] + input_offset);
+    input += 4;
+
+    const float16x8_t valphah = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+    const float16x8_t valphav = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+
+    size_t c = channels;
+    for (; c >= 16 * sizeof(__fp16); c -= 16 * sizeof(__fp16)) {
+      const float16x8_t vtl456789AB = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vtr456789AB = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vbl456789AB = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vbr456789AB = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vtlCDEFGHIJ = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vtrCDEFGHIJ = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vblCDEFGHIJ = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vbrCDEFGHIJ = vld1q_f16(i3); i3 += 8;
+
+      const float16x8_t vtd456789AB = vsubq_f16(vtr456789AB, vtl456789AB);
+      const float16x8_t vbd456789AB = vsubq_f16(vbr456789AB, vbl456789AB);
+      const float16x8_t vtdCDEFGHIJ = vsubq_f16(vtrCDEFGHIJ, vtlCDEFGHIJ);
+      const float16x8_t vbdCDEFGHIJ = vsubq_f16(vbrCDEFGHIJ, vblCDEFGHIJ);
+
+      const float16x8_t vt456789AB = vfmaq_f16(vtl456789AB, vtd456789AB, valphah);
+      const float16x8_t vb456789AB = vfmaq_f16(vbl456789AB, vbd456789AB, valphah);
+      const float16x8_t vtCDEFGHIJ = vfmaq_f16(vtlCDEFGHIJ, vtdCDEFGHIJ, valphah);
+      const float16x8_t vbCDEFGHIJ = vfmaq_f16(vblCDEFGHIJ, vbdCDEFGHIJ, valphah);
+
+      const float16x8_t vd456789AB = vsubq_f16(vb456789AB, vt456789AB);
+      const float16x8_t vdCDEFGHIJ = vsubq_f16(vbCDEFGHIJ, vtCDEFGHIJ);
+
+      const float16x8_t vo456789AB = vfmaq_f16(vt456789AB, vd456789AB, valphav);
+      const float16x8_t voCDEFGHIJ = vfmaq_f16(vtCDEFGHIJ, vdCDEFGHIJ, valphav);
+
+      vst1q_f16(o, vo456789AB); o += 8;
+      vst1q_f16(o, voCDEFGHIJ); o += 8;
+    }
+    for (; c >= 8 * sizeof(__fp16); c -= 8 * sizeof(__fp16)) {
+      const float16x8_t vtl = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vtr = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vbl = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vbr = vld1q_f16(i3); i3 += 8;
+
+      const float16x8_t vtd = vsubq_f16(vtr, vtl);
+      const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+      const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+      const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+      const float16x8_t vd = vsubq_f16(vb, vt);
+
+      const float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+      vst1q_f16(o, vo); o += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const float16x8_t vtl = vld1q_f16(i0);
+      const float16x8_t vtr = vld1q_f16(i1);
+      const float16x8_t vbl = vld1q_f16(i2);
+      const float16x8_t vbr = vld1q_f16(i3);
+
+      const float16x8_t vtd = vsubq_f16(vtr, vtl);
+      const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+      const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+      const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+      const float16x8_t vd = vsubq_f16(vb, vt);
+
+      float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+      float16x4_t vo_lo = vget_low_f16(vo);
+      if (c & (4 * sizeof(__fp16))) {
+        vst1_f16(o, vo_lo); o += 4;
+        vo_lo = vget_high_f16(vo);
+      }
+      if (c & (2 * sizeof(__fp16))) {
+        vst1_lane_u32(o, vreinterpret_u32_f16(vo_lo), 0); o += 2;
+        vo_lo = vext_f16(vo_lo, vo_lo, 2);
+      }
+      if (c & (1 * sizeof(__fp16))) {
+        vst1_lane_f16(o, vo_lo, 0); o += 1;
+      }
+    }
+
+    o = (__fp16*) ((uintptr_t) o + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/f16-ibilinear/gen/neonfp16arith-c8.c b/src/f16-ibilinear/gen/neonfp16arith-c8.c
new file mode 100644
index 000000000..33db81fba
--- /dev/null
+++ b/src/f16-ibilinear/gen/neonfp16arith-c8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-ibilinear/neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f16_ibilinear_ukernel__neonfp16arith_c8(
+    size_t output_pixels,
+    size_t channels,
+    const void**restrict input,
+    size_t input_offset,
+    const void*restrict weights,
+    void*restrict output,
+    size_t output_increment) XNN_OOB_READS
+{
+  assert(output_pixels != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(__fp16) == 0);
+
+  __fp16* o = (__fp16*) output;
+  do {
+    const __fp16* i0 = (const __fp16*) ((uintptr_t) input[0] + input_offset);
+    const __fp16* i1 = (const __fp16*) ((uintptr_t) input[1] + input_offset);
+    const __fp16* i2 = (const __fp16*) ((uintptr_t) input[2] + input_offset);
+    const __fp16* i3 = (const __fp16*) ((uintptr_t) input[3] + input_offset);
+    input += 4;
+
+    const float16x8_t valphah = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+    const float16x8_t valphav = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+
+    size_t c = channels;
+    for (; c >= 8 * sizeof(__fp16); c -= 8 * sizeof(__fp16)) {
+      const float16x8_t vtl = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vtr = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vbl = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vbr = vld1q_f16(i3); i3 += 8;
+
+      const float16x8_t vtd = vsubq_f16(vtr, vtl);
+      const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+      const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+      const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+      const float16x8_t vd = vsubq_f16(vb, vt);
+
+      const float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+      vst1q_f16(o, vo); o += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const float16x8_t vtl = vld1q_f16(i0);
+      const float16x8_t vtr = vld1q_f16(i1);
+      const float16x8_t vbl = vld1q_f16(i2);
+      const float16x8_t vbr = vld1q_f16(i3);
+
+      const float16x8_t vtd = vsubq_f16(vtr, vtl);
+      const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+      const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+      const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+      const float16x8_t vd = vsubq_f16(vb, vt);
+
+      float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+      float16x4_t vo_lo = vget_low_f16(vo);
+      if (c & (4 * sizeof(__fp16))) {
+        vst1_f16(o, vo_lo); o += 4;
+        vo_lo = vget_high_f16(vo);
+      }
+      if (c & (2 * sizeof(__fp16))) {
+        vst1_lane_u32(o, vreinterpret_u32_f16(vo_lo), 0); o += 2;
+        vo_lo = vext_f16(vo_lo, vo_lo, 2);
+      }
+      if (c & (1 * sizeof(__fp16))) {
+        vst1_lane_f16(o, vo_lo, 0); o += 1;
+      }
+    }
+
+    o = (__fp16*) ((uintptr_t) o + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/f16-ibilinear/neonfp16arith.c.in b/src/f16-ibilinear/neonfp16arith.c.in
new file mode 100644
index 000000000..4262b4b5a
--- /dev/null
+++ b/src/f16-ibilinear/neonfp16arith.c.in
@@ -0,0 +1,118 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert CHANNEL_TILE % 4 == 0
+$assert CHANNEL_TILE >= 4
+$assert PIXEL_TILE == 1
+$ABC = "456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f16_ibilinear_ukernel__neonfp16arith_c${CHANNEL_TILE}${"" if PIXEL_TILE == 1 else "x%d" % PIXEL_TILE}(
+    size_t output_pixels,
+    size_t channels,
+    const void**restrict input,
+    size_t input_offset,
+    const void*restrict weights,
+    void*restrict output,
+    size_t output_increment) XNN_OOB_READS
+{
+  assert(output_pixels != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(__fp16) == 0);
+
+  __fp16* o = (__fp16*) output;
+  do {
+    const __fp16* i0 = (const __fp16*) ((uintptr_t) input[0] + input_offset);
+    const __fp16* i1 = (const __fp16*) ((uintptr_t) input[1] + input_offset);
+    const __fp16* i2 = (const __fp16*) ((uintptr_t) input[2] + input_offset);
+    const __fp16* i3 = (const __fp16*) ((uintptr_t) input[3] + input_offset);
+    input += 4;
+
+    const float16x8_t valphah = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+    const float16x8_t valphav = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1;
+
+    size_t c = channels;
+    $if CHANNEL_TILE > 8:
+      for (; c >= ${CHANNEL_TILE} * sizeof(__fp16); c -= ${CHANNEL_TILE} * sizeof(__fp16)) {
+        $for C in range(0, CHANNEL_TILE, 8):
+          const float16x8_t vtl${ABC[C:C+8]} = vld1q_f16(i0); i0 += 8;
+          const float16x8_t vtr${ABC[C:C+8]} = vld1q_f16(i1); i1 += 8;
+          const float16x8_t vbl${ABC[C:C+8]} = vld1q_f16(i2); i2 += 8;
+          const float16x8_t vbr${ABC[C:C+8]} = vld1q_f16(i3); i3 += 8;
+
+        $for C in range(0, CHANNEL_TILE, 8):
+          const float16x8_t vtd${ABC[C:C+8]} = vsubq_f16(vtr${ABC[C:C+8]}, vtl${ABC[C:C+8]});
+          const float16x8_t vbd${ABC[C:C+8]} = vsubq_f16(vbr${ABC[C:C+8]}, vbl${ABC[C:C+8]});
+
+        $for C in range(0, CHANNEL_TILE, 8):
+          const float16x8_t vt${ABC[C:C+8]} = vfmaq_f16(vtl${ABC[C:C+8]}, vtd${ABC[C:C+8]}, valphah);
+          const float16x8_t vb${ABC[C:C+8]} = vfmaq_f16(vbl${ABC[C:C+8]}, vbd${ABC[C:C+8]}, valphah);
+
+        $for C in range(0, CHANNEL_TILE, 8):
+          const float16x8_t vd${ABC[C:C+8]} = vsubq_f16(vb${ABC[C:C+8]}, vt${ABC[C:C+8]});
+
+        $for C in range(0, CHANNEL_TILE, 8):
+          const float16x8_t vo${ABC[C:C+8]} = vfmaq_f16(vt${ABC[C:C+8]}, vd${ABC[C:C+8]}, valphav);
+
+        $for C in range(0, CHANNEL_TILE, 8):
+          vst1q_f16(o, vo${ABC[C:C+8]}); o += 8;
+      }
+    for (; c >= 8 * sizeof(__fp16); c -= 8 * sizeof(__fp16)) {
+      const float16x8_t vtl = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vtr = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vbl = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vbr = vld1q_f16(i3); i3 += 8;
+
+      const float16x8_t vtd = vsubq_f16(vtr, vtl);
+      const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+      const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+      const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+      const float16x8_t vd = vsubq_f16(vb, vt);
+
+      const float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+      vst1q_f16(o, vo); o += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const float16x8_t vtl = vld1q_f16(i0);
+      const float16x8_t vtr = vld1q_f16(i1);
+      const float16x8_t vbl = vld1q_f16(i2);
+      const float16x8_t vbr = vld1q_f16(i3);
+
+      const float16x8_t vtd = vsubq_f16(vtr, vtl);
+      const float16x8_t vbd = vsubq_f16(vbr, vbl);
+
+      const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah);
+      const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah);
+
+      const float16x8_t vd = vsubq_f16(vb, vt);
+
+      float16x8_t vo = vfmaq_f16(vt, vd, valphav);
+
+      float16x4_t vo_lo = vget_low_f16(vo);
+      if (c & (4 * sizeof(__fp16))) {
+        vst1_f16(o, vo_lo); o += 4;
+        vo_lo = vget_high_f16(vo);
+      }
+      if (c & (2 * sizeof(__fp16))) {
+        vst1_lane_u32(o, vreinterpret_u32_f16(vo_lo), 0); o += 2;
+        vo_lo = vext_f16(vo_lo, vo_lo, 2);
+      }
+      if (c & (1 * sizeof(__fp16))) {
+        vst1_lane_f16(o, vo_lo, 0); o += 1;
+      }
+    }
+
+    o = (__fp16*) ((uintptr_t) o + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/xnnpack/ibilinear.h b/src/xnnpack/ibilinear.h
index 739bb9a64..b39bedcc4 100644
--- a/src/xnnpack/ibilinear.h
+++ b/src/xnnpack/ibilinear.h
@@ -16,6 +16,20 @@ extern "C" {
 #endif
 
 
+#define DECLARE_F16_IBILINEAR_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                            \
+      size_t output_pixels,                             \
+      size_t channels,                                  \
+      const void** input,                               \
+      size_t input_offset,                              \
+      const void* weights,                              \
+      void* output,                                     \
+      size_t output_increment);
+
+DECLARE_F16_IBILINEAR_UKERNEL_FUNCTION(xnn_f16_ibilinear_ukernel__neonfp16arith_c8)
+DECLARE_F16_IBILINEAR_UKERNEL_FUNCTION(xnn_f16_ibilinear_ukernel__neonfp16arith_c16)
+
+
 #define DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                            \
       size_t output_pixels,                             \
@@ -42,6 +56,7 @@ DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(xnn_f32_ibilinear_ukernel__sse_c8)
 DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(xnn_f32_ibilinear_ukernel__wasmsimd_c4)
 DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(xnn_f32_ibilinear_ukernel__wasmsimd_c8)
 
+
 #define DECLARE_S8_IBILINEAR_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                           \
       size_t output_pixels,                            \
@@ -81,6 +96,7 @@ DECLARE_S8_IBILINEAR_UKERNEL_FUNCTION(xnn_s8_ibilinear_ukernel__wasmsimd_mul32_c
       uint8_t* output,                                 \
       size_t output_increment);
 
+
 DECLARE_U8_IBILINEAR_UKERNEL_FUNCTION(xnn_u8_ibilinear_ukernel__scalar_c1)
 DECLARE_U8_IBILINEAR_UKERNEL_FUNCTION(xnn_u8_ibilinear_ukernel__scalar_c2)
 DECLARE_U8_IBILINEAR_UKERNEL_FUNCTION(xnn_u8_ibilinear_ukernel__scalar_c4)
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index c164fc70d..471176a78 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -2657,6 +2657,15 @@ typedef void (*xnn_dwconv_multipass_ukernel_function)(
     const void* zero,
     const void* params);
 
+typedef void (*xnn_f16_ibilinear_ukernel_function)(
+    size_t output_pixels,
+    size_t channels,
+    const void** input,
+    size_t input_offset,
+    const void* weights,
+    void* output,
+    size_t output_increment);
+
 typedef void (*xnn_f32_ibilinear_ukernel_function)(
     size_t output_pixels,
     size_t channels,
diff --git a/test/f16-ibilinear.cc b/test/f16-ibilinear.cc
new file mode 100644
index 000000000..0e9f88e79
--- /dev/null
+++ b/test/f16-ibilinear.cc
@@ -0,0 +1,175 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Specification: test/f16-ibilinear.yaml
+//   Generator: tools/generate-ibilinear-test.py
+
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/ibilinear.h>
+#include "ibilinear-microkernel-tester.h"
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    IBilinearMicrokernelTester()
+      .pixels(1)
+      .channels(8)
+      .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_div_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 16; channels < 80; channels += 8) {
+      IBilinearMicrokernelTester()
+        .pixels(1)
+        .channels(channels)
+        .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+    }
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 8; channels++) {
+      IBilinearMicrokernelTester()
+        .pixels(1)
+        .channels(channels)
+        .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+    }
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 9; channels < 16; channels++) {
+      IBilinearMicrokernelTester()
+        .pixels(1)
+        .channels(channels)
+        .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+    }
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C8, pixels_gt_1) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t pixels = 2; pixels < 3; pixels++) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+      }
+    }
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C8, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t pixels = 1; pixels < 5; pixels += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_offset(43)
+          .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+      }
+    }
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C8, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t pixels = 1; pixels < 5; pixels += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .output_stride(43)
+          .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_eq_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    IBilinearMicrokernelTester()
+      .pixels(1)
+      .channels(16)
+      .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_div_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 32; channels < 160; channels += 16) {
+      IBilinearMicrokernelTester()
+        .pixels(1)
+        .channels(channels)
+        .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+    }
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_lt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 16; channels++) {
+      IBilinearMicrokernelTester()
+        .pixels(1)
+        .channels(channels)
+        .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+    }
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_gt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 17; channels < 32; channels++) {
+      IBilinearMicrokernelTester()
+        .pixels(1)
+        .channels(channels)
+        .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+    }
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C16, pixels_gt_1) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t pixels = 2; pixels < 3; pixels++) {
+      for (size_t channels = 1; channels <= 80; channels += 15) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+      }
+    }
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C16, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t pixels = 1; pixels < 5; pixels += 1) {
+      for (size_t channels = 1; channels <= 80; channels += 15) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_offset(83)
+          .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+      }
+    }
+  }
+
+  TEST(F16_IBILINEAR__NEONFP16ARITH_C16, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t pixels = 1; pixels < 5; pixels += 1) {
+      for (size_t channels = 1; channels <= 80; channels += 15) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .output_stride(83)
+          .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
diff --git a/test/f16-ibilinear.yaml b/test/f16-ibilinear.yaml
new file mode 100644
index 000000000..5231fe70c
--- /dev/null
+++ b/test/f16-ibilinear.yaml
@@ -0,0 +1,12 @@
+# Copyright 2022 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# ARM NEON
+- name: xnn_f16_ibilinear_ukernel__neonfp16arith_c8
+  arch:
+    - aarch64
+- name: xnn_f16_ibilinear_ukernel__neonfp16arith_c16
+  arch:
+    - aarch64
diff --git a/test/ibilinear-microkernel-tester.h b/test/ibilinear-microkernel-tester.h
index 5857285c8..85bae73c4 100644
--- a/test/ibilinear-microkernel-tester.h
+++ b/test/ibilinear-microkernel-tester.h
@@ -16,6 +16,8 @@
 #include <random>
 #include <vector>
 
+#include <fp16.h>
+
 #include <xnnpack.h>
 #include <xnnpack/AlignedAllocator.h>
 #include <xnnpack/math.h>
@@ -92,6 +94,61 @@ class IBilinearMicrokernelTester {
     }
   }
 
+  void Test(xnn_f16_ibilinear_ukernel_function ibilinear) const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
+    auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+    std::vector<const uint16_t*> indirection(pixels() * 4);
+    std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + indirection.size() * channels());
+    std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(pixels() * 2);
+    std::vector<uint16_t> output((pixels() - 1) * output_stride() + channels());
+    std::vector<float> output_ref(pixels() * channels());
+
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(input.begin(), input.end(), std::ref(f16rng));
+      std::generate(packed_weights.begin(), packed_weights.end(), std::ref(f16rng));
+      std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+
+      for (size_t i = 0; i < indirection.size(); i++) {
+        indirection[i] = input.data() + i * channels() - input_offset();
+      }
+      std::shuffle(indirection.begin(), indirection.end(), rng);
+
+      // Compute reference results.
+      for (size_t i = 0; i < pixels(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          const float alpha_h = fp16_ieee_to_fp32_value(packed_weights[i * 2 + 0]);
+          const float alpha_v = fp16_ieee_to_fp32_value(packed_weights[i * 2 + 1]);
+          output_ref[i * channels() + c] =
+            fp16_ieee_to_fp32_value(indirection[i * 4 + 0][c + input_offset()]) * (1.0f - alpha_h) * (1.0f - alpha_v) +
+            fp16_ieee_to_fp32_value(indirection[i * 4 + 1][c + input_offset()]) * alpha_h * (1.0f - alpha_v) +
+            fp16_ieee_to_fp32_value(indirection[i * 4 + 2][c + input_offset()]) * (1.0f - alpha_h) * alpha_v +
+            fp16_ieee_to_fp32_value(indirection[i * 4 + 3][c + input_offset()]) * alpha_h * alpha_v;
+        }
+      }
+
+      // Call optimized micro-kernel.
+      ibilinear(
+        pixels(), channels() * sizeof(uint16_t),
+        reinterpret_cast<const void**>(indirection.data()), input_offset() * sizeof(uint16_t),
+        packed_weights.data(), output.data(),
+        (output_stride() - channels()) * sizeof(uint16_t));
+
+      // Verify results.
+      for (size_t i = 0; i < pixels(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_NEAR(
+              fp16_ieee_to_fp32_value(output[i * output_stride() + c]),
+              output_ref[i * channels() + c],
+              std::abs(output_ref[i * channels() + c]) * 1.0e-2f)
+            << "pixel " << i << " / " << pixels() << ", channel " << c << " / " << channels();
+        }
+      }
+    }
+  }
+
   void Test(xnn_f32_ibilinear_ukernel_function ibilinear) const {
     std::random_device random_device;
     auto rng = std::mt19937(random_device());