diff options
-rw-r--r-- | BUILD.bazel | 12 | ||||
-rwxr-xr-x | CMakeLists.txt | 11 | ||||
-rwxr-xr-x | scripts/generate-f16-ibilinear.sh | 14 | ||||
-rw-r--r-- | src/f16-ibilinear/gen/neonfp16arith-c16.c | 122 | ||||
-rw-r--r-- | src/f16-ibilinear/gen/neonfp16arith-c8.c | 93 | ||||
-rw-r--r-- | src/f16-ibilinear/neonfp16arith.c.in | 118 | ||||
-rw-r--r-- | src/xnnpack/ibilinear.h | 16 | ||||
-rw-r--r-- | src/xnnpack/params.h | 9 | ||||
-rw-r--r-- | test/f16-ibilinear.cc | 175 | ||||
-rw-r--r-- | test/f16-ibilinear.yaml | 12 | ||||
-rw-r--r-- | test/ibilinear-microkernel-tester.h | 57 |
11 files changed, 639 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index f70e0441e..6442aad77 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -4491,6 +4491,8 @@ ALL_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c", "src/f16-gemm/gen/8x8-minmax-neonfp16arith-ld64.c", "src/f16-gemm/gen/8x16-minmax-neonfp16arith-ld64.c", + "src/f16-ibilinear/gen/neonfp16arith-c8.c", + "src/f16-ibilinear/gen/neonfp16arith-c16.c", "src/f16-igemm/gen/1x8-minmax-neonfp16arith-ld64.c", "src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c", "src/f16-igemm/gen/4x8-minmax-neonfp16arith-ld64.c", @@ -11054,6 +11056,16 @@ xnnpack_unit_test( ) xnnpack_unit_test( + name = "f16_ibilinear_test", + srcs = [ + "test/f16-ibilinear.cc", + "test/ibilinear-microkernel-tester.h", + "src/xnnpack/AlignedAllocator.h", + ] + MICROKERNEL_TEST_HDRS, + deps = MICROKERNEL_TEST_DEPS, +) + +xnnpack_unit_test( name = "f16_igemm_minmax_test", srcs = [ "test/f16-igemm-minmax.cc", diff --git a/CMakeLists.txt b/CMakeLists.txt index a8ebcab42..46bfd777f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3244,6 +3244,8 @@ SET(ALL_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c src/f16-gemm/gen/8x8-minmax-neonfp16arith-ld64.c src/f16-gemm/gen/8x16-minmax-neonfp16arith-ld64.c + src/f16-ibilinear/gen/neonfp16arith-c8.c + src/f16-ibilinear/gen/neonfp16arith-c16.c src/f16-igemm/gen/1x8-minmax-neonfp16arith-ld64.c src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c src/f16-igemm/gen/4x8-minmax-neonfp16arith-ld64.c @@ -7107,6 +7109,15 @@ IF(XNNPACK_BUILD_TESTS) TARGET_LINK_LIBRARIES(f16-gemm-minmax-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main gemm-microkernel-tester) ADD_TEST(f16-gemm-minmax-test f16-gemm-minmax-test) + ADD_EXECUTABLE(f16-ibilinear-test test/f16-ibilinear.cc $<TARGET_OBJECTS:all_microkernels>) + SET_TARGET_PROPERTIES(f16-ibilinear-test PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED YES + CXX_EXTENSIONS YES) + TARGET_INCLUDE_DIRECTORIES(f16-ibilinear-test PRIVATE include src test) + TARGET_LINK_LIBRARIES(f16-ibilinear-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main) + ADD_TEST(f16-ibilinear-test f16-ibilinear-test) + ADD_EXECUTABLE(f16-igemm-minmax-test test/f16-igemm-minmax.cc $<TARGET_OBJECTS:all_microkernels> $<TARGET_OBJECTS:packing>) SET_TARGET_PROPERTIES(f16-igemm-minmax-test PROPERTIES CXX_STANDARD 11 diff --git a/scripts/generate-f16-ibilinear.sh b/scripts/generate-f16-ibilinear.sh new file mode 100755 index 000000000..d84e6cd1b --- /dev/null +++ b/scripts/generate-f16-ibilinear.sh @@ -0,0 +1,14 @@ +#!/bin/sh +# Copyright 2022 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +################################### ARM NEON ################################## +tools/xngen src/f16-ibilinear/neonfp16arith.c.in -D CHANNEL_TILE=8 -D PIXEL_TILE=1 -o src/f16-ibilinear/gen/neonfp16arith-c8.c & +tools/xngen src/f16-ibilinear/neonfp16arith.c.in -D CHANNEL_TILE=16 -D PIXEL_TILE=1 -o src/f16-ibilinear/gen/neonfp16arith-c16.c & + +################################## Unit tests ################################# +tools/generate-ibilinear-test.py --spec test/f16-ibilinear.yaml --output test/f16-ibilinear.cc & + +wait diff --git a/src/f16-ibilinear/gen/neonfp16arith-c16.c b/src/f16-ibilinear/gen/neonfp16arith-c16.c new file mode 100644 index 000000000..ea26fc592 --- /dev/null +++ b/src/f16-ibilinear/gen/neonfp16arith-c16.c @@ -0,0 +1,122 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-ibilinear/neonfp16arith.c.in +// Generator: tools/xngen +// +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> + +#include <arm_neon.h> + +#include <xnnpack/common.h> +#include <xnnpack/ibilinear.h> + + +void xnn_f16_ibilinear_ukernel__neonfp16arith_c16( + size_t output_pixels, + size_t channels, + const void**restrict input, + size_t input_offset, + const void*restrict weights, + void*restrict output, + size_t output_increment) XNN_OOB_READS +{ + assert(output_pixels != 0); + assert(channels != 0); + assert(channels % sizeof(__fp16) == 0); + + __fp16* o = (__fp16*) output; + do { + const __fp16* i0 = (const __fp16*) ((uintptr_t) input[0] + input_offset); + const __fp16* i1 = (const __fp16*) ((uintptr_t) input[1] + input_offset); + const __fp16* i2 = (const __fp16*) ((uintptr_t) input[2] + input_offset); + const __fp16* i3 = (const __fp16*) ((uintptr_t) input[3] + input_offset); + input += 4; + + const float16x8_t valphah = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1; + const float16x8_t valphav = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1; + + size_t c = channels; + for (; c >= 16 * sizeof(__fp16); c -= 16 * sizeof(__fp16)) { + const float16x8_t vtl456789AB = vld1q_f16(i0); i0 += 8; + const float16x8_t vtr456789AB = vld1q_f16(i1); i1 += 8; + const float16x8_t vbl456789AB = vld1q_f16(i2); i2 += 8; + const float16x8_t vbr456789AB = vld1q_f16(i3); i3 += 8; + const float16x8_t vtlCDEFGHIJ = vld1q_f16(i0); i0 += 8; + const float16x8_t vtrCDEFGHIJ = vld1q_f16(i1); i1 += 8; + const float16x8_t vblCDEFGHIJ = vld1q_f16(i2); i2 += 8; + const float16x8_t vbrCDEFGHIJ = vld1q_f16(i3); i3 += 8; + + const float16x8_t vtd456789AB = vsubq_f16(vtr456789AB, vtl456789AB); + const float16x8_t vbd456789AB = vsubq_f16(vbr456789AB, vbl456789AB); + const float16x8_t vtdCDEFGHIJ = vsubq_f16(vtrCDEFGHIJ, vtlCDEFGHIJ); + const float16x8_t vbdCDEFGHIJ = vsubq_f16(vbrCDEFGHIJ, vblCDEFGHIJ); + + const float16x8_t vt456789AB = vfmaq_f16(vtl456789AB, vtd456789AB, valphah); + const float16x8_t vb456789AB = vfmaq_f16(vbl456789AB, vbd456789AB, valphah); + const float16x8_t vtCDEFGHIJ = vfmaq_f16(vtlCDEFGHIJ, vtdCDEFGHIJ, valphah); + const float16x8_t vbCDEFGHIJ = vfmaq_f16(vblCDEFGHIJ, vbdCDEFGHIJ, valphah); + + const float16x8_t vd456789AB = vsubq_f16(vb456789AB, vt456789AB); + const float16x8_t vdCDEFGHIJ = vsubq_f16(vbCDEFGHIJ, vtCDEFGHIJ); + + const float16x8_t vo456789AB = vfmaq_f16(vt456789AB, vd456789AB, valphav); + const float16x8_t voCDEFGHIJ = vfmaq_f16(vtCDEFGHIJ, vdCDEFGHIJ, valphav); + + vst1q_f16(o, vo456789AB); o += 8; + vst1q_f16(o, voCDEFGHIJ); o += 8; + } + for (; c >= 8 * sizeof(__fp16); c -= 8 * sizeof(__fp16)) { + const float16x8_t vtl = vld1q_f16(i0); i0 += 8; + const float16x8_t vtr = vld1q_f16(i1); i1 += 8; + const float16x8_t vbl = vld1q_f16(i2); i2 += 8; + const float16x8_t vbr = vld1q_f16(i3); i3 += 8; + + const float16x8_t vtd = vsubq_f16(vtr, vtl); + const float16x8_t vbd = vsubq_f16(vbr, vbl); + + const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah); + const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah); + + const float16x8_t vd = vsubq_f16(vb, vt); + + const float16x8_t vo = vfmaq_f16(vt, vd, valphav); + + vst1q_f16(o, vo); o += 8; + } + if XNN_UNLIKELY(c != 0) { + const float16x8_t vtl = vld1q_f16(i0); + const float16x8_t vtr = vld1q_f16(i1); + const float16x8_t vbl = vld1q_f16(i2); + const float16x8_t vbr = vld1q_f16(i3); + + const float16x8_t vtd = vsubq_f16(vtr, vtl); + const float16x8_t vbd = vsubq_f16(vbr, vbl); + + const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah); + const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah); + + const float16x8_t vd = vsubq_f16(vb, vt); + + float16x8_t vo = vfmaq_f16(vt, vd, valphav); + + float16x4_t vo_lo = vget_low_f16(vo); + if (c & (4 * sizeof(__fp16))) { + vst1_f16(o, vo_lo); o += 4; + vo_lo = vget_high_f16(vo); + } + if (c & (2 * sizeof(__fp16))) { + vst1_lane_u32(o, vreinterpret_u32_f16(vo_lo), 0); o += 2; + vo_lo = vext_f16(vo_lo, vo_lo, 2); + } + if (c & (1 * sizeof(__fp16))) { + vst1_lane_f16(o, vo_lo, 0); o += 1; + } + } + + o = (__fp16*) ((uintptr_t) o + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/f16-ibilinear/gen/neonfp16arith-c8.c b/src/f16-ibilinear/gen/neonfp16arith-c8.c new file mode 100644 index 000000000..33db81fba --- /dev/null +++ b/src/f16-ibilinear/gen/neonfp16arith-c8.c @@ -0,0 +1,93 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-ibilinear/neonfp16arith.c.in +// Generator: tools/xngen +// +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> + +#include <arm_neon.h> + +#include <xnnpack/common.h> +#include <xnnpack/ibilinear.h> + + +void xnn_f16_ibilinear_ukernel__neonfp16arith_c8( + size_t output_pixels, + size_t channels, + const void**restrict input, + size_t input_offset, + const void*restrict weights, + void*restrict output, + size_t output_increment) XNN_OOB_READS +{ + assert(output_pixels != 0); + assert(channels != 0); + assert(channels % sizeof(__fp16) == 0); + + __fp16* o = (__fp16*) output; + do { + const __fp16* i0 = (const __fp16*) ((uintptr_t) input[0] + input_offset); + const __fp16* i1 = (const __fp16*) ((uintptr_t) input[1] + input_offset); + const __fp16* i2 = (const __fp16*) ((uintptr_t) input[2] + input_offset); + const __fp16* i3 = (const __fp16*) ((uintptr_t) input[3] + input_offset); + input += 4; + + const float16x8_t valphah = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1; + const float16x8_t valphav = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1; + + size_t c = channels; + for (; c >= 8 * sizeof(__fp16); c -= 8 * sizeof(__fp16)) { + const float16x8_t vtl = vld1q_f16(i0); i0 += 8; + const float16x8_t vtr = vld1q_f16(i1); i1 += 8; + const float16x8_t vbl = vld1q_f16(i2); i2 += 8; + const float16x8_t vbr = vld1q_f16(i3); i3 += 8; + + const float16x8_t vtd = vsubq_f16(vtr, vtl); + const float16x8_t vbd = vsubq_f16(vbr, vbl); + + const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah); + const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah); + + const float16x8_t vd = vsubq_f16(vb, vt); + + const float16x8_t vo = vfmaq_f16(vt, vd, valphav); + + vst1q_f16(o, vo); o += 8; + } + if XNN_UNLIKELY(c != 0) { + const float16x8_t vtl = vld1q_f16(i0); + const float16x8_t vtr = vld1q_f16(i1); + const float16x8_t vbl = vld1q_f16(i2); + const float16x8_t vbr = vld1q_f16(i3); + + const float16x8_t vtd = vsubq_f16(vtr, vtl); + const float16x8_t vbd = vsubq_f16(vbr, vbl); + + const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah); + const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah); + + const float16x8_t vd = vsubq_f16(vb, vt); + + float16x8_t vo = vfmaq_f16(vt, vd, valphav); + + float16x4_t vo_lo = vget_low_f16(vo); + if (c & (4 * sizeof(__fp16))) { + vst1_f16(o, vo_lo); o += 4; + vo_lo = vget_high_f16(vo); + } + if (c & (2 * sizeof(__fp16))) { + vst1_lane_u32(o, vreinterpret_u32_f16(vo_lo), 0); o += 2; + vo_lo = vext_f16(vo_lo, vo_lo, 2); + } + if (c & (1 * sizeof(__fp16))) { + vst1_lane_f16(o, vo_lo, 0); o += 1; + } + } + + o = (__fp16*) ((uintptr_t) o + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/f16-ibilinear/neonfp16arith.c.in b/src/f16-ibilinear/neonfp16arith.c.in new file mode 100644 index 000000000..4262b4b5a --- /dev/null +++ b/src/f16-ibilinear/neonfp16arith.c.in @@ -0,0 +1,118 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert CHANNEL_TILE % 4 == 0 +$assert CHANNEL_TILE >= 4 +$assert PIXEL_TILE == 1 +$ABC = "456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" +#include <assert.h> + +#include <arm_neon.h> + +#include <xnnpack/common.h> +#include <xnnpack/ibilinear.h> + + +void xnn_f16_ibilinear_ukernel__neonfp16arith_c${CHANNEL_TILE}${"" if PIXEL_TILE == 1 else "x%d" % PIXEL_TILE}( + size_t output_pixels, + size_t channels, + const void**restrict input, + size_t input_offset, + const void*restrict weights, + void*restrict output, + size_t output_increment) XNN_OOB_READS +{ + assert(output_pixels != 0); + assert(channels != 0); + assert(channels % sizeof(__fp16) == 0); + + __fp16* o = (__fp16*) output; + do { + const __fp16* i0 = (const __fp16*) ((uintptr_t) input[0] + input_offset); + const __fp16* i1 = (const __fp16*) ((uintptr_t) input[1] + input_offset); + const __fp16* i2 = (const __fp16*) ((uintptr_t) input[2] + input_offset); + const __fp16* i3 = (const __fp16*) ((uintptr_t) input[3] + input_offset); + input += 4; + + const float16x8_t valphah = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1; + const float16x8_t valphav = vld1q_dup_f16(weights); weights = (const __fp16*) weights + 1; + + size_t c = channels; + $if CHANNEL_TILE > 8: + for (; c >= ${CHANNEL_TILE} * sizeof(__fp16); c -= ${CHANNEL_TILE} * sizeof(__fp16)) { + $for C in range(0, CHANNEL_TILE, 8): + const float16x8_t vtl${ABC[C:C+8]} = vld1q_f16(i0); i0 += 8; + const float16x8_t vtr${ABC[C:C+8]} = vld1q_f16(i1); i1 += 8; + const float16x8_t vbl${ABC[C:C+8]} = vld1q_f16(i2); i2 += 8; + const float16x8_t vbr${ABC[C:C+8]} = vld1q_f16(i3); i3 += 8; + + $for C in range(0, CHANNEL_TILE, 8): + const float16x8_t vtd${ABC[C:C+8]} = vsubq_f16(vtr${ABC[C:C+8]}, vtl${ABC[C:C+8]}); + const float16x8_t vbd${ABC[C:C+8]} = vsubq_f16(vbr${ABC[C:C+8]}, vbl${ABC[C:C+8]}); + + $for C in range(0, CHANNEL_TILE, 8): + const float16x8_t vt${ABC[C:C+8]} = vfmaq_f16(vtl${ABC[C:C+8]}, vtd${ABC[C:C+8]}, valphah); + const float16x8_t vb${ABC[C:C+8]} = vfmaq_f16(vbl${ABC[C:C+8]}, vbd${ABC[C:C+8]}, valphah); + + $for C in range(0, CHANNEL_TILE, 8): + const float16x8_t vd${ABC[C:C+8]} = vsubq_f16(vb${ABC[C:C+8]}, vt${ABC[C:C+8]}); + + $for C in range(0, CHANNEL_TILE, 8): + const float16x8_t vo${ABC[C:C+8]} = vfmaq_f16(vt${ABC[C:C+8]}, vd${ABC[C:C+8]}, valphav); + + $for C in range(0, CHANNEL_TILE, 8): + vst1q_f16(o, vo${ABC[C:C+8]}); o += 8; + } + for (; c >= 8 * sizeof(__fp16); c -= 8 * sizeof(__fp16)) { + const float16x8_t vtl = vld1q_f16(i0); i0 += 8; + const float16x8_t vtr = vld1q_f16(i1); i1 += 8; + const float16x8_t vbl = vld1q_f16(i2); i2 += 8; + const float16x8_t vbr = vld1q_f16(i3); i3 += 8; + + const float16x8_t vtd = vsubq_f16(vtr, vtl); + const float16x8_t vbd = vsubq_f16(vbr, vbl); + + const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah); + const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah); + + const float16x8_t vd = vsubq_f16(vb, vt); + + const float16x8_t vo = vfmaq_f16(vt, vd, valphav); + + vst1q_f16(o, vo); o += 8; + } + if XNN_UNLIKELY(c != 0) { + const float16x8_t vtl = vld1q_f16(i0); + const float16x8_t vtr = vld1q_f16(i1); + const float16x8_t vbl = vld1q_f16(i2); + const float16x8_t vbr = vld1q_f16(i3); + + const float16x8_t vtd = vsubq_f16(vtr, vtl); + const float16x8_t vbd = vsubq_f16(vbr, vbl); + + const float16x8_t vt = vfmaq_f16(vtl, vtd, valphah); + const float16x8_t vb = vfmaq_f16(vbl, vbd, valphah); + + const float16x8_t vd = vsubq_f16(vb, vt); + + float16x8_t vo = vfmaq_f16(vt, vd, valphav); + + float16x4_t vo_lo = vget_low_f16(vo); + if (c & (4 * sizeof(__fp16))) { + vst1_f16(o, vo_lo); o += 4; + vo_lo = vget_high_f16(vo); + } + if (c & (2 * sizeof(__fp16))) { + vst1_lane_u32(o, vreinterpret_u32_f16(vo_lo), 0); o += 2; + vo_lo = vext_f16(vo_lo, vo_lo, 2); + } + if (c & (1 * sizeof(__fp16))) { + vst1_lane_f16(o, vo_lo, 0); o += 1; + } + } + + o = (__fp16*) ((uintptr_t) o + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/xnnpack/ibilinear.h b/src/xnnpack/ibilinear.h index 739bb9a64..b39bedcc4 100644 --- a/src/xnnpack/ibilinear.h +++ b/src/xnnpack/ibilinear.h @@ -16,6 +16,20 @@ extern "C" { #endif +#define DECLARE_F16_IBILINEAR_UKERNEL_FUNCTION(fn_name) \ + XNN_INTERNAL void fn_name( \ + size_t output_pixels, \ + size_t channels, \ + const void** input, \ + size_t input_offset, \ + const void* weights, \ + void* output, \ + size_t output_increment); + +DECLARE_F16_IBILINEAR_UKERNEL_FUNCTION(xnn_f16_ibilinear_ukernel__neonfp16arith_c8) +DECLARE_F16_IBILINEAR_UKERNEL_FUNCTION(xnn_f16_ibilinear_ukernel__neonfp16arith_c16) + + #define DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ size_t output_pixels, \ @@ -42,6 +56,7 @@ DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(xnn_f32_ibilinear_ukernel__sse_c8) DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(xnn_f32_ibilinear_ukernel__wasmsimd_c4) DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(xnn_f32_ibilinear_ukernel__wasmsimd_c8) + #define DECLARE_S8_IBILINEAR_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ size_t output_pixels, \ @@ -81,6 +96,7 @@ DECLARE_S8_IBILINEAR_UKERNEL_FUNCTION(xnn_s8_ibilinear_ukernel__wasmsimd_mul32_c uint8_t* output, \ size_t output_increment); + DECLARE_U8_IBILINEAR_UKERNEL_FUNCTION(xnn_u8_ibilinear_ukernel__scalar_c1) DECLARE_U8_IBILINEAR_UKERNEL_FUNCTION(xnn_u8_ibilinear_ukernel__scalar_c2) DECLARE_U8_IBILINEAR_UKERNEL_FUNCTION(xnn_u8_ibilinear_ukernel__scalar_c4) diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h index c164fc70d..471176a78 100644 --- a/src/xnnpack/params.h +++ b/src/xnnpack/params.h @@ -2657,6 +2657,15 @@ typedef void (*xnn_dwconv_multipass_ukernel_function)( const void* zero, const void* params); +typedef void (*xnn_f16_ibilinear_ukernel_function)( + size_t output_pixels, + size_t channels, + const void** input, + size_t input_offset, + const void* weights, + void* output, + size_t output_increment); + typedef void (*xnn_f32_ibilinear_ukernel_function)( size_t output_pixels, size_t channels, diff --git a/test/f16-ibilinear.cc b/test/f16-ibilinear.cc new file mode 100644 index 000000000..0e9f88e79 --- /dev/null +++ b/test/f16-ibilinear.cc @@ -0,0 +1,175 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! +// Specification: test/f16-ibilinear.yaml +// Generator: tools/generate-ibilinear-test.py + + +#include <gtest/gtest.h> + +#include <xnnpack/common.h> +#include <xnnpack/isa-checks.h> + +#include <xnnpack/ibilinear.h> +#include "ibilinear-microkernel-tester.h" + + +#if XNN_ARCH_ARM64 + TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_eq_8) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + IBilinearMicrokernelTester() + .pixels(1) + .channels(8) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8); + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_div_8) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t channels = 16; channels < 80; channels += 8) { + IBilinearMicrokernelTester() + .pixels(1) + .channels(channels) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8); + } + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_lt_8) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t channels = 1; channels < 8; channels++) { + IBilinearMicrokernelTester() + .pixels(1) + .channels(channels) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8); + } + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C8, channels_gt_8) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t channels = 9; channels < 16; channels++) { + IBilinearMicrokernelTester() + .pixels(1) + .channels(channels) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8); + } + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C8, pixels_gt_1) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t pixels = 2; pixels < 3; pixels++) { + for (size_t channels = 1; channels <= 40; channels += 7) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8); + } + } + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C8, input_offset) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t pixels = 1; pixels < 5; pixels += 1) { + for (size_t channels = 1; channels <= 40; channels += 7) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .input_offset(43) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8); + } + } + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C8, output_stride) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t pixels = 1; pixels < 5; pixels += 1) { + for (size_t channels = 1; channels <= 40; channels += 7) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .output_stride(43) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c8); + } + } + } +#endif // XNN_ARCH_ARM64 + + +#if XNN_ARCH_ARM64 + TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_eq_16) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + IBilinearMicrokernelTester() + .pixels(1) + .channels(16) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16); + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_div_16) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t channels = 32; channels < 160; channels += 16) { + IBilinearMicrokernelTester() + .pixels(1) + .channels(channels) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16); + } + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_lt_16) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t channels = 1; channels < 16; channels++) { + IBilinearMicrokernelTester() + .pixels(1) + .channels(channels) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16); + } + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C16, channels_gt_16) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t channels = 17; channels < 32; channels++) { + IBilinearMicrokernelTester() + .pixels(1) + .channels(channels) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16); + } + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C16, pixels_gt_1) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t pixels = 2; pixels < 3; pixels++) { + for (size_t channels = 1; channels <= 80; channels += 15) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16); + } + } + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C16, input_offset) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t pixels = 1; pixels < 5; pixels += 1) { + for (size_t channels = 1; channels <= 80; channels += 15) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .input_offset(83) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16); + } + } + } + + TEST(F16_IBILINEAR__NEONFP16ARITH_C16, output_stride) { + TEST_REQUIRES_ARM_NEON_FP16_ARITH; + for (size_t pixels = 1; pixels < 5; pixels += 1) { + for (size_t channels = 1; channels <= 80; channels += 15) { + IBilinearMicrokernelTester() + .pixels(pixels) + .channels(channels) + .output_stride(83) + .Test(xnn_f16_ibilinear_ukernel__neonfp16arith_c16); + } + } + } +#endif // XNN_ARCH_ARM64 diff --git a/test/f16-ibilinear.yaml b/test/f16-ibilinear.yaml new file mode 100644 index 000000000..5231fe70c --- /dev/null +++ b/test/f16-ibilinear.yaml @@ -0,0 +1,12 @@ +# Copyright 2022 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# ARM NEON +- name: xnn_f16_ibilinear_ukernel__neonfp16arith_c8 + arch: + - aarch64 +- name: xnn_f16_ibilinear_ukernel__neonfp16arith_c16 + arch: + - aarch64 diff --git a/test/ibilinear-microkernel-tester.h b/test/ibilinear-microkernel-tester.h index 5857285c8..85bae73c4 100644 --- a/test/ibilinear-microkernel-tester.h +++ b/test/ibilinear-microkernel-tester.h @@ -16,6 +16,8 @@ #include <random> #include <vector> +#include <fp16.h> + #include <xnnpack.h> #include <xnnpack/AlignedAllocator.h> #include <xnnpack/math.h> @@ -92,6 +94,61 @@ class IBilinearMicrokernelTester { } } + void Test(xnn_f16_ibilinear_ukernel_function ibilinear) const { + std::random_device random_device; + auto rng = std::mt19937(random_device()); + auto f32rng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng)); + auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng); + + std::vector<const uint16_t*> indirection(pixels() * 4); + std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + indirection.size() * channels()); + std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(pixels() * 2); + std::vector<uint16_t> output((pixels() - 1) * output_stride() + channels()); + std::vector<float> output_ref(pixels() * channels()); + + for (size_t iteration = 0; iteration < iterations(); iteration++) { + std::generate(input.begin(), input.end(), std::ref(f16rng)); + std::generate(packed_weights.begin(), packed_weights.end(), std::ref(f16rng)); + std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); + + for (size_t i = 0; i < indirection.size(); i++) { + indirection[i] = input.data() + i * channels() - input_offset(); + } + std::shuffle(indirection.begin(), indirection.end(), rng); + + // Compute reference results. + for (size_t i = 0; i < pixels(); i++) { + for (size_t c = 0; c < channels(); c++) { + const float alpha_h = fp16_ieee_to_fp32_value(packed_weights[i * 2 + 0]); + const float alpha_v = fp16_ieee_to_fp32_value(packed_weights[i * 2 + 1]); + output_ref[i * channels() + c] = + fp16_ieee_to_fp32_value(indirection[i * 4 + 0][c + input_offset()]) * (1.0f - alpha_h) * (1.0f - alpha_v) + + fp16_ieee_to_fp32_value(indirection[i * 4 + 1][c + input_offset()]) * alpha_h * (1.0f - alpha_v) + + fp16_ieee_to_fp32_value(indirection[i * 4 + 2][c + input_offset()]) * (1.0f - alpha_h) * alpha_v + + fp16_ieee_to_fp32_value(indirection[i * 4 + 3][c + input_offset()]) * alpha_h * alpha_v; + } + } + + // Call optimized micro-kernel. + ibilinear( + pixels(), channels() * sizeof(uint16_t), + reinterpret_cast<const void**>(indirection.data()), input_offset() * sizeof(uint16_t), + packed_weights.data(), output.data(), + (output_stride() - channels()) * sizeof(uint16_t)); + + // Verify results. + for (size_t i = 0; i < pixels(); i++) { + for (size_t c = 0; c < channels(); c++) { + ASSERT_NEAR( + fp16_ieee_to_fp32_value(output[i * output_stride() + c]), + output_ref[i * channels() + c], + std::abs(output_ref[i * channels() + c]) * 1.0e-2f) + << "pixel " << i << " / " << pixels() << ", channel " << c << " / " << channels(); + } + } + } + } + void Test(xnn_f32_ibilinear_ukernel_function ibilinear) const { std::random_device random_device; auto rng = std::mt19937(random_device()); |