diff options
author | Marat Dukhan <maratek@google.com> | 2022-02-04 01:55:53 -0800 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-02-04 01:57:03 -0800 |
commit | 5756a927fc5044bdcfebe57d4bd84408ca0a0975 (patch) | |
tree | 18e40a963d466dff13bb2a35340f3b032f047c0c | |
parent | af1671ab277a0742f8c7251b6b4ad1a16aba79bd (diff) | |
download | XNNPACK-5756a927fc5044bdcfebe57d4bd84408ca0a0975.tar.gz |
F16 Max Pooling NHWC operator
PiperOrigin-RevId: 426345207
-rw-r--r-- | BUILD.bazel | 2 | ||||
-rwxr-xr-x | CMakeLists.txt | 2 | ||||
-rw-r--r-- | include/xnnpack.h | 28 | ||||
-rw-r--r-- | src/amalgam/f16c.c | 262 | ||||
-rw-r--r-- | src/init.c | 14 | ||||
-rw-r--r-- | src/operator-strings.c | 2 | ||||
-rw-r--r-- | src/operators/max-pooling-nhwc.c | 89 | ||||
-rw-r--r-- | src/xnnpack/operator.h | 1 | ||||
-rw-r--r-- | src/xnnpack/params.h | 2 | ||||
-rw-r--r-- | test/max-pooling-nhwc.cc | 819 | ||||
-rw-r--r-- | test/max-pooling-operator-tester.h | 287 |
11 files changed, 1508 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index b9156846d..72fe1a218 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -4425,6 +4425,7 @@ PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c", "src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c", "src/f16-igemm/gen/6x16-minmax-neonfp16arith-ld64.c", + "src/f16-maxpool/9p8x-minmax-neonfp16arith-c8.c", "src/f16-prelu/gen/neonfp16arith-2x16.c", "src/f16-vbinary/gen/vadd-minmax-neonfp16arith-x16.c", "src/f16-vbinary/gen/vaddc-minmax-neonfp16arith-x16.c", @@ -6017,6 +6018,7 @@ PROD_F16C_MICROKERNEL_SRCS = [ "src/f16-f32-vcvt/gen/vcvt-f16c-x16.c", "src/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c", "src/f16-gavgpool/gen/7x-minmax-f16c-c8.c", + "src/f16-maxpool/9p8x-minmax-f16c-c8.c", "src/f16-prelu/gen/f16c-2x16.c", "src/f16-vbinary/gen/vadd-minmax-f16c-x16.c", "src/f16-vbinary/gen/vaddc-minmax-f16c-x16.c", diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ef91efc0..936d7b9b5 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3181,6 +3181,7 @@ SET(PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c src/f16-igemm/gen/6x16-minmax-neonfp16arith-ld64.c + src/f16-maxpool/9p8x-minmax-neonfp16arith-c8.c src/f16-prelu/gen/neonfp16arith-2x16.c src/f16-vbinary/gen/vadd-minmax-neonfp16arith-x16.c src/f16-vbinary/gen/vaddc-minmax-neonfp16arith-x16.c @@ -4757,6 +4758,7 @@ SET(PROD_F16C_MICROKERNEL_SRCS src/f16-f32-vcvt/gen/vcvt-f16c-x16.c src/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c src/f16-gavgpool/gen/7x-minmax-f16c-c8.c + src/f16-maxpool/9p8x-minmax-f16c-c8.c src/f16-prelu/gen/neonfp16arith-2x16.c src/f16-vbinary/gen/vadd-minmax-f16c-x16.c src/f16-vbinary/gen/vaddc-minmax-f16c-x16.c diff --git a/include/xnnpack.h b/include/xnnpack.h index c29899a9d..f8a581c09 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -1996,6 +1996,34 @@ enum xnn_status xnn_setup_hardswish_nc_f16( void* output, pthreadpool_t threadpool); +enum xnn_status xnn_create_max_pooling2d_nhwc_f16( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t stride_height, + uint32_t stride_width, + uint32_t dilation_height, + uint32_t dilation_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* max_pooling_op_out); + +enum xnn_status xnn_setup_max_pooling2d_nhwc_f16( + xnn_operator_t max_pooling_op, + size_t batch_size, + size_t input_height, + size_t input_width, + const void* input, + void* output, + pthreadpool_t threadpool); + enum xnn_status xnn_create_multiply_nd_f16( float output_min, float output_max, diff --git a/src/amalgam/f16c.c b/src/amalgam/f16c.c index 1e41f3aab..d28728b8b 100644 --- a/src/amalgam/f16c.c +++ b/src/amalgam/f16c.c @@ -11,6 +11,7 @@ #include <xnnpack/gavgpool.h> #include <xnnpack/intrinsics-polyfill.h> #include <xnnpack/math.h> +#include <xnnpack/maxpool.h> #include <xnnpack/prelu.h> #include <xnnpack/vbinary.h> #include <xnnpack/vcvt.h> @@ -358,6 +359,267 @@ void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8( } } +void xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8( + size_t output_pixels, + size_t kernel_elements, + size_t channels, + const void** input, + size_t input_offset, + void* output, + size_t input_increment, + size_t output_increment, + const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(output_pixels != 0); + assert(kernel_elements != 0); + assert(channels != 0); + + const __m256 voutput_min = _mm256_load_ps(params->avx.min); + const __m256 voutput_max = _mm256_load_ps(params->avx.max); + do { + uint16_t* o = output; + { + const uint16_t* i0 = *input++; + const uint16_t* i1 = *input++; + const uint16_t* i2 = *input++; + const uint16_t* i3 = *input++; + const uint16_t* i4 = *input++; + const uint16_t* i5 = *input++; + const uint16_t* i6 = *input++; + const uint16_t* i7 = *input++; + const uint16_t* i8 = *input++; + i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset); + i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset); + i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset); + i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset); + i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset); + i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset); + i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset); + i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset); + i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset); + if (kernel_elements < 2) { + i1 = i0; + } + if (kernel_elements <= 2) { + i2 = i0; + } + if (kernel_elements < 4) { + i3 = i0; + } + if (kernel_elements <= 4) { + i4 = i0; + } + if (kernel_elements < 6) { + i5 = i0; + } + if (kernel_elements <= 6) { + i6 = i0; + } + if (kernel_elements < 8) { + i7 = i0; + } + if (kernel_elements <= 8) { + i8 = i0; + } + + size_t c = channels; + for (; c >= 8; c -= 8) { + const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); + i0 += 8; + const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); + i1 += 8; + const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); + i2 += 8; + const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); + i3 += 8; + const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); + i4 += 8; + const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); + i5 += 8; + const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); + i6 += 8; + const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7)); + i7 += 8; + const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8)); + i8 += 8; + + const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8); + const __m256 vmax23 = _mm256_max_ps(vi2, vi3); + const __m256 vmax45 = _mm256_max_ps(vi4, vi5); + const __m256 vmax67 = _mm256_max_ps(vi6, vi7); + + const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45); + const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67); + const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678); + const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min); + + _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC)); + o += 8; + } + if (c != 0) { + const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); + i0 += 8; + const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); + i1 += 8; + const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); + i2 += 8; + const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); + i3 += 8; + const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); + i4 += 8; + const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); + i5 += 8; + const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); + i6 += 8; + const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7)); + i7 += 8; + const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8)); + i8 += 8; + + const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8); + const __m256 vmax23 = _mm256_max_ps(vi2, vi3); + const __m256 vmax45 = _mm256_max_ps(vi4, vi5); + const __m256 vmax67 = _mm256_max_ps(vi6, vi7); + + const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45); + const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67); + const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678); + __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min); + + __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC); + if (c & 4) { + _mm_storel_epi64((__m128i*) o, vh); + vh = _mm_unpackhi_epi64(vh, vh); + o += 4; + } + if (c & 2) { + *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh); + vh = _mm_srli_epi64(vh, 32); + o += 2; + } + if (c & 1) { + *o = _mm_extract_epi16(vh, 0); + o += 1; + } + } + } + + for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { + const uint16_t* i0 = *input++; + const uint16_t* i1 = *input++; + const uint16_t* i2 = *input++; + const uint16_t* i3 = *input++; + const uint16_t* i4 = *input++; + const uint16_t* i5 = *input++; + const uint16_t* i6 = *input++; + const uint16_t* i7 = *input++; + i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset); + i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset); + i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset); + i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset); + i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset); + i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset); + i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset); + i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset); + if (k < 2) { + i1 = i0; + } + if (k <= 2) { + i2 = i0; + } + if (k < 4) { + i3 = i0; + } + if (k <= 4) { + i4 = i0; + } + if (k < 6) { + i5 = i0; + } + if (k <= 6) { + i6 = i0; + } + if (k < 8) { + i7 = i0; + } + + o = output; + size_t c = channels; + for (; c >= 8; c -= 8) { + const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); + i0 += 8; + const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); + i1 += 8; + const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); + i2 += 8; + const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); + i3 += 8; + const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); + i4 += 8; + const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); + i5 += 8; + const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); + i6 += 8; + const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7)); + i7 += 8; + const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o)); + + const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo); + const __m256 vmax23 = _mm256_max_ps(vi2, vi3); + const __m256 vmax45 = _mm256_max_ps(vi4, vi5); + const __m256 vmax67 = _mm256_max_ps(vi6, vi7); + + const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45); + const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67); + const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167); + const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min); + + _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC)); + o += 8; + } + if (c != 0) { + const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); + const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); + const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); + const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); + const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); + const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); + const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); + const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7)); + const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o)); + + const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo); + const __m256 vmax23 = _mm256_max_ps(vi2, vi3); + const __m256 vmax45 = _mm256_max_ps(vi4, vi5); + const __m256 vmax67 = _mm256_max_ps(vi6, vi7); + + const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45); + const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67); + const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167); + __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min); + + __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC); + if (c & 4) { + _mm_storel_epi64((__m128i*) o, vh); + vh = _mm_unpackhi_epi64(vh, vh); + o += 4; + } + if (c & 2) { + *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh); + vh = _mm_srli_epi64(vh, 32); + o += 2; + } + if (c & 1) { + *o = _mm_extract_epi16(vh, 0); + o += 1; + } + } + } + input = (const void**) ((uintptr_t) input + input_increment); + output = (uint16_t*) ((uintptr_t) o + output_increment); + } while (--output_pixels != 0); +} + void xnn_f16_prelu_ukernel__f16c_2x16( size_t rows, size_t channels, diff --git a/src/init.c b/src/init.c index fc167e891..8a1f66601 100644 --- a/src/init.c +++ b/src/init.c @@ -2440,6 +2440,13 @@ static void init(void) { .channel_tile = 8, }; + xnn_params.f16.maxpool = (struct maxpool_parameters) { + .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__neonfp16arith_c8, + .init.f16 = xnn_init_f16_minmax_neon_params, + .mr = 9, + .qr = 8, + }; + xnn_params.f16.prelu = (struct prelu_parameters) { .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__neonfp16arith_2x16, .row_tile = 2, @@ -3665,6 +3672,13 @@ static void init(void) { .channel_tile = 8, }; + xnn_params.f16.maxpool = (struct maxpool_parameters) { + .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8, + .init.f16 = xnn_init_f16_minmax_avx_params, + .mr = 9, + .qr = 8, + }; + xnn_params.f16.prelu = (struct prelu_parameters) { .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__f16c_2x16, .row_tile = 2, diff --git a/src/operator-strings.c b/src/operator-strings.c index 9bb524bed..dce9c4671 100644 --- a/src/operator-strings.c +++ b/src/operator-strings.c @@ -124,6 +124,8 @@ const char* xnn_operator_type_to_string(enum xnn_operator_type type) { return "Leaky ReLU (NC, F32)"; case xnn_operator_type_leaky_relu_nc_qu8: return "Leaky ReLU (NC, QU8)"; + case xnn_operator_type_max_pooling_nhwc_f16: + return "Max Pooling (NHWC, F16)"; case xnn_operator_type_max_pooling_nhwc_f32: return "Max Pooling (NHWC, F32)"; case xnn_operator_type_max_pooling_nhwc_s8: diff --git a/src/operators/max-pooling-nhwc.c b/src/operators/max-pooling-nhwc.c index 1ec409fdb..8e13b2c47 100644 --- a/src/operators/max-pooling-nhwc.c +++ b/src/operators/max-pooling-nhwc.c @@ -14,6 +14,8 @@ #include <stdlib.h> #include <string.h> +#include <fp16.h> + #include <xnnpack.h> #include <xnnpack/allocator.h> #include <xnnpack/common.h> @@ -455,6 +457,66 @@ enum xnn_status xnn_create_max_pooling2d_nhwc_f32( max_pooling_op_out); } +enum xnn_status xnn_create_max_pooling2d_nhwc_f16( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t stride_height, + uint32_t stride_width, + uint32_t dilation_height, + uint32_t dilation_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* max_pooling_op_out) +{ + if (isnan(output_min)) { + xnn_log_error( + "failed to create %s with NaN output lower bound: lower bound must be non-NaN", + xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16)); + return xnn_status_invalid_parameter; + } + + if (isnan(output_max)) { + xnn_log_error( + "failed to create %s with NaN output upper bound: upper bound must be non-NaN", + xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16)); + return xnn_status_invalid_parameter; + } + + const uint16_t output_min_as_half = fp16_ieee_from_fp32_value(output_min); + const uint16_t output_max_as_half = fp16_ieee_from_fp32_value(output_max); + output_min = fp16_ieee_to_fp32_value(output_min_as_half); + output_max = fp16_ieee_to_fp32_value(output_max_as_half); + if (output_min >= output_max) { + xnn_log_error( + "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound", + xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16), output_min, output_max); + return xnn_status_invalid_parameter; + } + + union xnn_f16_minmax_params params; + if (xnn_params.f16.maxpool.init.f16 != NULL) { + xnn_params.f16.maxpool.init.f16(¶ms, output_min_as_half, output_max_as_half); + } + return create_max_pooling2d_nhwc( + input_padding_top, input_padding_right, input_padding_bottom, input_padding_left, + pooling_height, pooling_width, + stride_height, stride_width, + dilation_height, dilation_width, + channels, input_pixel_stride, output_pixel_stride, + flags, + ¶ms, sizeof(params), XNN_INIT_FLAG_F16, + xnn_operator_type_max_pooling_nhwc_f16, + max_pooling_op_out); +} + enum xnn_status xnn_setup_max_pooling2d_nhwc_s8( xnn_operator_t max_pooling_op, size_t batch_size, @@ -509,6 +571,33 @@ enum xnn_status xnn_setup_max_pooling2d_nhwc_u8( pthreadpool_get_threads_count(threadpool)); } +enum xnn_status xnn_setup_max_pooling2d_nhwc_f16( + xnn_operator_t max_pooling_op, + size_t batch_size, + size_t input_height, + size_t input_width, + const void* input, + void* output, + pthreadpool_t threadpool) +{ + if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_f16) { + xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", + xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16), + xnn_operator_type_to_string(max_pooling_op->type)); + return xnn_status_invalid_parameter; + } + + return setup_max_pooling2d_nhwc( + max_pooling_op, + batch_size, input_height, input_width, + input, output, + 1 /* log2(sizeof(input element)) = log2(sizeof(uint16_t)) */, + 1 /* log2(sizeof(output element)) = log2(sizeof(uint16_t)) */, + &xnn_params.f16.maxpool, + &max_pooling_op->params.f16_minmax, sizeof(max_pooling_op->params.f16_minmax), + pthreadpool_get_threads_count(threadpool)); +} + enum xnn_status xnn_setup_max_pooling2d_nhwc_f32( xnn_operator_t max_pooling_op, size_t batch_size, diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index b41a4ebee..7f4429f3f 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -86,6 +86,7 @@ enum xnn_operator_type { xnn_operator_type_hardswish_nc_f32, xnn_operator_type_leaky_relu_nc_f32, xnn_operator_type_leaky_relu_nc_qu8, + xnn_operator_type_max_pooling_nhwc_f16, xnn_operator_type_max_pooling_nhwc_f32, xnn_operator_type_max_pooling_nhwc_s8, xnn_operator_type_max_pooling_nhwc_u8, diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h index 53383c44f..90a145260 100644 --- a/src/xnnpack/params.h +++ b/src/xnnpack/params.h @@ -3943,6 +3943,7 @@ struct maxpool_parameters { xnn_init_s8_minmax_params_fn s8; xnn_init_u8_minmax_params_fn u8; xnn_init_f32_minmax_params_fn f32; + xnn_init_f16_minmax_params_fn f16; } init; uint8_t mr; uint8_t qr; @@ -4096,6 +4097,7 @@ struct xnn_parameters { struct gemm_parameters gemm; struct gemm_parameters gemm2; struct dwconv_parameters dwconv[XNN_MAX_F16_DWCONV_UKERNELS]; + struct maxpool_parameters maxpool; struct vunary_parameters hswish; struct prelu_parameters prelu; struct vbinary_parameters vadd; diff --git a/test/max-pooling-nhwc.cc b/test/max-pooling-nhwc.cc index 327dc970f..b25774471 100644 --- a/test/max-pooling-nhwc.cc +++ b/test/max-pooling-nhwc.cc @@ -829,6 +829,7 @@ TEST(MAX_POOLING_NHWC_S8, setup_swap_height_and_width) { .TestSetupS8(); } + TEST(MAX_POOLING_NHWC_U8, unit_batch_small_1xM_pool) { ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); for (size_t channels = 1; channels <= 100; channels += 15) { @@ -1645,6 +1646,824 @@ TEST(MAX_POOLING_NHWC_U8, setup_swap_height_and_width) { .TestSetupU8(); } + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_padding) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 3; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + for (size_t padding_left = 0; padding_left <= 1; padding_left++) { + for (size_t padding_right = 0; padding_right <= 1; padding_right++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .padding_left(padding_left) + .padding_right(padding_right) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .TestF16(); + } + } + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_tf_same_padding) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 3; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + for (size_t input_width = pool_size; input_width <= pool_size * 2; input_width++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(input_width) + .padding_tf_same(true) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .TestF16(); + } + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 4) + .pooling_height(1) + .pooling_width(pool_size) + .stride_width(2) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_dilation) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(2 * pool_size + 1) + .padding_left(1) + .padding_right(1) + .pooling_height(1) + .pooling_width(pool_size) + .dilation_width(2) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_padding) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + for (size_t padding_top = 0; padding_top <= 1; padding_top++) { + for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .padding_top(padding_top) + .padding_bottom(padding_bottom) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .TestF16(); + } + } + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_tf_same_padding) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + for (size_t input_height = pool_size; input_height <= pool_size * 2; input_height++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(input_height) + .input_width(3) + .padding_tf_same(true) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .TestF16(); + } + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 3) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .stride_height(2) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_dilation) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2 * pool_size) + .input_width(3) + .padding_top(1) + .padding_bottom(1) + .pooling_height(pool_size) + .pooling_width(1) + .dilation_height(2) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_input_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .input_pixel_stride(5 * channels) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .input_pixel_stride(5 * channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_output_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .output_pixel_stride(5 * channels) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .output_pixel_stride(5 * channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_qmin) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .qmin(192) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .qmin(192) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_qmax) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .qmax(192) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .qmax(192) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_padding) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + for (size_t padding_left = 0; padding_left <= 1; padding_left++) { + for (size_t padding_right = 0; padding_right <= 1; padding_right++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .padding_left(padding_left) + .padding_right(padding_right) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .TestF16(); + } + } + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_tf_same_padding) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + for (size_t input_width = pool_size; input_width <= pool_size * 2; input_width++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(input_width) + .padding_tf_same(true) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .TestF16(); + } + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 4) + .pooling_height(1) + .pooling_width(pool_size) + .stride_width(2) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_dilation) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(2 * pool_size + 1) + .padding_left(1) + .padding_right(1) + .pooling_height(1) + .pooling_width(pool_size) + .dilation_width(2) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_padding) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + for (size_t padding_top = 0; padding_top <= 1; padding_top++) { + for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .padding_top(padding_top) + .padding_bottom(padding_bottom) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .TestF16(); + } + } + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_tf_same_padding) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + for (size_t input_height = pool_size; input_height <= pool_size * 2; input_height++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(input_height) + .input_width(3) + .padding_tf_same(true) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .TestF16(); + } + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 3) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .stride_height(2) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_dilation) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2 * pool_size) + .input_width(3) + .padding_top(1) + .padding_bottom(1) + .pooling_height(pool_size) + .pooling_width(1) + .dilation_height(2) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_input_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .input_pixel_stride(5 * channels) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .input_pixel_stride(5 * channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_output_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .output_pixel_stride(5 * channels) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .output_pixel_stride(5 * channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_qmin) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .qmin(192) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .qmin(192) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_qmax) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .qmax(192) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(1) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .qmax(192) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, small_batch_small_pool) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, small_batch_small_pool_with_input_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .input_pixel_stride(5 * channels) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .input_pixel_stride(5 * channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, small_batch_small_pool_with_output_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .output_pixel_stride(5 * channels) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .output_pixel_stride(5 * channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, small_batch_large_pool) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, small_batch_large_pool_with_input_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .input_pixel_stride(5 * channels) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .input_pixel_stride(5 * channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, small_batch_large_pool_with_output_stride) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + for (size_t channels = 1; channels <= 100; channels += 15) { + for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) { + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(pool_size + 1) + .input_width(3) + .pooling_height(pool_size) + .pooling_width(1) + .channels(channels) + .output_pixel_stride(5 * channels) + .TestF16(); + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(2) + .input_width(pool_size + 2) + .pooling_height(1) + .pooling_width(pool_size) + .channels(channels) + .output_pixel_stride(5 * channels) + .TestF16(); + } + } +} + +TEST(MAX_POOLING_NHWC_F16, setup_increasing_batch) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + MaxPoolingOperatorTester() + .batch_size(3) + .next_batch_size(5) + .input_height(8) + .input_width(8) + .pooling_height(5) + .pooling_width(3) + .channels(24) + .TestSetupF16(); +} + +TEST(MAX_POOLING_NHWC_F16, setup_decreasing_batch) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + MaxPoolingOperatorTester() + .batch_size(5) + .next_batch_size(3) + .input_height(8) + .input_width(8) + .pooling_height(5) + .pooling_width(3) + .channels(24) + .TestSetupF16(); +} + +TEST(MAX_POOLING_NHWC_F16, setup_changing_height) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(8) + .input_width(8) + .next_input_height(9) + .pooling_height(5) + .pooling_width(3) + .channels(24) + .TestSetupF16(); + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(8) + .input_width(8) + .next_input_height(7) + .pooling_height(5) + .pooling_width(3) + .channels(24) + .TestSetupF16(); +} + +TEST(MAX_POOLING_NHWC_F16, setup_changing_width) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(8) + .input_width(8) + .next_input_width(9) + .pooling_height(5) + .pooling_width(3) + .channels(24) + .TestSetupF16(); + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(8) + .input_width(8) + .next_input_width(7) + .pooling_height(5) + .pooling_width(3) + .channels(24) + .TestSetupF16(); +} + +TEST(MAX_POOLING_NHWC_F16, setup_swap_height_and_width) { + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + MaxPoolingOperatorTester() + .batch_size(3) + .input_height(9) + .input_width(8) + .next_input_height(8) + .next_input_width(9) + .pooling_height(5) + .pooling_width(3) + .channels(24) + .TestSetupF16(); +} + + TEST(MAX_POOLING_NHWC_F32, unit_batch_small_1xM_pool) { ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); for (size_t channels = 1; channels <= 100; channels += 15) { diff --git a/test/max-pooling-operator-tester.h b/test/max-pooling-operator-tester.h index 349c16f30..106150603 100644 --- a/test/max-pooling-operator-tester.h +++ b/test/max-pooling-operator-tester.h @@ -10,6 +10,8 @@ #include <gtest/gtest.h> +#include <fp16.h> + #include <algorithm> #include <cassert> #include <cstddef> @@ -615,6 +617,119 @@ class MaxPoolingOperatorTester { } } + void TestF16() const { + std::random_device random_device; + auto rng = std::mt19937(random_device()); + // Note: we need to avoid FP16 denormals in the generated tensor because they might be processed differently in + // native vs emulated arithmetics, and we use exact comparison to verify the results against reference. + auto f32rng = std::bind(std::uniform_real_distribution<float>(0.001f, 1.0f), rng); + auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng); + + std::vector<uint16_t> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t)); + std::vector<uint16_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t)); + std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels()); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + std::generate(input.begin(), input.end(), std::ref(f16rng)); + std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); + + // Compute reference results, without clamping. + for (size_t i = 0; i < batch_size(); i++) { + for (size_t oy = 0; oy < output_height(); oy++) { + for (size_t ox = 0; ox < output_width(); ox++) { + for (size_t c = 0; c < channels(); c++) { + float max_value = -std::numeric_limits<float>::infinity(); + for (size_t py = 0; py < pooling_height(); py++) { + const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); + for (size_t px = 0; px < pooling_width(); px++) { + const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); + if (ix < input_width() && iy < input_height()) { + max_value = std::max(max_value, + fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c])); + } + } + } + output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value; + } + } + } + } + + // Compute clamping parameters. + const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); + const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); + const float accumulated_range = accumulated_max - accumulated_min; + float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin()); + float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax()); + output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min)); + output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max)); + if (accumulated_range == 0.0f) { + output_min = -std::numeric_limits<float>::infinity(); + output_max = +std::numeric_limits<float>::infinity(); + } + if (qmin() == std::numeric_limits<uint8_t>::min()) { + output_min = -std::numeric_limits<float>::infinity(); + } + if (qmax() == std::numeric_limits<uint8_t>::max()) { + output_max = +std::numeric_limits<float>::infinity(); + } + + // Clamp reference results. + for (float& value : output_ref) { + value = std::max(std::min(value, output_max), output_min); + } + + // Create, setup, run, and destroy Max Pooling operator. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t max_pooling_op = nullptr; + + const xnn_status status = xnn_create_max_pooling2d_nhwc_f16( + padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), + padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), + pooling_height(), pooling_width(), + stride_height(), stride_width(), + dilation_height(), dilation_width(), + channels(), input_pixel_stride(), output_pixel_stride(), + output_min, output_max, + padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0, + &max_pooling_op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, max_pooling_op); + + // Smart pointer to automatically delete max_pooling_op. + std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator); + + ASSERT_EQ(xnn_status_success, + xnn_setup_max_pooling2d_nhwc_f16( + max_pooling_op, + batch_size(), input_height(), input_width(), + input.data(), output.data(), + nullptr /* thread pool */)); + + ASSERT_EQ(xnn_status_success, + xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); + + // Verify results. + for (size_t i = 0; i < batch_size(); i++) { + for (size_t y = 0; y < output_height(); y++) { + for (size_t x = 0; x < output_width(); x++) { + for (size_t c = 0; c < channels(); c++) { + ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max); + ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min); + ASSERT_EQ( + fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), + output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) << + "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c + << ", min = " << output_min << ", max = " << output_max; + } + } + } + } + } + } + void TestF32() const { std::random_device random_device; auto rng = std::mt19937(random_device()); @@ -995,6 +1110,178 @@ class MaxPoolingOperatorTester { } } + void TestSetupF16() const { + std::random_device random_device; + auto rng = std::mt19937(random_device()); + // Note: we need to avoid FP16 denormals in the generated tensor because they might be processed differently in + // native vs emulated arithmetics, and we use exact comparison to verify the results against reference. + auto f32rng = std::bind(std::uniform_real_distribution<float>(0.001f, 1.0f), rng); + auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng); + + std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max( + (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(), + (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels())); + std::vector<uint16_t> output(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max( + (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(), + (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels())); + std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels()); + std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels()); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + std::generate(input.begin(), input.end(), std::ref(f16rng)); + std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); + + // Compute reference results, without clamping. + for (size_t i = 0; i < batch_size(); i++) { + for (size_t oy = 0; oy < output_height(); oy++) { + for (size_t ox = 0; ox < output_width(); ox++) { + for (size_t c = 0; c < channels(); c++) { + float max_value = -std::numeric_limits<float>::infinity(); + for (size_t py = 0; py < pooling_height(); py++) { + const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); + for (size_t px = 0; px < pooling_width(); px++) { + const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); + if (ix < input_width() && iy < input_height()) { + max_value = std::max(max_value, + fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c])); + } + } + } + output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value; + } + } + } + } + + // Compute clamping parameters. + const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); + const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); + const float accumulated_range = accumulated_max - accumulated_min; + float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin()); + float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax()); + output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min)); + output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max)); + if (accumulated_range == 0.0f) { + output_min = -std::numeric_limits<float>::infinity(); + output_max = +std::numeric_limits<float>::infinity(); + } + if (qmin() == std::numeric_limits<uint8_t>::min()) { + output_min = -std::numeric_limits<float>::infinity(); + } + if (qmax() == std::numeric_limits<uint8_t>::max()) { + output_max = +std::numeric_limits<float>::infinity(); + } + + // Clamp reference results. + for (float& value : output_ref) { + value = std::max(std::min(value, output_max), output_min); + } + + // Create, setup, and run Max Pooling operator once. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t max_pooling_op = nullptr; + + const xnn_status status = xnn_create_max_pooling2d_nhwc_f16( + padding_top(), padding_right(), padding_bottom(), padding_left(), + pooling_height(), pooling_width(), + stride_height(), stride_width(), + dilation_height(), dilation_width(), + channels(), input_pixel_stride(), output_pixel_stride(), + output_min, output_max, + 0, &max_pooling_op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, max_pooling_op); + + // Smart pointer to automatically delete max_pooling_op. + std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator); + + ASSERT_EQ(xnn_status_success, + xnn_setup_max_pooling2d_nhwc_f16( + max_pooling_op, + batch_size(), input_height(), input_width(), + input.data(), output.data(), + nullptr /* thread pool */)); + + ASSERT_EQ(xnn_status_success, + xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); + + // Verify results of the first run. + for (size_t i = 0; i < batch_size(); i++) { + for (size_t y = 0; y < output_height(); y++) { + for (size_t x = 0; x < output_width(); x++) { + for (size_t c = 0; c < channels(); c++) { + ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max); + ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min); + ASSERT_EQ( + fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), + output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) << + "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c + << ", min = " << output_min << ", max = " << output_max; + } + } + } + } + + // Re-generate data for the second run. + std::generate(input.begin(), input.end(), std::ref(f16rng)); + std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); + + // Compute reference results for the second run, including clamping. + for (size_t i = 0; i < next_batch_size(); i++) { + for (size_t oy = 0; oy < next_output_height(); oy++) { + for (size_t ox = 0; ox < next_output_width(); ox++) { + for (size_t c = 0; c < channels(); c++) { + float max_value = -std::numeric_limits<float>::infinity(); + for (size_t py = 0; py < pooling_height(); py++) { + const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); + for (size_t px = 0; px < pooling_width(); px++) { + const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); + if (ix < next_input_width() && iy < next_input_height()) { + max_value = std::max(max_value, + fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c])); + } + } + } + max_value = std::min(max_value, output_max); + max_value = std::max(max_value, output_min); + next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = max_value; + } + } + } + } + + // Setup and run Max Pooling operator the second time, and destroy the operator. + ASSERT_EQ(xnn_status_success, + xnn_setup_max_pooling2d_nhwc_f16( + max_pooling_op, + next_batch_size(), next_input_height(), next_input_width(), + input.data(), output.data(), + nullptr /* thread pool */)); + + ASSERT_EQ(xnn_status_success, + xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); + + // Verify results of the second run. + for (size_t i = 0; i < next_batch_size(); i++) { + for (size_t y = 0; y < next_output_height(); y++) { + for (size_t x = 0; x < next_output_width(); x++) { + for (size_t c = 0; c < channels(); c++) { + ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_max); + ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_min); + ASSERT_EQ( + fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), + next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c]) << + "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c + << ", min = " << output_min << ", max = " << output_max; + } + } + } + } + } + } + void TestSetupF32() const { std::random_device random_device; auto rng = std::mt19937(random_device()); |