aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarat Dukhan <maratek@google.com>2022-02-04 01:55:53 -0800
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-02-04 01:57:03 -0800
commit5756a927fc5044bdcfebe57d4bd84408ca0a0975 (patch)
tree18e40a963d466dff13bb2a35340f3b032f047c0c
parentaf1671ab277a0742f8c7251b6b4ad1a16aba79bd (diff)
downloadXNNPACK-5756a927fc5044bdcfebe57d4bd84408ca0a0975.tar.gz
F16 Max Pooling NHWC operator
PiperOrigin-RevId: 426345207
-rw-r--r--BUILD.bazel2
-rwxr-xr-xCMakeLists.txt2
-rw-r--r--include/xnnpack.h28
-rw-r--r--src/amalgam/f16c.c262
-rw-r--r--src/init.c14
-rw-r--r--src/operator-strings.c2
-rw-r--r--src/operators/max-pooling-nhwc.c89
-rw-r--r--src/xnnpack/operator.h1
-rw-r--r--src/xnnpack/params.h2
-rw-r--r--test/max-pooling-nhwc.cc819
-rw-r--r--test/max-pooling-operator-tester.h287
11 files changed, 1508 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index b9156846d..72fe1a218 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4425,6 +4425,7 @@ PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS = [
"src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c",
"src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c",
"src/f16-igemm/gen/6x16-minmax-neonfp16arith-ld64.c",
+ "src/f16-maxpool/9p8x-minmax-neonfp16arith-c8.c",
"src/f16-prelu/gen/neonfp16arith-2x16.c",
"src/f16-vbinary/gen/vadd-minmax-neonfp16arith-x16.c",
"src/f16-vbinary/gen/vaddc-minmax-neonfp16arith-x16.c",
@@ -6017,6 +6018,7 @@ PROD_F16C_MICROKERNEL_SRCS = [
"src/f16-f32-vcvt/gen/vcvt-f16c-x16.c",
"src/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c",
"src/f16-gavgpool/gen/7x-minmax-f16c-c8.c",
+ "src/f16-maxpool/9p8x-minmax-f16c-c8.c",
"src/f16-prelu/gen/f16c-2x16.c",
"src/f16-vbinary/gen/vadd-minmax-f16c-x16.c",
"src/f16-vbinary/gen/vaddc-minmax-f16c-x16.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ef91efc0..936d7b9b5 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3181,6 +3181,7 @@ SET(PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS
src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c
src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c
src/f16-igemm/gen/6x16-minmax-neonfp16arith-ld64.c
+ src/f16-maxpool/9p8x-minmax-neonfp16arith-c8.c
src/f16-prelu/gen/neonfp16arith-2x16.c
src/f16-vbinary/gen/vadd-minmax-neonfp16arith-x16.c
src/f16-vbinary/gen/vaddc-minmax-neonfp16arith-x16.c
@@ -4757,6 +4758,7 @@ SET(PROD_F16C_MICROKERNEL_SRCS
src/f16-f32-vcvt/gen/vcvt-f16c-x16.c
src/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c
src/f16-gavgpool/gen/7x-minmax-f16c-c8.c
+ src/f16-maxpool/9p8x-minmax-f16c-c8.c
src/f16-prelu/gen/neonfp16arith-2x16.c
src/f16-vbinary/gen/vadd-minmax-f16c-x16.c
src/f16-vbinary/gen/vaddc-minmax-f16c-x16.c
diff --git a/include/xnnpack.h b/include/xnnpack.h
index c29899a9d..f8a581c09 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -1996,6 +1996,34 @@ enum xnn_status xnn_setup_hardswish_nc_f16(
void* output,
pthreadpool_t threadpool);
+enum xnn_status xnn_create_max_pooling2d_nhwc_f16(
+ uint32_t input_padding_top,
+ uint32_t input_padding_right,
+ uint32_t input_padding_bottom,
+ uint32_t input_padding_left,
+ uint32_t pooling_height,
+ uint32_t pooling_width,
+ uint32_t stride_height,
+ uint32_t stride_width,
+ uint32_t dilation_height,
+ uint32_t dilation_width,
+ size_t channels,
+ size_t input_pixel_stride,
+ size_t output_pixel_stride,
+ float output_min,
+ float output_max,
+ uint32_t flags,
+ xnn_operator_t* max_pooling_op_out);
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_f16(
+ xnn_operator_t max_pooling_op,
+ size_t batch_size,
+ size_t input_height,
+ size_t input_width,
+ const void* input,
+ void* output,
+ pthreadpool_t threadpool);
+
enum xnn_status xnn_create_multiply_nd_f16(
float output_min,
float output_max,
diff --git a/src/amalgam/f16c.c b/src/amalgam/f16c.c
index 1e41f3aab..d28728b8b 100644
--- a/src/amalgam/f16c.c
+++ b/src/amalgam/f16c.c
@@ -11,6 +11,7 @@
#include <xnnpack/gavgpool.h>
#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/math.h>
+#include <xnnpack/maxpool.h>
#include <xnnpack/prelu.h>
#include <xnnpack/vbinary.h>
#include <xnnpack/vcvt.h>
@@ -358,6 +359,267 @@ void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(
}
}
+void xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
+ const void** input,
+ size_t input_offset,
+ void* output,
+ size_t input_increment,
+ size_t output_increment,
+ const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(output_pixels != 0);
+ assert(kernel_elements != 0);
+ assert(channels != 0);
+
+ const __m256 voutput_min = _mm256_load_ps(params->avx.min);
+ const __m256 voutput_max = _mm256_load_ps(params->avx.max);
+ do {
+ uint16_t* o = output;
+ {
+ const uint16_t* i0 = *input++;
+ const uint16_t* i1 = *input++;
+ const uint16_t* i2 = *input++;
+ const uint16_t* i3 = *input++;
+ const uint16_t* i4 = *input++;
+ const uint16_t* i5 = *input++;
+ const uint16_t* i6 = *input++;
+ const uint16_t* i7 = *input++;
+ const uint16_t* i8 = *input++;
+ i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
+ i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
+ i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
+ i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
+ i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
+ i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
+ i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
+ i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
+ i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
+ if (kernel_elements < 2) {
+ i1 = i0;
+ }
+ if (kernel_elements <= 2) {
+ i2 = i0;
+ }
+ if (kernel_elements < 4) {
+ i3 = i0;
+ }
+ if (kernel_elements <= 4) {
+ i4 = i0;
+ }
+ if (kernel_elements < 6) {
+ i5 = i0;
+ }
+ if (kernel_elements <= 6) {
+ i6 = i0;
+ }
+ if (kernel_elements < 8) {
+ i7 = i0;
+ }
+ if (kernel_elements <= 8) {
+ i8 = i0;
+ }
+
+ size_t c = channels;
+ for (; c >= 8; c -= 8) {
+ const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ i0 += 8;
+ const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ i1 += 8;
+ const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+ i2 += 8;
+ const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+ i3 += 8;
+ const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+ i4 += 8;
+ const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+ i5 += 8;
+ const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+ i6 += 8;
+ const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+ i7 += 8;
+ const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+ i8 += 8;
+
+ const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
+ const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
+ const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
+ const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
+
+ const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
+ const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
+ const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
+ const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
+
+ _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
+ o += 8;
+ }
+ if (c != 0) {
+ const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ i0 += 8;
+ const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ i1 += 8;
+ const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+ i2 += 8;
+ const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+ i3 += 8;
+ const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+ i4 += 8;
+ const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+ i5 += 8;
+ const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+ i6 += 8;
+ const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+ i7 += 8;
+ const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+ i8 += 8;
+
+ const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
+ const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
+ const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
+ const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
+
+ const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
+ const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
+ const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
+ __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
+
+ __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
+ if (c & 4) {
+ _mm_storel_epi64((__m128i*) o, vh);
+ vh = _mm_unpackhi_epi64(vh, vh);
+ o += 4;
+ }
+ if (c & 2) {
+ *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh);
+ vh = _mm_srli_epi64(vh, 32);
+ o += 2;
+ }
+ if (c & 1) {
+ *o = _mm_extract_epi16(vh, 0);
+ o += 1;
+ }
+ }
+ }
+
+ for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
+ const uint16_t* i0 = *input++;
+ const uint16_t* i1 = *input++;
+ const uint16_t* i2 = *input++;
+ const uint16_t* i3 = *input++;
+ const uint16_t* i4 = *input++;
+ const uint16_t* i5 = *input++;
+ const uint16_t* i6 = *input++;
+ const uint16_t* i7 = *input++;
+ i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
+ i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
+ i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
+ i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
+ i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
+ i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
+ i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
+ i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
+ if (k < 2) {
+ i1 = i0;
+ }
+ if (k <= 2) {
+ i2 = i0;
+ }
+ if (k < 4) {
+ i3 = i0;
+ }
+ if (k <= 4) {
+ i4 = i0;
+ }
+ if (k < 6) {
+ i5 = i0;
+ }
+ if (k <= 6) {
+ i6 = i0;
+ }
+ if (k < 8) {
+ i7 = i0;
+ }
+
+ o = output;
+ size_t c = channels;
+ for (; c >= 8; c -= 8) {
+ const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ i0 += 8;
+ const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ i1 += 8;
+ const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+ i2 += 8;
+ const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+ i3 += 8;
+ const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+ i4 += 8;
+ const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+ i5 += 8;
+ const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+ i6 += 8;
+ const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+ i7 += 8;
+ const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
+
+ const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
+ const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
+ const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
+ const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
+
+ const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
+ const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
+ const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
+ const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
+
+ _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
+ o += 8;
+ }
+ if (c != 0) {
+ const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+ const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+ const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+ const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+ const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+ const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+ const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
+
+ const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
+ const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
+ const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
+ const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
+
+ const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
+ const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
+ const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
+ __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
+
+ __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
+ if (c & 4) {
+ _mm_storel_epi64((__m128i*) o, vh);
+ vh = _mm_unpackhi_epi64(vh, vh);
+ o += 4;
+ }
+ if (c & 2) {
+ *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh);
+ vh = _mm_srli_epi64(vh, 32);
+ o += 2;
+ }
+ if (c & 1) {
+ *o = _mm_extract_epi16(vh, 0);
+ o += 1;
+ }
+ }
+ }
+ input = (const void**) ((uintptr_t) input + input_increment);
+ output = (uint16_t*) ((uintptr_t) o + output_increment);
+ } while (--output_pixels != 0);
+}
+
void xnn_f16_prelu_ukernel__f16c_2x16(
size_t rows,
size_t channels,
diff --git a/src/init.c b/src/init.c
index fc167e891..8a1f66601 100644
--- a/src/init.c
+++ b/src/init.c
@@ -2440,6 +2440,13 @@ static void init(void) {
.channel_tile = 8,
};
+ xnn_params.f16.maxpool = (struct maxpool_parameters) {
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__neonfp16arith_c8,
+ .init.f16 = xnn_init_f16_minmax_neon_params,
+ .mr = 9,
+ .qr = 8,
+ };
+
xnn_params.f16.prelu = (struct prelu_parameters) {
.ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__neonfp16arith_2x16,
.row_tile = 2,
@@ -3665,6 +3672,13 @@ static void init(void) {
.channel_tile = 8,
};
+ xnn_params.f16.maxpool = (struct maxpool_parameters) {
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8,
+ .init.f16 = xnn_init_f16_minmax_avx_params,
+ .mr = 9,
+ .qr = 8,
+ };
+
xnn_params.f16.prelu = (struct prelu_parameters) {
.ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__f16c_2x16,
.row_tile = 2,
diff --git a/src/operator-strings.c b/src/operator-strings.c
index 9bb524bed..dce9c4671 100644
--- a/src/operator-strings.c
+++ b/src/operator-strings.c
@@ -124,6 +124,8 @@ const char* xnn_operator_type_to_string(enum xnn_operator_type type) {
return "Leaky ReLU (NC, F32)";
case xnn_operator_type_leaky_relu_nc_qu8:
return "Leaky ReLU (NC, QU8)";
+ case xnn_operator_type_max_pooling_nhwc_f16:
+ return "Max Pooling (NHWC, F16)";
case xnn_operator_type_max_pooling_nhwc_f32:
return "Max Pooling (NHWC, F32)";
case xnn_operator_type_max_pooling_nhwc_s8:
diff --git a/src/operators/max-pooling-nhwc.c b/src/operators/max-pooling-nhwc.c
index 1ec409fdb..8e13b2c47 100644
--- a/src/operators/max-pooling-nhwc.c
+++ b/src/operators/max-pooling-nhwc.c
@@ -14,6 +14,8 @@
#include <stdlib.h>
#include <string.h>
+#include <fp16.h>
+
#include <xnnpack.h>
#include <xnnpack/allocator.h>
#include <xnnpack/common.h>
@@ -455,6 +457,66 @@ enum xnn_status xnn_create_max_pooling2d_nhwc_f32(
max_pooling_op_out);
}
+enum xnn_status xnn_create_max_pooling2d_nhwc_f16(
+ uint32_t input_padding_top,
+ uint32_t input_padding_right,
+ uint32_t input_padding_bottom,
+ uint32_t input_padding_left,
+ uint32_t pooling_height,
+ uint32_t pooling_width,
+ uint32_t stride_height,
+ uint32_t stride_width,
+ uint32_t dilation_height,
+ uint32_t dilation_width,
+ size_t channels,
+ size_t input_pixel_stride,
+ size_t output_pixel_stride,
+ float output_min,
+ float output_max,
+ uint32_t flags,
+ xnn_operator_t* max_pooling_op_out)
+{
+ if (isnan(output_min)) {
+ xnn_log_error(
+ "failed to create %s with NaN output lower bound: lower bound must be non-NaN",
+ xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16));
+ return xnn_status_invalid_parameter;
+ }
+
+ if (isnan(output_max)) {
+ xnn_log_error(
+ "failed to create %s with NaN output upper bound: upper bound must be non-NaN",
+ xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16));
+ return xnn_status_invalid_parameter;
+ }
+
+ const uint16_t output_min_as_half = fp16_ieee_from_fp32_value(output_min);
+ const uint16_t output_max_as_half = fp16_ieee_from_fp32_value(output_max);
+ output_min = fp16_ieee_to_fp32_value(output_min_as_half);
+ output_max = fp16_ieee_to_fp32_value(output_max_as_half);
+ if (output_min >= output_max) {
+ xnn_log_error(
+ "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
+ xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16), output_min, output_max);
+ return xnn_status_invalid_parameter;
+ }
+
+ union xnn_f16_minmax_params params;
+ if (xnn_params.f16.maxpool.init.f16 != NULL) {
+ xnn_params.f16.maxpool.init.f16(&params, output_min_as_half, output_max_as_half);
+ }
+ return create_max_pooling2d_nhwc(
+ input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
+ pooling_height, pooling_width,
+ stride_height, stride_width,
+ dilation_height, dilation_width,
+ channels, input_pixel_stride, output_pixel_stride,
+ flags,
+ &params, sizeof(params), XNN_INIT_FLAG_F16,
+ xnn_operator_type_max_pooling_nhwc_f16,
+ max_pooling_op_out);
+}
+
enum xnn_status xnn_setup_max_pooling2d_nhwc_s8(
xnn_operator_t max_pooling_op,
size_t batch_size,
@@ -509,6 +571,33 @@ enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
pthreadpool_get_threads_count(threadpool));
}
+enum xnn_status xnn_setup_max_pooling2d_nhwc_f16(
+ xnn_operator_t max_pooling_op,
+ size_t batch_size,
+ size_t input_height,
+ size_t input_width,
+ const void* input,
+ void* output,
+ pthreadpool_t threadpool)
+{
+ if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_f16) {
+ xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
+ xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16),
+ xnn_operator_type_to_string(max_pooling_op->type));
+ return xnn_status_invalid_parameter;
+ }
+
+ return setup_max_pooling2d_nhwc(
+ max_pooling_op,
+ batch_size, input_height, input_width,
+ input, output,
+ 1 /* log2(sizeof(input element)) = log2(sizeof(uint16_t)) */,
+ 1 /* log2(sizeof(output element)) = log2(sizeof(uint16_t)) */,
+ &xnn_params.f16.maxpool,
+ &max_pooling_op->params.f16_minmax, sizeof(max_pooling_op->params.f16_minmax),
+ pthreadpool_get_threads_count(threadpool));
+}
+
enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
xnn_operator_t max_pooling_op,
size_t batch_size,
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index b41a4ebee..7f4429f3f 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -86,6 +86,7 @@ enum xnn_operator_type {
xnn_operator_type_hardswish_nc_f32,
xnn_operator_type_leaky_relu_nc_f32,
xnn_operator_type_leaky_relu_nc_qu8,
+ xnn_operator_type_max_pooling_nhwc_f16,
xnn_operator_type_max_pooling_nhwc_f32,
xnn_operator_type_max_pooling_nhwc_s8,
xnn_operator_type_max_pooling_nhwc_u8,
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 53383c44f..90a145260 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -3943,6 +3943,7 @@ struct maxpool_parameters {
xnn_init_s8_minmax_params_fn s8;
xnn_init_u8_minmax_params_fn u8;
xnn_init_f32_minmax_params_fn f32;
+ xnn_init_f16_minmax_params_fn f16;
} init;
uint8_t mr;
uint8_t qr;
@@ -4096,6 +4097,7 @@ struct xnn_parameters {
struct gemm_parameters gemm;
struct gemm_parameters gemm2;
struct dwconv_parameters dwconv[XNN_MAX_F16_DWCONV_UKERNELS];
+ struct maxpool_parameters maxpool;
struct vunary_parameters hswish;
struct prelu_parameters prelu;
struct vbinary_parameters vadd;
diff --git a/test/max-pooling-nhwc.cc b/test/max-pooling-nhwc.cc
index 327dc970f..b25774471 100644
--- a/test/max-pooling-nhwc.cc
+++ b/test/max-pooling-nhwc.cc
@@ -829,6 +829,7 @@ TEST(MAX_POOLING_NHWC_S8, setup_swap_height_and_width) {
.TestSetupS8();
}
+
TEST(MAX_POOLING_NHWC_U8, unit_batch_small_1xM_pool) {
ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
for (size_t channels = 1; channels <= 100; channels += 15) {
@@ -1645,6 +1646,824 @@ TEST(MAX_POOLING_NHWC_U8, setup_swap_height_and_width) {
.TestSetupU8();
}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_padding) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 3; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ for (size_t padding_left = 0; padding_left <= 1; padding_left++) {
+ for (size_t padding_right = 0; padding_right <= 1; padding_right++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .padding_left(padding_left)
+ .padding_right(padding_right)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_tf_same_padding) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 3; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ for (size_t input_width = pool_size; input_width <= pool_size * 2; input_width++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(input_width)
+ .padding_tf_same(true)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 4)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .stride_width(2)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_dilation) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(2 * pool_size + 1)
+ .padding_left(1)
+ .padding_right(1)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .dilation_width(2)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_padding) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
+ for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .padding_top(padding_top)
+ .padding_bottom(padding_bottom)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_tf_same_padding) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ for (size_t input_height = pool_size; input_height <= pool_size * 2; input_height++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(input_height)
+ .input_width(3)
+ .padding_tf_same(true)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 3)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .stride_height(2)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_dilation) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2 * pool_size)
+ .input_width(3)
+ .padding_top(1)
+ .padding_bottom(1)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .dilation_height(2)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_input_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .input_pixel_stride(5 * channels)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .input_pixel_stride(5 * channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_output_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .output_pixel_stride(5 * channels)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .output_pixel_stride(5 * channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_qmin) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .qmin(192)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .qmin(192)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_qmax) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .qmax(192)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .qmax(192)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_padding) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ for (size_t padding_left = 0; padding_left <= 1; padding_left++) {
+ for (size_t padding_right = 0; padding_right <= 1; padding_right++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .padding_left(padding_left)
+ .padding_right(padding_right)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_tf_same_padding) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ for (size_t input_width = pool_size; input_width <= pool_size * 2; input_width++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(input_width)
+ .padding_tf_same(true)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 4)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .stride_width(2)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_dilation) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(2 * pool_size + 1)
+ .padding_left(1)
+ .padding_right(1)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .dilation_width(2)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_padding) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
+ for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .padding_top(padding_top)
+ .padding_bottom(padding_bottom)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_tf_same_padding) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ for (size_t input_height = pool_size; input_height <= pool_size * 2; input_height++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(input_height)
+ .input_width(3)
+ .padding_tf_same(true)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 3)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .stride_height(2)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_dilation) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2 * pool_size)
+ .input_width(3)
+ .padding_top(1)
+ .padding_bottom(1)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .dilation_height(2)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_input_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .input_pixel_stride(5 * channels)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .input_pixel_stride(5 * channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_output_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .output_pixel_stride(5 * channels)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .output_pixel_stride(5 * channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_qmin) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .qmin(192)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .qmin(192)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_qmax) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .qmax(192)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(1)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .qmax(192)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_small_pool) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_small_pool_with_input_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .input_pixel_stride(5 * channels)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .input_pixel_stride(5 * channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_small_pool_with_output_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .output_pixel_stride(5 * channels)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .output_pixel_stride(5 * channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_large_pool) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_large_pool_with_input_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .input_pixel_stride(5 * channels)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .input_pixel_stride(5 * channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_large_pool_with_output_stride) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ for (size_t channels = 1; channels <= 100; channels += 15) {
+ for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(pool_size + 1)
+ .input_width(3)
+ .pooling_height(pool_size)
+ .pooling_width(1)
+ .channels(channels)
+ .output_pixel_stride(5 * channels)
+ .TestF16();
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(2)
+ .input_width(pool_size + 2)
+ .pooling_height(1)
+ .pooling_width(pool_size)
+ .channels(channels)
+ .output_pixel_stride(5 * channels)
+ .TestF16();
+ }
+ }
+}
+
+TEST(MAX_POOLING_NHWC_F16, setup_increasing_batch) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .next_batch_size(5)
+ .input_height(8)
+ .input_width(8)
+ .pooling_height(5)
+ .pooling_width(3)
+ .channels(24)
+ .TestSetupF16();
+}
+
+TEST(MAX_POOLING_NHWC_F16, setup_decreasing_batch) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ MaxPoolingOperatorTester()
+ .batch_size(5)
+ .next_batch_size(3)
+ .input_height(8)
+ .input_width(8)
+ .pooling_height(5)
+ .pooling_width(3)
+ .channels(24)
+ .TestSetupF16();
+}
+
+TEST(MAX_POOLING_NHWC_F16, setup_changing_height) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(8)
+ .input_width(8)
+ .next_input_height(9)
+ .pooling_height(5)
+ .pooling_width(3)
+ .channels(24)
+ .TestSetupF16();
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(8)
+ .input_width(8)
+ .next_input_height(7)
+ .pooling_height(5)
+ .pooling_width(3)
+ .channels(24)
+ .TestSetupF16();
+}
+
+TEST(MAX_POOLING_NHWC_F16, setup_changing_width) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(8)
+ .input_width(8)
+ .next_input_width(9)
+ .pooling_height(5)
+ .pooling_width(3)
+ .channels(24)
+ .TestSetupF16();
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(8)
+ .input_width(8)
+ .next_input_width(7)
+ .pooling_height(5)
+ .pooling_width(3)
+ .channels(24)
+ .TestSetupF16();
+}
+
+TEST(MAX_POOLING_NHWC_F16, setup_swap_height_and_width) {
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ MaxPoolingOperatorTester()
+ .batch_size(3)
+ .input_height(9)
+ .input_width(8)
+ .next_input_height(8)
+ .next_input_width(9)
+ .pooling_height(5)
+ .pooling_width(3)
+ .channels(24)
+ .TestSetupF16();
+}
+
+
TEST(MAX_POOLING_NHWC_F32, unit_batch_small_1xM_pool) {
ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
for (size_t channels = 1; channels <= 100; channels += 15) {
diff --git a/test/max-pooling-operator-tester.h b/test/max-pooling-operator-tester.h
index 349c16f30..106150603 100644
--- a/test/max-pooling-operator-tester.h
+++ b/test/max-pooling-operator-tester.h
@@ -10,6 +10,8 @@
#include <gtest/gtest.h>
+#include <fp16.h>
+
#include <algorithm>
#include <cassert>
#include <cstddef>
@@ -615,6 +617,119 @@ class MaxPoolingOperatorTester {
}
}
+ void TestF16() const {
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ // Note: we need to avoid FP16 denormals in the generated tensor because they might be processed differently in
+ // native vs emulated arithmetics, and we use exact comparison to verify the results against reference.
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.001f, 1.0f), rng);
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+ std::vector<uint16_t> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
+ std::vector<uint16_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
+ std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
+ for (size_t iteration = 0; iteration < iterations(); iteration++) {
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+
+ // Compute reference results, without clamping.
+ for (size_t i = 0; i < batch_size(); i++) {
+ for (size_t oy = 0; oy < output_height(); oy++) {
+ for (size_t ox = 0; ox < output_width(); ox++) {
+ for (size_t c = 0; c < channels(); c++) {
+ float max_value = -std::numeric_limits<float>::infinity();
+ for (size_t py = 0; py < pooling_height(); py++) {
+ const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
+ for (size_t px = 0; px < pooling_width(); px++) {
+ const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
+ if (ix < input_width() && iy < input_height()) {
+ max_value = std::max(max_value,
+ fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]));
+ }
+ }
+ }
+ output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
+ }
+ }
+ }
+ }
+
+ // Compute clamping parameters.
+ const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+ const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
+ const float accumulated_range = accumulated_max - accumulated_min;
+ float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
+ float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
+ output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
+ output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
+ if (accumulated_range == 0.0f) {
+ output_min = -std::numeric_limits<float>::infinity();
+ output_max = +std::numeric_limits<float>::infinity();
+ }
+ if (qmin() == std::numeric_limits<uint8_t>::min()) {
+ output_min = -std::numeric_limits<float>::infinity();
+ }
+ if (qmax() == std::numeric_limits<uint8_t>::max()) {
+ output_max = +std::numeric_limits<float>::infinity();
+ }
+
+ // Clamp reference results.
+ for (float& value : output_ref) {
+ value = std::max(std::min(value, output_max), output_min);
+ }
+
+ // Create, setup, run, and destroy Max Pooling operator.
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ xnn_operator_t max_pooling_op = nullptr;
+
+ const xnn_status status = xnn_create_max_pooling2d_nhwc_f16(
+ padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
+ padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
+ pooling_height(), pooling_width(),
+ stride_height(), stride_width(),
+ dilation_height(), dilation_width(),
+ channels(), input_pixel_stride(), output_pixel_stride(),
+ output_min, output_max,
+ padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0,
+ &max_pooling_op);
+ if (status == xnn_status_unsupported_hardware) {
+ GTEST_SKIP();
+ }
+ ASSERT_EQ(xnn_status_success, status);
+ ASSERT_NE(nullptr, max_pooling_op);
+
+ // Smart pointer to automatically delete max_pooling_op.
+ std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
+
+ ASSERT_EQ(xnn_status_success,
+ xnn_setup_max_pooling2d_nhwc_f16(
+ max_pooling_op,
+ batch_size(), input_height(), input_width(),
+ input.data(), output.data(),
+ nullptr /* thread pool */));
+
+ ASSERT_EQ(xnn_status_success,
+ xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
+
+ // Verify results.
+ for (size_t i = 0; i < batch_size(); i++) {
+ for (size_t y = 0; y < output_height(); y++) {
+ for (size_t x = 0; x < output_width(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
+ ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max);
+ ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min);
+ ASSERT_EQ(
+ fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]),
+ output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) <<
+ "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c
+ << ", min = " << output_min << ", max = " << output_max;
+ }
+ }
+ }
+ }
+ }
+ }
+
void TestF32() const {
std::random_device random_device;
auto rng = std::mt19937(random_device());
@@ -995,6 +1110,178 @@ class MaxPoolingOperatorTester {
}
}
+ void TestSetupF16() const {
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ // Note: we need to avoid FP16 denormals in the generated tensor because they might be processed differently in
+ // native vs emulated arithmetics, and we use exact comparison to verify the results against reference.
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.001f, 1.0f), rng);
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+ std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max(
+ (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(),
+ (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels()));
+ std::vector<uint16_t> output(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max(
+ (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(),
+ (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels()));
+ std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
+ std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels());
+ for (size_t iteration = 0; iteration < iterations(); iteration++) {
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+
+ // Compute reference results, without clamping.
+ for (size_t i = 0; i < batch_size(); i++) {
+ for (size_t oy = 0; oy < output_height(); oy++) {
+ for (size_t ox = 0; ox < output_width(); ox++) {
+ for (size_t c = 0; c < channels(); c++) {
+ float max_value = -std::numeric_limits<float>::infinity();
+ for (size_t py = 0; py < pooling_height(); py++) {
+ const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
+ for (size_t px = 0; px < pooling_width(); px++) {
+ const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
+ if (ix < input_width() && iy < input_height()) {
+ max_value = std::max(max_value,
+ fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]));
+ }
+ }
+ }
+ output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
+ }
+ }
+ }
+ }
+
+ // Compute clamping parameters.
+ const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+ const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
+ const float accumulated_range = accumulated_max - accumulated_min;
+ float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
+ float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
+ output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
+ output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
+ if (accumulated_range == 0.0f) {
+ output_min = -std::numeric_limits<float>::infinity();
+ output_max = +std::numeric_limits<float>::infinity();
+ }
+ if (qmin() == std::numeric_limits<uint8_t>::min()) {
+ output_min = -std::numeric_limits<float>::infinity();
+ }
+ if (qmax() == std::numeric_limits<uint8_t>::max()) {
+ output_max = +std::numeric_limits<float>::infinity();
+ }
+
+ // Clamp reference results.
+ for (float& value : output_ref) {
+ value = std::max(std::min(value, output_max), output_min);
+ }
+
+ // Create, setup, and run Max Pooling operator once.
+ ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+ xnn_operator_t max_pooling_op = nullptr;
+
+ const xnn_status status = xnn_create_max_pooling2d_nhwc_f16(
+ padding_top(), padding_right(), padding_bottom(), padding_left(),
+ pooling_height(), pooling_width(),
+ stride_height(), stride_width(),
+ dilation_height(), dilation_width(),
+ channels(), input_pixel_stride(), output_pixel_stride(),
+ output_min, output_max,
+ 0, &max_pooling_op);
+ if (status == xnn_status_unsupported_hardware) {
+ GTEST_SKIP();
+ }
+ ASSERT_EQ(xnn_status_success, status);
+ ASSERT_NE(nullptr, max_pooling_op);
+
+ // Smart pointer to automatically delete max_pooling_op.
+ std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
+
+ ASSERT_EQ(xnn_status_success,
+ xnn_setup_max_pooling2d_nhwc_f16(
+ max_pooling_op,
+ batch_size(), input_height(), input_width(),
+ input.data(), output.data(),
+ nullptr /* thread pool */));
+
+ ASSERT_EQ(xnn_status_success,
+ xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
+
+ // Verify results of the first run.
+ for (size_t i = 0; i < batch_size(); i++) {
+ for (size_t y = 0; y < output_height(); y++) {
+ for (size_t x = 0; x < output_width(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
+ ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max);
+ ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min);
+ ASSERT_EQ(
+ fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]),
+ output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) <<
+ "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c
+ << ", min = " << output_min << ", max = " << output_max;
+ }
+ }
+ }
+ }
+
+ // Re-generate data for the second run.
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+
+ // Compute reference results for the second run, including clamping.
+ for (size_t i = 0; i < next_batch_size(); i++) {
+ for (size_t oy = 0; oy < next_output_height(); oy++) {
+ for (size_t ox = 0; ox < next_output_width(); ox++) {
+ for (size_t c = 0; c < channels(); c++) {
+ float max_value = -std::numeric_limits<float>::infinity();
+ for (size_t py = 0; py < pooling_height(); py++) {
+ const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
+ for (size_t px = 0; px < pooling_width(); px++) {
+ const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
+ if (ix < next_input_width() && iy < next_input_height()) {
+ max_value = std::max(max_value,
+ fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c]));
+ }
+ }
+ }
+ max_value = std::min(max_value, output_max);
+ max_value = std::max(max_value, output_min);
+ next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = max_value;
+ }
+ }
+ }
+ }
+
+ // Setup and run Max Pooling operator the second time, and destroy the operator.
+ ASSERT_EQ(xnn_status_success,
+ xnn_setup_max_pooling2d_nhwc_f16(
+ max_pooling_op,
+ next_batch_size(), next_input_height(), next_input_width(),
+ input.data(), output.data(),
+ nullptr /* thread pool */));
+
+ ASSERT_EQ(xnn_status_success,
+ xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
+
+ // Verify results of the second run.
+ for (size_t i = 0; i < next_batch_size(); i++) {
+ for (size_t y = 0; y < next_output_height(); y++) {
+ for (size_t x = 0; x < next_output_width(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
+ ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_max);
+ ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_min);
+ ASSERT_EQ(
+ fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]),
+ next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c]) <<
+ "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c
+ << ", min = " << output_min << ", max = " << output_max;
+ }
+ }
+ }
+ }
+ }
+ }
+
void TestSetupF32() const {
std::random_device random_device;
auto rng = std::mt19937(random_device());