F16 Max Pooling NHWC operator

PiperOrigin-RevId: 426345207
author: Marat Dukhan <maratek@google.com> 2022-02-04 01:55:53 -0800
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2022-02-04 01:57:03 -0800
commit: 5756a927fc5044bdcfebe57d4bd84408ca0a0975 (patch)
tree: 18e40a963d466dff13bb2a35340f3b032f047c0c
parent: af1671ab277a0742f8c7251b6b4ad1a16aba79bd (diff)
download: XNNPACK-5756a927fc5044bdcfebe57d4bd84408ca0a0975.tar.gz
11 files changed, 1508 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index b9156846d..72fe1a218 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4425,6 +4425,7 @@ PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS = [
     "src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c",
     "src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c",
     "src/f16-igemm/gen/6x16-minmax-neonfp16arith-ld64.c",
+    "src/f16-maxpool/9p8x-minmax-neonfp16arith-c8.c",
     "src/f16-prelu/gen/neonfp16arith-2x16.c",
     "src/f16-vbinary/gen/vadd-minmax-neonfp16arith-x16.c",
     "src/f16-vbinary/gen/vaddc-minmax-neonfp16arith-x16.c",
@@ -6017,6 +6018,7 @@ PROD_F16C_MICROKERNEL_SRCS = [
     "src/f16-f32-vcvt/gen/vcvt-f16c-x16.c",
     "src/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c",
     "src/f16-gavgpool/gen/7x-minmax-f16c-c8.c",
+    "src/f16-maxpool/9p8x-minmax-f16c-c8.c",
     "src/f16-prelu/gen/f16c-2x16.c",
     "src/f16-vbinary/gen/vadd-minmax-f16c-x16.c",
     "src/f16-vbinary/gen/vaddc-minmax-f16c-x16.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ef91efc0..936d7b9b5 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3181,6 +3181,7 @@ SET(PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS
   src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c
   src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c
   src/f16-igemm/gen/6x16-minmax-neonfp16arith-ld64.c
+  src/f16-maxpool/9p8x-minmax-neonfp16arith-c8.c
   src/f16-prelu/gen/neonfp16arith-2x16.c
   src/f16-vbinary/gen/vadd-minmax-neonfp16arith-x16.c
   src/f16-vbinary/gen/vaddc-minmax-neonfp16arith-x16.c
@@ -4757,6 +4758,7 @@ SET(PROD_F16C_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/vcvt-f16c-x16.c
   src/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c
   src/f16-gavgpool/gen/7x-minmax-f16c-c8.c
+  src/f16-maxpool/9p8x-minmax-f16c-c8.c
   src/f16-prelu/gen/neonfp16arith-2x16.c
   src/f16-vbinary/gen/vadd-minmax-f16c-x16.c
   src/f16-vbinary/gen/vaddc-minmax-f16c-x16.c
diff --git a/include/xnnpack.h b/include/xnnpack.h
index c29899a9d..f8a581c09 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -1996,6 +1996,34 @@ enum xnn_status xnn_setup_hardswish_nc_f16(
   void* output,
   pthreadpool_t threadpool);
 
+enum xnn_status xnn_create_max_pooling2d_nhwc_f16(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* max_pooling_op_out);
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_f16(
+  xnn_operator_t max_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  const void* input,
+  void* output,
+  pthreadpool_t threadpool);
+
 enum xnn_status xnn_create_multiply_nd_f16(
   float output_min,
   float output_max,
diff --git a/src/amalgam/f16c.c b/src/amalgam/f16c.c
index 1e41f3aab..d28728b8b 100644
--- a/src/amalgam/f16c.c
+++ b/src/amalgam/f16c.c
@@ -11,6 +11,7 @@
 #include <xnnpack/gavgpool.h>
 #include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/math.h>
+#include <xnnpack/maxpool.h>
 #include <xnnpack/prelu.h>
 #include <xnnpack/vbinary.h>
 #include <xnnpack/vcvt.h>
@@ -358,6 +359,267 @@ void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(
   }
 }
 
+void xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const void** input,
+    size_t input_offset,
+    void* output,
+    size_t input_increment,
+    size_t output_increment,
+    const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
+
+  const __m256 voutput_min = _mm256_load_ps(params->avx.min);
+  const __m256 voutput_max = _mm256_load_ps(params->avx.max);
+  do {
+    uint16_t* o = output;
+    {
+      const uint16_t* i0 = *input++;
+      const uint16_t* i1 = *input++;
+      const uint16_t* i2 = *input++;
+      const uint16_t* i3 = *input++;
+      const uint16_t* i4 = *input++;
+      const uint16_t* i5 = *input++;
+      const uint16_t* i6 = *input++;
+      const uint16_t* i7 = *input++;
+      const uint16_t* i8 = *input++;
+      i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
+      i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
+      if (kernel_elements < 2) {
+        i1 = i0;
+      }
+      if (kernel_elements <= 2) {
+        i2 = i0;
+      }
+      if (kernel_elements < 4) {
+        i3 = i0;
+      }
+      if (kernel_elements <= 4) {
+        i4 = i0;
+      }
+      if (kernel_elements < 6) {
+        i5 = i0;
+      }
+      if (kernel_elements <= 6) {
+        i6 = i0;
+      }
+      if (kernel_elements < 8) {
+        i7 = i0;
+      }
+      if (kernel_elements <= 8) {
+        i8 = i0;
+      }
+
+      size_t c = channels;
+      for (; c >= 8; c -= 8) {
+        const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+        i0 += 8;
+        const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+        i1 += 8;
+        const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+        i2 += 8;
+        const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+        i3 += 8;
+        const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+        i4 += 8;
+        const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+        i5 += 8;
+        const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+        i6 += 8;
+        const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+        i7 += 8;
+        const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+        i8 += 8;
+
+        const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
+        const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
+        const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
+        const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
+
+        const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
+        const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
+        const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
+        const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
+
+        _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
+        o += 8;
+      }
+      if (c != 0) {
+        const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+        i0 += 8;
+        const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+        i1 += 8;
+        const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+        i2 += 8;
+        const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+        i3 += 8;
+        const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+        i4 += 8;
+        const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+        i5 += 8;
+        const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+        i6 += 8;
+        const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+        i7 += 8;
+        const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+        i8 += 8;
+
+        const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
+        const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
+        const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
+        const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
+
+        const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
+        const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
+        const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
+        __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
+
+        __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
+        if (c & 4) {
+          _mm_storel_epi64((__m128i*) o, vh);
+          vh = _mm_unpackhi_epi64(vh, vh);
+          o += 4;
+        }
+        if (c & 2) {
+          *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh);
+          vh = _mm_srli_epi64(vh, 32);
+          o += 2;
+        }
+        if (c & 1) {
+          *o = _mm_extract_epi16(vh, 0);
+          o += 1;
+        }
+      }
+    }
+
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
+      const uint16_t* i0 = *input++;
+      const uint16_t* i1 = *input++;
+      const uint16_t* i2 = *input++;
+      const uint16_t* i3 = *input++;
+      const uint16_t* i4 = *input++;
+      const uint16_t* i5 = *input++;
+      const uint16_t* i6 = *input++;
+      const uint16_t* i7 = *input++;
+      i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
+      if (k < 2) {
+        i1 = i0;
+      }
+      if (k <= 2) {
+        i2 = i0;
+      }
+      if (k < 4) {
+        i3 = i0;
+      }
+      if (k <= 4) {
+        i4 = i0;
+      }
+      if (k < 6) {
+        i5 = i0;
+      }
+      if (k <= 6) {
+        i6 = i0;
+      }
+      if (k < 8) {
+        i7 = i0;
+      }
+
+      o = output;
+      size_t c = channels;
+      for (; c >= 8; c -= 8) {
+        const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+        i0 += 8;
+        const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+        i1 += 8;
+        const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+        i2 += 8;
+        const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+        i3 += 8;
+        const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+        i4 += 8;
+        const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+        i5 += 8;
+        const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+        i6 += 8;
+        const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+        i7 += 8;
+        const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
+
+        const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
+        const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
+        const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
+        const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
+
+        const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
+        const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
+        const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
+        const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
+
+        _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
+        o += 8;
+      }
+      if (c != 0) {
+        const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+        const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+        const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+        const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+        const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+        const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+        const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+        const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+        const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
+
+        const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
+        const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
+        const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
+        const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
+
+        const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
+        const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
+        const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
+        __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
+
+        __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
+        if (c & 4) {
+          _mm_storel_epi64((__m128i*) o, vh);
+          vh = _mm_unpackhi_epi64(vh, vh);
+          o += 4;
+        }
+        if (c & 2) {
+          *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh);
+          vh = _mm_srli_epi64(vh, 32);
+          o += 2;
+        }
+        if (c & 1) {
+          *o = _mm_extract_epi16(vh, 0);
+          o += 1;
+        }
+      }
+    }
+    input = (const void**) ((uintptr_t) input + input_increment);
+    output = (uint16_t*) ((uintptr_t) o + output_increment);
+  } while (--output_pixels != 0);
+}
+
 void xnn_f16_prelu_ukernel__f16c_2x16(
     size_t rows,
     size_t channels,
diff --git a/src/init.c b/src/init.c
index fc167e891..8a1f66601 100644
--- a/src/init.c
+++ b/src/init.c
@@ -2440,6 +2440,13 @@ static void init(void) {
         .channel_tile = 8,
       };
 
+      xnn_params.f16.maxpool = (struct maxpool_parameters) {
+        .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__neonfp16arith_c8,
+        .init.f16 = xnn_init_f16_minmax_neon_params,
+        .mr = 9,
+        .qr = 8,
+      };
+
       xnn_params.f16.prelu = (struct prelu_parameters) {
         .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__neonfp16arith_2x16,
         .row_tile = 2,
@@ -3665,6 +3672,13 @@ static void init(void) {
         .channel_tile = 8,
       };
 
+      xnn_params.f16.maxpool = (struct maxpool_parameters) {
+        .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8,
+        .init.f16 = xnn_init_f16_minmax_avx_params,
+        .mr = 9,
+        .qr = 8,
+      };
+
       xnn_params.f16.prelu = (struct prelu_parameters) {
         .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__f16c_2x16,
         .row_tile = 2,
diff --git a/src/operator-strings.c b/src/operator-strings.c
index 9bb524bed..dce9c4671 100644
--- a/src/operator-strings.c
+++ b/src/operator-strings.c
@@ -124,6 +124,8 @@ const char* xnn_operator_type_to_string(enum xnn_operator_type type) {
       return "Leaky ReLU (NC, F32)";
     case xnn_operator_type_leaky_relu_nc_qu8:
       return "Leaky ReLU (NC, QU8)";
+    case xnn_operator_type_max_pooling_nhwc_f16:
+      return "Max Pooling (NHWC, F16)";
     case xnn_operator_type_max_pooling_nhwc_f32:
       return "Max Pooling (NHWC, F32)";
     case xnn_operator_type_max_pooling_nhwc_s8:
diff --git a/src/operators/max-pooling-nhwc.c b/src/operators/max-pooling-nhwc.c
index 1ec409fdb..8e13b2c47 100644
--- a/src/operators/max-pooling-nhwc.c
+++ b/src/operators/max-pooling-nhwc.c
@@ -14,6 +14,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <fp16.h>
+
 #include <xnnpack.h>
 #include <xnnpack/allocator.h>
 #include <xnnpack/common.h>
@@ -455,6 +457,66 @@ enum xnn_status xnn_create_max_pooling2d_nhwc_f32(
     max_pooling_op_out);
 }
 
+enum xnn_status xnn_create_max_pooling2d_nhwc_f16(
+    uint32_t input_padding_top,
+    uint32_t input_padding_right,
+    uint32_t input_padding_bottom,
+    uint32_t input_padding_left,
+    uint32_t pooling_height,
+    uint32_t pooling_width,
+    uint32_t stride_height,
+    uint32_t stride_width,
+    uint32_t dilation_height,
+    uint32_t dilation_width,
+    size_t channels,
+    size_t input_pixel_stride,
+    size_t output_pixel_stride,
+    float output_min,
+    float output_max,
+    uint32_t flags,
+    xnn_operator_t* max_pooling_op_out)
+{
+  if (isnan(output_min)) {
+    xnn_log_error(
+      "failed to create %s with NaN output lower bound: lower bound must be non-NaN",
+      xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16));
+    return xnn_status_invalid_parameter;
+  }
+
+  if (isnan(output_max)) {
+    xnn_log_error(
+      "failed to create %s with NaN output upper bound: upper bound must be non-NaN",
+      xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16));
+    return xnn_status_invalid_parameter;
+  }
+
+  const uint16_t output_min_as_half = fp16_ieee_from_fp32_value(output_min);
+  const uint16_t output_max_as_half = fp16_ieee_from_fp32_value(output_max);
+  output_min = fp16_ieee_to_fp32_value(output_min_as_half);
+  output_max = fp16_ieee_to_fp32_value(output_max_as_half);
+  if (output_min >= output_max) {
+    xnn_log_error(
+      "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
+      xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16), output_min, output_max);
+    return xnn_status_invalid_parameter;
+  }
+
+  union xnn_f16_minmax_params params;
+  if (xnn_params.f16.maxpool.init.f16 != NULL) {
+    xnn_params.f16.maxpool.init.f16(&params, output_min_as_half, output_max_as_half);
+  }
+  return create_max_pooling2d_nhwc(
+    input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
+    pooling_height, pooling_width,
+    stride_height, stride_width,
+    dilation_height, dilation_width,
+    channels, input_pixel_stride, output_pixel_stride,
+    flags,
+    &params, sizeof(params), XNN_INIT_FLAG_F16,
+    xnn_operator_type_max_pooling_nhwc_f16,
+    max_pooling_op_out);
+}
+
 enum xnn_status xnn_setup_max_pooling2d_nhwc_s8(
     xnn_operator_t max_pooling_op,
     size_t batch_size,
@@ -509,6 +571,33 @@ enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
     pthreadpool_get_threads_count(threadpool));
 }
 
+enum xnn_status xnn_setup_max_pooling2d_nhwc_f16(
+    xnn_operator_t max_pooling_op,
+    size_t batch_size,
+    size_t input_height,
+    size_t input_width,
+    const void* input,
+    void* output,
+    pthreadpool_t threadpool)
+{
+  if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_f16) {
+    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
+      xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f16),
+      xnn_operator_type_to_string(max_pooling_op->type));
+    return xnn_status_invalid_parameter;
+  }
+
+  return setup_max_pooling2d_nhwc(
+    max_pooling_op,
+    batch_size, input_height, input_width,
+    input, output,
+    1 /* log2(sizeof(input element)) = log2(sizeof(uint16_t)) */,
+    1 /* log2(sizeof(output element)) = log2(sizeof(uint16_t)) */,
+    &xnn_params.f16.maxpool,
+    &max_pooling_op->params.f16_minmax, sizeof(max_pooling_op->params.f16_minmax),
+    pthreadpool_get_threads_count(threadpool));
+}
+
 enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
     xnn_operator_t max_pooling_op,
     size_t batch_size,
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index b41a4ebee..7f4429f3f 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -86,6 +86,7 @@ enum xnn_operator_type {
   xnn_operator_type_hardswish_nc_f32,
   xnn_operator_type_leaky_relu_nc_f32,
   xnn_operator_type_leaky_relu_nc_qu8,
+  xnn_operator_type_max_pooling_nhwc_f16,
   xnn_operator_type_max_pooling_nhwc_f32,
   xnn_operator_type_max_pooling_nhwc_s8,
   xnn_operator_type_max_pooling_nhwc_u8,
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 53383c44f..90a145260 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -3943,6 +3943,7 @@ struct maxpool_parameters {
     xnn_init_s8_minmax_params_fn s8;
     xnn_init_u8_minmax_params_fn u8;
     xnn_init_f32_minmax_params_fn f32;
+    xnn_init_f16_minmax_params_fn f16;
   } init;
   uint8_t mr;
   uint8_t qr;
@@ -4096,6 +4097,7 @@ struct xnn_parameters {
     struct gemm_parameters gemm;
     struct gemm_parameters gemm2;
     struct dwconv_parameters dwconv[XNN_MAX_F16_DWCONV_UKERNELS];
+    struct maxpool_parameters maxpool;
     struct vunary_parameters hswish;
     struct prelu_parameters prelu;
     struct vbinary_parameters vadd;
diff --git a/test/max-pooling-nhwc.cc b/test/max-pooling-nhwc.cc
index 327dc970f..b25774471 100644
--- a/test/max-pooling-nhwc.cc
+++ b/test/max-pooling-nhwc.cc
@@ -829,6 +829,7 @@ TEST(MAX_POOLING_NHWC_S8, setup_swap_height_and_width) {
     .TestSetupS8();
 }
 
+
 TEST(MAX_POOLING_NHWC_U8, unit_batch_small_1xM_pool) {
   ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
   for (size_t channels = 1; channels <= 100; channels += 15) {
@@ -1645,6 +1646,824 @@ TEST(MAX_POOLING_NHWC_U8, setup_swap_height_and_width) {
     .TestSetupU8();
 }
 
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_padding) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 3; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      for (size_t padding_left = 0; padding_left <= 1; padding_left++) {
+        for (size_t padding_right = 0; padding_right <= 1; padding_right++) {
+          MaxPoolingOperatorTester()
+            .batch_size(1)
+            .input_height(2)
+            .input_width(pool_size + 2)
+            .padding_left(padding_left)
+            .padding_right(padding_right)
+            .pooling_height(1)
+            .pooling_width(pool_size)
+            .channels(channels)
+            .TestF16();
+        }
+      }
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_tf_same_padding) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 3; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      for (size_t input_width = pool_size; input_width <= pool_size * 2; input_width++) {
+        MaxPoolingOperatorTester()
+          .batch_size(1)
+          .input_height(2)
+          .input_width(input_width)
+          .padding_tf_same(true)
+          .pooling_height(1)
+          .pooling_width(pool_size)
+          .channels(channels)
+          .TestF16();
+      }
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 4)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .stride_width(2)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_1xM_pool_with_dilation) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(2 * pool_size + 1)
+        .padding_left(1)
+        .padding_right(1)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .dilation_width(2)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_padding) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
+        for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
+          MaxPoolingOperatorTester()
+            .batch_size(1)
+            .input_height(pool_size + 1)
+            .input_width(3)
+            .padding_top(padding_top)
+            .padding_bottom(padding_bottom)
+            .pooling_height(pool_size)
+            .pooling_width(1)
+            .channels(channels)
+            .TestF16();
+        }
+      }
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_tf_same_padding) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      for (size_t input_height = pool_size; input_height <= pool_size * 2; input_height++) {
+        MaxPoolingOperatorTester()
+          .batch_size(1)
+          .input_height(input_height)
+          .input_width(3)
+          .padding_tf_same(true)
+          .pooling_height(pool_size)
+          .pooling_width(1)
+          .channels(channels)
+          .TestF16();
+      }
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 3)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .stride_height(2)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_Mx1_pool_with_dilation) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2 * pool_size)
+        .input_width(3)
+        .padding_top(1)
+        .padding_bottom(1)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .dilation_height(2)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_input_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .input_pixel_stride(5 * channels)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .input_pixel_stride(5 * channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_output_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .output_pixel_stride(5 * channels)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .output_pixel_stride(5 * channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_qmin) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .qmin(192)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .qmin(192)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_small_pool_with_qmax) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .qmax(192)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .qmax(192)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_padding) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      for (size_t padding_left = 0; padding_left <= 1; padding_left++) {
+        for (size_t padding_right = 0; padding_right <= 1; padding_right++) {
+          MaxPoolingOperatorTester()
+            .batch_size(1)
+            .input_height(2)
+            .input_width(pool_size + 2)
+            .padding_left(padding_left)
+            .padding_right(padding_right)
+            .pooling_height(1)
+            .pooling_width(pool_size)
+            .channels(channels)
+            .TestF16();
+        }
+      }
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_tf_same_padding) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      for (size_t input_width = pool_size; input_width <= pool_size * 2; input_width++) {
+        MaxPoolingOperatorTester()
+          .batch_size(1)
+          .input_height(2)
+          .input_width(input_width)
+          .padding_tf_same(true)
+          .pooling_height(1)
+          .pooling_width(pool_size)
+          .channels(channels)
+          .TestF16();
+      }
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 4)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .stride_width(2)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_1xM_pool_with_dilation) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(2 * pool_size + 1)
+        .padding_left(1)
+        .padding_right(1)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .dilation_width(2)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_padding) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
+        for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
+          MaxPoolingOperatorTester()
+            .batch_size(1)
+            .input_height(pool_size + 1)
+            .input_width(3)
+            .padding_top(padding_top)
+            .padding_bottom(padding_bottom)
+            .pooling_height(pool_size)
+            .pooling_width(1)
+            .channels(channels)
+            .TestF16();
+        }
+      }
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_tf_same_padding) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      for (size_t input_height = pool_size; input_height <= pool_size * 2; input_height++) {
+        MaxPoolingOperatorTester()
+          .batch_size(1)
+          .input_height(input_height)
+          .input_width(3)
+          .padding_tf_same(true)
+          .pooling_height(pool_size)
+          .pooling_width(1)
+          .channels(channels)
+          .TestF16();
+      }
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 3)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .stride_height(2)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_Mx1_pool_with_dilation) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2 * pool_size)
+        .input_width(3)
+        .padding_top(1)
+        .padding_bottom(1)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .dilation_height(2)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_input_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .input_pixel_stride(5 * channels)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .input_pixel_stride(5 * channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_output_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .output_pixel_stride(5 * channels)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .output_pixel_stride(5 * channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_qmin) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .qmin(192)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .qmin(192)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, unit_batch_large_pool_with_qmax) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .qmax(192)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(1)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .qmax(192)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_small_pool) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_small_pool_with_input_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .input_pixel_stride(5 * channels)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .input_pixel_stride(5 * channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_small_pool_with_output_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = 2; pool_size <= xnn_params.f32.maxpool.mr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .output_pixel_stride(5 * channels)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .output_pixel_stride(5 * channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_large_pool) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_large_pool_with_input_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .input_pixel_stride(5 * channels)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .input_pixel_stride(5 * channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, small_batch_large_pool_with_output_stride) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  for (size_t channels = 1; channels <= 100; channels += 15) {
+    for (size_t pool_size = xnn_params.f32.maxpool.mr + 1; pool_size <= xnn_params.f32.maxpool.mr + xnn_params.f32.maxpool.qr; pool_size++) {
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(pool_size + 1)
+        .input_width(3)
+        .pooling_height(pool_size)
+        .pooling_width(1)
+        .channels(channels)
+        .output_pixel_stride(5 * channels)
+        .TestF16();
+      MaxPoolingOperatorTester()
+        .batch_size(3)
+        .input_height(2)
+        .input_width(pool_size + 2)
+        .pooling_height(1)
+        .pooling_width(pool_size)
+        .channels(channels)
+        .output_pixel_stride(5 * channels)
+        .TestF16();
+    }
+  }
+}
+
+TEST(MAX_POOLING_NHWC_F16, setup_increasing_batch) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  MaxPoolingOperatorTester()
+    .batch_size(3)
+    .next_batch_size(5)
+    .input_height(8)
+    .input_width(8)
+    .pooling_height(5)
+    .pooling_width(3)
+    .channels(24)
+    .TestSetupF16();
+}
+
+TEST(MAX_POOLING_NHWC_F16, setup_decreasing_batch) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  MaxPoolingOperatorTester()
+    .batch_size(5)
+    .next_batch_size(3)
+    .input_height(8)
+    .input_width(8)
+    .pooling_height(5)
+    .pooling_width(3)
+    .channels(24)
+    .TestSetupF16();
+}
+
+TEST(MAX_POOLING_NHWC_F16, setup_changing_height) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  MaxPoolingOperatorTester()
+    .batch_size(3)
+    .input_height(8)
+    .input_width(8)
+    .next_input_height(9)
+    .pooling_height(5)
+    .pooling_width(3)
+    .channels(24)
+    .TestSetupF16();
+  MaxPoolingOperatorTester()
+    .batch_size(3)
+    .input_height(8)
+    .input_width(8)
+    .next_input_height(7)
+    .pooling_height(5)
+    .pooling_width(3)
+    .channels(24)
+    .TestSetupF16();
+}
+
+TEST(MAX_POOLING_NHWC_F16, setup_changing_width) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  MaxPoolingOperatorTester()
+    .batch_size(3)
+    .input_height(8)
+    .input_width(8)
+    .next_input_width(9)
+    .pooling_height(5)
+    .pooling_width(3)
+    .channels(24)
+    .TestSetupF16();
+  MaxPoolingOperatorTester()
+    .batch_size(3)
+    .input_height(8)
+    .input_width(8)
+    .next_input_width(7)
+    .pooling_height(5)
+    .pooling_width(3)
+    .channels(24)
+    .TestSetupF16();
+}
+
+TEST(MAX_POOLING_NHWC_F16, setup_swap_height_and_width) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+  MaxPoolingOperatorTester()
+    .batch_size(3)
+    .input_height(9)
+    .input_width(8)
+    .next_input_height(8)
+    .next_input_width(9)
+    .pooling_height(5)
+    .pooling_width(3)
+    .channels(24)
+    .TestSetupF16();
+}
+
+
 TEST(MAX_POOLING_NHWC_F32, unit_batch_small_1xM_pool) {
   ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
   for (size_t channels = 1; channels <= 100; channels += 15) {
diff --git a/test/max-pooling-operator-tester.h b/test/max-pooling-operator-tester.h
index 349c16f30..106150603 100644
--- a/test/max-pooling-operator-tester.h
+++ b/test/max-pooling-operator-tester.h
@@ -10,6 +10,8 @@
 
 #include <gtest/gtest.h>
 
+#include <fp16.h>
+
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -615,6 +617,119 @@ class MaxPoolingOperatorTester {
     }
   }
 
+  void TestF16() const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    // Note: we need to avoid FP16 denormals in the generated tensor because they might be processed differently in
+    // native vs emulated arithmetics, and we use exact comparison to verify the results against reference.
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(0.001f, 1.0f), rng);
+    auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+    std::vector<uint16_t> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
+    std::vector<uint16_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
+    std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(input.begin(), input.end(), std::ref(f16rng));
+      std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+
+      // Compute reference results, without clamping.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t oy = 0; oy < output_height(); oy++) {
+          for (size_t ox = 0; ox < output_width(); ox++) {
+            for (size_t c = 0; c < channels(); c++) {
+              float max_value = -std::numeric_limits<float>::infinity();
+              for (size_t py = 0; py < pooling_height(); py++) {
+                const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
+                for (size_t px = 0; px < pooling_width(); px++) {
+                  const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
+                  if (ix < input_width() && iy < input_height()) {
+                    max_value = std::max(max_value,
+                      fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]));
+                  }
+                }
+              }
+              output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
+            }
+          }
+        }
+      }
+
+      // Compute clamping parameters.
+      const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+      const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
+      const float accumulated_range = accumulated_max - accumulated_min;
+      float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
+      float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
+      output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
+      output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
+      if (accumulated_range == 0.0f) {
+        output_min = -std::numeric_limits<float>::infinity();
+        output_max = +std::numeric_limits<float>::infinity();
+      }
+      if (qmin() == std::numeric_limits<uint8_t>::min()) {
+        output_min = -std::numeric_limits<float>::infinity();
+      }
+      if (qmax() == std::numeric_limits<uint8_t>::max()) {
+        output_max = +std::numeric_limits<float>::infinity();
+      }
+
+      // Clamp reference results.
+      for (float& value : output_ref) {
+        value = std::max(std::min(value, output_max), output_min);
+      }
+
+      // Create, setup, run, and destroy Max Pooling operator.
+      ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+      xnn_operator_t max_pooling_op = nullptr;
+
+      const xnn_status status = xnn_create_max_pooling2d_nhwc_f16(
+          padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
+          padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
+          pooling_height(), pooling_width(),
+          stride_height(), stride_width(),
+          dilation_height(), dilation_width(),
+          channels(), input_pixel_stride(), output_pixel_stride(),
+          output_min, output_max,
+          padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0,
+          &max_pooling_op);
+      if (status == xnn_status_unsupported_hardware) {
+        GTEST_SKIP();
+      }
+      ASSERT_EQ(xnn_status_success, status);
+      ASSERT_NE(nullptr, max_pooling_op);
+
+      // Smart pointer to automatically delete max_pooling_op.
+      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_setup_max_pooling2d_nhwc_f16(
+          max_pooling_op,
+          batch_size(), input_height(), input_width(),
+          input.data(), output.data(),
+          nullptr /* thread pool */));
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
+
+      // Verify results.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t y = 0; y < output_height(); y++) {
+          for (size_t x = 0; x < output_width(); x++) {
+            for (size_t c = 0; c < channels(); c++) {
+              ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max);
+              ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min);
+              ASSERT_EQ(
+                  fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]),
+                  output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) <<
+                "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c
+                << ", min = " << output_min << ", max = " << output_max;
+            }
+          }
+        }
+      }
+    }
+  }
+
   void TestF32() const {
     std::random_device random_device;
     auto rng = std::mt19937(random_device());
@@ -995,6 +1110,178 @@ class MaxPoolingOperatorTester {
     }
   }
 
+  void TestSetupF16() const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    // Note: we need to avoid FP16 denormals in the generated tensor because they might be processed differently in
+    // native vs emulated arithmetics, and we use exact comparison to verify the results against reference.
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(0.001f, 1.0f), rng);
+    auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+    std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max(
+      (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(),
+      (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels()));
+    std::vector<uint16_t> output(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max(
+      (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(),
+      (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels()));
+    std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
+    std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels());
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(input.begin(), input.end(), std::ref(f16rng));
+      std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+
+      // Compute reference results, without clamping.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t oy = 0; oy < output_height(); oy++) {
+          for (size_t ox = 0; ox < output_width(); ox++) {
+            for (size_t c = 0; c < channels(); c++) {
+              float max_value = -std::numeric_limits<float>::infinity();
+              for (size_t py = 0; py < pooling_height(); py++) {
+                const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
+                for (size_t px = 0; px < pooling_width(); px++) {
+                  const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
+                  if (ix < input_width() && iy < input_height()) {
+                    max_value = std::max(max_value,
+                      fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]));
+                  }
+                }
+              }
+              output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
+            }
+          }
+        }
+      }
+
+      // Compute clamping parameters.
+      const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+      const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
+      const float accumulated_range = accumulated_max - accumulated_min;
+      float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
+      float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
+      output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
+      output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
+      if (accumulated_range == 0.0f) {
+        output_min = -std::numeric_limits<float>::infinity();
+        output_max = +std::numeric_limits<float>::infinity();
+      }
+      if (qmin() == std::numeric_limits<uint8_t>::min()) {
+        output_min = -std::numeric_limits<float>::infinity();
+      }
+      if (qmax() == std::numeric_limits<uint8_t>::max()) {
+        output_max = +std::numeric_limits<float>::infinity();
+      }
+
+      // Clamp reference results.
+      for (float& value : output_ref) {
+        value = std::max(std::min(value, output_max), output_min);
+      }
+
+      // Create, setup, and run Max Pooling operator once.
+      ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+      xnn_operator_t max_pooling_op = nullptr;
+
+      const xnn_status status = xnn_create_max_pooling2d_nhwc_f16(
+          padding_top(), padding_right(), padding_bottom(), padding_left(),
+          pooling_height(), pooling_width(),
+          stride_height(), stride_width(),
+          dilation_height(), dilation_width(),
+          channels(), input_pixel_stride(), output_pixel_stride(),
+          output_min, output_max,
+          0, &max_pooling_op);
+      if (status == xnn_status_unsupported_hardware) {
+        GTEST_SKIP();
+      }
+      ASSERT_EQ(xnn_status_success, status);
+      ASSERT_NE(nullptr, max_pooling_op);
+
+      // Smart pointer to automatically delete max_pooling_op.
+      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_setup_max_pooling2d_nhwc_f16(
+          max_pooling_op,
+          batch_size(), input_height(), input_width(),
+          input.data(), output.data(),
+          nullptr /* thread pool */));
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
+
+      // Verify results of the first run.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t y = 0; y < output_height(); y++) {
+          for (size_t x = 0; x < output_width(); x++) {
+            for (size_t c = 0; c < channels(); c++) {
+              ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max);
+              ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min);
+              ASSERT_EQ(
+                  fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]),
+                  output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) <<
+                "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c
+                << ", min = " << output_min << ", max = " << output_max;
+            }
+          }
+        }
+      }
+
+      // Re-generate data for the second run.
+      std::generate(input.begin(), input.end(), std::ref(f16rng));
+      std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+
+      // Compute reference results for the second run, including clamping.
+      for (size_t i = 0; i < next_batch_size(); i++) {
+        for (size_t oy = 0; oy < next_output_height(); oy++) {
+          for (size_t ox = 0; ox < next_output_width(); ox++) {
+            for (size_t c = 0; c < channels(); c++) {
+              float max_value = -std::numeric_limits<float>::infinity();
+              for (size_t py = 0; py < pooling_height(); py++) {
+                const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
+                for (size_t px = 0; px < pooling_width(); px++) {
+                  const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
+                  if (ix < next_input_width() && iy < next_input_height()) {
+                    max_value = std::max(max_value,
+                      fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c]));
+                  }
+                }
+              }
+              max_value = std::min(max_value, output_max);
+              max_value = std::max(max_value, output_min);
+              next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = max_value;
+            }
+          }
+        }
+      }
+
+      // Setup and run Max Pooling operator the second time, and destroy the operator.
+      ASSERT_EQ(xnn_status_success,
+        xnn_setup_max_pooling2d_nhwc_f16(
+          max_pooling_op,
+          next_batch_size(), next_input_height(), next_input_width(),
+          input.data(), output.data(),
+          nullptr /* thread pool */));
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
+
+      // Verify results of the second run.
+      for (size_t i = 0; i < next_batch_size(); i++) {
+        for (size_t y = 0; y < next_output_height(); y++) {
+          for (size_t x = 0; x < next_output_width(); x++) {
+            for (size_t c = 0; c < channels(); c++) {
+              ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_max);
+              ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_min);
+              ASSERT_EQ(
+                  fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]),
+                  next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c]) <<
+                "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c
+                << ", min = " << output_min << ", max = " << output_max;
+            }
+          }
+        }
+      }
+    }
+  }
+
   void TestSetupF32() const {
     std::random_device random_device;
     auto rng = std::mt19937(random_device());
author	Marat Dukhan <maratek@google.com>	2022-02-04 01:55:53 -0800
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2022-02-04 01:57:03 -0800
commit	5756a927fc5044bdcfebe57d4bd84408ca0a0975 (patch)
tree	18e40a963d466dff13bb2a35340f3b032f047c0c
parent	af1671ab277a0742f8c7251b6b4ad1a16aba79bd (diff)
download	XNNPACK-5756a927fc5044bdcfebe57d4bd84408ca0a0975.tar.gz