Compute Library v22.02upstream-master

author: Jenkins <bsgcomp@arm.com> 2022-02-26 12:23:41 +0000
committer: Jenkins <bsgcomp@arm.com> 2022-02-26 12:23:41 +0000
commit: 8f587de9214dbc3aee4ff4eeb2ede66747769b19 (patch)
tree: 51dddcedfbc88d417908b9d129a4f59ac7a0efa8
parent: 91ee4d0a9ef128b16936921470a0e3ffef347536 (diff)
download: ARMComputeLibrary-upstream-master.tar.gz
400 files changed, 14346 insertions, 11074 deletions
diff --git a/Android.bp b/Android.bp
index 32d5805e5..b54c8a574 100644
--- a/Android.bp
+++ b/Android.bp
@@ -40,6 +40,7 @@ opencl_srcs = [
         "src/core/CL/cl_kernels/common/floor.cl",
         "src/core/CL/cl_kernels/common/gather.cl",
         "src/core/CL/cl_kernels/common/gemm.cl",
+        "src/core/CL/cl_kernels/common/gemm_utils.cl",
         "src/core/CL/cl_kernels/common/gemmlowp.cl",
         "src/core/CL/cl_kernels/common/gemv.cl",
         "src/core/CL/cl_kernels/common/generate_proposals.cl",
@@ -80,17 +81,13 @@ opencl_srcs = [
         "src/core/CL/cl_kernels/nchw/channel_shuffle.cl",
         "src/core/CL/cl_kernels/nchw/depth_to_space.cl",
         "src/core/CL/cl_kernels/nchw/dequantization_layer.cl",
-        "src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl",
-        "src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl",
-        "src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl",
-        "src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl",
+        "src/core/CL/cl_kernels/nchw/direct_convolution.cl",
         "src/core/CL/cl_kernels/nchw/im2col.cl",
         "src/core/CL/cl_kernels/nchw/normalization_layer.cl",
         "src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl",
         "src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl",
         "src/core/CL/cl_kernels/nchw/pooling_layer.cl",
         "src/core/CL/cl_kernels/nchw/prior_box_layer.cl",
-        "src/core/CL/cl_kernels/nchw/remap.cl",
         "src/core/CL/cl_kernels/nchw/reorg_layer.cl",
         "src/core/CL/cl_kernels/nchw/scale.cl",
         "src/core/CL/cl_kernels/nchw/space_to_batch.cl",
@@ -114,7 +111,6 @@ opencl_srcs = [
         "src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl",
         "src/core/CL/cl_kernels/nhwc/pooling_layer.cl",
         "src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl",
-        "src/core/CL/cl_kernels/nhwc/remap.cl",
         "src/core/CL/cl_kernels/nhwc/reorg_layer.cl",
         "src/core/CL/cl_kernels/nhwc/scale.cl",
         "src/core/CL/cl_kernels/nhwc/space_to_batch.cl",
@@ -241,7 +237,6 @@ cc_library_static {
         "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp",
         "src/core/CL/kernels/CLRangeKernel.cpp",
         "src/core/CL/kernels/CLReductionOperationKernel.cpp",
-        "src/core/CL/kernels/CLRemapKernel.cpp",
         "src/core/CL/kernels/CLReorgLayerKernel.cpp",
         "src/core/CL/kernels/CLReverseKernel.cpp",
         "src/core/CL/kernels/CLSelectKernel.cpp",
@@ -293,7 +288,6 @@ cc_library_static {
         "src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp",
         "src/core/NEON/kernels/NERangeKernel.cpp",
         "src/core/NEON/kernels/NEReductionOperationKernel.cpp",
-        "src/core/NEON/kernels/NERemapKernel.cpp",
         "src/core/NEON/kernels/NEReorgLayerKernel.cpp",
         "src/core/NEON/kernels/NEReverseKernel.cpp",
         "src/core/NEON/kernels/NESelectKernel.cpp",
@@ -336,8 +330,6 @@ cc_library_static {
         "src/core/NEON/kernels/arm_gemm/transform.cpp",
         "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp",
         "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp",
-        "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp",
-        "src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp",
         "src/core/NEON/kernels/convolution/common/padding.cpp",
         "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
         "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
@@ -431,44 +423,79 @@ cc_library_static {
         "src/cpu/kernels/CpuTransposeKernel.cpp",
         "src/cpu/kernels/CpuWeightsReshapeKernel.cpp",
         "src/cpu/kernels/CpuWinogradConv2dKernel.cpp",
-        "src/cpu/kernels/activation/neon/fp16.cpp",
-        "src/cpu/kernels/activation/neon/fp32.cpp",
-        "src/cpu/kernels/activation/neon/qasymm8.cpp",
-        "src/cpu/kernels/activation/neon/qasymm8_signed.cpp",
-        "src/cpu/kernels/activation/neon/qsymm16.cpp",
-        "src/cpu/kernels/activation/sve/fp16.cpp",
-        "src/cpu/kernels/activation/sve/fp32.cpp",
-        "src/cpu/kernels/activation/sve/qasymm8.cpp",
-        "src/cpu/kernels/activation/sve/qasymm8_signed.cpp",
-        "src/cpu/kernels/activation/sve/qsymm16.cpp",
-        "src/cpu/kernels/add/neon/qasymm8.cpp",
-        "src/cpu/kernels/add/neon/qasymm8_signed.cpp",
-        "src/cpu/kernels/add/neon/qsymm16.cpp",
-        "src/cpu/kernels/add/sve/impl.cpp",
-        "src/cpu/kernels/add/sve/qasymm8.cpp",
-        "src/cpu/kernels/add/sve/qasymm8_signed.cpp",
-        "src/cpu/kernels/add/sve/qsymm16.cpp",
-        "src/cpu/kernels/elementwise/sve/elementwise.cpp",
-        "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp",
+        "src/cpu/kernels/activation/generic/neon/fp16.cpp",
+        "src/cpu/kernels/activation/generic/neon/fp32.cpp",
+        "src/cpu/kernels/activation/generic/neon/qasymm8.cpp",
+        "src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp",
+        "src/cpu/kernels/activation/generic/neon/qsymm16.cpp",
+        "src/cpu/kernels/add/generic/neon/fp16.cpp",
+        "src/cpu/kernels/add/generic/neon/fp32.cpp",
+        "src/cpu/kernels/add/generic/neon/impl.cpp",
+        "src/cpu/kernels/add/generic/neon/integer.cpp",
+        "src/cpu/kernels/add/generic/neon/qasymm8.cpp",
+        "src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp",
+        "src/cpu/kernels/add/generic/neon/qsymm16.cpp",
+        "src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp",
+        "src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp",
+        "src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp",
+        "src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp",
+        "src/cpu/kernels/crop/generic/neon/fp16.cpp",
+        "src/cpu/kernels/crop/generic/neon/fp32.cpp",
+        "src/cpu/kernels/crop/generic/neon/impl.cpp",
+        "src/cpu/kernels/crop/generic/neon/integer.cpp",
+        "src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp",
+        "src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp",
+        "src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp",
+        "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp",
+        "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp",
+        "src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp",
+        "src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp",
+        "src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp",
         "src/cpu/kernels/floor/neon/fp16.cpp",
         "src/cpu/kernels/floor/neon/fp32.cpp",
+        "src/cpu/kernels/genproposals/generic/neon/fp16.cpp",
+        "src/cpu/kernels/genproposals/generic/neon/fp32.cpp",
+        "src/cpu/kernels/genproposals/generic/neon/impl.cpp",
+        "src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp",
+        "src/cpu/kernels/instancenorm/generic/neon/fp16.cpp",
+        "src/cpu/kernels/instancenorm/generic/neon/fp32.cpp",
+        "src/cpu/kernels/instancenorm/generic/neon/impl.cpp",
         "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp",
         "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
+        "src/cpu/kernels/maxunpool/generic/neon/fp16.cpp",
+        "src/cpu/kernels/maxunpool/generic/neon/fp32.cpp",
+        "src/cpu/kernels/maxunpool/generic/neon/impl.cpp",
+        "src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp",
+        "src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp",
+        "src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp",
+        "src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp",
+        "src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp",
         "src/cpu/kernels/pool2d/neon/fp16.cpp",
         "src/cpu/kernels/pool2d/neon/fp32.cpp",
         "src/cpu/kernels/pool2d/neon/nchw/all.cpp",
         "src/cpu/kernels/pool2d/neon/qasymm8.cpp",
         "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp",
+        "src/cpu/kernels/range/generic/neon/fp16.cpp",
+        "src/cpu/kernels/range/generic/neon/fp32.cpp",
+        "src/cpu/kernels/range/generic/neon/impl.cpp",
+        "src/cpu/kernels/range/generic/neon/integer.cpp",
+        "src/cpu/kernels/roialign/generic/neon/fp16.cpp",
+        "src/cpu/kernels/roialign/generic/neon/fp32.cpp",
+        "src/cpu/kernels/roialign/generic/neon/impl.cpp",
+        "src/cpu/kernels/roialign/generic/neon/qasymm8.cpp",
+        "src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/scale/neon/fp16.cpp",
         "src/cpu/kernels/scale/neon/integer.cpp",
         "src/cpu/kernels/scale/neon/qasymm8.cpp",
         "src/cpu/kernels/scale/neon/qasymm8_signed.cpp",
-        "src/cpu/kernels/scale/sve/fp16.cpp",
-        "src/cpu/kernels/scale/sve/fp32.cpp",
-        "src/cpu/kernels/scale/sve/integer.cpp",
-        "src/cpu/kernels/scale/sve/qasymm8.cpp",
-        "src/cpu/kernels/scale/sve/qasymm8_signed.cpp",
-        "src/cpu/kernels/softmax/impl/sve/impl.cpp",
+        "src/cpu/kernels/select/generic/neon/fp16.cpp",
+        "src/cpu/kernels/select/generic/neon/fp32.cpp",
+        "src/cpu/kernels/select/generic/neon/impl.cpp",
+        "src/cpu/kernels/select/generic/neon/integer.cpp",
+        "src/cpu/kernels/softmax/generic/neon/fp16.cpp",
+        "src/cpu/kernels/softmax/generic/neon/fp32.cpp",
+        "src/cpu/kernels/softmax/generic/neon/qasymm8.cpp",
+        "src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/sub/neon/qasymm8.cpp",
         "src/cpu/kernels/sub/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/sub/neon/qsymm16.cpp",
@@ -683,7 +710,6 @@ cc_library_static {
         "src/runtime/CL/functions/CLRange.cpp",
         "src/runtime/CL/functions/CLReduceMean.cpp",
         "src/runtime/CL/functions/CLReductionOperation.cpp",
-        "src/runtime/CL/functions/CLRemap.cpp",
         "src/runtime/CL/functions/CLReorgLayer.cpp",
         "src/runtime/CL/functions/CLReshapeLayer.cpp",
         "src/runtime/CL/functions/CLReverse.cpp",
@@ -794,7 +820,6 @@ cc_library_static {
         "src/runtime/NEON/functions/NERange.cpp",
         "src/runtime/NEON/functions/NEReduceMean.cpp",
         "src/runtime/NEON/functions/NEReductionOperation.cpp",
-        "src/runtime/NEON/functions/NERemap.cpp",
         "src/runtime/NEON/functions/NEReorgLayer.cpp",
         "src/runtime/NEON/functions/NEReshapeLayer.cpp",
         "src/runtime/NEON/functions/NEReverse.cpp",
@@ -1013,8 +1038,11 @@ cc_library_static {
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
diff --git a/LICENSE b/LICENSE
index 1bb90563e..953432d89 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2017-2021 Arm Limited
+Copyright (c) 2017-2022 Arm Limited
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 28418c9bd..a0d8a11a2 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,13 @@
 
+> **⚠ Important**
+> Starting from the next release (22.05): 'master' branch will be changed to 'main' following our inclusive language update, more information [here](https://arm-software.github.io/ComputeLibrary/latest/contribution_guidelines.xhtml#S5_0_inc_lang).
+
 <br>
 <div align="center">
  <img src="https://raw.githubusercontent.com/ARM-software/ComputeLibrary/gh-pages/ACL_logo.png"/><br><br>
 </div>
 
-# Compute Library ![](https://img.shields.io/badge/latest_release-21.11-green)
+# Compute Library ![](https://img.shields.io/badge/latest_release-22.02-green)
 
 
 The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A and Arm® Mali™ GPUs architectures.<br>
@@ -32,7 +35,7 @@ Key Features:
 <br>
 
 ## Documentation
-[![Documentation](https://img.shields.io/badge/documentation-21.11-green)](https://arm-software.github.io/ComputeLibrary/latest)
+[![Documentation](https://img.shields.io/badge/documentation-22.02-green)](https://arm-software.github.io/ComputeLibrary/latest)
 
 > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc.
 
@@ -45,21 +48,21 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C
 
 | Platform    | Operating System | Release archive (Download) |
 | ----------- | ----------- | ----------- |
-| Raspberry Pi 4 | Linux 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-armv7a-neon.tar.gz) |
-| Raspberry Pi 4 | Linux 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8a-neon.tar.gz) |
-| Odroid N2 | Linux 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| HiKey960 | Linux 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| Raspberry Pi 4 | Linux 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-armv7a-neon.tar.gz) |
+| Raspberry Pi 4 | Linux 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8a-neon.tar.gz) |
+| Odroid N2 | Linux 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| HiKey960 | Linux 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8a-neon-cl.tar.gz) |
 
 <br>
 
 | Architecture    | Operating System | Release archive (Download) |
 | ----------- | ----------- | ----------- |
-| armv7 | Android | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-android-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-android-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-android-armv7a-neon-cl.tar.gz) |
-| armv7 | Linux | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-armv7a-neon-cl.tar.gz) |
-| arm64-v8a | Android | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-android-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8a | Linux | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Android | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Linux | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v21.11/arm_compute-v21.11-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
+| armv7 | Android | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-android-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-android-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-android-armv7a-neon-cl.tar.gz) |
+| armv7 | Linux | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-armv7a-neon-cl.tar.gz) |
+| arm64-v8a | Android | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-android-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8a | Linux | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Android | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Linux | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v22.02/arm_compute-v22.02-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
 
 <br>
 
diff --git a/SConscript b/SConscript
index 3341c96bc..626db33fc 100644
--- a/SConscript
+++ b/SConscript
@@ -1,3 +1,6 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
 # Copyright (c) 2016-2021 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
@@ -19,6 +22,7 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
+
 import collections
 import os.path
 import re
@@ -27,8 +31,8 @@ import zlib
 import json
 import codecs
 
-VERSION = "v21.11"
-LIBRARY_VERSION_MAJOR = 25
+VERSION = "v22.02"
+LIBRARY_VERSION_MAJOR = 26
 LIBRARY_VERSION_MINOR =  0
 LIBRARY_VERSION_PATCH =  0
 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
@@ -45,31 +49,83 @@ def build_bootcode_objs(sources):
     return obj
 
 
-def build_sve_objs(sources):
+
+
+# @brief Create a list of object from a given file list.
+#
+# @param  arch_info      A dictionary represents the architecture info such as the
+#                        compiler flags and defines (filedefs.json).
+#
+# @param  sources        A list of files to build
+#
+# @return A list of objects for the corresponding architecture.
+
+def build_obj_list(arch_info, sources, static=False):
+
+    # Clone environment
     tmp_env = arm_compute_env.Clone()
-    tmp_env.Append(CXXFLAGS = "-march=armv8.2-a+sve+fp16")
-    obj = tmp_env.SharedObject(sources)
-    Default(obj)
-    return obj
 
+    # Append architecture spec
+    if 'cxxflags' in arch_info and len(arch_info['cxxflags']) > 0:
+        tmp_env.Append(CXXFLAGS = arch_info['cxxflags'])
+
+    # Build and return objects
+    if static:
+        objs = tmp_env.StaticObject(sources)
+    else:
+        objs = tmp_env.SharedObject(sources)
+    
+    tmp_env.Default(objs)
+    return objs
+
+# @brief Build multi-ISA files with the respective architecture.
+#
+# @return Two distinct lists:
+#         A list of static objects
+#         A list of shared objects
+
+def build_lib_objects():
+    lib_static_objs = [] # static objects
+    lib_shared_objs = [] # shared objects
+
+    arm_compute_env.Append(CPPDEFINES = ['ENABLE_NEON', 'ARM_COMPUTE_ENABLE_NEON',
+                           'ENABLE_SVE', 'ARM_COMPUTE_ENABLE_SVE',
+                           'ARM_COMPUTE_ENABLE_FP16', 'ARM_COMPUTE_ENABLE_BF16',
+                           'ARM_COMPUTE_ENABLE_I8MM', 'ARM_COMPUTE_ENABLE_SVEF32MM'])
+
+    # Build all the common files for the base architecture
+    lib_static_objs += build_obj_list(filedefs["armv8.2-a"], lib_files, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], lib_files, static=False)
+    
+    # Build the SVE specific files
+    lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], lib_files_sve, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], lib_files_sve, static=False)
+
+    # Build the SVE2 specific files
+    arm_compute_env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_SVE2'])
+    lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], lib_files_sve2, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], lib_files_sve2, static=False)
+
+    return lib_static_objs, lib_shared_objs
 
-def build_objs(sources):
-    obj = arm_compute_env.SharedObject(sources)
-    Default(obj)
-    return obj
 
 
 def build_library(name, build_env, sources, static=False, libs=[]):
+    cloned_build_env = build_env.Clone()
+    if env['os'] == 'android' and static == False:
+        cloned_build_env["LINKFLAGS"].remove('-pie')
+        cloned_build_env["LINKFLAGS"].remove('-static-libstdc++')
+
     if static:
-        obj = build_env.StaticLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs)
+        obj = cloned_build_env.StaticLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs)
     else:
         if env['set_soname']:
-            obj = build_env.SharedLibrary(name, source=sources, SHLIBVERSION = SONAME_VERSION, LIBS = arm_compute_env["LIBS"] + libs)
+            obj = cloned_build_env.SharedLibrary(name, source=sources, SHLIBVERSION = SONAME_VERSION, LIBS = arm_compute_env["LIBS"] + libs)
         else:
-            obj = build_env.SharedLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs)
+            obj = cloned_build_env.SharedLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs)
 
     obj = install_lib(obj)
-    Default(obj)
+    build_env.Default(obj)
     return obj
 
 
@@ -179,6 +235,7 @@ def get_attrs_list(env, data_types, data_layouts):
 
     # Manage execution state
     attrs += ['estate32' if (env['estate'] == 'auto' and 'v7a' in env['arch']) or '32' in env['estate'] else 'estate64']
+
     return attrs
 
 
@@ -260,7 +317,7 @@ arm_compute_env = env.Clone()
 version_file = arm_compute_env.Command("src/core/arm_compute_version.embed", "", action=create_version_file)
 arm_compute_env.AlwaysBuild(version_file)
 
-default_cpp_compiler = 'g++' if env['os'] not in ['android', 'macos'] else 'clang++'
+default_cpp_compiler = 'g++' if env['os'] not in ['android', 'macos', 'openbsd'] else 'clang++'
 cpp_compiler = os.environ.get('CXX', default_cpp_compiler)
 
 # Generate embed files
@@ -310,6 +367,7 @@ if env['opencl'] and env['embed_kernels']:
                        'src/core/CL/cl_kernels/common/floor.cl',
                        'src/core/CL/cl_kernels/common/gather.cl',
                        'src/core/CL/cl_kernels/common/gemm.cl',
+                       'src/core/CL/cl_kernels/common/gemm_utils.cl',
                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl',
                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl',
                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl',
@@ -352,10 +410,7 @@ if env['opencl'] and env['embed_kernels']:
                     'src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl',
                     'src/core/CL/cl_kernels/nchw/channel_shuffle.cl',
                     'src/core/CL/cl_kernels/nchw/depth_to_space.cl',
-                    'src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl',
-                    'src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl',
-                    'src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl',
-                    'src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl',
+                    'src/core/CL/cl_kernels/nchw/direct_convolution.cl',
                     'src/core/CL/cl_kernels/nchw/dequantization_layer.cl',
                     'src/core/CL/cl_kernels/nchw/im2col.cl',
                     'src/core/CL/cl_kernels/nchw/normalization_layer.cl',
@@ -363,7 +418,6 @@ if env['opencl'] and env['embed_kernels']:
                     'src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl',
                     'src/core/CL/cl_kernels/nchw/pooling_layer.cl',
                     'src/core/CL/cl_kernels/nchw/prior_box_layer.cl',
-                    'src/core/CL/cl_kernels/nchw/remap.cl',
                     'src/core/CL/cl_kernels/nchw/reorg_layer.cl',
                     'src/core/CL/cl_kernels/nchw/scale.cl',
                     'src/core/CL/cl_kernels/nchw/space_to_batch.cl',
@@ -390,7 +444,6 @@ if env['opencl'] and env['embed_kernels']:
                     'src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl',
                     'src/core/CL/cl_kernels/nhwc/pooling_layer.cl',
                     'src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl',
-                    'src/core/CL/cl_kernels/nhwc/remap.cl',
                     'src/core/CL/cl_kernels/nhwc/reorg_layer.cl',
                     'src/core/CL/cl_kernels/nhwc/scale.cl',
                     'src/core/CL/cl_kernels/nhwc/space_to_batch.cl',
@@ -422,7 +475,14 @@ undefined_flag = '-Wl,-undefined,error' if 'macos' in arm_compute_env["os"] else
 arm_compute_env.Append(LINKFLAGS=[undefined_flag])
 arm_compute_env.Append(CPPPATH =[Dir("./src/core/").path] )
 
-arm_compute_env.Append(LIBS = ['dl'])
+if env['os'] != 'openbsd':
+    arm_compute_env.Append(LIBS = ['dl'])
+
+# Load build definitions file
+with (open(Dir('#').path + '/filedefs.json')) as fd:
+    filedefs = json.load(fd)
+    filedefs = filedefs['cpu']['arch']
+
 
 with (open(Dir('#').path + '/filelist.json')) as fp:
     filelist = json.load(fp)
@@ -474,8 +534,10 @@ if env['opencl']:
 
     graph_files += Glob('src/graph/backends/CL/*.cpp')
 
-sve_o = []
+
 lib_files_sve = []
+lib_files_sve2 = []
+
 if env['neon']:
     # build winograd/depthwise sources for either v7a / v8a
     arm_compute_env.Append(CPPPATH = ["src/core/NEON/kernels/convolution/common/",
@@ -483,14 +545,17 @@ if env['neon']:
                                       "src/core/NEON/kernels/convolution/depthwise/",
                                       "src/core/NEON/kernels/assembly/",
                                       "arm_compute/core/NEON/kernels/assembly/",
-                                      "src/cpu/kernels/assembly/",])
+                                      "src/cpu/kernels/assembly/"])
 
     lib_files += filelist['cpu']['common']
 
     # Setup SIMD file list to include
-    simd = []
-    if 'sve' in env['arch'] or env['fat_binary']: simd += ['sve']
-    if 'sve' not in env['arch'] or env['fat_binary']: simd += ['neon']
+    simd = ['neon']
+    if env['multi_isa']:
+        simd += ['sve', 'sve2']
+    else:
+        if 'sve' in env['arch']: simd += ['sve']
+        if 'sve2' in env['arch']: simd += ['sve2']
 
     # Get attributes
     if(use_custom_ops):
@@ -503,9 +568,18 @@ if env['neon']:
     cpu_ops_to_build = resolve_operator_dependencies(filelist, cpu_operators, 'cpu')
 
     cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs)
+
+    # Shared among ALL CPU files
     lib_files += cpu_files.get('common', [])
+
+    # Arm® Neon™ specific files
     lib_files += cpu_files.get('neon', [])
-    lib_files_sve += cpu_files.get('sve', [])
+
+    # SVE files only
+    lib_files_sve = cpu_files.get('sve', [])
+
+    # SVE2 files only
+    lib_files_sve2 = cpu_files.get('sve2', [])
 
     graph_files += Glob('src/graph/backends/NEON/*.cpp')
 
@@ -521,25 +595,42 @@ if env['os'] == 'bare_metal':
     bootcode_o = build_bootcode_objs(bootcode_files)
 Export('bootcode_o')
 
-# Build static libraries
-if (env['fat_binary']):
-    sve_o = build_sve_objs(lib_files_sve)
-    arm_compute_a = build_library('arm_compute-static', arm_compute_env, lib_files + sve_o, static=True)
+
+if (env['multi_isa']):
+    lib_static_objs, lib_shared_objs = build_lib_objects()
+
+
+# STATIC library build.
+if (env['multi_isa']):
+    arm_compute_a = build_library('arm_compute-static', arm_compute_env, lib_static_objs, static=True)
 else:
-    arm_compute_a = build_library('arm_compute-static', arm_compute_env, lib_files + lib_files_sve, static=True)
+    if 'sve2' in env['arch']:
+        lib_files += lib_files_sve
+        lib_files += lib_files_sve2
+    elif 'sve' in env['arch']:
+        lib_files += lib_files_sve
+
+    arm_compute_a = build_library('arm_compute-static', arm_compute_env, lib_files, static=True)
+
 Export('arm_compute_a')
 
-# Build shared libraries
+# SHARED library build.
 if env['os'] != 'bare_metal' and not env['standalone']:
-    if (env['fat_binary']):
-        arm_compute_so = build_library('arm_compute', arm_compute_env, lib_files + sve_o, static=False)
+    if (env['multi_isa']):
+
+        arm_compute_so = build_library('arm_compute', arm_compute_env, lib_shared_objs, static=False)
     else:
-        arm_compute_so = build_library('arm_compute', arm_compute_env, lib_files + lib_files_sve, static=False)
+        arm_compute_so = build_library('arm_compute', arm_compute_env, lib_files, static=False)
 
     Export('arm_compute_so')
 
 # Generate dummy core lib for backwards compatibility
-arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, [], static=True)
+if env['os'] == 'macos':
+    # macos static library archiver fails if given an empty list of files
+    arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, lib_files, static=True)
+else:
+    arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, [], static=True)
+
 Export('arm_compute_core_a')
 
 if env['os'] != 'bare_metal' and not env['standalone']:
@@ -551,7 +642,7 @@ arm_compute_graph_env = arm_compute_env.Clone()
 # Build graph libraries
 arm_compute_graph_env.Append(CXXFLAGS = ['-Wno-redundant-move', '-Wno-pessimizing-move'])
 
-arm_compute_graph_a = build_library('arm_compute_graph-static', arm_compute_graph_env, graph_files, static=True, libs = [ arm_compute_a])
+arm_compute_graph_a = build_library('arm_compute_graph-static', arm_compute_graph_env, graph_files, static=True, libs = [ arm_compute_a ])
 Export('arm_compute_graph_a')
 
 if env['os'] != 'bare_metal' and not env['standalone']:
diff --git a/SConstruct b/SConstruct
index 400228c71..cc5a382c1 100644
--- a/SConstruct
+++ b/SConstruct
@@ -94,12 +94,12 @@ vars.AddVariables(
                   allowed_values=("armv7a", "armv7a-hf", "arm64-v8a", "arm64-v8.2-a", "arm64-v8.2-a-sve", "arm64-v8.2-a-sve2", "x86_32", "x86_64",
                                   "armv8a", "armv8.2-a", "armv8.2-a-sve", "armv8.6-a", "armv8.6-a-sve", "armv8.6-a-sve2", "armv8r64", "x86")),
     EnumVariable("estate", "Execution State", "auto", allowed_values=("auto", "32", "64")),
-    EnumVariable("os", "Target OS", "linux", allowed_values=("linux", "android", "tizen", "macos", "bare_metal")),
+    EnumVariable("os", "Target OS", "linux", allowed_values=("linux", "android", "tizen", "macos", "bare_metal", "openbsd")),
     EnumVariable("build", "Build type", "cross_compile", allowed_values=("native", "cross_compile", "embed_only")),
     BoolVariable("examples", "Build example programs", True),
     BoolVariable("gemm_tuner", "Build gemm_tuner programs", True),
     BoolVariable("Werror", "Enable/disable the -Werror compilation flag", True),
-    BoolVariable("fat_binary", "Build fat binary version of library. Note works only for armv8.2-a", False),
+    BoolVariable("multi_isa", "Build Multi ISA binary version of library. Note works only for armv8.2-a", False),
     BoolVariable("standalone", "Builds the tests as standalone executables, links statically with libgcc, libstdc++ and libarm_compute", False),
     BoolVariable("opencl", "Enable OpenCL support", True),
     BoolVariable("neon", "Enable Arm® Neon™ support", False),
@@ -126,6 +126,7 @@ vars.AddVariables(
     ("build_config", "Operator/Data-type/Data-layout configuration to use for tailored ComputeLibrary builds. Can be a JSON file or a JSON formatted string", "")
 )
 
+
 env = Environment(platform="posix", variables=vars, ENV = os.environ)
 build_path = env['build_dir']
 # If build_dir is a relative path then add a #build/ prefix:
@@ -206,8 +207,8 @@ env.Append(CXXFLAGS = ['-Wall','-DARCH_ARM',
 
 env.Append(CPPDEFINES = ['_GLIBCXX_USE_NANOSLEEP'])
 
-default_cpp_compiler = 'g++' if env['os'] not in ['android', 'macos'] else 'clang++'
-default_c_compiler = 'gcc' if env['os'] not in ['android', 'macos'] else 'clang'
+default_cpp_compiler = 'g++' if env['os'] not in ['android', 'macos', 'openbsd'] else 'clang++'
+default_c_compiler = 'gcc' if env['os'] not in ['android', 'macos', 'openbsd'] else 'clang'
 cpp_compiler = os.environ.get('CXX', default_cpp_compiler)
 c_compiler = os.environ.get('CC', default_c_compiler)
 
@@ -219,7 +220,7 @@ if 'clang++' in cpp_compiler:
 elif 'armclang' in cpp_compiler:
     pass
 else:
-    env.Append(CXXFLAGS = ['-Wlogical-op','-Wnoexcept','-Wstrict-null-sentinel'])
+    env.Append(CXXFLAGS = ['-Wlogical-op','-Wnoexcept','-Wstrict-null-sentinel', '-Wno-misleading-indentation'])
 
 if cpp_compiler == 'g++':
     # Don't strip comments that could include markers
@@ -248,42 +249,63 @@ if 'v7a' in env['estate'] and env['estate'] == '64':
     print("ERROR: armv7a architecture has only 32-bit execution state")
     Exit(1)
 
-# Add architecture specific flags
-prefix = ""
-if 'v7a' in env['arch']:
-    env.Append(CXXFLAGS = ['-march=armv7-a', '-mthumb', '-mfpu=neon'])
-    if (env['os'] == 'android' or env['os'] == 'tizen') and not 'hf' in env['arch']:
-        env.Append(CXXFLAGS = ['-mfloat-abi=softfp'])
-    else:
-        env.Append(CXXFLAGS = ['-mfloat-abi=hard'])
-elif 'v8' in env['arch']:
+env.Append(CPPDEFINES = ['ENABLE_NEON', 'ARM_COMPUTE_ENABLE_NEON'])
+
+if 'sve' in env['arch']:
+    env.Append(CPPDEFINES = ['ENABLE_SVE', 'ARM_COMPUTE_ENABLE_SVE'])
     if 'sve2' in env['arch']:
-        env.Append(CXXFLAGS = ['-march=armv8.2-a+sve2+fp16+dotprod'])
         env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_SVE2'])
-    elif 'sve' in env['arch']:
-        env.Append(CXXFLAGS = ['-march=armv8.2-a+sve+fp16+dotprod'])
-    elif 'armv8r64' in env['arch']:
-        env.Append(CXXFLAGS = ['-march=armv8.4-a'])
-    elif 'v8.' in env['arch']:
-        env.Append(CXXFLAGS = ['-march=armv8.2-a+fp16']) # explicitly enable fp16 extension otherwise __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is undefined
-    else:
-        env.Append(CXXFLAGS = ['-march=armv8-a'])
+
+# Add architecture specific flags
+prefix = ""
+if env['multi_isa']:
+    # assert arch version is v8
+    if 'v8' not in env['arch']:
+        print("Currently Multi ISA binary is only supported for arm v8 family")
+        Exit(1)
 
     if 'v8.6-a' in env['arch']:
-        env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_I8MM', 'ARM_COMPUTE_ENABLE_BF16'])
         if "disable_mmla_fp" not in env['custom_options']:
             env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_SVEF32MM'])
-    if 'v8.' in env['arch']:
-        env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_FP16'])
 
-elif 'x86' in env['arch']:
-    if env['estate'] == '32':
-        env.Append(CCFLAGS = ['-m32'])
-        env.Append(LINKFLAGS = ['-m32'])
-    else:
-        env.Append(CXXFLAGS = ['-fPIC'])
-        env.Append(CCFLAGS = ['-m64'])
-        env.Append(LINKFLAGS = ['-m64'])
+    env.Append(CXXFLAGS = ['-march=armv8.2-a+fp16']) # explicitly enable fp16 extension otherwise __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is undefined
+
+else: # NONE "multi_isa" builds
+
+    if 'v7a' in env['arch']:
+        env.Append(CXXFLAGS = ['-march=armv7-a', '-mthumb', '-mfpu=neon'])
+        if (env['os'] == 'android' or env['os'] == 'tizen') and not 'hf' in env['arch']:
+            env.Append(CXXFLAGS = ['-mfloat-abi=softfp'])
+        else:
+            env.Append(CXXFLAGS = ['-mfloat-abi=hard'])
+    elif 'v8' in env['arch']:
+        # Preserve the V8 archs for non-multi-ISA variants
+        if 'sve2' in env['arch']:
+            env.Append(CXXFLAGS = ['-march=armv8.2-a+sve2+fp16+dotprod'])
+        elif 'sve' in env['arch']:
+            env.Append(CXXFLAGS = ['-march=armv8.2-a+sve+fp16+dotprod'])
+        elif 'armv8r64' in env['arch']:
+            env.Append(CXXFLAGS = ['-march=armv8.4-a'])
+        elif 'v8.' in env['arch']:
+            env.Append(CXXFLAGS = ['-march=armv8.2-a+fp16']) # explicitly enable fp16 extension otherwise __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is undefined
+        else:
+            env.Append(CXXFLAGS = ['-march=armv8-a'])
+
+        if 'v8.6-a' in env['arch']:
+            env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_I8MM', 'ARM_COMPUTE_ENABLE_BF16'])
+            if "disable_mmla_fp" not in env['custom_options']:
+                env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_SVEF32MM'])
+        if 'v8.' in env['arch']:
+            env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_FP16'])
+
+    elif 'x86' in env['arch']:
+        if env['estate'] == '32':
+            env.Append(CCFLAGS = ['-m32'])
+            env.Append(LINKFLAGS = ['-m32'])
+        else:
+            env.Append(CXXFLAGS = ['-fPIC'])
+            env.Append(CCFLAGS = ['-m64'])
+            env.Append(LINKFLAGS = ['-m64'])
 
 # Define toolchain
 prefix = ""
@@ -307,11 +329,6 @@ if 'x86' not in env['arch']:
         elif env['os'] == 'tizen':
             prefix = "aarch64-tizen-linux-gnu-"
 
-if 'sve' in env['arch']:
-    env.Append(CXXFLAGS = ['-DENABLE_SVE', '-DARM_COMPUTE_ENABLE_SVE'])
-else:
-    env.Append(CXXFLAGS = ['-DENABLE_NEON', '-DARM_COMPUTE_ENABLE_NEON'])
-
 if env['build'] == 'native':
     prefix = ""
 
@@ -355,17 +372,8 @@ if not GetOption("help"):
         if not version_at_least(compiler_ver, '7.0.0') and env['os'] == 'bare_metal':
             env.Append(LINKFLAGS = ['-fstack-protector-strong'])
 
-if env['fat_binary']:
-    if env['arch'] != 'armv8.2-a':
-        print("Currently fat binary is only supported with armv8.2-a")
-        Exit(1)
-    env.Append(CXXFLAGS = ['-DENABLE_NEON', '-DARM_COMPUTE_ENABLE_NEON',
-                           '-DENABLE_SVE', '-DARM_COMPUTE_ENABLE_SVE',
-                           '-DARM_COMPUTE_ENABLE_FP16', '-DARM_COMPUTE_ENABLE_BF16',
-                           '-DARM_COMPUTE_ENABLE_I8MM', '-DARM_COMPUTE_ENABLE_SVEF32MM'])
-
 if env['high_priority'] and env['build_config']:
-    print("The high priority library cannot be built in conjuction with a user-specified build configuration")
+    print("The high priority library cannot be built in conjunction with a user-specified build configuration")
     Exit(1)
 
 if not env['high_priority'] and not env['build_config']:
@@ -421,6 +429,10 @@ if env['opencl']:
 if env["os"] not in ["android", "bare_metal"] and (env['opencl'] or env['cppthreads']):
     env.Append(LIBS = ['pthread'])
 
+if env['os'] == 'openbsd':
+    env.Append(LIBS = ['c'])
+    env.Append(CXXFLAGS = ['-fPIC'])
+
 if env['opencl']:
     if env['embed_kernels']:
         env.Append(CPPDEFINES = ['EMBEDDED_KERNELS'])
@@ -474,3 +486,11 @@ if env['exceptions']:
         print("WARNING: Building tests for bare metal and armv7a is not supported")
         Return()
     SConscript('./tests/SConscript', variant_dir='%s/tests' % build_path, duplicate=0)
+
+# Unknown variables are not allowed
+# Note: we must delay the call of UnknownVariables until after
+# we have applied the Variables object to the construction environment
+unknown = vars.UnknownVariables()
+if unknown:
+    print("Unknown variables: %s" % " ".join(unknown.keys()))
+    Exit(1)
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index bbe469f1a..4ff42c6b8 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -41,7 +41,7 @@
 #if defined(__GNUG__) && __GNUG__ >= 8
 #pragma GCC diagnostic ignored "-Wcatch-value"
 #endif // defined(__GNUG__) && __GNUG__ >= 8
-#include <CL/cl2.hpp>
+#include <CL/opencl.hpp> // include new hpp header instead of cl2.hpp
 #pragma GCC diagnostic pop
 
 namespace cl
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index 76378d27e..a021bdf5e 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,11 @@
 
 namespace arm_compute
 {
+namespace cpuinfo
+{
+struct CpuIsaInfo;
+} // namespace cpuinfo
+
 #define ARM_COMPUTE_CPU_MODEL_LIST \
     X(GENERIC)                     \
     X(GENERIC_FP16)                \
@@ -39,6 +44,7 @@ namespace arm_compute
     X(A55r1)                       \
     X(A35)                         \
     X(A73)                         \
+    X(A76)                         \
     X(A510)                        \
     X(X1)                          \
     X(V1)                          \
@@ -133,6 +139,11 @@ public:
      * @return Current thread's @ref CPUModel
      */
     CPUModel get_cpu_model() const;
+    /** Gets the current cpu's ISA information
+     *
+     * @return Current cpu's ISA information
+     */
+    cpuinfo::CpuIsaInfo get_isa() const;
     /** Gets the L1 cache size
      *
      * @return the size of the L1 cache
diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h
index af4a896a6..469731637 100644
--- a/arm_compute/core/CPP/ICPPKernel.h
+++ b/arm_compute/core/CPP/ICPPKernel.h
@@ -38,8 +38,7 @@ class ITensor;
 class ICPPKernel : public IKernel
 {
 public:
-    static constexpr size_t default_mws       = 128; /* Default minimum workload size value */
-    static constexpr size_t small_network_mws = 256; /* Default Minimum workload size value for small networks */
+    static constexpr size_t default_mws       = 1; /* Default minimum workload size value  - no impact */
 
     /** Default destructor */
     virtual ~ICPPKernel() = default;
diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h
index 992d6bc71..c9a0d85f0 100644
--- a/arm_compute/core/Error.h
+++ b/arm_compute/core/Error.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -119,7 +119,7 @@ private:
 /** Creates an error containing the error message
  *
  * @param[in] error_code Error code
- * @param[in] msg        Message to display before aborting.
+ * @param[in] msg        Message to display before abandoning.
  *
  * @return status containing the error
  */
@@ -131,7 +131,7 @@ Status create_error(ErrorCode error_code, std::string msg);
  * @param[in] func       Function in which the error occurred.
  * @param[in] file       File in which the error occurred.
  * @param[in] line       Line in which the error occurred.
- * @param[in] msg        Message to display before aborting.
+ * @param[in] msg        Message to display before abandoning.
  *
  * @return status containing the error
  */
@@ -164,7 +164,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] func       Function in which the error occurred.
  * @param[in] file       File in which the error occurred.
  * @param[in] line       Line in which the error occurred.
- * @param[in] msg        Message to display before aborting.
+ * @param[in] msg        Message to display before abandoning.
  */
 #define ARM_COMPUTE_CREATE_ERROR_LOC(error_code, func, file, line, msg) arm_compute::create_error_msg(error_code, func, file, line, msg)
 
diff --git a/arm_compute/core/GPUTarget.h b/arm_compute/core/GPUTarget.h
index 04d2250d9..c4f5b8ca4 100644
--- a/arm_compute/core/GPUTarget.h
+++ b/arm_compute/core/GPUTarget.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@ enum class GPUTarget
     G76           = 0x250,
     G77           = 0x310,
     G78           = 0x320,
-    TODX          = 0x330,
+    G710          = 0x330,
 };
 
 /** Enable bitwise operations on GPUTarget enumerations */
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index bc6ec1f6c..b1086494e 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -211,17 +211,5 @@ struct ScaleKernelInfo
     bool                align_corners;         /**< Align corners of input and output */
     DataLayout          data_layout;           /**< Data layout to use */
 };
-
-struct RemapInfo
-{
-    RemapInfo() = default;
-    RemapInfo(InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
-        : policy(policy), border_mode(border_mode), constant_border_value(constant_border_value)
-    {
-    }
-    InterpolationPolicy policy;
-    BorderMode          border_mode;
-    PixelValue          constant_border_value;
-};
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H */
diff --git a/arm_compute/core/Size3D.h b/arm_compute/core/Size3D.h
index 148bd1791..4241ed4f7 100644
--- a/arm_compute/core/Size3D.h
+++ b/arm_compute/core/Size3D.h
@@ -40,7 +40,7 @@ public:
      * @param[in] h Height of the 3D shape or object
      * @param[in] d Depth of the 3D shape or object
      */
-    Size3D(size_t w, size_t h, size_t d)
+    Size3D(size_t w, size_t h, size_t d) noexcept
         : width(w), height(h), depth(d)
     {
     }
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 47df44cb6..961581134 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -772,7 +772,7 @@ private:
 /** Padding information for 3D operations like Conv3d */
 struct Padding3D
 {
-    Padding3D()
+    Padding3D() noexcept
     {
     }
 
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index af9a777a0..b24955d77 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -886,6 +886,13 @@ const std::string &string_from_norm_type(NormType type);
  * @return The string describing the pooling type.
  */
 const std::string &string_from_pooling_type(PoolingType type);
+/** Check if the pool region is entirely outside the input tensor
+ *
+ * @param[in] info @ref PoolingLayerInfo to be checked.
+ *
+ * @return True if the pool region is entirely outside the input tensor, False otherwise.
+ */
+bool is_pool_region_entirely_outside_input(const PoolingLayerInfo &info);
 /** Translates a given GEMMLowp output stage to a string.
  *
  * @param[in] output_stage @ref GEMMLowpOutputStageInfo to be translated to string.
@@ -952,6 +959,14 @@ inline ::std::istream &operator>>(::std::istream &stream, DataType &data_type)
  */
 std::string lower_string(const std::string &val);
 
+/** Raise a given string to upper case
+ *
+ * @param[in] val Given string to lower.
+ *
+ * @return The upper case string
+ */
+std::string upper_string(const std::string &val);
+
 /** Check if a given data type is of floating point type
  *
  * @param[in] dt Input data type.
@@ -1185,6 +1200,49 @@ inline unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
     return vec_size;
 }
 
+/** Returns the suffix string of CPU kernel implementation names based on the given data type
+ *
+ * @param[in] data_type The data type the CPU kernel implemetation uses
+ *
+ * @return the suffix string of CPU kernel implementations
+ */
+inline std::string cpu_impl_dt(const DataType &data_type)
+{
+    std::string ret = "";
+
+    switch(data_type)
+    {
+        case DataType::F32:
+            ret = "fp32";
+            break;
+        case DataType::F16:
+            ret = "fp16";
+            break;
+        case DataType::U8:
+            ret = "u8";
+            break;
+        case DataType::S16:
+            ret = "s16";
+            break;
+        case DataType::S32:
+            ret = "s32";
+            break;
+        case DataType::QASYMM8:
+            ret = "qu8";
+            break;
+        case DataType::QASYMM8_SIGNED:
+            ret = "qs8";
+            break;
+        case DataType::QSYMM16:
+            ret = "qs16";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported.");
+    }
+
+    return ret;
+}
+
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
 /** Print consecutive elements to an output stream.
  *
diff --git a/arm_compute/graph/DataLayerVisitor.h b/arm_compute/graph/DataLayerVisitor.h
index 670b9f02b..ac7f1c84e 100644
--- a/arm_compute/graph/DataLayerVisitor.h
+++ b/arm_compute/graph/DataLayerVisitor.h
@@ -48,6 +48,7 @@ public:
     void visit(ConvolutionLayerNode &n) override;
     void visit(DepthwiseConvolutionLayerNode &n) override;
     void visit(FusedConvolutionBatchNormalizationNode &n) override;
+    void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
     void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
     void visit(OutputNode &n) override;
 
diff --git a/arm_compute/graph/INodeVisitor.h b/arm_compute/graph/INodeVisitor.h
index 4cb601fb0..97e95336e 100644
--- a/arm_compute/graph/INodeVisitor.h
+++ b/arm_compute/graph/INodeVisitor.h
@@ -106,6 +106,11 @@ public:
      * @param[in] n Node to visit.
      */
     virtual void visit(FusedConvolutionBatchNormalizationNode &n) = 0;
+    /** Visit FusedConvolutionBatchNormalizationWithPostOpsNode.
+     *
+     * @param[in] n Node to visit.
+     */
+    virtual void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) = 0;
     /** Visit FusedConvolutionWithPostOpNode.
      *
      * @param[in] n Node to visit.
@@ -210,6 +215,7 @@ public:
     virtual void visit(FlattenLayerNode &n) override;
     virtual void visit(FullyConnectedLayerNode &n) override;
     virtual void visit(FusedConvolutionBatchNormalizationNode &n) override;
+    virtual void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
     virtual void visit(FusedConvolutionWithPostOpNode &n) override;
     virtual void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
     virtual void visit(InputNode &n) override;
diff --git a/arm_compute/graph/TypePrinter.h b/arm_compute/graph/TypePrinter.h
index a8a20c9de..8f97bbf84 100644
--- a/arm_compute/graph/TypePrinter.h
+++ b/arm_compute/graph/TypePrinter.h
@@ -116,6 +116,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::FusedConvolutionBatchNormalizationLayer:
             os << "FusedConvolutionBatchNormalizationLayer";
             break;
+        case NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer:
+            os << "FusedConvolutionBatchNormalizationLayerWithPostOpsLayer";
+            break;
         case NodeType::FusedConvolutionWithPostOp:
             os << "FusedConvolutionWithPostOp";
             break;
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index e802e9dc7..ff33d5037 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -216,6 +216,7 @@ enum class NodeType
     FullyConnectedLayer,
     FusedConvolutionBatchNormalizationLayer,
     FusedConvolutionWithPostOp,
+    FusedConvolutionBatchNormalizationLayerWithPostOpsLayer,
     FusedDepthwiseConvolutionBatchNormalizationLayer,
     GenerateProposalsLayer,
     L2NormalizeLayer,
diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index 1e420a803..803283e20 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h
@@ -32,6 +32,7 @@
 #include "arm_compute/graph/Types.h"
 #include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h"
+#include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h"
 #include "arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h"
 #include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
@@ -540,7 +541,7 @@ std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node,
     return std::move(func);
 }
 
-/** Create a backend convolution layer function with post opreator
+/** Create a backend convolution layer function with post operator
  *
  * @tparam ConvolutionLayerFunctions Backend convolution functions
  * @tparam TargetInfo                Target-specific information
@@ -629,6 +630,90 @@ std::unique_ptr<IFunction> create_fused_convolution_with_post_op(FusedConvolutio
                                << " Output shape: " << output->info()->tensor_shape()
                                << qss.str()
                                << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
+                               << " Post ops" << post_ops
+                               << std::endl);
+    return std::move(func);
+}
+
+/** Create a backend convolution batch normalization layer function with post operator
+ *
+ * @tparam FusedLayerTypes           Backend convolution functions
+ * @tparam TargetInfo                Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ * @param[in] ctx  Graph context
+ *
+ * @return Backend fused convolution with batch normalization layer function
+ */
+template <typename FusedLayerTypes, typename TargetInfo>
+std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_with_post_op(FusedConvolutionBatchNormalizationWithPostOpsNode &node, GraphContext &ctx)
+{
+    validate_node<TargetInfo>(node, 8 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input   = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *weights = get_backing_tensor<TargetInfo>(node.input(1));
+    typename TargetInfo::TensorType *biases  = get_backing_tensor<TargetInfo>(node.input(2));
+    typename TargetInfo::TensorType *mean    = get_backing_tensor<TargetInfo>(node.input(3));
+    typename TargetInfo::TensorType *var     = get_backing_tensor<TargetInfo>(node.input(4));
+    typename TargetInfo::TensorType *beta    = get_backing_tensor<TargetInfo>(node.input(5));
+    typename TargetInfo::TensorType *gamma   = get_backing_tensor<TargetInfo>(node.input(6));
+
+    typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
+
+    const PadStrideInfo conv_info  = node.convolution_info();
+    const unsigned int  num_groups = node.num_groups();
+    const bool          fast_math  = node.fast_math_hint() == FastMathHint::Enabled;
+    const float         epsilon    = node.epsilon();
+
+    experimental::PostOpList<typename TargetInfo::TensorType *> post_ops;
+
+    auto &post_op_info_list = node.post_op_info_list();
+    for(const auto &post_op_info : post_op_info_list)
+    {
+        switch(post_op_info->type())
+        {
+            case PostOpType::Activation:
+            {
+                const auto act_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoActivation *>(post_op_info.get());
+                post_ops.template push_back_op<experimental::PostOpAct<typename TargetInfo::TensorType *>>(act_info->_act);
+                break;
+            }
+            case PostOpType::Eltwise_Add:
+            {
+                typename TargetInfo::TensorType *add_input    = get_backing_tensor<TargetInfo>(node.input(3));
+                const auto                       eltwise_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoEltwiseAdd *>(post_op_info.get());
+                post_ops.template push_back_op<experimental::PostOpEltwiseAdd<typename TargetInfo::TensorType *>>(add_input, eltwise_info->_prev_op_dst_pos, eltwise_info->_policy);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported PostOpType");
+            }
+        }
+    }
+
+    // Create and configure function (we assume that functions have been validated before creation)
+    std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, TargetInfo::TargetType);
+    std::unique_ptr<IFunction>      func;
+    std::string                     func_name;
+
+    using FType = FusedConvolutionBatchNormalizationWithPostOpsFunction<TargetInfo, FusedLayerTypes>;
+
+    // Create and configure function
+    std::tie(func, func_name) = create_named_memory_managed_function<FType>(
+                                    std::string("FusedConvolutionBatchNormalizationLayerWithPostOpsLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, num_groups, fast_math, post_ops);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " Post Ops:" << post_ops
                                << std::endl);
     return std::move(func);
 }
diff --git a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h
new file mode 100644
index 000000000..10f2e5c25
--- /dev/null
+++ b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H
+#define ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/IPostOp.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+/** Wrapper function to first apply {NE, CL}BatchNormalizationLayer on the weights and then run {NE, CL}ConvolutionLayer with the modified weights */
+template <typename TargetInfo, typename FusedLayerTypes>
+class FusedConvolutionBatchNormalizationWithPostOpsFunction : public IFunction
+{
+public:
+    using TensorType         = typename TargetInfo::TensorType;
+    using TensorConcreteType = typename TargetInfo::TensorConcreteType;
+
+    FusedConvolutionBatchNormalizationWithPostOpsFunction(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
+        : _conv_layer(memory_manager), _fused_batch_norm_layer(), _fused_bias(), _is_prepared(false)
+    {
+    }
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input      Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                        while every optional dimension from 4 and above represent a batch of inputs.
+     *                        Data types supported: QASYMM8/F16/F32.
+     * @param[in]  weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  bias       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                        Data type supported: Should match @p input data type.
+     * @param[out] output     Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                        Data types supported: Same as @p input.
+     * @param[in]  mean       Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var        Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta       Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
+     * @param[in]  gamma      Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
+     * @param[in]  epsilon    Small value to avoid division with zero. Default value is 0.001f.
+     * @param[in]  conv_info  Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  num_groups Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
+     * @param[in]  fast_math  Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                        available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]  post_ops   A sequence of post operations that are performed after the main operation.
+     *
+     */
+    void configure(TensorType       *input,
+                   TensorType       *weights,
+                   TensorType       *bias,
+                   TensorType       *output,
+                   const TensorType *mean,
+                   const TensorType *var,
+                   const TensorType *beta,
+                   const TensorType *gamma,
+                   float epsilon, const PadStrideInfo &conv_info, unsigned int num_groups, bool fast_math,
+                   const arm_compute::experimental::PostOpList<TensorType *> &post_ops = experimental::PostOpList<TensorType *> {})
+    {
+        // We don't run any validate, as we assume that the layers have been already validated
+        const bool        has_bias = (bias != nullptr);
+        const TensorType *bias_to_use;
+
+        // We check if the layer has a bias. If yes, use it in-place. If not, we need to create one
+        // as batch normalization might end up with a bias != 0
+        if(has_bias)
+        {
+            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon);
+            bias_to_use = bias;
+        }
+        else
+        {
+            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, &_fused_bias, nullptr, beta, gamma, epsilon);
+            bias_to_use = &_fused_bias;
+        }
+
+        ActivationLayerInfo fused_act = ActivationLayerInfo(); // Passing an empty ActivationLayerInfo.
+        _conv_layer.configure(input, weights, bias_to_use, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act, fast_math, num_groups, post_ops);
+
+        if(!has_bias)
+        {
+            _fused_bias.allocator()->allocate();
+        }
+    }
+
+    // Inherited methods overridden:
+    void run()
+    {
+        prepare();
+        _conv_layer.run();
+    }
+
+    void prepare()
+    {
+        if(!_is_prepared)
+        {
+            _fused_batch_norm_layer.run();
+            _is_prepared = true;
+        }
+    }
+
+private:
+    typename FusedLayerTypes::ConvolutionLayer       _conv_layer;
+    typename FusedLayerTypes::FuseBatchNormalization _fused_batch_norm_layer;
+    TensorConcreteType                               _fused_bias;
+    bool                                             _is_prepared;
+};
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H */
diff --git a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
index b3661c300..b0051b138 100644
--- a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
+++ b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,7 +100,7 @@ public:
      */
     ConvolutionMethod convolution_method() const;
 
-    /** Sets the fast math fast hint
+    /** Sets the fast math hint
      *
      * @param[in] hint Hint to use for convolution
      */
diff --git a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h
new file mode 100644
index 000000000..a42e06d88
--- /dev/null
+++ b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_BATCH_NORMALIZATION_WITH_POST_OPS_NODE_H
+#define ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_BATCH_NORMALIZATION_WITH_POST_OPS_NODE_H
+
+#include "arm_compute/graph/INode.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Batch Normalization node */
+class FusedConvolutionBatchNormalizationWithPostOpsNode final : public INode
+{
+public:
+    /** Constructor
+     *
+     * @param[in] epsilon        Epsilon parameter.
+     * @param[in] info           Convolution layer attributes.
+     * @param[in] num_groups     (Optional) Number of groups (Defaults to 1)
+     * @param[in] method         (Optional) Convolution method to use
+     * @param[in] fast_math_hint (Optional) Fast math hint
+     */
+    FusedConvolutionBatchNormalizationWithPostOpsNode(float epsilon, PadStrideInfo info,
+                                                      unsigned int      num_groups     = 1,
+                                                      ConvolutionMethod method         = ConvolutionMethod::Default,
+                                                      FastMathHint      fast_math_hint = FastMathHint::Disabled);
+
+    /** Epsilon parameter accessor
+     *
+     * @return Epsilon parameter
+     */
+    float epsilon() const;
+
+    /** Computes convolution output descriptor
+     *
+     * @param[in] input_descriptor   Input descriptor
+     * @param[in] weights_descriptor Weights descriptor
+     * @param[in] info               Convolution operation attributes
+     *
+     * @return Output descriptor
+     */
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                      const TensorDescriptor &weights_descriptor,
+                                                      const PadStrideInfo    &info);
+
+    /** Sets the convolution layer method to use
+     *
+     * @param[in] method Method to use for convolution
+     */
+    void set_convolution_method(ConvolutionMethod method);
+
+    /** Number of groups in convolution accessor
+     *
+     * @return Number of groups in convolution
+     */
+    unsigned int num_groups() const;
+
+    /** Convolution layer method accessor
+     *
+     * @note This is an indication on which convolution layer implementation to use,
+     *       if it fails to be created the library's heuristic approach will be used
+     *
+     * @return Convolution layer method to be used by the node
+     */
+    ConvolutionMethod convolution_method() const;
+
+    /** Sets the fast math hint
+     *
+     * @param[in] hint Hint to use for convolution
+     */
+    void set_fast_math_hint(FastMathHint hint);
+
+    /** Fast math hint accessor
+     *
+     * @return Fast math hint to be used by the node
+     */
+    FastMathHint fast_math_hint() const;
+
+    /** Convolution metadata accessor
+     *
+     * @return Convolution information
+     */
+    PadStrideInfo convolution_info() const;
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void accept(INodeVisitor &v) override;
+
+public:
+    static constexpr NodeType node_type = NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer;
+
+private:
+    float _epsilon;
+
+    PadStrideInfo     _info;
+    unsigned int      _num_groups;
+    ConvolutionMethod _method;
+    FastMathHint      _fast_math_hint;
+};
+
+} // namespace graph
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_GRAPH_BATCH_NORMALIZATION_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/Nodes.h b/arm_compute/graph/nodes/Nodes.h
index fb0eb155f..3887eaeac 100644
--- a/arm_compute/graph/nodes/Nodes.h
+++ b/arm_compute/graph/nodes/Nodes.h
@@ -43,6 +43,7 @@
 #include "arm_compute/graph/nodes/FlattenLayerNode.h"
 #include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
+#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
 #include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
 #include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
 #include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h"
diff --git a/arm_compute/graph/nodes/NodesFwd.h b/arm_compute/graph/nodes/NodesFwd.h
index 6393b1d5d..f1576d633 100644
--- a/arm_compute/graph/nodes/NodesFwd.h
+++ b/arm_compute/graph/nodes/NodesFwd.h
@@ -51,6 +51,7 @@ class FullyConnectedLayerNode;
 class FusedConvolutionBatchNormalizationNode;
 class FusedConvolutionWithPostOpNode;
 class FusedDepthwiseConvolutionBatchNormalizationNode;
+class FusedConvolutionBatchNormalizationWithPostOpsNode;
 class GenerateProposalsLayerNode;
 class InputNode;
 class L2NormalizeLayerNode;
diff --git a/arm_compute/graph/printers/DotGraphPrinter.h b/arm_compute/graph/printers/DotGraphPrinter.h
index 42a20678f..63b89272f 100644
--- a/arm_compute/graph/printers/DotGraphPrinter.h
+++ b/arm_compute/graph/printers/DotGraphPrinter.h
@@ -57,6 +57,7 @@ public:
     void visit(DepthwiseConvolutionLayerNode &n) override;
     void visit(EltwiseLayerNode &n) override;
     void visit(FusedConvolutionBatchNormalizationNode &n) override;
+    void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
     void visit(FusedConvolutionWithPostOpNode &n) override;
     void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
     void visit(NormalizationLayerNode &n) override;
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 3dd635b14..9f67f2e05 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -94,7 +94,6 @@
 #include "arm_compute/runtime/CL/functions/CLRange.h"
 #include "arm_compute/runtime/CL/functions/CLReduceMean.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
-#include "arm_compute/runtime/CL/functions/CLRemap.h"
 #include "arm_compute/runtime/CL/functions/CLReorgLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 1975e1547..2163c1680 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -66,6 +66,9 @@ public:
      * |F16            |F16            |
      * |F32            |F32            |
      *
+     * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+     *       Cases where pooling region is completely outside input tensor are not supported
+     *
      * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
diff --git a/arm_compute/runtime/CL/functions/CLRemap.h b/arm_compute/runtime/CL/functions/CLRemap.h
deleted file mode 100644
index ae75817d0..000000000
--- a/arm_compute/runtime/CL/functions/CLRemap.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREMAP_H
-#define ARM_COMPUTE_CLREMAP_H
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-class ITensorInfo;
-
-/** Basic function to execute remap. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLRemapKernel
- *
- */
-class CLRemap : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's sources, destination, interpolation policy and border mode.
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src0   |src1   |src2   |dst    |
-     * |:------|:------|:------|:------|
-     * |U8     |F32    |F32    |U8     |
-     * |F16    |F32    |F32    |F16    |
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]     map_x                 Map for X coords. Data types supported: F32.
-     * @param[in]     map_y                 Map for Y coords. Data types supported: F32.
-     * @param[out]    output                Output tensor. Data types supported: U8.
-     * @param[in]     policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
-     * @param[in]     border_mode           Border mode to use on the input tensor. Only CONSTANT and UNDEFINED are supported.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     * @deprecated This function is deprecated and is intended to be removed in 22.02 release
-     *
-     */
-    ARM_COMPUTE_DEPRECATED_REL(21.08)
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
-                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    /** Initialise the function's sources, destination, interpolation policy and border mode.
-     *
-     *  Similar to @ref CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
-                                           InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-     *
-     * @deprecated This function is deprecated and is intended to be removed in 22.02 release
-     *
-     */
-    ARM_COMPUTE_DEPRECATED_REL(21.08)
-    void configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
-                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    /** Initialise the function's sources, destination, interpolation policy and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8,(or F16 when layout is NHWC). (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]     map_x                 Map for X coords. Data types supported: F32.
-     * @param[in]     map_y                 Map for Y coords. Data types supported: F32.
-     * @param[out]    output                Output tensor. Data types supported: Same as @p input.
-     * @param[in]     policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
-     * @param[in]     border_mode           Border mode to use on the input tensor. Only CONSTANT and UNDEFINED are supported.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue{});
-
-    /** Initialise the function's sources, destination, interpolation policy and border mode.
-     *
-     * Similar to @ref CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
-                                          InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
-     *
-     */
-    void configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue{});
-
-    /** Checks if the kernel's input, output and border mode will lead to a valid configuration of @ref CLRemap
-     *
-     * Similar to @ref CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
-                                          InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *map_x, const ITensorInfo *map_y, const ITensorInfo *output,
-                           const InterpolationPolicy policy, const BorderMode border_mode, PixelValue constant_border_value = PixelValue{});
-};
-}
-#endif /*ARM_COMPUTE_CLREMAP_H */
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index 3759fee8a..129009c58 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -152,7 +152,7 @@ public:
      */
     virtual void set_num_threads_with_affinity(unsigned int num_threads, BindFunc func);
 
-    /** Returns the number of threads that the SingleThreadScheduler has in his pool.
+    /** Returns the number of threads that the SingleThreadScheduler has in its pool.
      *
      * @return Number of threads available in SingleThreadScheduler.
      */
@@ -215,6 +215,19 @@ protected:
      */
     void schedule_common(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors);
 
+    /** Adjust the number of windows to the optimize performance
+     * (used for small workloads where smaller number of threads might improve the performance)
+     *
+     * @param[in] window           Window to use for kernel execution
+     * @param[in] split_dimension  Axis of dimension to split
+     * @param[in] init_num_windows Initial number of sub-windows to split
+     * @param[in] kernel           Kernel to execute
+     * @param[in] cpu_info         The CPU platform used to create the context.
+     *
+     * @return Adjusted number of windows
+     */
+    std::size_t adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info);
+
 private:
     unsigned int _num_threads_hint = {};
 };
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 71009fb45..562ade1d5 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,7 +90,6 @@
 #include "arm_compute/runtime/NEON/functions/NERange.h"
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-#include "arm_compute/runtime/NEON/functions/NERemap.h"
 #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 9398e1fce..9147ad968 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -71,6 +71,8 @@ public:
      * |F32            |F32            |
      *
      * @note F16 is supported for pool sizes 2 and 3 only
+     * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+     *       Cases where pooling region is completely outside input tensor are only supported for floating point data type
      *
      * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h
deleted file mode 100644
index 5d50517b0..000000000
--- a/arm_compute/runtime/NEON/functions/NERemap.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREMAP_H
-#define ARM_COMPUTE_NEREMAP_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute remap. This function calls the following kernels:
- *
- * -# @ref NERemapKernel
- */
-class NERemap : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the function's sources, destination, interpolation policy and border mode.
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src0   |src1   |src2   |dst    |
-     * |:------|:------|:------|:------|
-     * |U8     |F32    |F32    |U 8    |
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]      map_x                 Map for X coordinates. Data type supported: F32.
-     * @param[in]      map_y                 Map for Y coordinates. Data type supported: F32.
-     * @param[out]     output                Output tensor. Data type supported: U8.
-     * @param[in]      policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
-     * @param[in]      border_mode           Border mode to use on the input tensor.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. Defaults to 0.
-     *
-     * @deprecated This function is deprecated and is intended to be removed in 22.02 release
-     *
-     */
-    ARM_COMPUTE_DEPRECATED_REL(21.08)
-    void configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output,
-                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's sources, destination, interpolation policy and border mode.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]      map_x                 Map for X coordinates. Data type supported: F32.
-     * @param[in]      map_y                 Map for Y coordinates. Data type supported: F32.
-     * @param[out]     output                Output tensor. Data type supported: U8.
-     * @param[in]      policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
-     * @param[in]      border_mode           Border mode to use on the input tensor.
-     * @param[in]      constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value);
-};
-}
-#endif /*ARM_COMPUTE_NEREMAP_H */
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 20780c712..08be22891 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Compute Library"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 21.11
+PROJECT_NUMBER         = 22.02
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -364,7 +364,7 @@ SUBGROUPING            = YES
 
 # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
 # are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# instead of on a separate page (for HTML and Manual pages) or section (for LaTeX
 # and RTF).
 #
 # Note that this feature does not work in combination with
@@ -378,7 +378,7 @@ INLINE_GROUPED_CLASSES = NO
 # the documentation of the scope in which they are defined (i.e. file,
 # namespace, or group documentation), provided this scope is documented. If set
 # to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
+# Manual pages) or section (for LaTeX and RTF).
 # The default value is: NO.
 
 INLINE_SIMPLE_STRUCTS  = NO
@@ -1882,16 +1882,16 @@ RTF_EXTENSIONS_FILE    =
 #RTF_SOURCE_CODE        = NO
 
 #---------------------------------------------------------------------------
-# Configuration options related to the man page output
+# Configuration options related to the manual page output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# If the GENERATE_MAN tag is set to YES, doxygen will generate manual pages for
 # classes and files.
 # The default value is: NO.
 
 GENERATE_MAN           = NO
 
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# The MAN_OUTPUT tag is used to specify where the manual pages will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
 # it. A directory man3 will be created inside the directory specified by
 # MAN_OUTPUT.
@@ -1901,7 +1901,7 @@ GENERATE_MAN           = NO
 MAN_OUTPUT             = man
 
 # The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
+# manual pages. In case the manual section does not start with a number, the number
 # 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
 # optional.
 # The default value is: .3.
@@ -1910,15 +1910,15 @@ MAN_OUTPUT             = man
 MAN_EXTENSION          = .3
 
 # The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_OUTPUT in which the manual pages are placed. If defaults to man followed by
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
 #MAN_SUBDIR             = 
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
+# will generate one additional manual file for each entity documented in the real
+# manual page(s). These additional files only source the real manual page, but without
 # them the man command would be unable to find the correct page.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_MAN is set to YES.
diff --git a/docs/contributor_guide/contribution_guidelines.dox b/docs/contributor_guide/contribution_guidelines.dox
index f3a6def58..ab02adfc3 100644
--- a/docs/contributor_guide/contribution_guidelines.dox
+++ b/docs/contributor_guide/contribution_guidelines.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2019-2020 Arm Limited.
+/// Copyright (c) 2019-2022 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -35,6 +35,13 @@ The development is structured in the following way:
 - Development repository: https://review.mlplatform.org/#/admin/projects/ml/ComputeLibrary
 - Please report issues here: https://github.com/ARM-software/ComputeLibrary/issues
 
+@section S5_0_inc_lang Inclusive language guideline
+As part of the initiative to use inclusive language, there are certain phrases and words that were removed or replaced by more inclusive ones. Examples include but not limited to:
+\includedoc non_inclusive_language_examples.dox
+
+Please also follow this guideline when committing changes to Compute Library.
+
+Futhermore, starting from next release (22.05), 'master' branch will no longer be used, it will be replaced by 'main'. Please update your clone jobs accordingly.
 @section S5_1_coding_standards Coding standards and guidelines
 
 Best practices (as suggested by clang-tidy):
@@ -444,7 +451,7 @@ Once a patch is uploaded for review, there is a pre-commit test that runs on a J
 - get a "+1 Comments-Addressed", in case of comments from reviewers the committer has to address them all. A comment is considered addressed when the first line of the reply contains the word "Done"
 - get a "+2" from a reviewer, that means the patch has the final approval
 
-At the moment, the Jenkins server is not publicly accessible and for security reasons patches submitted by non-whitelisted committers do not trigger the pre-commit tests. For this reason, one of the maintainers has to manually trigger the job.
+At the moment, the Jenkins server is not publicly accessible and for security reasons patches submitted by non-allowlisted committers do not trigger the pre-commit tests. For this reason, one of the maintainers has to manually trigger the job.
 
 If the pre-commit test fails, the Jenkins job will post a comment on Gerrit with the details about the failure so that the committer will be able to reproduce the error and fix the issue, if any (sometimes there can be infrastructure issues, a test platform disconnecting for example, where the job needs to be retriggered).
 
diff --git a/docs/contributor_guide/non_inclusive_language_examples.dox b/docs/contributor_guide/non_inclusive_language_examples.dox
new file mode 100644
index 000000000..addfdd34d
--- /dev/null
+++ b/docs/contributor_guide/non_inclusive_language_examples.dox
@@ -0,0 +1,4 @@
+ - master/slave
+ - black/white
+ - he/she, him/her, his/hers
+   - When referring to a person where gender is irrelevant or unknown, kindly use they, them, theirs, or a person’s preferred pronoun.
+\ No newline at end of file
diff --git a/docs/user_guide/errata.dox b/docs/user_guide/errata.dox
index 47b4ef23b..56a0e789e 100644
--- a/docs/user_guide/errata.dox
+++ b/docs/user_guide/errata.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2019-2021 Arm Limited.
+/// Copyright (c) 2019-2022 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -30,20 +30,20 @@ namespace arm_compute
 
 @section S7_1_errata Errata
 
-- An issue has been identified when running the graph_deepspeech_v0_4_1 graph example.
+- (COMPMID-5134) An issue has been identified when running the graph_deepspeech_v0_4_1 graph example.
     - Versions Affected: >= v21.08
     - Conditions:
         - Data type input: F32
         - Backend: OpenCL
     - Result: The execution of the graph_deepspeech_v0_4_1 could fail on OpenCL backend for systems with a small RAM. The issue is due to the extra temporary memory required to reshape the network weights
 
-- Experimented performance regressions for some networks on OpenCL when using Arm® Mali™ DDK r8p0
+- (COMPMID-4013) Experimented performance regressions for some networks on OpenCL when using Arm® Mali™ DDK r8p0
     - Versions Affected: v21.05
     - OSs Affected: All
     - Conditions:
         - Arm® Mali™ DDK r8p0
 
-- Under certain conditions, CLFullyConnectedLayer quantized tests may fail due to an issue in the test framework.
+- (COMPMID-5146) Under certain conditions, CLFullyConnectedLayer quantized tests may fail due to an issue in the test framework.
     - Versions Affected: v21.02
     - OSs Affected: Linux
     - Conditions:
@@ -51,7 +51,7 @@ namespace arm_compute
         - release mode
         - asserts enabled
 
-- Performance regression in Convolution Layer OpenCL backend on Mali™ G77 when QSYMM8_PER_CHANNEL is used as weights' data type.
+- (COMPMID-4367) Performance regression in Convolution Layer OpenCL backend on Mali™ G77 when QSYMM8_PER_CHANNEL is used as weights' data type.
     - Versions Affected: >= v20.11 && < v21.08
     - OSs Affected: All
     - Conditions:
@@ -60,25 +60,25 @@ namespace arm_compute
         - OpenCL backend
         - Convolution Layer uses QSYMM8_PER_CHANNEL as the data type of its weight
 
-- A wrong test configuration has been found in CLGEMMMatrixMultiplyReshapedOnlyRHS set of tests.
+- (COMPMID-4306) A wrong test configuration has been found in CLGEMMMatrixMultiplyReshapedOnlyRHS set of tests.
     - Versions Affected: >= v20.11 && < v21.05
     - Conditions:
         - Data type input: F32/F16
         - Fused bounded relu activation with coefficient 'a' being negative
 
-- Under certain conditions, the validation test case 'CL/DirectConvolutionLayer/Float/FP32/RunSmall9x9\@InputShape=32x37x3x4:StrideX=1:StrideY=1:PadX=0:PadY=0:KernelSize=9:NumKernels=1:DataType=F32:ActivationInfo=LU_BOUNDED_RELU:DataLayout=NHWC' may fail.
+- (COMPMID-5135) Under certain conditions, the validation test case 'CL/DirectConvolutionLayer/Float/FP32/RunSmall9x9\@InputShape=32x37x3x4:StrideX=1:StrideY=1:PadX=0:PadY=0:KernelSize=9:NumKernels=1:DataType=F32:ActivationInfo=LU_BOUNDED_RELU:DataLayout=NHWC' may fail.
     - Versions Affected: >= v20.08
     - Conditions:
         - The validation suite has to run in nightly mode and execute 40k+ test cases before the test mentioned above
 
-- Under certain conditions, benchmark examples can hang when OpenCL profiling queues are enabled.
+- (COMPMID-5136) Under certain conditions, benchmark examples can hang when OpenCL profiling queues are enabled.
     - Versions Affected: >= v19.11
     - OSs Affected: Linux
     - Conditions:
         - Arm® Mali™ DDK r1p0 - r8p0, and
         - Linux kernel >= 4.4
 
-- On Android with arm64-v8a/arm64-v8.2-a architecture, Arm® Neon™ validation tests can fail when compiled using Android Ndk
+- (COMPMID-5137) On Android with arm64-v8a/arm64-v8.2-a architecture, Arm® Neon™ validation tests can fail when compiled using Android Ndk
   >= r18b in debug mode (https://github.com/android/ndk/issues/1135).
     - Versions Affected: >= v19.11
     - OSs Affected: Android
@@ -86,7 +86,7 @@ namespace arm_compute
         - arm64-v8a/arm64-v8.2-a architecture, and
         - Compiled using Android NDK >= r18b in debug mode.
 
-- An issue has been identified with CLCast.
+- (COMPMID-4288) An issue has been identified with CLCast.
     - Versions Affected: >= v18.11 && < v21.05
     - Conditions:
         - Data type input: F32
diff --git a/docs/user_guide/introduction.dox b/docs/user_guide/introduction.dox
index a8c9926b8..3c0483eea 100644
--- a/docs/user_guide/introduction.dox
+++ b/docs/user_guide/introduction.dox
@@ -66,6 +66,10 @@ Several builds of the library are available using various configurations:
   <td>arm64-v8a
   <tr>
   <td>armv8.2-a
+<tr>
+  <td rowspan="1">macOS
+  <td>armv8.2-a
+  <td>Monterey (OS version): clang 13 (native)
 </table>
 
 @section S0_1_contact Contact / Support
@@ -85,7 +89,7 @@ These binaries have been built using the following toolchains:
             - Linux armv7a: gcc-linaro-7.2.1-2017.11-x86_64_arm-linux-gnueabihf
             - Linux arm64-v8a: gcc-linaro-7.2.1-2017.11-x86_64_aarch64-linux-gnu
             - Linux arm64-v8.2-a: gcc-linaro-7.2.1-2017.11-x86_64_aarch64-linux-gnu
-            - Linux arm64-v8.2-a (fat binary): gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu
+            - Linux arm64-v8.2-a (multi-ISA binary): gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu
             - Linux armv8.2a-sve: gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu
             - Android armv7a: clang++ / libc++ NDK r20b
             - Android am64-v8a: clang++ / libc++ NDK r20b
diff --git a/docs/user_guide/library.dox b/docs/user_guide/library.dox
index fc08dbc43..7a45fe9d9 100644
--- a/docs/user_guide/library.dox
+++ b/docs/user_guide/library.dox
@@ -555,9 +555,9 @@ The responsibilities of the operators can be summarized as follows:
 - Providing information to the caller required by the computation (e.g., memory requirements)
 - Allocation of any required auxiliary memory if it isn't given by its caller explicitly
 
-@subsection architecture_experimental_build_fat_binary Build fat binary
+@subsection architecture_experimental_build_multi_isa Build multi-ISA binary
 
-Selecting fat_binary when building Compute Library, will create a library that contains all the supported ISA features. 
+Selecting multi_isa when building Compute Library, will create a library that contains all the supported ISA features.
 Based on the CPU support, the appropriate kernel will be selected at runtime for execution. Currently this option is
 only supported with armv8.2-a as the base architecture.
 
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index ffbb813e2..97a531c47 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2021 Arm Limited.
+/// Copyright (c) 2017-2022 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -37,14 +37,48 @@ If there is more than one release in a month then an extra sequential number is
 	v17.04 (First release of April 2017)
 
 @note We're aiming at releasing one major public release with new features per quarter. All releases in between will only contain bug fixes.
+@note Starting from next release (22.05), 'master' branch will no longer be used, it will be replaced by 'main'. Please update your clone jobs accordingly.
 
 @section S2_2_changelog Changelog
 
+v22.02 Public major release
+ - Various bug fixes.
+ - Various optimizations.
+ - Update A510 arm_gemm cpu Kernels.
+ - Inclusive language adjustment. Please refer to @ref S5_0_inc_lang for details.
+ - Improve the start-up time for the following OpenCL kernels:
+   - @ref CLScale
+   - @ref CLGEMM
+   - @ref CLDepthwiseConvolutionLayer
+   - \link opencl::kernels::ClIm2ColKernel ClIm2ColKernel \endlink
+   - \link opencl::kernels::ClDirectConv2dKernel ClDirectConv2dKernel \endlink
+ - Remove functions:
+   - CLRemap
+   - NERemap
+ - Remove padding from OpenCL kernels:
+   - \link opencl::kernels::ClDirectConv2dKernel ClDirectConv2dKernel \endlink
+ - Remove padding from Cpu kernels:
+   - \link cpu::kernels::CpuDirectConv2dKernel CpuDirectConv2dKernel \endlink
+ - Decouple the implementation of the following Cpu kernels into various data types (fp32, fp16, int):
+   - \link cpu::kernels::CpuActivationKernel CpuActivationKernel \endlink
+   - \link cpu::kernels::CpuAddKernel CpuAddKernel \endlink
+   - \link cpu::kernels::CpuElementwiseKernel CpuElementwiseKernel \endlink
+   - \link cpu::CpuSoftmaxGeneric CpuSoftmaxKernel \endlink
+   - @ref NEBoundingBoxTransformKernel
+   - @ref NECropKernel
+   - @ref NEComputeAllAnchorsKernel
+   - @ref NEInstanceNormalizationLayerKernel
+   - @ref NEMaxUnpoolingLayerKernel
+   - @ref NEMeanStdDevNormalizationKernel
+   - @ref NERangeKernel
+   - @ref NEROIAlignLayerKernel
+   - @ref NESelectKernel
+
 v21.11 Public major release
  - Various bug fixes.
  - Various optimizations:
    - Improve performance of bilinear and nearest neighbor Scale on both CPU and GPU for FP32, FP16, Int8, Uint8 data types
-   - Improve performance of Softmax on GPU for Uint8/Int8 
+   - Improve performance of Softmax on GPU for Uint8/Int8
  - New OpenCL kernels / functions:
    - @ref CLConv3D
  - New Arm® Neon™ kernels / functions:
@@ -75,7 +109,7 @@ v21.08 Public major release
  - Support fat binary build for arm8.2-a via fat_binary build flag
  - Add CPU discovery capabilities
  - Add data type f16 support for:
-  - @ref CLRemapKernel
+  - CLRemapKernel
  - Port the following functions to stateless API:
    - @ref CLConvolutionLayer
    - @ref CLFlattenLayer
@@ -296,7 +330,7 @@ v20.11 Public major release
  - Removed padding from Arm® Neon™ kernels:
    - NEComplexPixelWiseMultiplicationKernel
    - NENonMaximaSuppression3x3Kernel
-   - @ref NERemapKernel
+   - NERemapKernel
    - NEGEMMInterleave4x4Kernel
    - NEDirectConvolutionLayerKernel
    - NEScaleKernel
@@ -1365,7 +1399,7 @@ v17.06 Public major release
  - Added infrastructure to provide GPU specific optimisation for some OpenCL kernels.
  - Added @ref OMPScheduler (OpenMP) scheduler for Neon
  - Added @ref SingleThreadScheduler scheduler for Arm® Neon™ (For bare metal)
- - User can specify his own scheduler by implementing the @ref IScheduler interface.
+ - User can specify their own scheduler by implementing the @ref IScheduler interface.
  - New OpenCL kernels / functions:
     - @ref CLBatchNormalizationLayerKernel / @ref CLBatchNormalizationLayer
     - CLDepthConcatenateLayerKernel / CLDepthConcatenateLayer
@@ -1443,7 +1477,7 @@ v17.02.1 Sources preview
    - CLLogits1DMaxKernel, CLLogits1DShiftExpSumKernel, CLLogits1DNormKernel / @ref CLSoftmaxLayer
    - CLPoolingLayerKernel / @ref CLPoolingLayer
    - CLIm2ColKernel, CLCol2ImKernel, CLConvolutionLayerWeightsReshapeKernel / CLConvolutionLayer
-   - @ref CLRemapKernel / @ref CLRemap
+   - CLRemapKernel / CLRemap
    - CLGaussianPyramidHorKernel, CLGaussianPyramidVertKernel / CLGaussianPyramid, CLGaussianPyramidHalf, CLGaussianPyramidOrb
    - CLMinMaxKernel, CLMinMaxLocationKernel / CLMinMaxLocation
    - CLNonLinearFilterKernel / CLNonLinearFilter
diff --git a/examples/graph_resnext50.cpp b/examples/graph_resnext50.cpp
index 97eb85ccd..6378f6c74 100644
--- a/examples/graph_resnext50.cpp
+++ b/examples/graph_resnext50.cpp
@@ -192,7 +192,7 @@ private:
  * Model is based on:
  *      https://arxiv.org/abs/1611.05431
  *      "Aggregated Residual Transformations for Deep Neural Networks"
- *      Saining Xie, Ross Girshick, Piotr Dollar, Zhuowen Tu, Kaiming He
+ *      Saining Xie, Ross Girshick, Piotr Dollar, Zhuowen Tu, Kaiming He.
  *
  * @note To list all the possible arguments execute the binary appended with the --help option
  *
diff --git a/examples/neon_copy_objects.cpp b/examples/neon_copy_objects.cpp
index 3043709fe..b060b0975 100644
--- a/examples/neon_copy_objects.cpp
+++ b/examples/neon_copy_objects.cpp
@@ -49,7 +49,7 @@ public:
         src_data = new float[width * height * batch];
         dst_data = new float[width * height * batch];
 
-        // Fill src_data with dummy values:
+        // Fill src_data with pseudo(meaningless) values:
         for(unsigned int b = 0; b < batch; b++)
         {
             for(unsigned int h = 0; h < height; h++)
diff --git a/filedefs.json b/filedefs.json
new file mode 100644
index 000000000..76dccfffe
--- /dev/null
+++ b/filedefs.json
@@ -0,0 +1,40 @@
+{
+    "cpu": {
+        "arch" : {
+            "armv8-a": {
+                "cxxflags": ["-march=armv8.2-a"]
+            },
+            "armv8.2-a": {
+                "cxxflags": ["-march=armv8.2-a+fp16"],
+                "cppdefines": ["ARM_COMPUTE_ENABLE_FP16"]
+            },
+            "armv8.2-a-sve": {
+                "cxxflags": ["-march=armv8.2-a+sve+fp16+dotprod"],
+                "cppdefines": ["ARM_COMPUTE_ENABLE_FP16", "ARM_COMPUTE_ENABLE_BF16",
+                               "ARM_COMPUTE_ENABLE_I8MM", "ARM_COMPUTE_ENABLE_SVEF32MM"]
+            },
+            "armv8.2-a-sve2": {
+                "cxxflags": ["-march=armv8.2-a+sve2+fp16+dotprod"],
+                "cppdefines": ["ARM_COMPUTE_ENABLE_FP16", "ARM_COMPUTE_ENABLE_BF16",
+                               "ARM_COMPUTE_ENABLE_I8MM", "ARM_COMPUTE_ENABLE_SVEF32MM"]
+            },
+            "armv8r64": {
+                "cxxflags": ["-march=armv8.4-a"]
+            },
+            "armv8.6-a": {
+                "cxxflags": ["-march=armv8.6-a+fp16"],
+                "cppdefines": ["ARM_COMPUTE_ENABLE_FP16"]
+            },
+            "armv8.6-a-sve": {
+                "cxxflags": ["-march=armv8.6-a+sve+fp16+dotprod"],
+                "cppdefines": ["ARM_COMPUTE_ENABLE_FP16", "ARM_COMPUTE_ENABLE_BF16",
+                               "ARM_COMPUTE_ENABLE_I8MM"]
+            },
+            "armv8.6-a-sve2": {
+                "cxxflags": ["-march=armv8.6-a+sve2+fp16+dotprod"],
+                "cppdefines": ["ARM_COMPUTE_ENABLE_FP16", "ARM_COMPUTE_ENABLE_BF16",
+                               "ARM_COMPUTE_ENABLE_I8MM"]
+            }
+        }
+    }
+}
+\ No newline at end of file
diff --git a/filelist.json b/filelist.json
index 549dc66ce..ba19321a5 100644
--- a/filelist.json
+++ b/filelist.json
@@ -662,13 +662,6 @@
         ]
       }
     },
-    "Remap": {
-      "files": {
-        "common": [
-          "src/core/CL/kernels/CLRemapKernel.cpp",
-          "src/runtime/CL/functions/CLRemap.cpp"]
-      }
-    },
     "Reorg": {
       "files": {
         "common": [
@@ -845,20 +838,20 @@
             "src/cpu/operators/CpuActivation.cpp",
             "src/cpu/kernels/CpuActivationKernel.cpp",
             "src/runtime/NEON/functions/NEActivationLayer.cpp",
-            "src/cpu/kernels/activation/neon/qasymm8.cpp",
-            "src/cpu/kernels/activation/neon/qasymm8_signed.cpp",
-            "src/cpu/kernels/activation/neon/qsymm16.cpp"
+            "src/cpu/kernels/activation/generic/neon/qasymm8.cpp",
+            "src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp",
+            "src/cpu/kernels/activation/generic/neon/qsymm16.cpp"
           ],
           "neon": {
-            "fp16": [ "src/cpu/kernels/activation/neon/fp16.cpp" ],
-            "fp32": [ "src/cpu/kernels/activation/neon/fp32.cpp" ]
+            "fp16": [ "src/cpu/kernels/activation/generic/neon/fp16.cpp" ],
+            "fp32": [ "src/cpu/kernels/activation/generic/neon/fp32.cpp" ]
           },
           "sve": {
-            "fp16": [ "src/cpu/kernels/activation/sve/fp16.cpp" ],
-            "fp32": [ "src/cpu/kernels/activation/sve/fp32.cpp" ],
-            "qasymm8": [ "src/cpu/kernels/activation/sve/qasymm8.cpp" ],
-            "qasymm8_signed": [ "src/cpu/kernels/activation/sve/qasymm8_signed.cpp" ],
-            "qsymm16": [ "src/cpu/kernels/activation/sve/qsymm16.cpp" ]
+            "fp16": [ "src/cpu/kernels/activation/generic/sve/fp16.cpp" ],
+            "fp32": [ "src/cpu/kernels/activation/generic/sve/fp32.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/activation/generic/sve2/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/activation/generic/sve2/qsymm16.cpp" ]
           }
         }
       },
@@ -873,16 +866,27 @@
           "common": [
             "src/cpu/operators/CpuAdd.cpp",
             "src/cpu/kernels/CpuAddKernel.cpp",
-            "src/runtime/NEON/functions/NEArithmeticAddition.cpp",
-            "src/cpu/kernels/add/neon/qasymm8.cpp",
-            "src/cpu/kernels/add/neon/qasymm8_signed.cpp",
-            "src/cpu/kernels/add/neon/qsymm16.cpp"
+            "src/runtime/NEON/functions/NEArithmeticAddition.cpp"
           ],
+          "neon": {
+            "common": ["src/cpu/kernels/add/generic/neon/impl.cpp"],
+            "fp32":["src/cpu/kernels/add/generic/neon/fp32.cpp"],
+            "fp16":["src/cpu/kernels/add/generic/neon/fp16.cpp"],
+            "integer":["src/cpu/kernels/add/generic/neon/integer.cpp"],
+            "qasymm8": ["src/cpu/kernels/add/generic/neon/qasymm8.cpp"],
+            "qasymm8_signed": ["src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp"],
+            "qsymm16": ["src/cpu/kernels/add/generic/neon/qsymm16.cpp"]
+          },
           "sve": {
-            "common": [ "src/cpu/kernels/add/sve/impl.cpp" ],
-            "qasymm8": [ "src/cpu/kernels/add/sve/qasymm8.cpp" ],
-            "qasymm8_signed": [ "src/cpu/kernels/add/sve/qasymm8_signed.cpp" ],
-            "qsymm16": [ "src/cpu/kernels/add/sve/qsymm16.cpp" ]
+            "common": [ "src/cpu/kernels/add/generic/sve/impl.cpp" ],
+            "integer":["src/cpu/kernels/add/generic/sve/integer.cpp"],
+            "fp32":["src/cpu/kernels/add/generic/sve/fp32.cpp"],
+            "fp16":["src/cpu/kernels/add/generic/sve/fp16.cpp"]
+          },
+          "sve2": {
+            "qasymm8": [ "src/cpu/kernels/add/generic/sve2/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/add/generic/sve2/qsymm16.cpp" ]
           }
         }
       },
@@ -949,7 +953,14 @@
           "common": [
             "src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp",
             "src/runtime/NEON/functions/NEBoundingBoxTransform.cpp"
-          ]
+          ],
+          "neon":{
+            "common":["src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp"],
+            "fp32":["src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp"],
+            "fp16":["src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp"],
+            "qsymm16":["src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp"]
+
+          }
         }
       },
       "Cast": {
@@ -1008,38 +1019,42 @@
             "src/cpu/kernels/CpuCol2ImKernel.cpp",
             "src/cpu/kernels/CpuIm2ColKernel.cpp",
             "src/cpu/kernels/CpuWeightsReshapeKernel.cpp",
-            "src/core/NEON/kernels/convolution/common/padding.cpp",
-            "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
-            "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
-            "src/core/NEON/kernels/convolution/common/utils.cpp",
-            "src/core/NEON/kernels/convolution/winograd/padding.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp",
             "src/runtime/NEON/functions/NEConvolutionLayer.cpp",
             "src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp",
             "src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp",
             "src/runtime/NEON/functions/NEGEMMConv2d.cpp",
             "src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp",
             "src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp"
-          ]
+          ],
+          "neon": {
+            "common": [
+              "src/core/NEON/kernels/convolution/common/padding.cpp",
+              "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
+              "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
+              "src/core/NEON/kernels/convolution/common/utils.cpp",
+              "src/core/NEON/kernels/convolution/winograd/padding.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp" 
+            ]
+          }
         }
       },
       "Copy": {
@@ -1057,7 +1072,13 @@
           "common": [
             "src/core/NEON/kernels/NECropKernel.cpp",
             "src/runtime/NEON/functions/NECropResize.cpp"
-          ]
+          ],
+          "neon": {
+            "common":  [ "src/cpu/kernels/crop/generic/neon/impl.cpp" ],
+            "fp32":    [ "src/cpu/kernels/crop/generic/neon/fp32.cpp" ],
+            "fp16":    [ "src/cpu/kernels/crop/generic/neon/fp16.cpp" ],
+            "integer": [ "src/cpu/kernels/crop/generic/neon/integer.cpp"   ]
+          }
         }
       },
       "Deconv2d": {
@@ -1092,68 +1113,72 @@
             "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp",
             "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp",
             "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp",
-            "src/core/NEON/kernels/convolution/common/padding.cpp",
-            "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
-            "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
-            "src/core/NEON/kernels/convolution/common/utils.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp",
-            "src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp",
-            "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp"
+            "src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp"
           ],
+          "neon": {
+            "common": [
+              "src/core/NEON/kernels/convolution/common/padding.cpp",
+              "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
+              "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
+              "src/core/NEON/kernels/convolution/common/utils.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp"
+              ]
+          },
           "sve": {
             "common": [
               "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp",
@@ -1237,10 +1262,22 @@
           "common": [
             "src/cpu/operators/CpuElementwise.cpp",
             "src/cpu/kernels/CpuElementwiseKernel.cpp",
-            "src/runtime/NEON/functions/NEElementwiseOperations.cpp"
+            "src/runtime/NEON/functions/NEElementwiseOperations.cpp",
+            "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp",
+            "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp"
           ],
+          "neon":{
+            "fp32": ["src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp"],
+            "fp16": ["src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp"],
+            "integer": ["src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp"]
+          },
           "sve": {
-            "common": [ "src/cpu/kernels/elementwise/sve/elementwise.cpp" ]
+            "common": ["src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp" ],
+            "integer": ["src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp"],
+            "fp32": ["src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp"],
+            "fp16": ["src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp"], 
+            "qasymm8": ["src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp"],
+            "qasymm8_signed": ["src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp"]
           }
         }
       },
@@ -1251,8 +1288,16 @@
             "src/cpu/kernels/CpuElementwiseUnaryKernel.cpp",
             "src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp"
           ],
+          "neon": {
+            "integer": ["src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp"],
+            "fp32": ["src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp"],
+            "fp16": ["src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp"]
+          },
           "sve": {
-            "common": [ "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp" ]
+            "common": ["src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp" ],
+            "integer": ["src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp"],
+            "fp32": ["src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp"],
+            "fp16": ["src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp"]
           }
         }
       },
@@ -1346,89 +1391,94 @@
             "src/cpu/operators/CpuGemm.cpp",
             "src/cpu/operators/CpuGemmLowpOutputStage.cpp",
             "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp",
-            "src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp",
-            "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
-            "src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp",
-            "src/core/NEON/kernels/arm_gemm/gemm_int16.cpp",
-            "src/core/NEON/kernels/arm_gemm/gemm_int8.cpp",
-            "src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp",
-            "src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp",
-            "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp",
-            "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp",
-            "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
-            "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp",
-            "src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
-            "src/core/NEON/kernels/arm_gemm/misc.cpp",
-            "src/core/NEON/kernels/arm_gemm/quantized.cpp",
-            "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
-            "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
-            "src/core/NEON/kernels/arm_gemm/transform.cpp",
             "src/runtime/NEON/functions/NEGEMM.cpp",
             "src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp",
-            "src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp",
-            "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp"
+            "src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp"
           ],
           "neon": {
+            "common": [
+              "src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp",
+              "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
+              "src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp",
+              "src/core/NEON/kernels/arm_gemm/gemm_int16.cpp",
+              "src/core/NEON/kernels/arm_gemm/gemm_int8.cpp",
+              "src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp",
+              "src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp",
+              "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp",
+              "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp",
+              "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
+              "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp",
+              "src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
+              "src/core/NEON/kernels/arm_gemm/misc.cpp",
+              "src/core/NEON/kernels/arm_gemm/quantized.cpp",
+              "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
+              "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
+              "src/core/NEON/kernels/arm_gemm/transform.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp"    
+            ],
             "estate32": [
               "src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp",
@@ -1490,7 +1540,13 @@
           "common": [
             "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp",
             "src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp"
-          ]
+          ],
+          "neon":{
+            "common":["src/cpu/kernels/genproposals/generic/neon/impl.cpp"],
+            "fp16":["src/cpu/kernels/genproposals/generic/neon/fp16.cpp"],
+            "fp32":["src/cpu/kernels/genproposals/generic/neon/fp32.cpp"],
+            "qsymm16":["src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp"]
+          }
         }
       },
       "InstanceNormalize": {
@@ -1499,7 +1555,12 @@
           "common": [
             "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp",
             "src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp"
-          ]
+          ], 
+          "neon":{
+            "common":["src/cpu/kernels/instancenorm/generic/neon/impl.cpp"],
+            "fp16":["src/cpu/kernels/instancenorm/generic/neon/fp16.cpp"],
+            "fp32":["src/cpu/kernels/instancenorm/generic/neon/fp32.cpp"]
+          }
         }
       },
       "L2Normalize": {
@@ -1550,7 +1611,14 @@
           "common": [
             "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp",
             "src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp"
-          ]
+          ],
+          "neon":{
+            "common":["src/cpu/kernels/maxunpool/generic/neon/impl.cpp"],
+            "fp32":["src/cpu/kernels/maxunpool/generic/neon/fp32.cpp"],
+            "fp16":["src/cpu/kernels/maxunpool/generic/neon/fp16.cpp"],
+            "qasymm8":["src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp"],
+            "qasymm8_signed":["src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp"]
+          }
         }
       },
       "Mean": {
@@ -1564,7 +1632,12 @@
           "common": [
             "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp",
             "src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp"
-          ]
+          ],
+        "neon":{
+          "common":["src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp"],
+          "fp32":["src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp"],
+          "fp16":["src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp"]
+        }
         }
       },
       "Mul": {
@@ -1609,36 +1682,38 @@
             "src/cpu/operators/CpuPool2d.cpp",
             "src/cpu/kernels/CpuPool2dKernel.cpp",
             "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp",
-            "src/runtime/NEON/functions/NEPoolingLayer.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp",
             "src/cpu/kernels/pool2d/neon/qasymm8.cpp",
-            "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp"
+            "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp",
+            "src/runtime/NEON/functions/NEPoolingLayer.cpp"
           ],
           "neon": {
+            "common": [
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp"  
+            ],
             "nchw": [ "src/cpu/kernels/pool2d/neon/nchw/all.cpp" ],
             "fp16": [ "src/cpu/kernels/pool2d/neon/fp16.cpp" ],
             "fp32": [ "src/cpu/kernels/pool2d/neon/fp32.cpp" ]
@@ -1697,7 +1772,13 @@
           "common": [
             "src/core/NEON/kernels/NERangeKernel.cpp",
             "src/runtime/NEON/functions/NERange.cpp"
-          ]
+          ],
+          "neon": {
+            "common":  [ "src/cpu/kernels/range/generic/neon/impl.cpp" ],
+            "fp32":    [ "src/cpu/kernels/range/generic/neon/fp32.cpp" ],
+            "fp16":    [ "src/cpu/kernels/range/generic/neon/fp16.cpp" ],
+            "integer": [ "src/cpu/kernels/range/generic/neon/integer.cpp"   ]
+          }
         }
       },
       "Reduction":{
@@ -1709,14 +1790,6 @@
           ]
         }
       },
-      "Remap": {
-        "files": {
-          "common": [
-            "src/core/NEON/kernels/NERemapKernel.cpp",
-            "src/runtime/NEON/functions/NERemap.cpp"
-          ]
-        }
-      },
       "Reorg": {
         "files": {
           "common": [
@@ -1753,7 +1826,14 @@
           "common": [
             "src/core/NEON/kernels/NEROIAlignLayerKernel.cpp",
             "src/runtime/NEON/functions/NEROIAlignLayer.cpp"
-          ]
+          ],
+          "neon":{
+            "common":["src/cpu/kernels/roialign/generic/neon/impl.cpp"],
+            "fp32":["src/cpu/kernels/roialign/generic/neon/fp32.cpp"],
+            "fp16":["src/cpu/kernels/roialign/generic/neon/fp16.cpp"],
+            "qasymm8":["src/cpu/kernels/roialign/generic/neon/qasymm8.cpp"],
+            "qasymm8_signed":["src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp"]
+          }
         }
       },
       "ROIPool2d": {
@@ -1792,7 +1872,13 @@
           "common": [
             "src/core/NEON/kernels/NESelectKernel.cpp",
             "src/runtime/NEON/functions/NESelect.cpp"
-          ]
+          ],
+          "neon": {
+            "common": [ "src/cpu/kernels/select/generic/neon/impl.cpp" ],
+            "fp32": [ "src/cpu/kernels/select/generic/neon/fp32.cpp" ],
+            "fp16": [ "src/cpu/kernels/select/generic/neon/fp16.cpp" ],
+            "integer": [ "src/cpu/kernels/select/generic/neon/integer.cpp"  ]
+          }
         }
       },
       "Slice": {
@@ -1811,8 +1897,18 @@
             "src/cpu/kernels/CpuSoftmaxKernel.cpp",
             "src/runtime/NEON/functions/NESoftmaxLayer.cpp"
           ],
+          "neon":{
+            "fp32": ["src/cpu/kernels/softmax/generic/neon/fp32.cpp"],
+            "fp16": ["src/cpu/kernels/softmax/generic/neon/fp16.cpp"],
+            "qasymm8": ["src/cpu/kernels/softmax/generic/neon/qasymm8.cpp"],
+            "qasymm8_signed": ["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"]
+          },
           "sve": {
-            "common": [ "src/cpu/kernels/softmax/impl/sve/impl.cpp" ]
+            "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ],
+            "fp32": ["src/cpu/kernels/softmax/generic/sve/fp32.cpp"],
+            "fp16": ["src/cpu/kernels/softmax/generic/sve/fp16.cpp"],
+            "qasymm8": ["src/cpu/kernels/softmax/generic/sve/qasymm8.cpp" ,"src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp" ],
+            "qasymm8_signed": ["src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp", "src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"]
           }
         }
       },
diff --git a/include/CL/cl.h b/include/CL/cl.h
index 3a5aae486..0018a0f42 100644
--- a/include/CL/cl.h
+++ b/include/CL/cl.h
@@ -1311,11 +1311,11 @@ clLinkProgram(cl_context           context,
 
 #ifdef CL_VERSION_2_2
 
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_2_2_DEPRECATED cl_int CL_API_CALL
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int CL_API_CALL
 clSetProgramReleaseCallback(cl_program          program,
                             void (CL_CALLBACK * pfn_notify)(cl_program program,
                                                             void * user_data),
-                            void *              user_data) CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED;
+                            void *              user_data) CL_API_SUFFIX__VERSION_2_2_DEPRECATED;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clSetProgramSpecializationConstant(cl_program  program,
@@ -1857,11 +1857,11 @@ clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
     clSetCommandQueueProperty(cl_command_queue              command_queue,
                               cl_command_queue_properties   properties,
                               cl_bool                       enable,
-                              cl_command_queue_properties * old_properties) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+                              cl_command_queue_properties * old_properties) CL_API_SUFFIX__VERSION_1_0_DEPRECATED;
 #endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
 
 /* Deprecated OpenCL 1.1 APIs */
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateImage2D(cl_context              context,
                 cl_mem_flags            flags,
                 const cl_image_format * image_format,
@@ -1869,9 +1869,9 @@ clCreateImage2D(cl_context              context,
                 size_t                  image_height,
                 size_t                  image_row_pitch,
                 void *                  host_ptr,
-                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+                cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateImage3D(cl_context              context,
                 cl_mem_flags            flags,
                 const cl_image_format * image_format,
@@ -1881,46 +1881,46 @@ clCreateImage3D(cl_context              context,
                 size_t                  image_row_pitch,
                 size_t                  image_slice_pitch,
                 void *                  host_ptr,
-                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+                cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
 clEnqueueMarker(cl_command_queue    command_queue,
-                cl_event *          event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+                cl_event *          event) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
 clEnqueueWaitForEvents(cl_command_queue  command_queue,
                         cl_uint          num_events,
-                        const cl_event * event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+                        const cl_event * event_list) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
-clEnqueueBarrier(cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
-clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
-clGetExtensionFunctionAddress(const char * func_name) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * func_name) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
 /* Deprecated OpenCL 2.0 APIs */
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
 clCreateCommandQueue(cl_context                     context,
                      cl_device_id                   device,
                      cl_command_queue_properties    properties,
-                     cl_int *                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+                     cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;
 
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
 clCreateSampler(cl_context          context,
                 cl_bool             normalized_coords,
                 cl_addressing_mode  addressing_mode,
                 cl_filter_mode      filter_mode,
-                cl_int *            errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+                cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;
 
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
 clEnqueueTask(cl_command_queue  command_queue,
               cl_kernel         kernel,
               cl_uint           num_events_in_wait_list,
               const cl_event *  event_wait_list,
-              cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+              cl_event *        event) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;
 
 #ifdef __cplusplus
 }
diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d10.h
index 2b80d90cb..6adedb061 100644
--- a/include/CL/cl_d3d10.h
+++ b/include/CL/cl_d3d10.h
@@ -75,7 +75,7 @@ typedef cl_uint cl_d3d10_device_set_khr;
 
 /******************************************************************************/
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+typedef cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
     cl_platform_id             platform,
     cl_d3d10_device_source_khr d3d_device_source,
     void *                     d3d_object,
@@ -84,27 +84,27 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
     cl_device_id *             devices,
     cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+typedef cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
     cl_context     context,
     cl_mem_flags   flags,
     ID3D10Buffer * resource,
     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+typedef cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
     cl_context        context,
     cl_mem_flags      flags,
     ID3D10Texture2D * resource,
     UINT              subresource,
     cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+typedef cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
     cl_context        context,
     cl_mem_flags      flags,
     ID3D10Texture3D * resource,
     UINT              subresource,
     cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+typedef cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
     cl_command_queue command_queue,
     cl_uint          num_objects,
     const cl_mem *   mem_objects,
@@ -112,7 +112,7 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
     const cl_event * event_wait_list,
     cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+typedef cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
     cl_command_queue command_queue,
     cl_uint          num_objects,
     const cl_mem *   mem_objects,
diff --git a/include/CL/cl_d3d11.h b/include/CL/cl_d3d11.h
index 10023dde0..50ed906b3 100644
--- a/include/CL/cl_d3d11.h
+++ b/include/CL/cl_d3d11.h
@@ -75,7 +75,7 @@ typedef cl_uint cl_d3d11_device_set_khr;
 
 /******************************************************************************/
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+typedef cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
     cl_platform_id             platform,
     cl_d3d11_device_source_khr d3d_device_source,
     void *                     d3d_object,
@@ -84,27 +84,27 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
     cl_device_id *             devices,
     cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+typedef cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
     cl_context     context,
     cl_mem_flags   flags,
     ID3D11Buffer * resource,
     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+typedef cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
     cl_context        context,
     cl_mem_flags      flags,
     ID3D11Texture2D * resource,
     UINT              subresource,
     cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+typedef cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
     cl_context        context,
     cl_mem_flags      flags,
     ID3D11Texture3D * resource,
     UINT              subresource,
     cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+typedef cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
     cl_command_queue command_queue,
     cl_uint          num_objects,
     const cl_mem *   mem_objects,
@@ -112,7 +112,7 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
     const cl_event * event_wait_list,
     cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+typedef cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
     cl_command_queue command_queue,
     cl_uint          num_objects,
     const cl_mem *   mem_objects,
diff --git a/include/CL/cl_dx9_media_sharing.h b/include/CL/cl_dx9_media_sharing.h
index 048937005..b0d2b2340 100644
--- a/include/CL/cl_dx9_media_sharing.h
+++ b/include/CL/cl_dx9_media_sharing.h
@@ -76,7 +76,7 @@ typedef struct _cl_dx9_surface_info_khr
 
 /******************************************************************************/
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+typedef cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
     cl_platform_id                   platform,
     cl_uint                          num_media_adapters,
     cl_dx9_media_adapter_type_khr *  media_adapter_type,
@@ -86,7 +86,7 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_f
     cl_device_id *                   devices,
     cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+typedef cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
     cl_context                    context,
     cl_mem_flags                  flags,
     cl_dx9_media_adapter_type_khr adapter_type,
@@ -94,7 +94,7 @@ typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
     cl_uint                       plane,                                                                          
     cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+typedef cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
     cl_command_queue command_queue,
     cl_uint          num_objects,
     const cl_mem *   mem_objects,
@@ -102,7 +102,7 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn
     const cl_event * event_wait_list,
     cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+typedef cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
     cl_command_queue command_queue,
     cl_uint          num_objects,
     const cl_mem *   mem_objects,
@@ -110,6 +110,117 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn
     const cl_event * event_wait_list,
     cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 
+/***************************************
+* cl_intel_dx9_media_sharing extension *
+****************************************/
+
+#define cl_intel_dx9_media_sharing 1
+
+typedef cl_uint cl_dx9_device_source_intel;
+typedef cl_uint cl_dx9_device_set_intel;
+
+/* error codes */
+#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
+#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
+#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
+#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
+
+/* cl_dx9_device_source_intel */
+#define CL_D3D9_DEVICE_INTEL                          0x4022
+#define CL_D3D9EX_DEVICE_INTEL                        0x4070
+#define CL_DXVA_DEVICE_INTEL                          0x4071
+
+/* cl_dx9_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
+#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
+#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
+#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
+
+/* cl_mem_info */
+#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
+#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
+#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
+/******************************************************************************/
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromDX9INTEL(
+    cl_platform_id              platform,
+    cl_dx9_device_source_intel  dx9_device_source,
+    void*                       dx9_object,
+    cl_dx9_device_set_intel     dx9_device_set,
+    cl_uint                     num_entries,
+    cl_device_id*               devices,
+    cl_uint*                    num_devices) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
+    cl_platform_id              platform,
+    cl_dx9_device_source_intel  dx9_device_source,
+    void*                       dx9_object,
+    cl_dx9_device_set_intel     dx9_device_set,
+    cl_uint                     num_entries,
+    cl_device_id*               devices,
+    cl_uint*                    num_devices) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromDX9MediaSurfaceINTEL(
+    cl_context                  context,
+    cl_mem_flags                flags,
+    IDirect3DSurface9*          resource,
+    HANDLE                      sharedHandle,
+    UINT                        plane,
+    cl_int*                     errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
+    cl_context                  context,
+    cl_mem_flags                flags,
+    IDirect3DSurface9*          resource,
+    HANDLE                      sharedHandle,
+    UINT                        plane,
+    cl_int*                     errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireDX9ObjectsINTEL(
+    cl_command_queue            command_queue,
+    cl_uint                     num_objects,
+    const cl_mem*               mem_objects,
+    cl_uint                     num_events_in_wait_list,
+    const cl_event*             event_wait_list,
+    cl_event*                   event) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
+    cl_command_queue            command_queue,
+    cl_uint                     num_objects,
+    const cl_mem*               mem_objects,
+    cl_uint                     num_events_in_wait_list,
+    const cl_event*             event_wait_list,
+    cl_event*                   event) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseDX9ObjectsINTEL(
+    cl_command_queue            command_queue,
+    cl_uint                     num_objects,
+    cl_mem*                     mem_objects,
+    cl_uint                     num_events_in_wait_list,
+    const cl_event*             event_wait_list,
+    cl_event*                   event) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
+    cl_command_queue            command_queue,
+    cl_uint                     num_objects,
+    cl_mem*                     mem_objects,
+    cl_uint                     num_events_in_wait_list,
+    const cl_event*             event_wait_list,
+    cl_event*                   event) CL_API_SUFFIX__VERSION_1_1;
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/CL/cl_dx9_media_sharing_intel.h b/include/CL/cl_dx9_media_sharing_intel.h
index 4525a175e..f6518d7f6 100644
--- a/include/CL/cl_dx9_media_sharing_intel.h
+++ b/include/CL/cl_dx9_media_sharing_intel.h
@@ -13,158 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  ******************************************************************************/
-/*****************************************************************************\
-
-Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
-
-THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
-MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-File Name: cl_dx9_media_sharing_intel.h
-
-Abstract:
-
-Notes:
-
-\*****************************************************************************/
-
-#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
-#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
-
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-#include <d3d9.h>
-#include <dxvahd.h>
-#include <wtypes.h>
-#include <d3d9types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/***************************************
-* cl_intel_dx9_media_sharing extension *
-****************************************/
-
-#define cl_intel_dx9_media_sharing 1
-
-typedef cl_uint cl_dx9_device_source_intel;
-typedef cl_uint cl_dx9_device_set_intel;
-
-/* error codes */
-#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
-#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
-#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
-#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
-
-/* cl_dx9_device_source_intel */
-#define CL_D3D9_DEVICE_INTEL                          0x4022
-#define CL_D3D9EX_DEVICE_INTEL                        0x4070
-#define CL_DXVA_DEVICE_INTEL                          0x4071
-
-/* cl_dx9_device_set_intel */
-#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
-#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
-
-/* cl_context_info */
-#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
-#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
-#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
-
-/* cl_mem_info */
-#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
-#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
-
-/* cl_image_info */
-#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
-#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
-/******************************************************************************/
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceIDsFromDX9INTEL(
-    cl_platform_id              platform,
-    cl_dx9_device_source_intel  dx9_device_source,
-    void*                       dx9_object,
-    cl_dx9_device_set_intel     dx9_device_set,
-    cl_uint                     num_entries,
-    cl_device_id*               devices,
-    cl_uint*                    num_devices) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
-    cl_platform_id              platform,
-    cl_dx9_device_source_intel  dx9_device_source,
-    void*                       dx9_object,
-    cl_dx9_device_set_intel     dx9_device_set,
-    cl_uint                     num_entries,
-    cl_device_id*               devices,
-    cl_uint*                    num_devices) CL_EXT_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromDX9MediaSurfaceINTEL(
-    cl_context                  context,
-    cl_mem_flags                flags,
-    IDirect3DSurface9*          resource,
-    HANDLE                      sharedHandle,
-    UINT                        plane,
-    cl_int*                     errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
-    cl_context                  context,
-    cl_mem_flags                flags,
-    IDirect3DSurface9*          resource,
-    HANDLE                      sharedHandle,
-    UINT                        plane,
-    cl_int*                     errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireDX9ObjectsINTEL(
-    cl_command_queue            command_queue,
-    cl_uint                     num_objects,
-    const cl_mem*               mem_objects,
-    cl_uint                     num_events_in_wait_list,
-    const cl_event*             event_wait_list,
-    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
-    cl_command_queue            command_queue,
-    cl_uint                     num_objects,
-    const cl_mem*               mem_objects,
-    cl_uint                     num_events_in_wait_list,
-    const cl_event*             event_wait_list,
-    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseDX9ObjectsINTEL(
-    cl_command_queue            command_queue,
-    cl_uint                     num_objects,
-    cl_mem*                     mem_objects,
-    cl_uint                     num_events_in_wait_list,
-    const cl_event*             event_wait_list,
-    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
-    cl_command_queue            command_queue,
-    cl_uint                     num_objects,
-    cl_mem*                     mem_objects,
-    cl_uint                     num_events_in_wait_list,
-    const cl_event*             event_wait_list,
-    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
 
+#include <CL/cl_dx9_media_sharing.h>
+#pragma message("The Intel DX9 media sharing extensions have been moved into cl_dx9_media_sharing.h.  Please include cl_dx9_media_sharing.h directly.")
diff --git a/include/CL/cl_egl.h b/include/CL/cl_egl.h
index c8bde80e1..357a37c02 100644
--- a/include/CL/cl_egl.h
+++ b/include/CL/cl_egl.h
@@ -56,7 +56,7 @@ clCreateFromEGLImageKHR(cl_context                  context,
                         const cl_egl_image_properties_khr * properties,
                         cl_int *                    errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+typedef cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
     cl_context                  context,
     CLeglDisplayKHR             egldisplay,
     CLeglImageKHR               eglimage,
@@ -73,7 +73,7 @@ clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
                               const cl_event * event_wait_list,
                               cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+typedef cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
     cl_command_queue command_queue,
     cl_uint          num_objects,
     const cl_mem *   mem_objects,
@@ -90,7 +90,7 @@ clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
                               const cl_event * event_wait_list,
                               cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+typedef cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
     cl_command_queue command_queue,
     cl_uint          num_objects,
     const cl_mem *   mem_objects,
@@ -107,7 +107,7 @@ clCreateEventFromEGLSyncKHR(cl_context      context,
                             CLeglDisplayKHR display,
                             cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+typedef cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
     cl_context      context,
     CLeglSyncKHR    sync,
     CLeglDisplayKHR display,
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
index 0239b6413..36d025684 100644
--- a/include/CL/cl_ext.h
+++ b/include/CL/cl_ext.h
@@ -54,9 +54,9 @@ extern "C" {
  * before using.
  */
 #define cl_APPLE_SetMemObjectDestructor 1
-cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem memobj,
+extern CL_API_ENTRY cl_int CL_API_CALL clSetMemObjectDestructorAPPLE(  cl_mem memobj,
                                         void (* pfn_notify)(cl_mem memobj, void * user_data),
-                                        void * user_data)             CL_EXT_SUFFIX__VERSION_1_0;
+                                        void * user_data)             CL_API_SUFFIX__VERSION_1_0;
 
 
 /* Context Logging Functions
@@ -68,22 +68,22 @@ cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem memobj,
  * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger
  */
 #define cl_APPLE_ContextLoggingFunctions 1
-extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * errstr,
+extern CL_API_ENTRY void CL_API_CALL clLogMessagesToSystemLogAPPLE(  const char * errstr,
                                             const void * private_info,
                                             size_t       cb,
-                                            void *       user_data)  CL_EXT_SUFFIX__VERSION_1_0;
+                                            void *       user_data)  CL_API_SUFFIX__VERSION_1_0;
 
 /* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
-extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * errstr,
+extern CL_API_ENTRY void CL_API_CALL clLogMessagesToStdoutAPPLE(   const char * errstr,
                                           const void * private_info,
                                           size_t       cb,
-                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;
+                                          void *       user_data)    CL_API_SUFFIX__VERSION_1_0;
 
 /* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
-extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * errstr,
+extern CL_API_ENTRY void CL_API_CALL clLogMessagesToStderrAPPLE(   const char * errstr,
                                           const void * private_info,
                                           size_t       cb,
-                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;
+                                          void *       user_data)    CL_API_SUFFIX__VERSION_1_0;
 
 
 /************************
@@ -102,7 +102,7 @@ clIcdGetPlatformIDsKHR(cl_uint          num_entries,
                        cl_platform_id * platforms,
                        cl_uint *        num_platforms);
 
-typedef CL_API_ENTRY cl_int
+typedef cl_int
 (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint          num_entries,
                                          cl_platform_id * platforms,
                                          cl_uint *        num_platforms);
@@ -129,11 +129,11 @@ clCreateProgramWithILKHR(cl_context   context,
                          size_t       length,
                          cl_int *     errcode_ret);
 
-typedef CL_API_ENTRY cl_program
+typedef cl_program
 (CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context   context,
                                            const void * il,
                                            size_t       length,
-                                           cl_int *     errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+                                           cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
 /* Extension: cl_khr_image2d_from_buffer
  *
@@ -176,10 +176,10 @@ typedef CL_API_ENTRY cl_program
 
 #define cl_khr_terminate_context 1
 extern CL_API_ENTRY cl_int CL_API_CALL
-clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
+clTerminateContextKHR(cl_context context) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int
-(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
+typedef cl_int
+(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_API_SUFFIX__VERSION_1_2;
 
 
 /*
@@ -204,13 +204,13 @@ extern CL_API_ENTRY cl_command_queue CL_API_CALL
 clCreateCommandQueueWithPropertiesKHR(cl_context context,
                                       cl_device_id device,
                                       const cl_queue_properties_khr* properties,
-                                      cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+                                      cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_command_queue
+typedef cl_command_queue
 (CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context,
                                                         cl_device_id device,
                                                         const cl_queue_properties_khr* properties,
-                                                        cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+                                                        cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
 
 /******************************************
@@ -268,16 +268,16 @@ typedef CL_API_ENTRY cl_command_queue
 #define cl_ext_device_fission   1
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+clReleaseDeviceEXT(cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
 
-typedef CL_API_ENTRY cl_int
-(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+typedef cl_int
+(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+clRetainDeviceEXT(cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
 
-typedef CL_API_ENTRY cl_int
-(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+typedef cl_int
+(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
 
 typedef cl_ulong  cl_device_partition_property_ext;
 extern CL_API_ENTRY cl_int CL_API_CALL
@@ -285,14 +285,14 @@ clCreateSubDevicesEXT(cl_device_id   in_device,
                       const cl_device_partition_property_ext * properties,
                       cl_uint        num_entries,
                       cl_device_id * out_devices,
-                      cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;
+                      cl_uint *      num_devices) CL_API_SUFFIX__VERSION_1_1;
 
-typedef CL_API_ENTRY cl_int
+typedef cl_int
 (CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id   in_device,
                                          const cl_device_partition_property_ext * properties,
                                          cl_uint        num_entries,
                                          cl_device_id * out_devices,
-                                         cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;
+                                         cl_uint *      num_devices) CL_API_SUFFIX__VERSION_1_1;
 
 /* cl_device_partition_property_ext */
 #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
@@ -346,7 +346,7 @@ clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue,
                              const cl_event * event_wait_list,
                              cl_event *       event);
 
-typedef CL_API_ENTRY cl_int
+typedef cl_int
 (CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue,
                                                cl_uint          num_mem_objects,
                                                const cl_mem *   mem_objects,
@@ -490,7 +490,7 @@ clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      command_queue,
                                   const cl_mem *        mem_objects,
                                   cl_uint               num_events_in_wait_list,
                                   const cl_event *      event_wait_list,
-                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;
+                                  cl_event *            event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      command_queue,
@@ -498,7 +498,7 @@ clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      command_queue,
                                   const cl_mem *        mem_objects,
                                   cl_uint               num_events_in_wait_list,
                                   const cl_event *      event_wait_list,
-                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;
+                                  cl_event *            event) CL_API_SUFFIX__VERSION_1_2;
 
 /******************************************
  * cl_img_generate_mipmap extension *
@@ -523,7 +523,7 @@ clEnqueueGenerateMipmapIMG(cl_command_queue          command_queue,
                            const size_t              *mip_region,
                            cl_uint                   num_events_in_wait_list,
                            const cl_event            *event_wait_list,
-                           cl_event *event) CL_EXT_SUFFIX__VERSION_1_2;
+                           cl_event *event) CL_API_SUFFIX__VERSION_1_2;
   
 /******************************************
  * cl_img_mem_properties extension *
@@ -564,9 +564,9 @@ clGetKernelSubGroupInfoKHR(cl_kernel    in_kernel,
                            const void * input_value,
                            size_t       param_value_size,
                            void *       param_value,
-                           size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+                           size_t *     param_value_size_ret) CL_API_SUFFIX__VERSION_2_0_DEPRECATED;
 
-typedef CL_API_ENTRY cl_int
+typedef cl_int
 (CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel    in_kernel,
                                               cl_device_id in_device,
                                               cl_kernel_sub_group_info param_name,
@@ -574,7 +574,7 @@ typedef CL_API_ENTRY cl_int
                                               const void * input_value,
                                               size_t       param_value_size,
                                               void *       param_value,
-                                              size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+                                              size_t *     param_value_size_ret) CL_API_SUFFIX__VERSION_2_0_DEPRECATED;
 
 
 /*********************************
@@ -694,6 +694,61 @@ typedef struct _cl_name_version_khr
 #define CL_DEVICE_NODE_MASK_KHR     0x106E
 
 
+/***************************************************************
+* cl_khr_pci_bus_info
+***************************************************************/
+#define cl_khr_pci_bus_info 1
+
+typedef struct _cl_device_pci_bus_info_khr {
+    cl_uint pci_domain;
+    cl_uint pci_bus;
+    cl_uint pci_device;
+    cl_uint pci_function;
+} cl_device_pci_bus_info_khr;
+
+/* cl_device_info */
+#define CL_DEVICE_PCI_BUS_INFO_KHR                          0x410F
+
+
+/***************************************************************
+* cl_khr_suggested_local_work_size
+***************************************************************/
+#define cl_khr_suggested_local_work_size 1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSuggestedLocalWorkSizeKHR(
+    cl_command_queue command_queue,
+    cl_kernel kernel,
+    cl_uint work_dim,
+    const size_t* global_work_offset,
+    const size_t* global_work_size,
+    size_t* suggested_local_work_size) CL_API_SUFFIX__VERSION_3_0;
+
+typedef cl_int (CL_API_CALL *
+clGetKernelSuggestedLocalWorkSizeKHR_fn)(
+    cl_command_queue command_queue,
+    cl_kernel kernel,
+    cl_uint work_dim,
+    const size_t* global_work_offset,
+    const size_t* global_work_size,
+    size_t* suggested_local_work_size) CL_API_SUFFIX__VERSION_3_0;
+
+
+/***************************************************************
+* cl_khr_integer_dot_product
+***************************************************************/
+#define cl_khr_integer_dot_product 1
+
+typedef cl_bitfield         cl_device_integer_dot_product_capabilities_khr;
+
+/* cl_device_integer_dot_product_capabilities_khr */
+#define CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR (1 << 0)
+#define CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR      (1 << 1)
+
+/* cl_device_info */
+#define CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR      0x1073
+
+
 /**********************************
  * cl_arm_import_memory extension *
  **********************************/
@@ -719,6 +774,12 @@ typedef intptr_t cl_import_properties_arm;
 /* Data consistency with host property */
 #define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3
 
+/* Index of plane in a multiplanar hardware buffer */
+#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_PLANE_INDEX_ARM 0x41EF
+
+/* Index of layer in a multilayer hardware buffer */
+#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_LAYER_INDEX_ARM 0x41F0
+
 /* Import memory size value to indicate a size for the whole buffer */
 #define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX
 
@@ -744,7 +805,7 @@ clImportMemoryARM( cl_context context,
                    const cl_import_properties_arm *properties,
                    void *memory,
                    size_t size,
-                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
+                   cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
 
 /******************************************
@@ -787,11 +848,11 @@ extern CL_API_ENTRY void * CL_API_CALL
 clSVMAllocARM(cl_context       context,
               cl_svm_mem_flags_arm flags,
               size_t           size,
-              cl_uint          alignment) CL_EXT_SUFFIX__VERSION_1_2;
+              cl_uint          alignment) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY void CL_API_CALL
 clSVMFreeARM(cl_context        context,
-             void *            svm_pointer) CL_EXT_SUFFIX__VERSION_1_2;
+             void *            svm_pointer) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMFreeARM(cl_command_queue  command_queue,
@@ -804,7 +865,7 @@ clEnqueueSVMFreeARM(cl_command_queue  command_queue,
                     void *            user_data,
                     cl_uint           num_events_in_wait_list,
                     const cl_event *  event_wait_list,
-                    cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+                    cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMMemcpyARM(cl_command_queue  command_queue,
@@ -814,7 +875,7 @@ clEnqueueSVMMemcpyARM(cl_command_queue  command_queue,
                       size_t            size,
                       cl_uint           num_events_in_wait_list,
                       const cl_event *  event_wait_list,
-                      cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+                      cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMMemFillARM(cl_command_queue  command_queue,
@@ -824,7 +885,7 @@ clEnqueueSVMMemFillARM(cl_command_queue  command_queue,
                        size_t            size,
                        cl_uint           num_events_in_wait_list,
                        const cl_event *  event_wait_list,
-                       cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+                       cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMMapARM(cl_command_queue  command_queue,
@@ -834,25 +895,25 @@ clEnqueueSVMMapARM(cl_command_queue  command_queue,
                    size_t            size,
                    cl_uint           num_events_in_wait_list,
                    const cl_event *  event_wait_list,
-                   cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+                   cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMUnmapARM(cl_command_queue  command_queue,
                      void *            svm_ptr,
                      cl_uint           num_events_in_wait_list,
                      const cl_event *  event_wait_list,
-                     cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+                     cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clSetKernelArgSVMPointerARM(cl_kernel    kernel,
                             cl_uint      arg_index,
-                            const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2;
+                            const void * arg_value) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clSetKernelExecInfoARM(cl_kernel            kernel,
                        cl_kernel_exec_info_arm  param_name,
                        size_t               param_value_size,
-                       const void *         param_value) CL_EXT_SUFFIX__VERSION_1_2;
+                       const void *         param_value) CL_API_SUFFIX__VERSION_1_2;
 
 /********************************
  * cl_arm_get_core_id extension *
@@ -885,12 +946,18 @@ clSetKernelExecInfoARM(cl_kernel            kernel,
 
 #define cl_arm_scheduling_controls 1
 
+typedef cl_bitfield cl_device_scheduling_controls_capabilities_arm;
+
 /* cl_device_info */
 #define CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM          0x41E4
 
 #define CL_DEVICE_SCHEDULING_KERNEL_BATCHING_ARM               (1 << 0)
 #define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_ARM          (1 << 1)
 #define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_MODIFIER_ARM (1 << 2)
+#define CL_DEVICE_SCHEDULING_DEFERRED_FLUSH_ARM                (1 << 3)
+#define CL_DEVICE_SCHEDULING_REGISTER_ALLOCATION_ARM           (1 << 4)
+
+#define CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM            0x41EB
 
 /* cl_kernel_info */
 #define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM            0x41E5
@@ -898,6 +965,755 @@ clSetKernelExecInfoARM(cl_kernel            kernel,
 
 /* cl_queue_properties */
 #define CL_QUEUE_KERNEL_BATCHING_ARM                            0x41E7
+#define CL_QUEUE_DEFERRED_FLUSH_ARM                             0x41EC
+
+/**************************************
+* cl_arm_controlled_kernel_termination
+***************************************/
+
+#define cl_arm_controlled_kernel_termination 1
+
+/* Error code to indicate kernel terminated with failure */
+#define CL_COMMAND_TERMINATED_ITSELF_WITH_FAILURE_ARM -1108
+
+/* cl_device_info */
+#define CL_DEVICE_CONTROLLED_TERMINATION_CAPABILITIES_ARM 0x41EE
+
+/* Bit fields for controlled termination feature query */
+typedef cl_bitfield cl_device_controlled_termination_capabilities_arm;
+
+#define CL_DEVICE_CONTROLLED_TERMINATION_SUCCESS_ARM (1 << 0)
+#define CL_DEVICE_CONTROLLED_TERMINATION_FAILURE_ARM (1 << 1)
+#define CL_DEVICE_CONTROLLED_TERMINATION_QUERY_ARM (1 << 2)
+
+/* cl_event_info */
+#define CL_EVENT_COMMAND_TERMINATION_REASON_ARM 0x41ED
+
+/* Values returned for event termination reason query */
+typedef cl_uint cl_command_termination_reason_arm;
+
+#define CL_COMMAND_TERMINATION_COMPLETION_ARM  0
+#define CL_COMMAND_TERMINATION_CONTROLLED_SUCCESS_ARM 1
+#define CL_COMMAND_TERMINATION_CONTROLLED_FAILURE_ARM 2
+#define CL_COMMAND_TERMINATION_ERROR_ARM 3
+
+/***************************************
+* cl_intel_thread_local_exec extension *
+****************************************/
+
+#define cl_intel_thread_local_exec 1
+
+#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
+
+/***********************************************
+* cl_intel_device_partition_by_names extension *
+************************************************/
+
+#define cl_intel_device_partition_by_names 1
+
+#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
+#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
+
+/************************************************
+* cl_intel_accelerator extension                *
+* cl_intel_motion_estimation extension          *
+* cl_intel_advanced_motion_estimation extension *
+*************************************************/
+
+#define cl_intel_accelerator 1
+#define cl_intel_motion_estimation 1
+#define cl_intel_advanced_motion_estimation 1
+
+typedef struct _cl_accelerator_intel* cl_accelerator_intel;
+typedef cl_uint cl_accelerator_type_intel;
+typedef cl_uint cl_accelerator_info_intel;
+
+typedef struct _cl_motion_estimation_desc_intel {
+    cl_uint mb_block_type;
+    cl_uint subpixel_mode;
+    cl_uint sad_adjust_mode;
+    cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
+
+/* error codes */
+#define CL_INVALID_ACCELERATOR_INTEL                              -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
+
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
+
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
+#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
+
+/* cl_motion_detect_desc_intel flags */
+#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
+#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
+#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
+
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
+
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
+
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
+
+#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
+#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
+#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
+#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
+
+#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
+#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
+#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
+
+#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
+#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
+#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
+#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
+#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
+
+#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
+#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
+#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
+#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
+
+#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
+#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
+#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
+#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
+#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
+
+#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
+#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
+#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
+#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
+
+/* cl_device_info */
+#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
+
+#define CL_ME_VERSION_LEGACY_INTEL                                0x0
+#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
+#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
+
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+    cl_context                   context,
+    cl_accelerator_type_intel    accelerator_type,
+    size_t                       descriptor_size,
+    const void*                  descriptor,
+    cl_int*                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
+    cl_context                   context,
+    cl_accelerator_type_intel    accelerator_type,
+    size_t                       descriptor_size,
+    const void*                  descriptor,
+    cl_int*                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL(
+    cl_accelerator_intel         accelerator,
+    cl_accelerator_info_intel    param_name,
+    size_t                       param_value_size,
+    void*                        param_value,
+    size_t*                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
+    cl_accelerator_intel         accelerator,
+    cl_accelerator_info_intel    param_name,
+    size_t                       param_value_size,
+    void*                        param_value,
+    size_t*                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+    cl_accelerator_intel         accelerator) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
+    cl_accelerator_intel         accelerator) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+    cl_accelerator_intel         accelerator) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
+    cl_accelerator_intel         accelerator) CL_API_SUFFIX__VERSION_1_2;
+
+/******************************************
+* cl_intel_simultaneous_sharing extension *
+*******************************************/
+
+#define cl_intel_simultaneous_sharing 1
+
+#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
+#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
+
+/***********************************
+* cl_intel_egl_image_yuv extension *
+************************************/
+
+#define cl_intel_egl_image_yuv 1
+
+#define CL_EGL_YUV_PLANE_INTEL                           0x4107
+
+/********************************
+* cl_intel_packed_yuv extension *
+*********************************/
+
+#define cl_intel_packed_yuv 1
+
+#define CL_YUYV_INTEL                                    0x4076
+#define CL_UYVY_INTEL                                    0x4077
+#define CL_YVYU_INTEL                                    0x4078
+#define CL_VYUY_INTEL                                    0x4079
+
+/********************************************
+* cl_intel_required_subgroup_size extension *
+*********************************************/
+
+#define cl_intel_required_subgroup_size 1
+
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
+
+/****************************************
+* cl_intel_driver_diagnostics extension *
+*****************************************/
+
+#define cl_intel_driver_diagnostics 1
+
+typedef cl_uint cl_diagnostics_verbose_level;
+
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
+
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
+
+/********************************
+* cl_intel_planar_yuv extension *
+*********************************/
+
+#define CL_NV12_INTEL                                       0x410E
+
+#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
+#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
+
+#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
+#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
+
+/*******************************************************
+* cl_intel_device_side_avc_motion_estimation extension *
+********************************************************/
+
+#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
+#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
+#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
+
+#define CL_AVC_ME_VERSION_0_INTEL                           0x0   /* No support. */
+#define CL_AVC_ME_VERSION_1_INTEL                           0x1   /* First supported version. */
+
+#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
+#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
+#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
+#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
+
+#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
+#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
+#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
+#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
+
+#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
+#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
+#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
+
+#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
+#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
+#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
+#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
+#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
+#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
+#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
+#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
+
+#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
+#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
+#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
+#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
+#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
+#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
+#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
+#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
+#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
+#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
+
+#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
+#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
+
+#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
+#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
+#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
+
+#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
+#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
+#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
+#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
+
+#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
+#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
+#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
+#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
+#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
+
+#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
+#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
+#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
+#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
+
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
+
+#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
+
+#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
+#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
+
+#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
+#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
+#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
+
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3
+
+#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
+
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
+
+#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
+#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
+#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
+
+#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
+#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
+#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
+
+#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
+#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1
+
+/*******************************************
+* cl_intel_unified_shared_memory extension *
+********************************************/
+
+/* These APIs are in sync with Revision Q of the cl_intel_unified_shared_memory spec! */
+
+#define cl_intel_unified_shared_memory 1
+
+/* cl_device_info */
+#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL                   0x4190
+#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL                 0x4191
+#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL   0x4192
+#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL    0x4193
+#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL          0x4194
+
+typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel;
+
+/* cl_device_unified_shared_memory_capabilities_intel - bitfield */
+#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL                   (1 << 0)
+#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL            (1 << 1)
+#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL        (1 << 2)
+#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3)
+
+typedef cl_properties cl_mem_properties_intel;
+
+/* cl_mem_properties_intel */
+#define CL_MEM_ALLOC_FLAGS_INTEL        0x4195
+
+typedef cl_bitfield cl_mem_alloc_flags_intel;
+
+/* cl_mem_alloc_flags_intel - bitfield */
+#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL               (1 << 0)
+
+typedef cl_uint cl_mem_info_intel;
+
+/* cl_mem_alloc_info_intel */
+#define CL_MEM_ALLOC_TYPE_INTEL         0x419A
+#define CL_MEM_ALLOC_BASE_PTR_INTEL     0x419B
+#define CL_MEM_ALLOC_SIZE_INTEL         0x419C
+#define CL_MEM_ALLOC_DEVICE_INTEL       0x419D
+/* Enum values 0x419E-0x419F are reserved for future queries. */
+
+typedef cl_uint cl_unified_shared_memory_type_intel;
+
+/* cl_unified_shared_memory_type_intel */
+#define CL_MEM_TYPE_UNKNOWN_INTEL       0x4196
+#define CL_MEM_TYPE_HOST_INTEL          0x4197
+#define CL_MEM_TYPE_DEVICE_INTEL        0x4198
+#define CL_MEM_TYPE_SHARED_INTEL        0x4199
+
+typedef cl_uint cl_mem_advice_intel;
+
+/* cl_mem_advice_intel */
+/* Enum values 0x4208-0x420F are reserved for future memory advices. */
+
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL      0x4200
+#define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL    0x4201
+#define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL    0x4202
+#define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL                  0x4203
+
+/* cl_command_type */
+#define CL_COMMAND_MEMFILL_INTEL        0x4204
+#define CL_COMMAND_MEMCPY_INTEL         0x4205
+#define CL_COMMAND_MIGRATEMEM_INTEL     0x4206
+#define CL_COMMAND_MEMADVISE_INTEL      0x4207
+
+extern CL_API_ENTRY void* CL_API_CALL
+clHostMemAllocINTEL(
+            cl_context context,
+            const cl_mem_properties_intel* properties,
+            size_t size,
+            cl_uint alignment,
+            cl_int* errcode_ret);
+
+typedef void* (CL_API_CALL *
+clHostMemAllocINTEL_fn)(
+            cl_context context,
+            const cl_mem_properties_intel* properties,
+            size_t size,
+            cl_uint alignment,
+            cl_int* errcode_ret);
+
+extern CL_API_ENTRY void* CL_API_CALL
+clDeviceMemAllocINTEL(
+            cl_context context,
+            cl_device_id device,
+            const cl_mem_properties_intel* properties,
+            size_t size,
+            cl_uint alignment,
+            cl_int* errcode_ret);
+
+typedef void* (CL_API_CALL *
+clDeviceMemAllocINTEL_fn)(
+            cl_context context,
+            cl_device_id device,
+            const cl_mem_properties_intel* properties,
+            size_t size,
+            cl_uint alignment,
+            cl_int* errcode_ret);
+
+extern CL_API_ENTRY void* CL_API_CALL
+clSharedMemAllocINTEL(
+            cl_context context,
+            cl_device_id device,
+            const cl_mem_properties_intel* properties,
+            size_t size,
+            cl_uint alignment,
+            cl_int* errcode_ret);
+
+typedef void* (CL_API_CALL *
+clSharedMemAllocINTEL_fn)(
+            cl_context context,
+            cl_device_id device,
+            const cl_mem_properties_intel* properties,
+            size_t size,
+            cl_uint alignment,
+            cl_int* errcode_ret);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clMemFreeINTEL(
+            cl_context context,
+            void* ptr);
+
+typedef cl_int (CL_API_CALL *
+clMemFreeINTEL_fn)(
+            cl_context context,
+            void* ptr);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clMemBlockingFreeINTEL(
+            cl_context context,
+            void* ptr);
+
+typedef cl_int (CL_API_CALL *
+clMemBlockingFreeINTEL_fn)(
+            cl_context context,
+            void* ptr);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemAllocInfoINTEL(
+            cl_context context,
+            const void* ptr,
+            cl_mem_info_intel param_name,
+            size_t param_value_size,
+            void* param_value,
+            size_t* param_value_size_ret);
+
+typedef cl_int (CL_API_CALL *
+clGetMemAllocInfoINTEL_fn)(
+            cl_context context,
+            const void* ptr,
+            cl_mem_info_intel param_name,
+            size_t param_value_size,
+            void* param_value,
+            size_t* param_value_size_ret);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgMemPointerINTEL(
+            cl_kernel kernel,
+            cl_uint arg_index,
+            const void* arg_value);
+
+typedef cl_int (CL_API_CALL *
+clSetKernelArgMemPointerINTEL_fn)(
+            cl_kernel kernel,
+            cl_uint arg_index,
+            const void* arg_value);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMemsetINTEL(       /* Deprecated */
+            cl_command_queue command_queue,
+            void* dst_ptr,
+            cl_int value,
+            size_t size,
+            cl_uint num_events_in_wait_list,
+            const cl_event* event_wait_list,
+            cl_event* event);
+
+typedef cl_int (CL_API_CALL *
+clEnqueueMemsetINTEL_fn)(   /* Deprecated */
+            cl_command_queue command_queue,
+            void* dst_ptr,
+            cl_int value,
+            size_t size,
+            cl_uint num_events_in_wait_list,
+            const cl_event* event_wait_list,
+            cl_event* event);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMemFillINTEL(
+            cl_command_queue command_queue,
+            void* dst_ptr,
+            const void* pattern,
+            size_t pattern_size,
+            size_t size,
+            cl_uint num_events_in_wait_list,
+            const cl_event* event_wait_list,
+            cl_event* event);
+
+typedef cl_int (CL_API_CALL *
+clEnqueueMemFillINTEL_fn)(
+            cl_command_queue command_queue,
+            void* dst_ptr,
+            const void* pattern,
+            size_t pattern_size,
+            size_t size,
+            cl_uint num_events_in_wait_list,
+            const cl_event* event_wait_list,
+            cl_event* event);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMemcpyINTEL(
+            cl_command_queue command_queue,
+            cl_bool blocking,
+            void* dst_ptr,
+            const void* src_ptr,
+            size_t size,
+            cl_uint num_events_in_wait_list,
+            const cl_event* event_wait_list,
+            cl_event* event);
+
+typedef cl_int (CL_API_CALL *
+clEnqueueMemcpyINTEL_fn)(
+            cl_command_queue command_queue,
+            cl_bool blocking,
+            void* dst_ptr,
+            const void* src_ptr,
+            size_t size,
+            cl_uint num_events_in_wait_list,
+            const cl_event* event_wait_list,
+            cl_event* event);
+
+#ifdef CL_VERSION_1_2
+
+/* Because these APIs use cl_mem_migration_flags, they require
+   OpenCL 1.2: */
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemINTEL(
+            cl_command_queue command_queue,
+            const void* ptr,
+            size_t size,
+            cl_mem_migration_flags flags,
+            cl_uint num_events_in_wait_list,
+            const cl_event* event_wait_list,
+            cl_event* event);
+
+typedef cl_int (CL_API_CALL *
+clEnqueueMigrateMemINTEL_fn)(
+            cl_command_queue command_queue,
+            const void* ptr,
+            size_t size,
+            cl_mem_migration_flags flags,
+            cl_uint num_events_in_wait_list,
+            const cl_event* event_wait_list,
+            cl_event* event);
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMemAdviseINTEL(
+            cl_command_queue command_queue,
+            const void* ptr,
+            size_t size,
+            cl_mem_advice_intel advice,
+            cl_uint num_events_in_wait_list,
+            const cl_event* event_wait_list,
+            cl_event* event);
+
+typedef cl_int (CL_API_CALL *
+clEnqueueMemAdviseINTEL_fn)(
+            cl_command_queue command_queue,
+            const void* ptr,
+            size_t size,
+            cl_mem_advice_intel advice,
+            cl_uint num_events_in_wait_list,
+            const cl_event* event_wait_list,
+            cl_event* event);
+
+/***************************************************
+* cl_intel_create_buffer_with_properties extension *
+****************************************************/
+
+#define cl_intel_create_buffer_with_properties 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferWithPropertiesINTEL(
+    cl_context   context,
+    const cl_mem_properties_intel* properties,
+    cl_mem_flags flags,
+    size_t       size,
+    void *       host_ptr,
+    cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem (CL_API_CALL *
+clCreateBufferWithPropertiesINTEL_fn)(
+    cl_context   context,
+    const cl_mem_properties_intel* properties,
+    cl_mem_flags flags,
+    size_t       size,
+    void *       host_ptr,
+    cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/******************************************
+* cl_intel_mem_channel_property extension *
+*******************************************/
+
+#define CL_MEM_CHANNEL_INTEL            0x4213
+
+/*********************************
+* cl_intel_mem_force_host_memory *
+**********************************/
+
+#define cl_intel_mem_force_host_memory 1
+
+/* cl_mem_flags */
+#define CL_MEM_FORCE_HOST_MEMORY_INTEL                      (1 << 20)
+
+/***************************************************************
+* cl_intel_command_queue_families
+***************************************************************/
+#define cl_intel_command_queue_families 1
+
+typedef cl_bitfield         cl_command_queue_capabilities_intel;
+
+#define CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL                 64
+
+typedef struct _cl_queue_family_properties_intel {
+    cl_command_queue_properties properties;
+    cl_command_queue_capabilities_intel capabilities;
+    cl_uint count;
+    char name[CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL];
+} cl_queue_family_properties_intel;
+
+/* cl_device_info */
+#define CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL             0x418B
+
+/* cl_queue_properties */
+#define CL_QUEUE_FAMILY_INTEL                               0x418C
+#define CL_QUEUE_INDEX_INTEL                                0x418D
+
+/* cl_command_queue_capabilities_intel */
+#define CL_QUEUE_DEFAULT_CAPABILITIES_INTEL                 0
+#define CL_QUEUE_CAPABILITY_CREATE_SINGLE_QUEUE_EVENTS_INTEL (1 << 0)
+#define CL_QUEUE_CAPABILITY_CREATE_CROSS_QUEUE_EVENTS_INTEL (1 << 1)
+#define CL_QUEUE_CAPABILITY_SINGLE_QUEUE_EVENT_WAIT_LIST_INTEL (1 << 2)
+#define CL_QUEUE_CAPABILITY_CROSS_QUEUE_EVENT_WAIT_LIST_INTEL (1 << 3)
+#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_INTEL           (1 << 8)
+#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_RECT_INTEL      (1 << 9)
+#define CL_QUEUE_CAPABILITY_MAP_BUFFER_INTEL                (1 << 10)
+#define CL_QUEUE_CAPABILITY_FILL_BUFFER_INTEL               (1 << 11)
+#define CL_QUEUE_CAPABILITY_TRANSFER_IMAGE_INTEL            (1 << 12)
+#define CL_QUEUE_CAPABILITY_MAP_IMAGE_INTEL                 (1 << 13)
+#define CL_QUEUE_CAPABILITY_FILL_IMAGE_INTEL                (1 << 14)
+#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_IMAGE_INTEL     (1 << 15)
+#define CL_QUEUE_CAPABILITY_TRANSFER_IMAGE_BUFFER_INTEL     (1 << 16)
+#define CL_QUEUE_CAPABILITY_MARKER_INTEL                    (1 << 24)
+#define CL_QUEUE_CAPABILITY_BARRIER_INTEL                   (1 << 25)
+#define CL_QUEUE_CAPABILITY_KERNEL_INTEL                    (1 << 26)
 
 #ifdef __cplusplus
 }
diff --git a/include/CL/cl_ext_intel.h b/include/CL/cl_ext_intel.h
index aab82284c..a7ae87a34 100644
--- a/include/CL/cl_ext_intel.h
+++ b/include/CL/cl_ext_intel.h
@@ -14,718 +14,6 @@
  * limitations under the License.
  *
  ******************************************************************************/
-/*****************************************************************************\
 
-Copyright (c) 2013-2020 Intel Corporation All Rights Reserved.
-
-THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
-MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-File Name: cl_ext_intel.h
-
-Abstract:
-
-Notes:
-
-\*****************************************************************************/
-
-#ifndef __CL_EXT_INTEL_H
-#define __CL_EXT_INTEL_H
-
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/***************************************
-* cl_intel_thread_local_exec extension *
-****************************************/
-
-#define cl_intel_thread_local_exec 1
-
-#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
-
-/***********************************************
-* cl_intel_device_partition_by_names extension *
-************************************************/
-
-#define cl_intel_device_partition_by_names 1
-
-#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
-#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
-
-/************************************************
-* cl_intel_accelerator extension                *
-* cl_intel_motion_estimation extension          *
-* cl_intel_advanced_motion_estimation extension *
-*************************************************/
-
-#define cl_intel_accelerator 1
-#define cl_intel_motion_estimation 1
-#define cl_intel_advanced_motion_estimation 1
-
-typedef struct _cl_accelerator_intel* cl_accelerator_intel;
-typedef cl_uint cl_accelerator_type_intel;
-typedef cl_uint cl_accelerator_info_intel;
-
-typedef struct _cl_motion_estimation_desc_intel {
-    cl_uint mb_block_type;
-    cl_uint subpixel_mode;
-    cl_uint sad_adjust_mode;
-    cl_uint search_path_type;
-} cl_motion_estimation_desc_intel;
-
-/* error codes */
-#define CL_INVALID_ACCELERATOR_INTEL                              -1094
-#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
-#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
-#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
-
-/* cl_accelerator_type_intel */
-#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
-
-/* cl_accelerator_info_intel */
-#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
-#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
-#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
-#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
-
-/* cl_motion_detect_desc_intel flags */
-#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
-#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
-#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
-
-#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
-#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
-#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
-
-#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
-#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
-
-#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
-#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
-#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
-
-#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
-#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
-#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
-#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
-
-#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
-#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
-#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
-
-#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
-#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
-#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
-#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
-#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
-
-#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
-#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
-#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
-#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
-
-#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
-#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
-#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
-#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
-
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
-#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
-#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
-
-#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
-#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
-
-#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
-#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
-#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
-#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
-
-/* cl_device_info */
-#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
-
-#define CL_ME_VERSION_LEGACY_INTEL                                0x0
-#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
-#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
-
-extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
-clCreateAcceleratorINTEL(
-    cl_context                   context,
-    cl_accelerator_type_intel    accelerator_type,
-    size_t                       descriptor_size,
-    const void*                  descriptor,
-    cl_int*                      errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
-    cl_context                   context,
-    cl_accelerator_type_intel    accelerator_type,
-    size_t                       descriptor_size,
-    const void*                  descriptor,
-    cl_int*                      errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetAcceleratorInfoINTEL(
-    cl_accelerator_intel         accelerator,
-    cl_accelerator_info_intel    param_name,
-    size_t                       param_value_size,
-    void*                        param_value,
-    size_t*                      param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
-    cl_accelerator_intel         accelerator,
-    cl_accelerator_info_intel    param_name,
-    size_t                       param_value_size,
-    void*                        param_value,
-    size_t*                      param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainAcceleratorINTEL(
-    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
-    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseAcceleratorINTEL(
-    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
-    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
-
-/******************************************
-* cl_intel_simultaneous_sharing extension *
-*******************************************/
-
-#define cl_intel_simultaneous_sharing 1
-
-#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
-#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
-
-/***********************************
-* cl_intel_egl_image_yuv extension *
-************************************/
-
-#define cl_intel_egl_image_yuv 1
-
-#define CL_EGL_YUV_PLANE_INTEL                           0x4107
-
-/********************************
-* cl_intel_packed_yuv extension *
-*********************************/
-
-#define cl_intel_packed_yuv 1
-
-#define CL_YUYV_INTEL                                    0x4076
-#define CL_UYVY_INTEL                                    0x4077
-#define CL_YVYU_INTEL                                    0x4078
-#define CL_VYUY_INTEL                                    0x4079
-
-/********************************************
-* cl_intel_required_subgroup_size extension *
-*********************************************/
-
-#define cl_intel_required_subgroup_size 1
-
-#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
-#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
-#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
-
-/****************************************
-* cl_intel_driver_diagnostics extension *
-*****************************************/
-
-#define cl_intel_driver_diagnostics 1
-
-typedef cl_uint cl_diagnostics_verbose_level;
-
-#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
-
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
-
-/********************************
-* cl_intel_planar_yuv extension *
-*********************************/
-
-#define CL_NV12_INTEL                                       0x410E
-
-#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
-#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
-
-#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
-#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
-
-/*******************************************************
-* cl_intel_device_side_avc_motion_estimation extension *
-********************************************************/
-
-#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
-#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
-#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
-
-#define CL_AVC_ME_VERSION_0_INTEL                           0x0   /* No support. */
-#define CL_AVC_ME_VERSION_1_INTEL                           0x1   /* First supported version. */
-
-#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
-#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
-#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
-#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
-
-#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
-#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
-#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
-#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
-
-#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
-#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
-#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
-
-#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
-#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
-#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
-#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
-#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
-#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
-#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
-#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
-
-#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
-#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
-#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
-#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
-#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
-#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
-#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
-#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
-#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
-#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
-#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
-#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
-
-#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
-#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
-
-#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
-#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
-#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
-
-#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
-#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
-#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
-#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
-
-#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
-#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
-#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
-#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
-#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
-
-#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
-#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
-#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
-#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
-
-#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
-#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
-
-#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
-
-#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
-#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
-
-#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
-#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
-#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
-
-#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
-#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
-#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3
-
-#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
-
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
-
-#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
-#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
-#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
-
-#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
-#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
-#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
-
-#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
-#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1
-
-/*******************************************
-* cl_intel_unified_shared_memory extension *
-********************************************/
-
-/* These APIs are in sync with Revision Q of the cl_intel_unified_shared_memory spec! */
-
-#define cl_intel_unified_shared_memory 1
-
-/* cl_device_info */
-#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL                   0x4190
-#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL                 0x4191
-#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL   0x4192
-#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL    0x4193
-#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL          0x4194
-
-typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel;
-
-/* cl_device_unified_shared_memory_capabilities_intel - bitfield */
-#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL                   (1 << 0)
-#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL            (1 << 1)
-#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL        (1 << 2)
-#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3)
-
-typedef cl_properties cl_mem_properties_intel;
-
-/* cl_mem_properties_intel */
-#define CL_MEM_ALLOC_FLAGS_INTEL        0x4195
-
-typedef cl_bitfield cl_mem_alloc_flags_intel;
-
-/* cl_mem_alloc_flags_intel - bitfield */
-#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL               (1 << 0)
-
-typedef cl_uint cl_mem_info_intel;
-
-/* cl_mem_alloc_info_intel */
-#define CL_MEM_ALLOC_TYPE_INTEL         0x419A
-#define CL_MEM_ALLOC_BASE_PTR_INTEL     0x419B
-#define CL_MEM_ALLOC_SIZE_INTEL         0x419C
-#define CL_MEM_ALLOC_DEVICE_INTEL       0x419D
-/* Enum values 0x419E-0x419F are reserved for future queries. */
-
-typedef cl_uint cl_unified_shared_memory_type_intel;
-
-/* cl_unified_shared_memory_type_intel */
-#define CL_MEM_TYPE_UNKNOWN_INTEL       0x4196
-#define CL_MEM_TYPE_HOST_INTEL          0x4197
-#define CL_MEM_TYPE_DEVICE_INTEL        0x4198
-#define CL_MEM_TYPE_SHARED_INTEL        0x4199
-
-typedef cl_uint cl_mem_advice_intel;
-
-/* cl_mem_advice_intel */
-/* Enum values 0x4208-0x420F are reserved for future memory advices. */
-
-/* cl_kernel_exec_info */
-#define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL      0x4200
-#define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL    0x4201
-#define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL    0x4202
-#define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL                  0x4203
-
-/* cl_command_type */
-#define CL_COMMAND_MEMFILL_INTEL        0x4204
-#define CL_COMMAND_MEMCPY_INTEL         0x4205
-#define CL_COMMAND_MIGRATEMEM_INTEL     0x4206
-#define CL_COMMAND_MEMADVISE_INTEL      0x4207
-
-extern CL_API_ENTRY void* CL_API_CALL
-clHostMemAllocINTEL(
-            cl_context context,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-
-typedef CL_API_ENTRY void* (CL_API_CALL *
-clHostMemAllocINTEL_fn)(
-            cl_context context,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-
-extern CL_API_ENTRY void* CL_API_CALL
-clDeviceMemAllocINTEL(
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-
-typedef CL_API_ENTRY void* (CL_API_CALL *
-clDeviceMemAllocINTEL_fn)(
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-
-extern CL_API_ENTRY void* CL_API_CALL
-clSharedMemAllocINTEL(
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-
-typedef CL_API_ENTRY void* (CL_API_CALL *
-clSharedMemAllocINTEL_fn)(
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clMemFreeINTEL(
-            cl_context context,
-            void* ptr);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-clMemFreeINTEL_fn)(
-            cl_context context,
-            void* ptr);
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clMemBlockingFreeINTEL(
-            cl_context context,
-            void* ptr);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-clMemBlockingFreeINTEL_fn)(
-            cl_context context,
-            void* ptr);
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetMemAllocInfoINTEL(
-            cl_context context,
-            const void* ptr,
-            cl_mem_info_intel param_name,
-            size_t param_value_size,
-            void* param_value,
-            size_t* param_value_size_ret);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-clGetMemAllocInfoINTEL_fn)(
-            cl_context context,
-            const void* ptr,
-            cl_mem_info_intel param_name,
-            size_t param_value_size,
-            void* param_value,
-            size_t* param_value_size_ret);
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelArgMemPointerINTEL(
-            cl_kernel kernel,
-            cl_uint arg_index,
-            const void* arg_value);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-clSetKernelArgMemPointerINTEL_fn)(
-            cl_kernel kernel,
-            cl_uint arg_index,
-            const void* arg_value);
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMemsetINTEL(       /* Deprecated */
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            cl_int value,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-clEnqueueMemsetINTEL_fn)(   /* Deprecated */
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            cl_int value,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMemFillINTEL(
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            const void* pattern,
-            size_t pattern_size,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-clEnqueueMemFillINTEL_fn)(
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            const void* pattern,
-            size_t pattern_size,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMemcpyINTEL(
-            cl_command_queue command_queue,
-            cl_bool blocking,
-            void* dst_ptr,
-            const void* src_ptr,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-clEnqueueMemcpyINTEL_fn)(
-            cl_command_queue command_queue,
-            cl_bool blocking,
-            void* dst_ptr,
-            const void* src_ptr,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-
-#ifdef CL_VERSION_1_2
-
-/* Because these APIs use cl_mem_migration_flags, they require
-   OpenCL 1.2: */
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMigrateMemINTEL(
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_migration_flags flags,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-clEnqueueMigrateMemINTEL_fn)(
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_migration_flags flags,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMemAdviseINTEL(
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_advice_intel advice,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-clEnqueueMemAdviseINTEL_fn)(
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_advice_intel advice,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-
-/***************************************************
-* cl_intel_create_buffer_with_properties extension *
-****************************************************/
-
-#define cl_intel_create_buffer_with_properties 1
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateBufferWithPropertiesINTEL(
-    cl_context   context,
-    const cl_mem_properties_intel* properties,
-    cl_mem_flags flags,
-    size_t       size,
-    void *       host_ptr,
-    cl_int *     errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *
-clCreateBufferWithPropertiesINTEL_fn)(
-    cl_context   context,
-    const cl_mem_properties_intel* properties,
-    cl_mem_flags flags,
-    size_t       size,
-    void *       host_ptr,
-    cl_int *     errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
-
-/******************************************
-* cl_intel_mem_channel_property extension *
-*******************************************/
-
-#define CL_MEM_CHANNEL_INTEL            0x4213
-
-/*********************************
-* cl_intel_mem_force_host_memory *
-**********************************/
-
-#define cl_intel_mem_force_host_memory 1
-
-/* cl_mem_flags */
-#define CL_MEM_FORCE_HOST_MEMORY_INTEL                      (1 << 20)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __CL_EXT_INTEL_H */
+#include <CL/cl_ext.h>
+#pragma message("The Intel extensions have been moved into cl_ext.h.  Please include cl_ext.h directly.")
diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h
index b587f02a9..5ea0fd8b7 100644
--- a/include/CL/cl_gl.h
+++ b/include/CL/cl_gl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
+ * Copyright (c) 2008-2021 The Khronos Group Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,21 +102,21 @@ clEnqueueReleaseGLObjects(cl_command_queue      command_queue,
 
 
 /* Deprecated OpenCL 1.1 APIs */
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateFromGLTexture2D(cl_context      context,
                         cl_mem_flags    flags,
                         cl_GLenum       target,
                         cl_GLint        miplevel,
                         cl_GLuint       texture,
-                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+                        cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateFromGLTexture3D(cl_context      context,
                         cl_mem_flags    flags,
                         cl_GLenum       target,
                         cl_GLint        miplevel,
                         cl_GLuint       texture,
-                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+                        cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
 /* cl_khr_gl_sharing extension  */
 
@@ -145,13 +145,23 @@ clGetGLContextInfoKHR(const cl_context_properties * properties,
                       void *                        param_value,
                       size_t *                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+typedef cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
     const cl_context_properties * properties,
     cl_gl_context_info            param_name,
     size_t                        param_value_size,
     void *                        param_value,
     size_t *                      param_value_size_ret);
 
+/* 
+ *  cl_khr_gl_event extension
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context context,
+                           cl_GLsync  sync,
+                           cl_int *   errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h
index 52107b111..8ec818167 100644
--- a/include/CL/cl_gl_ext.h
+++ b/include/CL/cl_gl_ext.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
+ * Copyright (c) 2008-2021 The Khronos Group Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,27 +14,5 @@
  * limitations under the License.
  ******************************************************************************/
 
-#ifndef __OPENCL_CL_GL_EXT_H
-#define __OPENCL_CL_GL_EXT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 #include <CL/cl_gl.h>
-
-/* 
- *  cl_khr_gl_event extension
- */
-#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
-
-extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateEventFromGLsyncKHR(cl_context context,
-                           cl_GLsync  sync,
-                           cl_int *   errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif	/* __OPENCL_CL_GL_EXT_H  */
+#pragma message("All OpenGL-related extensions have been moved into cl_gl.h.  Please include cl_gl.h directly.")
diff --git a/include/CL/cl_icd.h b/include/CL/cl_icd.h
index 8ff8b94f9..d5ac1de09 100644
--- a/include/CL/cl_icd.h
+++ b/include/CL/cl_icd.h
@@ -41,35 +41,35 @@ extern "C" {
 /* API function pointer definitions */
 
 // Platform APIs
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformIDs)(
+typedef cl_int(CL_API_CALL *cl_api_clGetPlatformIDs)(
     cl_uint num_entries, cl_platform_id *platforms,
     cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetPlatformInfo)(
     cl_platform_id platform, cl_platform_info param_name,
     size_t param_value_size, void *param_value,
     size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 // Device APIs
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDs)(
+typedef cl_int(CL_API_CALL *cl_api_clGetDeviceIDs)(
     cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
     cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetDeviceInfo)(
     cl_device_id device, cl_device_info param_name, size_t param_value_size,
     void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 #ifdef CL_VERSION_1_2
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevices)(
+typedef cl_int(CL_API_CALL *cl_api_clCreateSubDevices)(
     cl_device_id in_device,
     const cl_device_partition_property *partition_properties,
     cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDevice)(
+typedef cl_int(CL_API_CALL *cl_api_clRetainDevice)(
     cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDevice)(
+typedef cl_int(CL_API_CALL *cl_api_clReleaseDevice)(
     cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
 
 #else
@@ -81,36 +81,36 @@ typedef void *cl_api_clReleaseDevice;
 #endif
 
 // Context APIs
-typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContext)(
+typedef cl_context(CL_API_CALL *cl_api_clCreateContext)(
     const cl_context_properties *properties, cl_uint num_devices,
     const cl_device_id *devices,
     void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
     void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContextFromType)(
+typedef cl_context(CL_API_CALL *cl_api_clCreateContextFromType)(
     const cl_context_properties *properties, cl_device_type device_type,
     void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
     void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainContext)(
+typedef cl_int(CL_API_CALL *cl_api_clRetainContext)(
     cl_context context) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseContext)(
+typedef cl_int(CL_API_CALL *cl_api_clReleaseContext)(
     cl_context context) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetContextInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetContextInfo)(
     cl_context context, cl_context_info param_name, size_t param_value_size,
     void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 // Command Queue APIs
-typedef CL_API_ENTRY cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueue)(
+typedef cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueue)(
     cl_context context, cl_device_id device,
     cl_command_queue_properties properties,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
 #ifdef CL_VERSION_2_0
 
-typedef CL_API_ENTRY
+typedef
 cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueueWithProperties)(
     cl_context /* context */, cl_device_id /* device */,
     const cl_queue_properties * /* properties */,
@@ -122,25 +122,25 @@ typedef void *cl_api_clCreateCommandQueueWithProperties;
 
 #endif
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainCommandQueue)(
+typedef cl_int(CL_API_CALL *cl_api_clRetainCommandQueue)(
     cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseCommandQueue)(
+typedef cl_int(CL_API_CALL *cl_api_clReleaseCommandQueue)(
     cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetCommandQueueInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetCommandQueueInfo)(
     cl_command_queue command_queue, cl_command_queue_info param_name,
     size_t param_value_size, void *param_value,
     size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 // Memory Object APIs
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBuffer)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateBuffer)(
     cl_context context, cl_mem_flags flags, size_t size, void *host_ptr,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
 #ifdef CL_VERSION_1_2
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateImage)(
     cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
     const cl_image_desc *image_desc, void *host_ptr,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
@@ -153,17 +153,17 @@ typedef void *cl_api_clCreateImage;
 
 #ifdef CL_VERSION_3_0
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBufferWithProperties)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateBufferWithProperties)(
     cl_context context, const cl_mem_properties *properties, cl_mem_flags flags,
     size_t size, void *host_ptr,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImageWithProperties)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateImageWithProperties)(
     cl_context context, const cl_mem_properties *properties, cl_mem_flags flags,
     const cl_image_format *image_format, const cl_image_desc *image_desc,
     void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL* cl_api_clSetContextDestructorCallback)(
+typedef cl_int(CL_API_CALL* cl_api_clSetContextDestructorCallback)(
     cl_context context,
     void(CL_CALLBACK* pfn_notify)(cl_context context, void* user_data),
     void* user_data) CL_API_SUFFIX__VERSION_3_0;
@@ -176,43 +176,43 @@ typedef void *cl_api_clSetContextDestructorCallback;
 
 #endif
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainMemObject)(
+typedef cl_int(CL_API_CALL *cl_api_clRetainMemObject)(
     cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseMemObject)(
+typedef cl_int(CL_API_CALL *cl_api_clReleaseMemObject)(
     cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSupportedImageFormats)(
+typedef cl_int(CL_API_CALL *cl_api_clGetSupportedImageFormats)(
     cl_context context, cl_mem_flags flags, cl_mem_object_type image_type,
     cl_uint num_entries, cl_image_format *image_formats,
     cl_uint *num_image_formats) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetMemObjectInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetMemObjectInfo)(
     cl_mem memobj, cl_mem_info param_name, size_t param_value_size,
     void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetImageInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetImageInfo)(
     cl_mem image, cl_image_info param_name, size_t param_value_size,
     void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 #ifdef CL_VERSION_2_0
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreatePipe)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreatePipe)(
     cl_context /* context */, cl_mem_flags /* flags */,
     cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */,
     const cl_pipe_properties * /* properties */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPipeInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetPipeInfo)(
     cl_mem /* pipe */, cl_pipe_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
 
-typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clSVMAlloc)(
+typedef void *(CL_API_CALL *cl_api_clSVMAlloc)(
     cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */,
     unsigned int /* alignment */)CL_API_SUFFIX__VERSION_2_0;
 
-typedef CL_API_ENTRY void(CL_API_CALL *cl_api_clSVMFree)(
+typedef void(CL_API_CALL *cl_api_clSVMFree)(
     cl_context /* context */,
     void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
 
@@ -226,24 +226,24 @@ typedef void *cl_api_clSVMFree;
 #endif
 
 // Sampler APIs
-typedef CL_API_ENTRY cl_sampler(CL_API_CALL *cl_api_clCreateSampler)(
+typedef cl_sampler(CL_API_CALL *cl_api_clCreateSampler)(
     cl_context context, cl_bool normalized_coords,
     cl_addressing_mode addressing_mode, cl_filter_mode filter_mode,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainSampler)(
+typedef cl_int(CL_API_CALL *cl_api_clRetainSampler)(
     cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseSampler)(
+typedef cl_int(CL_API_CALL *cl_api_clReleaseSampler)(
     cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSamplerInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetSamplerInfo)(
     cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size,
     void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 #ifdef CL_VERSION_2_0
 
-typedef CL_API_ENTRY
+typedef
 cl_sampler(CL_API_CALL *cl_api_clCreateSamplerWithProperties)(
     cl_context /* context */,
     const cl_sampler_properties * /* sampler_properties */,
@@ -256,18 +256,18 @@ typedef void *cl_api_clCreateSamplerWithProperties;
 #endif
 
 // Program Object APIs
-typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithSource)(
+typedef cl_program(CL_API_CALL *cl_api_clCreateProgramWithSource)(
     cl_context context, cl_uint count, const char **strings,
     const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithBinary)(
+typedef cl_program(CL_API_CALL *cl_api_clCreateProgramWithBinary)(
     cl_context context, cl_uint num_devices, const cl_device_id *device_list,
     const size_t *lengths, const unsigned char **binaries,
     cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
 #ifdef CL_VERSION_1_2
 
-typedef CL_API_ENTRY
+typedef
 cl_program(CL_API_CALL *cl_api_clCreateProgramWithBuiltInKernels)(
     cl_context context, cl_uint num_devices, const cl_device_id *device_list,
     const char *kernel_names, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
@@ -278,13 +278,13 @@ typedef void *cl_api_clCreateProgramWithBuiltInKernels;
 
 #endif
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainProgram)(
+typedef cl_int(CL_API_CALL *cl_api_clRetainProgram)(
     cl_program program) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseProgram)(
+typedef cl_int(CL_API_CALL *cl_api_clReleaseProgram)(
     cl_program program) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clBuildProgram)(
+typedef cl_int(CL_API_CALL *cl_api_clBuildProgram)(
     cl_program program, cl_uint num_devices, const cl_device_id *device_list,
     const char *options,
     void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
@@ -292,14 +292,14 @@ typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clBuildProgram)(
 
 #ifdef CL_VERSION_1_2
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCompileProgram)(
+typedef cl_int(CL_API_CALL *cl_api_clCompileProgram)(
     cl_program program, cl_uint num_devices, const cl_device_id *device_list,
     const char *options, cl_uint num_input_headers,
     const cl_program *input_headers, const char **header_include_names,
     void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
     void *user_data) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clLinkProgram)(
+typedef cl_program(CL_API_CALL *cl_api_clLinkProgram)(
     cl_context context, cl_uint num_devices, const cl_device_id *device_list,
     const char *options, cl_uint num_input_programs,
     const cl_program *input_programs,
@@ -315,12 +315,12 @@ typedef void *cl_api_clLinkProgram;
 
 #ifdef CL_VERSION_2_2
 
-typedef CL_API_ENTRY
+typedef
 cl_int(CL_API_CALL *cl_api_clSetProgramSpecializationConstant)(
     cl_program program, cl_uint spec_id, size_t spec_size,
     const void *spec_value) CL_API_SUFFIX__VERSION_2_2;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetProgramReleaseCallback)(
+typedef cl_int(CL_API_CALL *cl_api_clSetProgramReleaseCallback)(
     cl_program program,
     void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
     void *user_data) CL_API_SUFFIX__VERSION_2_2;
@@ -334,7 +334,7 @@ typedef void *cl_api_clSetProgramReleaseCallback;
 
 #ifdef CL_VERSION_1_2
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadPlatformCompiler)(
+typedef cl_int(CL_API_CALL *cl_api_clUnloadPlatformCompiler)(
     cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;
 
 #else
@@ -343,41 +343,41 @@ typedef void *cl_api_clUnloadPlatformCompiler;
 
 #endif
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetProgramInfo)(
     cl_program program, cl_program_info param_name, size_t param_value_size,
     void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramBuildInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetProgramBuildInfo)(
     cl_program program, cl_device_id device, cl_program_build_info param_name,
     size_t param_value_size, void *param_value,
     size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 // Kernel Object APIs
-typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCreateKernel)(
+typedef cl_kernel(CL_API_CALL *cl_api_clCreateKernel)(
     cl_program program, const char *kernel_name,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateKernelsInProgram)(
+typedef cl_int(CL_API_CALL *cl_api_clCreateKernelsInProgram)(
     cl_program program, cl_uint num_kernels, cl_kernel *kernels,
     cl_uint *num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainKernel)(
+typedef cl_int(CL_API_CALL *cl_api_clRetainKernel)(
     cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseKernel)(
+typedef cl_int(CL_API_CALL *cl_api_clReleaseKernel)(
     cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArg)(
+typedef cl_int(CL_API_CALL *cl_api_clSetKernelArg)(
     cl_kernel kernel, cl_uint arg_index, size_t arg_size,
     const void *arg_value) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetKernelInfo)(
     cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size,
     void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 #ifdef CL_VERSION_1_2
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelArgInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetKernelArgInfo)(
     cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name,
     size_t param_value_size, void *param_value,
     size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
@@ -388,28 +388,28 @@ typedef void *cl_api_clGetKernelArgInfo;
 
 #endif
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelWorkGroupInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetKernelWorkGroupInfo)(
     cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
     size_t param_value_size, void *param_value,
     size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 #ifdef CL_VERSION_2_0
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArgSVMPointer)(
+typedef cl_int(CL_API_CALL *cl_api_clSetKernelArgSVMPointer)(
     cl_kernel /* kernel */, cl_uint /* arg_index */,
     const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelExecInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clSetKernelExecInfo)(
     cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */,
     size_t /* param_value_size */,
     const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfoKHR)(
+typedef cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfoKHR)(
     cl_kernel /* in_kernel */, cl_device_id /*in_device*/,
     cl_kernel_sub_group_info /* param_name */, size_t /*input_value_size*/,
     const void * /*input_value*/, size_t /*param_value_size*/,
     void * /*param_value*/,
-    size_t * /*param_value_size_ret*/) CL_EXT_SUFFIX__VERSION_2_0;
+    size_t * /*param_value_size_ret*/) CL_API_SUFFIX__VERSION_2_0;
 
 #else
 
@@ -420,33 +420,33 @@ typedef void *cl_api_clGetKernelSubGroupInfoKHR;
 #endif
 
 // Event Object APIs
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clWaitForEvents)(
+typedef cl_int(CL_API_CALL *cl_api_clWaitForEvents)(
     cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetEventInfo)(
     cl_event event, cl_event_info param_name, size_t param_value_size,
     void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainEvent)(cl_event event)
+typedef cl_int(CL_API_CALL *cl_api_clRetainEvent)(cl_event event)
     CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseEvent)(cl_event event)
+typedef cl_int(CL_API_CALL *cl_api_clReleaseEvent)(cl_event event)
     CL_API_SUFFIX__VERSION_1_0;
 
 // Profiling APIs
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventProfilingInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetEventProfilingInfo)(
     cl_event event, cl_profiling_info param_name, size_t param_value_size,
     void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 // Flush and Finish APIs
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFlush)(
+typedef cl_int(CL_API_CALL *cl_api_clFlush)(
     cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFinish)(
+typedef cl_int(CL_API_CALL *cl_api_clFinish)(
     cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
 
 // Enqueued Commands APIs
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)(
     cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
     size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
@@ -454,7 +454,7 @@ typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)(
 
 #ifdef CL_VERSION_1_1
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBufferRect)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueReadBufferRect)(
     cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
     const size_t *buffer_origin, const size_t *host_origin,
     const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
@@ -468,7 +468,7 @@ typedef void *cl_api_clEnqueueReadBufferRect;
 
 #endif
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)(
     cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
     size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
@@ -476,7 +476,7 @@ typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)(
 
 #ifdef CL_VERSION_1_1
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBufferRect)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueWriteBufferRect)(
     cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
     const size_t *buffer_origin, const size_t *host_origin,
     const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
@@ -492,7 +492,7 @@ typedef void *cl_api_clEnqueueWriteBufferRect;
 
 #ifdef CL_VERSION_1_2
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillBuffer)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueFillBuffer)(
     cl_command_queue command_queue, cl_mem buffer, const void *pattern,
     size_t pattern_size, size_t offset, size_t cb,
     cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
@@ -504,7 +504,7 @@ typedef void *cl_api_clEnqueueFillBuffer;
 
 #endif
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)(
     cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
     size_t src_offset, size_t dst_offset, size_t cb,
     cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
@@ -512,7 +512,7 @@ typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)(
 
 #ifdef CL_VERSION_1_1
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferRect)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferRect)(
     cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
     const size_t *src_origin, const size_t *dst_origin, const size_t *region,
     size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,
@@ -526,14 +526,14 @@ typedef void *cl_api_clEnqueueCopyBufferRect;
 
 #endif
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadImage)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueReadImage)(
     cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,
     const size_t *origin, const size_t *region, size_t row_pitch,
     size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)(
     cl_command_queue command_queue, cl_mem image, cl_bool blocking_write,
     const size_t *origin, const size_t *region, size_t input_row_pitch,
     size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list,
@@ -542,7 +542,7 @@ typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)(
 
 #ifdef CL_VERSION_1_2
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillImage)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueFillImage)(
     cl_command_queue command_queue, cl_mem image, const void *fill_color,
     const size_t origin[3], const size_t region[3],
     cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
@@ -554,45 +554,45 @@ typedef void *cl_api_clEnqueueFillImage;
 
 #endif
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImage)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueCopyImage)(
     cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image,
     const size_t *src_origin, const size_t *dst_origin, const size_t *region,
     cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImageToBuffer)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueCopyImageToBuffer)(
     cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer,
     const size_t *src_origin, const size_t *region, size_t dst_offset,
     cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferToImage)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferToImage)(
     cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image,
     size_t src_offset, const size_t *dst_origin, const size_t *region,
     cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapBuffer)(
+typedef void *(CL_API_CALL *cl_api_clEnqueueMapBuffer)(
     cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map,
     cl_map_flags map_flags, size_t offset, size_t cb,
     cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
     cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapImage)(
+typedef void *(CL_API_CALL *cl_api_clEnqueueMapImage)(
     cl_command_queue command_queue, cl_mem image, cl_bool blocking_map,
     cl_map_flags map_flags, const size_t *origin, const size_t *region,
     size_t *image_row_pitch, size_t *image_slice_pitch,
     cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
     cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueUnmapMemObject)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueUnmapMemObject)(
     cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr,
     cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
 
 #ifdef CL_VERSION_1_2
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMigrateMemObjects)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueMigrateMemObjects)(
     cl_command_queue command_queue, cl_uint num_mem_objects,
     const cl_mem *mem_objects, cl_mem_migration_flags flags,
     cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
@@ -604,19 +604,19 @@ typedef void *cl_api_clEnqueueMigrateMemObjects;
 
 #endif
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNDRangeKernel)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueNDRangeKernel)(
     cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
     const size_t *global_work_offset, const size_t *global_work_size,
     const size_t *local_work_size, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueTask)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueTask)(
     cl_command_queue command_queue, cl_kernel kernel,
     cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)(
     cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *),
     void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list,
     const void **args_mem_loc, cl_uint num_events_in_wait_list,
@@ -625,17 +625,17 @@ typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)(
 
 #ifdef CL_VERSION_1_2
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarkerWithWaitList)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueMarkerWithWaitList)(
     cl_command_queue command_queue, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrierWithWaitList)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueBarrierWithWaitList)(
     cl_command_queue command_queue, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY void *(
+typedef void *(
     CL_API_CALL *cl_api_clGetExtensionFunctionAddressForPlatform)(
     cl_platform_id platform,
     const char *function_name)CL_API_SUFFIX__VERSION_1_2;
@@ -652,7 +652,7 @@ typedef void *cl_api_clGetExtensionFunctionAddressForPlatform;
 
 #ifdef CL_VERSION_2_0
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)(
     cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
     void ** /* svm_pointers */,
     void(CL_CALLBACK *pfn_free_func)(cl_command_queue /* queue */,
@@ -663,28 +663,28 @@ typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)(
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemcpy)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemcpy)(
     cl_command_queue /* command_queue */, cl_bool /* blocking_copy */,
     void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemFill)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemFill)(
     cl_command_queue /* command_queue */, void * /* svm_ptr */,
     const void * /* pattern */, size_t /* pattern_size */, size_t /* size */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMap)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMMap)(
     cl_command_queue /* command_queue */, cl_bool /* blocking_map */,
     cl_map_flags /* map_flags */, void * /* svm_ptr */, size_t /* size */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMUnmap)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMUnmap)(
     cl_command_queue /* command_queue */, void * /* svm_ptr */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
@@ -701,119 +701,119 @@ typedef void *cl_api_clEnqueueSVMUnmap;
 #endif
 
 // Deprecated APIs
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetCommandQueueProperty)(
+typedef cl_int(CL_API_CALL *cl_api_clSetCommandQueueProperty)(
     cl_command_queue command_queue, cl_command_queue_properties properties,
     cl_bool enable, cl_command_queue_properties *old_properties)
-    CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+    CL_API_SUFFIX__VERSION_1_0_DEPRECATED;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage2D)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateImage2D)(
     cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
     size_t image_width, size_t image_height, size_t image_row_pitch,
-    void *host_ptr, cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage3D)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateImage3D)(
     cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
     size_t image_width, size_t image_height, size_t image_depth,
     size_t image_row_pitch, size_t image_slice_pitch, void *host_ptr,
-    cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadCompiler)(void)
-    CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+typedef cl_int(CL_API_CALL *cl_api_clUnloadCompiler)(void)
+    CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarker)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueMarker)(
     cl_command_queue command_queue,
-    cl_event *event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    cl_event *event) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWaitForEvents)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueWaitForEvents)(
     cl_command_queue command_queue, cl_uint num_events,
-    const cl_event *event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    const cl_event *event_list) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrier)(
-    cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueBarrier)(
+    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clGetExtensionFunctionAddress)(
-    const char *function_name)CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+typedef void *(CL_API_CALL *cl_api_clGetExtensionFunctionAddress)(
+    const char *function_name)CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
 // GL and other APIs
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLBuffer)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromGLBuffer)(
     cl_context context, cl_mem_flags flags, cl_GLuint bufobj,
     int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture)(
     cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
     cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture2D)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture2D)(
     cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
     cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture3D)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture3D)(
     cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
     cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLRenderbuffer)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromGLRenderbuffer)(
     cl_context context, cl_mem_flags flags, cl_GLuint renderbuffer,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLObjectInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetGLObjectInfo)(
     cl_mem memobj, cl_gl_object_type *gl_object_type,
     cl_GLuint *gl_object_name) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLTextureInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetGLTextureInfo)(
     cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size,
     void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireGLObjects)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueAcquireGLObjects)(
     cl_command_queue command_queue, cl_uint num_objects,
     const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseGLObjects)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueReleaseGLObjects)(
     cl_command_queue command_queue, cl_uint num_objects,
     const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
 
 /* cl_khr_gl_sharing */
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLContextInfoKHR)(
+typedef cl_int(CL_API_CALL *cl_api_clGetGLContextInfoKHR)(
     const cl_context_properties *properties, cl_gl_context_info param_name,
     size_t param_value_size, void *param_value, size_t *param_value_size_ret);
 
 /* cl_khr_gl_event */
-typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromGLsyncKHR)(
+typedef cl_event(CL_API_CALL *cl_api_clCreateEventFromGLsyncKHR)(
     cl_context context, cl_GLsync sync, cl_int *errcode_ret);
 
 #if defined(_WIN32)
 
 /* cl_khr_d3d10_sharing */
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D10KHR)(
+typedef cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D10KHR)(
     cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source,
     void *d3d_object, cl_d3d10_device_set_khr d3d_device_set,
     cl_uint num_entries, cl_device_id *devices,
     cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10BufferKHR)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10BufferKHR)(
     cl_context context, cl_mem_flags flags, ID3D10Buffer *resource,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture2DKHR)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture2DKHR)(
     cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource,
     UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture3DKHR)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture3DKHR)(
     cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource,
     UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY
+typedef
 cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D10ObjectsKHR)(
     cl_command_queue command_queue, cl_uint num_objects,
     const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY
+typedef
 cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D10ObjectsKHR)(
     cl_command_queue command_queue, cl_uint num_objects,
     const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
@@ -848,32 +848,32 @@ extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR(
     const cl_event *event_wait_list, cl_event *event);
 
 /* cl_khr_d3d11_sharing */
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D11KHR)(
+typedef cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D11KHR)(
     cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source,
     void *d3d_object, cl_d3d11_device_set_khr d3d_device_set,
     cl_uint num_entries, cl_device_id *devices,
     cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11BufferKHR)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11BufferKHR)(
     cl_context context, cl_mem_flags flags, ID3D11Buffer *resource,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture2DKHR)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture2DKHR)(
     cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource,
     UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture3DKHR)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture3DKHR)(
     cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource,
     UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY
+typedef
 cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D11ObjectsKHR)(
     cl_command_queue command_queue, cl_uint num_objects,
     const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY
+typedef
 cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D11ObjectsKHR)(
     cl_command_queue command_queue, cl_uint num_objects,
     const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
@@ -881,26 +881,26 @@ cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D11ObjectsKHR)(
     cl_event *event) CL_API_SUFFIX__VERSION_1_2;
 
 /* cl_khr_dx9_media_sharing */
-typedef CL_API_ENTRY
+typedef
 cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR)(
     cl_platform_id platform, cl_uint num_media_adapters,
     cl_dx9_media_adapter_type_khr *media_adapters_type, void *media_adapters,
     cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,
     cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromDX9MediaSurfaceKHR)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromDX9MediaSurfaceKHR)(
     cl_context context, cl_mem_flags flags,
     cl_dx9_media_adapter_type_khr adapter_type, void *surface_info,
     cl_uint plane, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY
+typedef
 cl_int(CL_API_CALL *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR)(
     cl_command_queue command_queue, cl_uint num_objects,
     const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY
+typedef
 cl_int(CL_API_CALL *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR)(
     cl_command_queue command_queue, cl_uint num_objects,
     const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
@@ -987,29 +987,29 @@ typedef void *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR;
 
 #ifdef CL_VERSION_1_1
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetEventCallback)(
+typedef cl_int(CL_API_CALL *cl_api_clSetEventCallback)(
     cl_event /* event */, cl_int /* command_exec_callback_type */,
     void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
     void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
 
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateSubBuffer)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateSubBuffer)(
     cl_mem /* buffer */, cl_mem_flags /* flags */,
     cl_buffer_create_type /* buffer_create_type */,
     const void * /* buffer_create_info */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
 
-typedef CL_API_ENTRY
+typedef
 cl_int(CL_API_CALL *cl_api_clSetMemObjectDestructorCallback)(
     cl_mem /* memobj */,
     void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */,
                                        void * /*user_data*/),
     void * /*user_data */) CL_API_SUFFIX__VERSION_1_1;
 
-typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateUserEvent)(
+typedef cl_event(CL_API_CALL *cl_api_clCreateUserEvent)(
     cl_context /* context */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetUserEventStatus)(
+typedef cl_int(CL_API_CALL *cl_api_clSetUserEventStatus)(
     cl_event /* event */,
     cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
 
@@ -1023,68 +1023,68 @@ typedef void *cl_api_clSetUserEventStatus;
 
 #endif
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevicesEXT)(
+typedef cl_int(CL_API_CALL *cl_api_clCreateSubDevicesEXT)(
     cl_device_id in_device,
     const cl_device_partition_property_ext *partition_properties,
     cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDeviceEXT)(
+typedef cl_int(CL_API_CALL *cl_api_clRetainDeviceEXT)(
     cl_device_id device) CL_API_SUFFIX__VERSION_1_0;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDeviceEXT)(
+typedef cl_int(CL_API_CALL *cl_api_clReleaseDeviceEXT)(
     cl_device_id device) CL_API_SUFFIX__VERSION_1_0;
 
 /* cl_khr_egl_image */
-typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromEGLImageKHR)(
+typedef cl_mem(CL_API_CALL *cl_api_clCreateFromEGLImageKHR)(
     cl_context context, CLeglDisplayKHR display, CLeglImageKHR image,
     cl_mem_flags flags, const cl_egl_image_properties_khr *properties,
     cl_int *errcode_ret);
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireEGLObjectsKHR)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueAcquireEGLObjectsKHR)(
     cl_command_queue command_queue, cl_uint num_objects,
     const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list, cl_event *event);
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseEGLObjectsKHR)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueReleaseEGLObjectsKHR)(
     cl_command_queue command_queue, cl_uint num_objects,
     const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list, cl_event *event);
 
 /* cl_khr_egl_event */
-typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromEGLSyncKHR)(
+typedef cl_event(CL_API_CALL *cl_api_clCreateEventFromEGLSyncKHR)(
     cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display,
     cl_int *errcode_ret);
 
 #ifdef CL_VERSION_2_1
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetDefaultDeviceCommandQueue)(
+typedef cl_int(CL_API_CALL *cl_api_clSetDefaultDeviceCommandQueue)(
     cl_context context, cl_device_id device,
     cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1;
 
-typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithIL)(
+typedef cl_program(CL_API_CALL *cl_api_clCreateProgramWithIL)(
     cl_context context, const void *il, size_t length,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfo)(
+typedef cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfo)(
     cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name,
     size_t input_value_size, const void *input_value, size_t param_value_size,
     void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;
 
-typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCloneKernel)(
+typedef cl_kernel(CL_API_CALL *cl_api_clCloneKernel)(
     cl_kernel source_kernel, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMigrateMem)(
+typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMMigrateMem)(
     cl_command_queue command_queue, cl_uint num_svm_pointers,
     const void **svm_pointers, const size_t *sizes,
     cl_mem_migration_flags flags, cl_uint num_events_in_wait_list,
     const cl_event *event_wait_list,
     cl_event *event) CL_API_SUFFIX__VERSION_2_1;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceAndHostTimer)(
+typedef cl_int(CL_API_CALL *cl_api_clGetDeviceAndHostTimer)(
     cl_device_id device, cl_ulong *device_timestamp,
     cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;
 
-typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetHostTimer)(
+typedef cl_int(CL_API_CALL *cl_api_clGetHostTimer)(
     cl_device_id device, cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;
 
 #else
diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
index 2d69cc4e5..8ae655d14 100644
--- a/include/CL/cl_platform.h
+++ b/include/CL/cl_platform.h
@@ -24,13 +24,25 @@ extern "C" {
 #endif
 
 #if defined(_WIN32)
-    #define CL_API_ENTRY
-    #define CL_API_CALL     __stdcall
-    #define CL_CALLBACK     __stdcall
+    #if !defined(CL_API_ENTRY)
+        #define CL_API_ENTRY
+    #endif
+    #if !defined(CL_API_CALL)
+        #define CL_API_CALL     __stdcall
+    #endif
+    #if !defined(CL_CALLBACK)
+        #define CL_CALLBACK     __stdcall
+    #endif
 #else
-    #define CL_API_ENTRY
-    #define CL_API_CALL
-    #define CL_CALLBACK
+    #if !defined(CL_API_ENTRY)
+        #define CL_API_ENTRY
+    #endif
+    #if !defined(CL_API_CALL)
+        #define CL_API_CALL
+    #endif
+    #if !defined(CL_CALLBACK)
+        #define CL_CALLBACK
+    #endif
 #endif
 
 /*
@@ -41,86 +53,94 @@ extern "C" {
  * deprecation but is deprecated in versions later than 1.1.
  */
 
-#define CL_EXTENSION_WEAK_LINK
-#define CL_API_SUFFIX__VERSION_1_0
-#define CL_EXT_SUFFIX__VERSION_1_0
-#define CL_API_SUFFIX__VERSION_1_1
-#define CL_EXT_SUFFIX__VERSION_1_1
-#define CL_API_SUFFIX__VERSION_1_2
-#define CL_EXT_SUFFIX__VERSION_1_2
-#define CL_API_SUFFIX__VERSION_2_0
-#define CL_EXT_SUFFIX__VERSION_2_0
-#define CL_API_SUFFIX__VERSION_2_1
-#define CL_EXT_SUFFIX__VERSION_2_1
-#define CL_API_SUFFIX__VERSION_2_2
-#define CL_EXT_SUFFIX__VERSION_2_2
-#define CL_API_SUFFIX__VERSION_3_0
-#define CL_EXT_SUFFIX__VERSION_3_0
-#define CL_API_SUFFIX__EXPERIMENTAL
-#define CL_EXT_SUFFIX__EXPERIMENTAL
+#ifndef CL_API_SUFFIX_USER
+#define CL_API_SUFFIX_USER
+#endif
+
+#ifndef CL_API_PREFIX_USER
+#define CL_API_PREFIX_USER
+#endif
+
+#define CL_API_SUFFIX_COMMON CL_API_SUFFIX_USER
+#define CL_API_PREFIX_COMMON CL_API_PREFIX_USER
+
+#define CL_API_SUFFIX__VERSION_1_0 CL_API_SUFFIX_COMMON
+#define CL_API_SUFFIX__VERSION_1_1 CL_API_SUFFIX_COMMON
+#define CL_API_SUFFIX__VERSION_1_2 CL_API_SUFFIX_COMMON
+#define CL_API_SUFFIX__VERSION_2_0 CL_API_SUFFIX_COMMON
+#define CL_API_SUFFIX__VERSION_2_1 CL_API_SUFFIX_COMMON
+#define CL_API_SUFFIX__VERSION_2_2 CL_API_SUFFIX_COMMON
+#define CL_API_SUFFIX__VERSION_3_0 CL_API_SUFFIX_COMMON
+#define CL_API_SUFFIX__EXPERIMENTAL CL_API_SUFFIX_COMMON
 
 
 #ifdef __GNUC__
-  #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated))
-  #define CL_EXT_PREFIX_DEPRECATED
+  #define CL_API_SUFFIX_DEPRECATED __attribute__((deprecated))
+  #define CL_API_PREFIX_DEPRECATED
 #elif defined(_WIN32)
-  #define CL_EXT_SUFFIX_DEPRECATED
-  #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated)
+  #define CL_API_SUFFIX_DEPRECATED
+  #define CL_API_PREFIX_DEPRECATED __declspec(deprecated)
 #else
-  #define CL_EXT_SUFFIX_DEPRECATED
-  #define CL_EXT_PREFIX_DEPRECATED
+  #define CL_API_SUFFIX_DEPRECATED
+  #define CL_API_PREFIX_DEPRECATED
 #endif
 
 #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
-    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON
+    #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON
 #else
-    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
+    #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
 #endif
 
 #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
-    #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON
+    #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON
 #else
-    #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
+    #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
 #endif
 
 #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
-    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON
+    #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON
 #else
-    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
+    #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
  #endif
 
 #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
-    #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON
+    #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON
 #else
-    #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
+    #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
 #endif
 
 #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
-    #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON
+    #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON
 #else
-    #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
+    #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
 #endif
 
 #ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS
-    #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON
+    #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON
 #else
-    #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
-    #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
+    #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
 #endif
 
 #if (defined (_WIN32) && defined(_MSC_VER))
 
+/* intptr_t is used in cl.h and provided by stddef.h in Visual C++, but not in clang */
+/* stdint.h was missing before Visual Studio 2010, include it for later versions and for clang */
+#if defined(__clang__) || _MSC_VER >= 1600
+    #include <stdint.h>
+#endif
+
 /* scalar types  */
 typedef signed   __int8         cl_char;
 typedef unsigned __int8         cl_uchar;
diff --git a/include/CL/cl_va_api_media_sharing_intel.h b/include/CL/cl_va_api_media_sharing_intel.h
index 0e7cd4d6f..7ba2ec83d 100644
--- a/include/CL/cl_va_api_media_sharing_intel.h
+++ b/include/CL/cl_va_api_media_sharing_intel.h
@@ -13,30 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  ******************************************************************************/
-/*****************************************************************************\
-
-Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
-
-THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
-MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-File Name: cl_va_api_media_sharing_intel.h
-
-Abstract:
-
-Notes:
-
-\*****************************************************************************/
-
 
 #ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
 #define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
@@ -92,16 +68,16 @@ clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
     cl_va_api_device_set_intel    media_adapter_set,
     cl_uint                       num_entries,
     cl_device_id*                 devices,
-    cl_uint*                      num_devices) CL_EXT_SUFFIX__VERSION_1_2;
+    cl_uint*                      num_devices) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
+typedef cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
     cl_platform_id                platform,
     cl_va_api_device_source_intel media_adapter_type,
     void*                         media_adapter,
     cl_va_api_device_set_intel    media_adapter_set,
     cl_uint                       num_entries,
     cl_device_id*                 devices,
-    cl_uint*                      num_devices) CL_EXT_SUFFIX__VERSION_1_2;
+    cl_uint*                      num_devices) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromVA_APIMediaSurfaceINTEL(
@@ -109,14 +85,14 @@ clCreateFromVA_APIMediaSurfaceINTEL(
     cl_mem_flags                  flags,
     VASurfaceID*                  surface,
     cl_uint                       plane,
-    cl_int*                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+    cl_int*                       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
+typedef cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
     cl_context                    context,
     cl_mem_flags                  flags,
     VASurfaceID*                  surface,
     cl_uint                       plane,
-    cl_int*                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+    cl_int*                       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireVA_APIMediaSurfacesINTEL(
@@ -125,15 +101,15 @@ clEnqueueAcquireVA_APIMediaSurfacesINTEL(
     const cl_mem*                 mem_objects,
     cl_uint                       num_events_in_wait_list,
     const cl_event*               event_wait_list,
-    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
+    cl_event*                     event) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
+typedef cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
     cl_command_queue              command_queue,
     cl_uint                       num_objects,
     const cl_mem*                 mem_objects,
     cl_uint                       num_events_in_wait_list,
     const cl_event*               event_wait_list,
-    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
+    cl_event*                     event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseVA_APIMediaSurfacesINTEL(
@@ -142,15 +118,15 @@ clEnqueueReleaseVA_APIMediaSurfacesINTEL(
     const cl_mem*                 mem_objects,
     cl_uint                       num_events_in_wait_list,
     const cl_event*               event_wait_list,
-    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
+    cl_event*                     event) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
+typedef cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
     cl_command_queue              command_queue,
     cl_uint                       num_objects,
     const cl_mem*                 mem_objects,
     cl_uint                       num_events_in_wait_list,
     const cl_event*               event_wait_list,
-    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
+    cl_event*                     event) CL_API_SUFFIX__VERSION_1_2;
 
 #ifdef __cplusplus
 }
diff --git a/include/CL/opencl.h b/include/CL/opencl.h
index 1c4e10c88..ef8dd1e03 100644
--- a/include/CL/opencl.h
+++ b/include/CL/opencl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
+ * Copyright (c) 2008-2021 The Khronos Group Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@ extern "C" {
 
 #include <CL/cl.h>
 #include <CL/cl_gl.h>
-#include <CL/cl_gl_ext.h>
 #include <CL/cl_ext.h>
 
 #ifdef __cplusplus
diff --git a/include/CL/cl2.hpp b/include/CL/opencl.hpp
index 0d6e805a0..75b9c92c2 100644
--- a/include/CL/cl2.hpp
+++ b/include/CL/opencl.hpp
@@ -1,35 +1,23 @@
-/*******************************************************************************
- * Copyright (c) 2008-2016 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
+//
+// Copyright (c) 2008-2020 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 /*! \file
  *
- *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33),
- *       OpenCL 1.2 (rev 15) and OpenCL 2.0 (rev 29)
+ *   \brief C++ bindings for OpenCL 1.0, OpenCL 1.1, OpenCL 1.2,
+ *       OpenCL 2.0, OpenCL 2.1, OpenCL 2.2, and OpenCL 3.0.
  *   \author Lee Howes and Bruce Merry
  *
  *   Derived from the OpenCL 1.x C++ bindings written by
@@ -40,9 +28,8 @@
  *       Bruce Merry, February 2013.
  *       Tom Deakin and Simon McIntosh-Smith, July 2013
  *       James Price, 2015-
- *
- *   \version 2.0.10
- *   \date 2016-07-20
+ *   \version 2.2.0
+ *   \date 2019-09-18
  *
  *   Optional extension support
  *
@@ -52,6 +39,8 @@
  *         #define CL_HPP_USE_DX_INTEROP
  *         cl_khr_sub_groups
  *         #define CL_HPP_USE_CL_SUB_GROUPS_KHR
+ *         cl_khr_image2d_from_buffer
+ *         #define CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR
  *
  *   Doxygen documentation for this header is available here:
  *
@@ -71,10 +60,10 @@
  * For many large applications C++ is the language of choice and so it seems
  * reasonable to define C++ bindings for OpenCL.
  *
- * The interface is contained with a single C++ header file \em cl2.hpp and all
+ * The interface is contained with a single C++ header file \em opencl.hpp and all
  * definitions are contained within the namespace \em cl. There is no additional
  * requirement to include \em cl.h and to use either the C++ or original C
- * bindings; it is enough to simply include \em cl2.hpp.
+ * bindings; it is enough to simply include \em opencl.hpp.
  *
  * The bindings themselves are lightweight and correspond closely to the
  * underlying C API. Using the C++ bindings introduces no additional execution
@@ -83,7 +72,7 @@
  * There are numerous compatibility, portability and memory management
  * fixes in the new header as well as additional OpenCL 2.0 features.
  * As a result the header is not directly backward compatible and for this
- * reason we release it as cl2.hpp rather than a new version of cl.hpp.
+ * reason we release it as opencl.hpp rather than a new version of cl.hpp.
  * 
  *
  * \section compatibility Compatibility
@@ -155,26 +144,26 @@
  * - CL_HPP_NO_STD_STRING
  *
  *   Do not use the standard library string class. cl::string is not
- *   defined and may be defined by the user before cl2.hpp is
+ *   defined and may be defined by the user before opencl.hpp is
  *   included.
  *
  * - CL_HPP_NO_STD_VECTOR
  *
  *   Do not use the standard library vector class. cl::vector is not
- *   defined and may be defined by the user before cl2.hpp is
+ *   defined and may be defined by the user before opencl.hpp is
  *   included.
  *
  * - CL_HPP_NO_STD_ARRAY
  *
  *   Do not use the standard library array class. cl::array is not
- *   defined and may be defined by the user before cl2.hpp is
+ *   defined and may be defined by the user before opencl.hpp is
  *   included.
  *
  * - CL_HPP_NO_STD_UNIQUE_PTR
  *
  *   Do not use the standard library unique_ptr class. cl::pointer and
  *   the cl::allocate_pointer functions are not defined and may be
- *   defined by the user before cl2.hpp is included.
+ *   defined by the user before opencl.hpp is included.
  *
  * - CL_HPP_ENABLE_DEVICE_FISSION
  *
@@ -205,6 +194,14 @@
  *   applies to use of cl::Program construction and other program
  *   build variants.
  *
+ * - CL_HPP_USE_CL_SUB_GROUPS_KHR
+ *
+ *   Enable the cl_khr_subgroups extension.
+ *
+ * - CL_HPP_USE_IL_KHR
+ *
+ *   Enable the cl_khr_il_program extension.
+ *
  *
  * \section example Example
  *
@@ -217,7 +214,7 @@
     #define CL_HPP_ENABLE_EXCEPTIONS
     #define CL_HPP_TARGET_OPENCL_VERSION 200
 
-    #include <CL/cl2.hpp>
+    #include <CL/opencl.hpp>
     #include <iostream>
     #include <vector>
     #include <memory>
@@ -396,61 +393,84 @@
  * both and hence work with either version of the bindings.
  */
 #if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP)
-# pragma message("cl2.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead")
+# pragma message("opencl.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead")
 # define CL_HPP_USE_DX_INTEROP
 #endif
 #if !defined(CL_HPP_USE_CL_DEVICE_FISSION) && defined(USE_CL_DEVICE_FISSION)
-# pragma message("cl2.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead")
+# pragma message("opencl.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead")
 # define CL_HPP_USE_CL_DEVICE_FISSION
 #endif
 #if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS)
-# pragma message("cl2.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
+# pragma message("opencl.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
 # define CL_HPP_ENABLE_EXCEPTIONS
 #endif
 #if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR)
-# pragma message("cl2.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
+# pragma message("opencl.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
 # define CL_HPP_NO_STD_VECTOR
 #endif
 #if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING)
-# pragma message("cl2.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead")
+# pragma message("opencl.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead")
 # define CL_HPP_NO_STD_STRING
 #endif
 #if defined(VECTOR_CLASS)
-# pragma message("cl2.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead")
+# pragma message("opencl.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead")
 #endif
 #if defined(STRING_CLASS)
-# pragma message("cl2.hpp: STRING_CLASS is deprecated. Alias cl::string instead.")
+# pragma message("opencl.hpp: STRING_CLASS is deprecated. Alias cl::string instead.")
 #endif
 #if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
-# pragma message("cl2.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead")
+# pragma message("opencl.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead")
 # define CL_HPP_USER_OVERRIDE_ERROR_STRINGS
 #endif
 
 /* Warn about features that are no longer supported
  */
 #if defined(__USE_DEV_VECTOR)
-# pragma message("cl2.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors")
+# pragma message("opencl.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors")
 #endif
 #if defined(__USE_DEV_STRING)
-# pragma message("cl2.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors")
+# pragma message("opencl.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors")
 #endif
 
 /* Detect which version to target */
 #if !defined(CL_HPP_TARGET_OPENCL_VERSION)
-# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 200 (OpenCL 2.0)")
-# define CL_HPP_TARGET_OPENCL_VERSION 200
+# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 300 (OpenCL 3.0)")
+# define CL_HPP_TARGET_OPENCL_VERSION 300
 #endif
-#if CL_HPP_TARGET_OPENCL_VERSION != 100 && CL_HPP_TARGET_OPENCL_VERSION != 110 && CL_HPP_TARGET_OPENCL_VERSION != 120 && CL_HPP_TARGET_OPENCL_VERSION != 200
-# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 200")
+#if CL_HPP_TARGET_OPENCL_VERSION != 100 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 110 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 120 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 200 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 210 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 220 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 300
+# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 300 (OpenCL 3.0).")
 # undef CL_HPP_TARGET_OPENCL_VERSION
-# define CL_HPP_TARGET_OPENCL_VERSION 200
+# define CL_HPP_TARGET_OPENCL_VERSION 300
+#endif
+
+/* Forward target OpenCL version to C headers if necessary */
+#if defined(CL_TARGET_OPENCL_VERSION)
+/* Warn if prior definition of CL_TARGET_OPENCL_VERSION is lower than
+ * requested C++ bindings version */
+#if CL_TARGET_OPENCL_VERSION < CL_HPP_TARGET_OPENCL_VERSION
+# pragma message("CL_TARGET_OPENCL_VERSION is already defined as is lower than CL_HPP_TARGET_OPENCL_VERSION")
+#endif
+#else
+# define CL_TARGET_OPENCL_VERSION CL_HPP_TARGET_OPENCL_VERSION
 #endif
 
 #if !defined(CL_HPP_MINIMUM_OPENCL_VERSION)
 # define CL_HPP_MINIMUM_OPENCL_VERSION 200
 #endif
-#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && CL_HPP_MINIMUM_OPENCL_VERSION != 110 && CL_HPP_MINIMUM_OPENCL_VERSION != 120 && CL_HPP_MINIMUM_OPENCL_VERSION != 200
-# pragma message("cl2.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 100")
+#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 110 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 120 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 200 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 210 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 220 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 300
+# pragma message("opencl.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 100")
 # undef CL_HPP_MINIMUM_OPENCL_VERSION
 # define CL_HPP_MINIMUM_OPENCL_VERSION 100
 #endif
@@ -470,6 +490,12 @@
 #if CL_HPP_MINIMUM_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
 # define CL_USE_DEPRECATED_OPENCL_2_0_APIS
 #endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
+# define CL_USE_DEPRECATED_OPENCL_2_1_APIS
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
+# define CL_USE_DEPRECATED_OPENCL_2_2_APIS
+#endif
 
 #ifdef _WIN32
 
@@ -504,33 +530,37 @@
 #include <CL/opencl.h>
 #endif // !__APPLE__
 
-#if (__cplusplus >= 201103L)
+#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L )
 #define CL_HPP_NOEXCEPT_ noexcept
 #else
 #define CL_HPP_NOEXCEPT_
 #endif
 
-#if defined(_MSC_VER)
+#if __cplusplus >= 201703L
+# define CL_HPP_DEFINE_STATIC_MEMBER_ inline
+#elif defined(_MSC_VER)
 # define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany)
+#elif defined(__MINGW32__)
+# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((selectany))
 #else
 # define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak))
 #endif // !_MSC_VER
 
 // Define deprecated prefixes and suffixes to ensure compilation
 // in case they are not pre-defined
-#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-
-#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
-#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED  
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
-#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED)
-#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_API_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_API_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
+#define CL_API_PREFIX__VERSION_1_2_DEPRECATED  
+#endif // #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
+#if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED)
+#define CL_API_SUFFIX__VERSION_1_2_DEPRECATED
+#endif // #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
 
 #if !defined(CL_CALLBACK)
 #define CL_CALLBACK
@@ -790,6 +820,9 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 #define __GET_KERNEL_ARG_INFO_ERR           CL_HPP_ERR_STR_(clGetKernelArgInfo)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#define __GET_KERNEL_SUB_GROUP_INFO_ERR     CL_HPP_ERR_STR_(clGetKernelSubGroupInfo)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
 #define __GET_KERNEL_WORK_GROUP_INFO_ERR    CL_HPP_ERR_STR_(clGetKernelWorkGroupInfo)
 #define __GET_PROGRAM_INFO_ERR              CL_HPP_ERR_STR_(clGetProgramInfo)
 #define __GET_PROGRAM_BUILD_INFO_ERR        CL_HPP_ERR_STR_(clGetProgramBuildInfo)
@@ -820,7 +853,13 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __CREATE_KERNEL_ERR                 CL_HPP_ERR_STR_(clCreateKernel)
 #define __SET_KERNEL_ARGS_ERR               CL_HPP_ERR_STR_(clSetKernelArg)
 #define __CREATE_PROGRAM_WITH_SOURCE_ERR    CL_HPP_ERR_STR_(clCreateProgramWithSource)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#define __CREATE_PROGRAM_WITH_IL_ERR        CL_HPP_ERR_STR_(clCreateProgramWithIL)
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
 #define __CREATE_PROGRAM_WITH_BINARY_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBinary)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+#define __CREATE_PROGRAM_WITH_IL_ERR        CL_HPP_ERR_STR_(clCreateProgramWithIL)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 #define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBuiltInKernels)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
@@ -857,6 +896,11 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 #define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   CL_HPP_ERR_STR_(clEnqueueMigrateMemObjects)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+#define __ENQUEUE_MIGRATE_SVM_ERR   CL_HPP_ERR_STR_(clEnqueueSVMMigrateMem)
+#define __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR   CL_HPP_ERR_STR_(clSetDefaultDeviceCommandQueue)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
+
 
 #define __ENQUEUE_ACQUIRE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueAcquireGLObjects)
 #define __ENQUEUE_RELEASE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueReleaseGLObjects)
@@ -871,6 +915,16 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __FINISH_ERR                        CL_HPP_ERR_STR_(clFinish)
 #define __VECTOR_CAPACITY_ERR               CL_HPP_ERR_STR_(Vector capacity error)
 
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+#define __GET_HOST_TIMER_ERR           CL_HPP_ERR_STR_(clGetHostTimer)
+#define __GET_DEVICE_AND_HOST_TIMER_ERR           CL_HPP_ERR_STR_(clGetDeviceAndHostTimer)
+#endif
+#if CL_HPP_TARGET_OPENCL_VERSION >= 220
+#define __SET_PROGRAM_RELEASE_CALLBACK_ERR          CL_HPP_ERR_STR_(clSetProgramReleaseCallback)
+#define __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR   CL_HPP_ERR_STR_(clSetProgramSpecializationConstant)
+#endif
+
+
 /**
  * CL 1.2 version that uses device fission.
  */
@@ -911,6 +965,10 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __ENQUEUE_BARRIER_WAIT_LIST_ERR               CL_HPP_ERR_STR_(clEnqueueBarrierWithWaitList)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+#define __CLONE_KERNEL_ERR     CL_HPP_ERR_STR_(clCloneKernel)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
+
 #endif // CL_HPP_USER_OVERRIDE_ERROR_STRINGS
 //! \endcond
 
@@ -1137,6 +1195,8 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
     F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
     F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
     F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
     F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
     F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
@@ -1233,8 +1293,6 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
     F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
     F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
-    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
-    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
     F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, string) \
     \
     F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
@@ -1259,13 +1317,20 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \
     F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \
     \
+    F(cl_kernel_work_group_info, CL_KERNEL_GLOBAL_WORK_SIZE, cl::detail::size_t_array) \
+    \
+    F(cl_device_info, CL_DEVICE_LINKER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, size_type) \
     F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \
+    F(cl_device_info, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, cl_uint) \
     F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector<cl_device_partition_property>) \
     F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector<cl_device_partition_property>)  \
     F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, size_type) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, cl_bool) \
     F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
     F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \
+    F(cl_device_info, CL_DEVICE_PRINTF_BUFFER_SIZE, size_type) \
     \
     F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \
     F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \
@@ -1285,12 +1350,45 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \
     F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \
     F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS, cl_uint ) \
+    F(cl_device_info, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, size_type ) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, size_type ) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_COMPLETE, cl_ulong) \
+    F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, cl_bool) \
+    F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_PTRS, void**) \
     F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \
     F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \
     F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \
     F(cl_pipe_info, CL_PIPE_PACKET_SIZE, cl_uint) \
     F(cl_pipe_info, CL_PIPE_MAX_PACKETS, cl_uint)
 
+#define CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(F) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR, size_type) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR, size_type)
+
+#define CL_HPP_PARAM_NAME_INFO_IL_KHR_(F) \
+    F(cl_device_info, CL_DEVICE_IL_VERSION_KHR, string) \
+    F(cl_program_info, CL_PROGRAM_IL_KHR, cl::vector<unsigned char>)
+
+#define CL_HPP_PARAM_NAME_INFO_2_1_(F) \
+    F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, cl_ulong) \
+    F(cl_program_info, CL_PROGRAM_IL, cl::vector<unsigned char>) \
+    F(cl_device_info, CL_DEVICE_MAX_NUM_SUB_GROUPS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_IL_VERSION, string) \
+    F(cl_device_info, CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, cl_bool) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE_DEFAULT, cl::DeviceCommandQueue) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, size_type) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, size_type) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type)
+
+#define CL_HPP_PARAM_NAME_INFO_2_2_(F) \
+    F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT, cl_bool) \
+    F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT, cl_bool)
+
 #define CL_HPP_PARAM_NAME_DEVICE_FISSION_(F) \
     F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
     F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, cl::vector<cl_device_partition_property_ext>) \
@@ -1298,6 +1396,43 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
     F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector<cl_device_partition_property_ext>)
 
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(F) \
+    F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION_KHR, cl_version_khr) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
+    \
+    F(cl_device_info, CL_DEVICE_NUMERIC_VERSION_KHR, cl_version_khr) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
+    F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>)
+
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(F) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR, cl_version_khr)
+
+#define CL_HPP_PARAM_NAME_INFO_3_0_(F) \
+    F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION, cl_version) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION, cl::vector<cl_name_version>) \
+    \
+    F(cl_device_info, CL_DEVICE_NUMERIC_VERSION, cl_version) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES, cl_device_atomic_capabilities) \
+    F(cl_device_info, CL_DEVICE_ATOMIC_FENCE_CAPABILITIES, cl_device_atomic_capabilities) \
+    F(cl_device_info, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_ALL_VERSIONS, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \
+    F(cl_device_info, CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_FEATURES, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES, cl_device_device_enqueue_capabilities) \
+    F(cl_device_info, CL_DEVICE_PIPE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED, string) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES_ARRAY, cl::vector<cl_queue_properties>) \
+    F(cl_mem_info, CL_MEM_PROPERTIES, cl::vector<cl_mem_properties>) \
+    F(cl_pipe_info, CL_PIPE_PROPERTIES, cl::vector<cl_pipe_properties>) \
+    F(cl_sampler_info, CL_SAMPLER_PROPERTIES, cl::vector<cl_sampler_properties>)
+
 template <typename enum_type, cl_int Name>
 struct param_traits {};
 
@@ -1316,10 +1451,27 @@ CL_HPP_PARAM_NAME_INFO_1_1_(CL_HPP_DECLARE_PARAM_TRAITS_)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 CL_HPP_PARAM_NAME_INFO_1_2_(CL_HPP_DECLARE_PARAM_TRAITS_)
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
 CL_HPP_PARAM_NAME_INFO_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+CL_HPP_PARAM_NAME_INFO_2_1_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
+#if CL_HPP_TARGET_OPENCL_VERSION >= 220
+CL_HPP_PARAM_NAME_INFO_2_2_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+CL_HPP_PARAM_NAME_INFO_3_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300
+
+#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210
+CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210
+
+#if defined(CL_HPP_USE_IL_KHR)
+CL_HPP_PARAM_NAME_INFO_IL_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // #if defined(CL_HPP_USE_IL_KHR)
 
 
 // Flags deprecated in OpenCL 2.0
@@ -1348,6 +1500,31 @@ CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
 CL_HPP_PARAM_NAME_DEVICE_FISSION_(CL_HPP_DECLARE_PARAM_TRAITS_);
 #endif // CL_HPP_USE_CL_DEVICE_FISSION
 
+#if defined(cl_khr_extended_versioning)
+#if CL_HPP_TARGET_OPENCL_VERSION < 300
+CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION < 300
+CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_khr_extended_versioning
+
+#if defined(cl_khr_device_uuid)
+using uuid_array = array<cl_uchar, CL_UUID_SIZE_KHR>;
+using luid_array = array<cl_uchar, CL_LUID_SIZE_KHR>;
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_UUID_KHR, uuid_array)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DRIVER_UUID_KHR, uuid_array)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_VALID_KHR, cl_bool)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_KHR, luid_array)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NODE_MASK_KHR, cl_uint)
+#endif
+
+#if defined(cl_khr_pci_bus_info)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PCI_BUS_INFO_KHR, cl_device_pci_bus_info_khr)
+#endif
+
+#if defined(cl_khr_integer_dot_product)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, cl_device_integer_dot_product_capabilities_khr)
+#endif
+
 #ifdef CL_PLATFORM_ICD_SUFFIX_KHR
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string)
 #endif
@@ -1355,7 +1532,6 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, strin
 #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
 #endif
-
 #ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector<size_type>)
 #endif
@@ -1387,6 +1563,22 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUT
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
 #endif
 
+#ifdef CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM, cl_ulong)
+#endif
+#ifdef CL_DEVICE_JOB_SLOTS_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_JOB_SLOTS_ARM, cl_uint)
+#endif
+#ifdef CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, cl_bitfield)
+#endif
+#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM, cl_uint)
+#endif
+#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, cl_int)
+#endif
+
 #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
 #endif
@@ -1697,10 +1889,7 @@ public:
 
     cl_type& operator ()() { return object_; }
 
-    const cl_type get() const { return object_; }
-
-    cl_type get() { return object_; }
-
+    cl_type get() const { return object_; }
 
 protected:
     template<typename Func, typename U>
@@ -1826,9 +2015,7 @@ public:
 
     cl_type& operator ()() { return object_; }
 
-    const cl_type get() const { return object_; }
-
-    cl_type get() { return object_; }
+    cl_type get() const { return object_; }
 
 protected:
     template<typename Func, typename U>
@@ -1938,6 +2125,9 @@ struct ImageFormat : public cl_image_format
         image_channel_data_type = type;
     }
 
+    //! \brief Copy constructor.
+    ImageFormat(const ImageFormat &other) { *this = other; }
+
     //! \brief Assignment operator.
     ImageFormat& operator = (const ImageFormat& rhs)
     {
@@ -2079,7 +2269,7 @@ public:
     }
 
     //! \brief Wrapper for clGetDeviceInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_device_info name> typename
     detail::param_traits<detail::cl_device_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -2092,6 +2282,53 @@ public:
         return param;
     }
 
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+    /**
+     * Return the current value of the host clock as seen by the device.
+     * The resolution of the device timer may be queried with the
+     * CL_DEVICE_PROFILING_TIMER_RESOLUTION query.
+     * @return The host timer value.
+     */
+    cl_ulong getHostTimer(cl_int *error = nullptr)
+    {
+        cl_ulong retVal = 0;
+        cl_int err = 
+            clGetHostTimer(this->get(), &retVal);
+        detail::errHandler(
+            err,
+            __GET_HOST_TIMER_ERR);
+        if (error) {
+            *error = err;
+        }
+        return retVal;
+    }
+
+    /**
+     * Return a synchronized pair of host and device timestamps as seen by device.
+     * Use to correlate the clocks and get the host timer only using getHostTimer
+     * as a lower cost mechanism in between calls.
+     * The resolution of the host timer may be queried with the 
+     * CL_PLATFORM_HOST_TIMER_RESOLUTION query.
+     * The resolution of the device timer may be queried with the
+     * CL_DEVICE_PROFILING_TIMER_RESOLUTION query.
+     * @return A pair of (device timer, host timer) timer values.
+     */
+    std::pair<cl_ulong, cl_ulong> getDeviceAndHostTimer(cl_int *error = nullptr)
+    {
+        std::pair<cl_ulong, cl_ulong> retVal;
+        cl_int err =
+            clGetDeviceAndHostTimer(this->get(), &(retVal.first), &(retVal.second));
+        detail::errHandler(
+            err,
+            __GET_DEVICE_AND_HOST_TIMER_ERR);
+        if (error) {
+            *error = err;
+        }
+        return retVal;
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
     /**
      * CL 1.2 version
      */
@@ -2144,7 +2381,7 @@ public:
                 const cl_device_partition_property_ext * /* properties */,
                 cl_uint /*num_entries*/,
                 cl_device_id * /*out_devices*/,
-                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+                cl_uint * /*num_devices*/ ) CL_API_SUFFIX__VERSION_1_1;
 
         static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT);
@@ -2310,7 +2547,8 @@ public:
     }
 
     //! \brief Wrapper for clGetPlatformInfo().
-    cl_int getInfo(cl_platform_info name, string* param) const
+    template <typename T>
+    cl_int getInfo(cl_platform_info name, T* param) const
     {
         return detail::errHandler(
             detail::getInfo(&::clGetPlatformInfo, object_, name, param),
@@ -2318,7 +2556,7 @@ public:
     }
 
     //! \brief Wrapper for clGetPlatformInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_platform_info name> typename
     detail::param_traits<detail::cl_platform_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -2344,14 +2582,16 @@ public:
             return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
         }
         cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
-        if (err != CL_SUCCESS) {
+        if (err != CL_SUCCESS  && err != CL_DEVICE_NOT_FOUND) {
             return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
         }
 
         vector<cl_device_id> ids(n);
-        err = ::clGetDeviceIDs(object_, type, n, ids.data(), NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        if (n>0) {
+            err = ::clGetDeviceIDs(object_, type, n, ids.data(), NULL);
+            if (err != CL_SUCCESS) {
+                return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+            }
         }
 
         // Cannot trivially assign because we need to capture intermediates 
@@ -2551,8 +2791,8 @@ CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS;
  * Unload the OpenCL compiler.
  * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
  */
-inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
-UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 inline cl_int
 UnloadCompiler()
 {
@@ -2642,7 +2882,7 @@ public:
      */
     Context(
         const vector<Device>& devices,
-        cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = NULL,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
@@ -2671,9 +2911,13 @@ public:
         }
     }
 
+    /*! \brief Constructs a context including a specific device.
+     *
+     *  Wraps clCreateContext().
+     */
     Context(
         const Device& device,
-        cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = NULL,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
@@ -2703,7 +2947,7 @@ public:
      */
     Context(
         cl_device_type type,
-        cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = NULL,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
@@ -2742,7 +2986,9 @@ public:
                     error = platforms[i].getDevices(type, &devices);
 
 #if defined(CL_HPP_ENABLE_EXCEPTIONS)
-                } catch (Error) {}
+                } catch (cl::Error& e) {
+                    error = e.err();
+                }
     // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
     // We do error checking next anyway, and can throw there if needed
 #endif
@@ -2871,7 +3117,7 @@ public:
     }
 
     //! \brief Wrapper for clGetContextInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_context_info name> typename
     detail::param_traits<detail::cl_context_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -3013,7 +3259,7 @@ public:
     }
 
     //! \brief Wrapper for clGetEventInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_event_info name> typename
     detail::param_traits<detail::cl_event_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -3036,7 +3282,7 @@ public:
     }
 
     //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_profiling_info name> typename
     detail::param_traits<detail::cl_profiling_info, name>::param_type
     getProfilingInfo(cl_int* err = NULL) const
     {
@@ -3067,7 +3313,7 @@ public:
      */
     cl_int setCallback(
         cl_int type,
-        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),
         void * user_data = NULL)
     {
         return detail::errHandler(
@@ -3228,7 +3474,7 @@ public:
     }
 
     //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_mem_info name> typename
     detail::param_traits<detail::cl_mem_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -3256,7 +3502,7 @@ public:
      *  value - not the Memory class instance.
      */
     cl_int setDestructorCallback(
-        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),
         void * user_data = NULL)
     {
         return detail::errHandler(
@@ -3494,7 +3740,7 @@ public:
     {
         size_type maxSize = std::numeric_limits<size_type>::max() / sizeof(T);
 
-        for (Device &d : context_.getInfo<CL_CONTEXT_DEVICES>()) {
+        for (const Device &d : context_.getInfo<CL_CONTEXT_DEVICES>()) {
             maxSize = std::min(
                 maxSize, 
                 static_cast<size_type>(d.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>()));
@@ -3599,7 +3845,7 @@ cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Arg
 
         return cl::pointer<T, detail::Deleter<Alloc>>(tmp, detail::Deleter<Alloc>{alloc, copies});
     }
-    catch (std::bad_alloc b)
+    catch (std::bad_alloc& b)
     {
         std::allocator_traits<Alloc>::deallocate(alloc, tmp, copies);
         throw;
@@ -3847,7 +4093,7 @@ public:
         }
 
         return result;
-    }		
+    }
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
 };
 
@@ -4226,7 +4472,7 @@ public:
     }
     
     //! \brief Wrapper for clGetImageInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_image_info name> typename
     detail::param_traits<detail::cl_image_info, name>::param_type
     getImageInfo(cl_int* err = NULL) const
     {
@@ -4263,12 +4509,11 @@ public:
         cl_int* err = NULL)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D,
-            width,
-            0, 0, 0, 0, 0, 0, 0, 0
-        };
+
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+        desc.image_width = width;
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
@@ -4351,13 +4596,12 @@ public:
         cl_int* err = NULL)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D_BUFFER,
-            width,
-            0, 0, 0, 0, 0, 0, 0,
-            buffer()
-        };
+
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        desc.image_width = width;
+        desc.buffer = buffer();
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
@@ -4437,15 +4681,13 @@ public:
         cl_int* err = NULL)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D_ARRAY,
-            width,
-            0, 0,  // height, depth (unused)
-            arraySize,
-            rowPitch,
-            0, 0, 0, 0
-        };
+
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+        desc.image_width = width;
+        desc.image_array_size = arraySize;
+        desc.image_row_pitch = rowPitch;
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
@@ -4552,15 +4794,12 @@ public:
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         if (useCreateImage)
         {
-            cl_image_desc desc =
-            {
-                CL_MEM_OBJECT_IMAGE2D,
-                width,
-                height,
-                0, 0, // depth, array size (unused)
-                row_pitch,
-                0, 0, 0, 0
-            };
+            cl_image_desc desc = {0};
+            desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+            desc.image_width = width;
+            desc.image_height = height;
+            desc.image_row_pitch = row_pitch;
+
             object_ = ::clCreateImage(
                 context(),
                 flags,
@@ -4589,7 +4828,7 @@ public:
 #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
     }
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR)
     /*! \brief Constructs a 2D Image from a buffer.
     * \note This will share storage with the underlying buffer.
     *
@@ -4606,17 +4845,13 @@ public:
     {
         cl_int error;
 
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE2D,
-            width,
-            height,
-            0, 0, // depth, array size (unused)
-            row_pitch,
-            0, 0, 0,
-            // Use buffer as input to image
-            sourceBuffer()
-        };
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+        desc.image_width = width;
+        desc.image_height = height;
+        desc.image_row_pitch = row_pitch;
+        desc.buffer = sourceBuffer();
+
         object_ = ::clCreateImage(
             context(),
             0, // flags inherited from buffer
@@ -4630,7 +4865,7 @@ public:
             *err = error;
         }
     }
-#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR)
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
     /*! \brief Constructs a 2D Image from an image.
@@ -4670,19 +4905,16 @@ public:
         // Update only the channel order. 
         // Channel format inherited from source.
         sourceFormat.image_channel_order = order;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE2D,
-            sourceWidth,
-            sourceHeight,
-            0, 0, // depth (unused), array size (unused)
-            sourceRowPitch,
-            0, // slice pitch (unused)
-            sourceNumMIPLevels,
-            sourceNumSamples,
-            // Use buffer as input to image
-            sourceImage()
-        };
+
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+        desc.image_width = sourceWidth;
+        desc.image_height = sourceHeight;
+        desc.image_row_pitch = sourceRowPitch;
+        desc.num_mip_levels = sourceNumMIPLevels;
+        desc.num_samples = sourceNumSamples;
+        desc.buffer = sourceImage();
+
         object_ = ::clCreateImage(
             context(),
             0, // flags should be inherited from mem_object
@@ -4762,7 +4994,7 @@ public:
  *  \see Memory
  *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
  */
-class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D 
+class CL_API_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D 
 {
 public:
     /*! \brief Constructs an Image2DGL in a specified context, from a given
@@ -4845,7 +5077,7 @@ public:
         return *this;
     }
 
-} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+} CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 #endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
@@ -4868,17 +5100,15 @@ public:
         cl_int* err = NULL)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE2D_ARRAY,
-            width,
-            height,
-            0,       // depth (unused)
-            arraySize,
-            rowPitch,
-            slicePitch,
-            0, 0, 0
-        };
+
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+        desc.image_width = width;
+        desc.image_height = height;
+        desc.image_array_size = arraySize;
+        desc.image_row_pitch = rowPitch;
+        desc.image_slice_pitch = slicePitch;
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
@@ -4983,17 +5213,14 @@ public:
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         if (useCreateImage)
         {
-            cl_image_desc desc =
-            {
-                CL_MEM_OBJECT_IMAGE3D,
-                width,
-                height,
-                depth,
-                0,      // array size (unused)
-                row_pitch,
-                slice_pitch,
-                0, 0, 0
-            };
+            cl_image_desc desc = {0};
+            desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+            desc.image_width = width;
+            desc.image_height = height;
+            desc.image_depth = depth;
+            desc.image_row_pitch = row_pitch;
+            desc.image_slice_pitch = slice_pitch;
+
             object_ = ::clCreateImage(
                 context(), 
                 flags, 
@@ -5375,7 +5602,7 @@ public:
     }
 
     //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_pipe_info name> typename
         detail::param_traits<detail::cl_pipe_info, name>::param_type
         getInfo(cl_int* err = NULL) const
     {
@@ -5508,7 +5735,7 @@ public:
     }
 
     //! \brief Wrapper for clGetSamplerInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_sampler_info name> typename
     detail::param_traits<detail::cl_sampler_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -5731,7 +5958,7 @@ public:
             __GET_KERNEL_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_kernel_info name> typename
     detail::param_traits<detail::cl_kernel_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -5753,7 +5980,7 @@ public:
             __GET_KERNEL_ARG_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_kernel_arg_info name> typename
     detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
     getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
     {
@@ -5777,7 +6004,7 @@ public:
                 __GET_KERNEL_WORK_GROUP_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_kernel_work_group_info name> typename
     detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
         getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
     {
@@ -5790,20 +6017,29 @@ public:
         return param;
     }
     
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200
-#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+#if (CL_HPP_TARGET_OPENCL_VERSION >= 200 && defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)) || CL_HPP_TARGET_OPENCL_VERSION >= 210
     cl_int getSubGroupInfo(const cl::Device &dev, cl_kernel_sub_group_info name, const cl::NDRange &range, size_type* param) const
     {
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        return detail::errHandler(
+            clGetKernelSubGroupInfo(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr),
+            __GET_KERNEL_SUB_GROUP_INFO_ERR);
+
+#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
         typedef clGetKernelSubGroupInfoKHR_fn PFN_clGetKernelSubGroupInfoKHR;
         static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = NULL;
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetKernelSubGroupInfoKHR);
 
         return detail::errHandler(
             pfn_clGetKernelSubGroupInfoKHR(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr),
-            __GET_KERNEL_ARG_INFO_ERR);
+            __GET_KERNEL_SUB_GROUP_INFO_ERR);
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
     }
 
-    template <cl_int name>
+    template <cl_kernel_sub_group_info name>
         size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = NULL) const
     {
         size_type param;
@@ -5813,7 +6049,6 @@ public:
         }
         return param;
     }
-#endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
@@ -5925,27 +6160,28 @@ public:
             );
     }
     
-    template<int index, int ArrayLength, class D, typename T0, typename... Ts>
-    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0, Ts... ts)
+    template<int index, int ArrayLength, class D, typename T0, typename T1, typename... Ts>
+    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0, const pointer<T1, D> &t1, Ts & ... ts)
     {
         pointerList[index] = static_cast<void*>(t0.get());
-        setSVMPointersHelper<index + 1, Ts...>(ts...);
+        setSVMPointersHelper<index + 1, ArrayLength>(pointerList, t1, ts...);
     }
 
-    template<int index, int ArrayLength, typename T0, typename... Ts>
+    template<int index, int ArrayLength, typename T0, typename T1, typename... Ts>
     typename std::enable_if<std::is_pointer<T0>::value, void>::type
-    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0, Ts... ts)
+    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0, T1 t1, Ts... ts)
     {
         pointerList[index] = static_cast<void*>(t0);
-        setSVMPointersHelper<index + 1, Ts...>(ts...);
+        setSVMPointersHelper<index + 1, ArrayLength>(pointerList, t1, ts...);
     }
-    
+
     template<int index, int ArrayLength, typename T0, class D>
     void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0)
     {
         pointerList[index] = static_cast<void*>(t0.get());
     }
 
+
     template<int index, int ArrayLength, typename T0>
     typename std::enable_if<std::is_pointer<T0>::value, void>::type
     setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0)
@@ -5954,7 +6190,7 @@ public:
     }
 
     template<typename T0, typename... Ts>
-    cl_int setSVMPointers(const T0 &t0, Ts... ts)
+    cl_int setSVMPointers(const T0 &t0, Ts & ... ts)
     {
         std::array<void*, 1 + sizeof...(Ts)> pointerList;
 
@@ -5966,7 +6202,40 @@ public:
             sizeof(void*)*(1 + sizeof...(Ts)),
             pointerList.data()));
     }
+
+    template<typename T>
+    cl_int setExecInfo(cl_kernel_exec_info param_name, const T& val)
+    {
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+            object_,
+            param_name,
+            sizeof(T),
+            &val));
+    }
+
+    template<cl_kernel_exec_info name>
+    cl_int setExecInfo(typename detail::param_traits<detail::cl_kernel_exec_info, name>::param_type& val)
+    {
+        return setExecInfo(name, val);
+    }
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+    /**
+     * Make a deep copy of the kernel object including its arguments.
+     * @return A new kernel object with internal state entirely separate from that
+     *         of the original but with any arguments set on the original intact.
+     */
+    Kernel clone()
+    {
+        cl_int error;
+        Kernel retValue(clCloneKernel(this->get(), &error));
+
+        detail::errHandler(error, __CLONE_KERNEL_ERR);
+        return retValue;
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
 };
 
 /*! \class Program
@@ -6129,6 +6398,116 @@ public:
         }
     }
 
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210 || (CL_HPP_TARGET_OPENCL_VERSION==200 && defined(CL_HPP_USE_IL_KHR))
+    /**
+     * Program constructor to allow construction of program from SPIR-V or another IL.
+     * Valid for either OpenCL >= 2.1 or when CL_HPP_USE_IL_KHR is defined.
+     */
+    Program(
+        const vector<char>& IL,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        object_ = ::clCreateProgramWithIL(
+            context(), static_cast<const void*>(IL.data()), IL.size(), &error);
+
+#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR;
+        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);
+
+        return detail::errHandler(
+            pfn_clCreateProgramWithILKHR(
+                context(), static_cast<const void*>(IL.data()), IL.size(), &error);
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                "-cl-std=CL2.0",
+#else
+                "",
+#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                NULL,
+                NULL);
+
+            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Program constructor to allow construction of program from SPIR-V or another IL
+     * for a specific context.
+     * Valid for either OpenCL >= 2.1 or when CL_HPP_USE_IL_KHR is defined.
+     */
+    Program(
+        const Context& context,
+        const vector<char>& IL,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        object_ = ::clCreateProgramWithIL(
+            context(), static_cast<const void*>(IL.data()), IL.size(), &error);
+
+#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR;
+        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);
+
+        return detail::errHandler(
+            pfn_clCreateProgramWithILKHR(
+            context(), static_cast<const void*>(IL.data()), IL.size(), &error);
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR);
+
+        if (error == CL_SUCCESS && build) {
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                "-cl-std=CL2.0",
+#else
+                "",
+#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                NULL,
+                NULL);
+
+            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
     /**
      * Construct a program object from a list of devices and a per-device list of binaries.
      * \param context A valid OpenCL context in which to construct the program.
@@ -6244,7 +6623,7 @@ public:
     Program() { }
     
 
-    /*! \brief Constructor from cl_mem - takes ownership.
+    /*! \brief Constructor from cl_program - takes ownership.
      *
      * \param retainObject will cause the constructor to retain its cl object.
      *                     Defaults to false to maintain compatibility with
@@ -6313,6 +6692,27 @@ public:
     }
 
     cl_int build(
+        const Device& device,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        cl_device_id deviceID = device();
+
+        cl_int buildError = ::clBuildProgram(
+            object_,
+            1,
+            &deviceID,
+            options,
+            notifyFptr,
+            data);
+
+        BuildLogType buildLog(1);
+        buildLog.push_back(std::make_pair(device, getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)));
+        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, buildLog);
+    }
+
+    cl_int build(
         const char* options = NULL,
         void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
         void* data = NULL) const
@@ -6325,7 +6725,6 @@ public:
             notifyFptr,
             data);
 
-
         return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
     }
 
@@ -6357,7 +6756,7 @@ public:
             __GET_PROGRAM_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_program_info name> typename
     detail::param_traits<detail::cl_program_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -6380,7 +6779,7 @@ public:
                 __GET_PROGRAM_BUILD_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_program_build_info name> typename
     detail::param_traits<detail::cl_program_build_info, name>::param_type
     getBuildInfo(const Device& device, cl_int* err = NULL) const
     {
@@ -6398,7 +6797,7 @@ public:
      * info type and for all devices in the program.
      * On an error reading the info for any device, an empty vector of info will be returned.
      */
-    template <cl_int name>
+    template <cl_program_build_info name>
     vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
         getBuildInfo(cl_int *err = NULL) const
     {
@@ -6416,7 +6815,7 @@ public:
             return devInfo;
         }
 
-        for (cl::Device d : devs) {
+        for (const cl::Device &d : devs) {
             typename detail::param_traits<
                 detail::cl_program_build_info, name>::param_type param;
             result = getBuildInfo(d, name, &param);
@@ -6466,6 +6865,64 @@ public:
         }
         return CL_SUCCESS;
     }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 220
+#if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
+    /*! \brief Registers a callback function to be called when destructors for
+     *         program scope global variables are complete and before the
+     *         program is released.
+     *
+     *  Wraps clSetProgramReleaseCallback().
+     *
+     *  Each call to this function registers the specified user callback function
+     *  on a callback stack associated with program. The registered user callback
+     *  functions are called in the reverse order in which they were registered.
+     */
+    CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int setReleaseCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data),
+        void * user_data = NULL) CL_API_SUFFIX__VERSION_2_2_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clSetProgramReleaseCallback(
+                object_,
+                pfn_notify,
+                user_data),
+            __SET_PROGRAM_RELEASE_CALLBACK_ERR);
+    }
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
+
+    /*! \brief Sets a SPIR-V specialization constant.
+     *
+     *  Wraps clSetProgramSpecializationConstant().
+     */
+    template <typename T>
+    typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type
+        setSpecializationConstant(cl_uint index, const T &value)
+    {
+        return detail::errHandler(
+            ::clSetProgramSpecializationConstant(
+                object_,
+                index,
+                sizeof(value),
+                &value),
+            __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);
+    }
+
+    /*! \brief Sets a SPIR-V specialization constant.
+     *
+     *  Wraps clSetProgramSpecializationConstant().
+     */
+    cl_int setSpecializationConstant(cl_uint index, size_type size, const void* value)
+    {
+        return detail::errHandler(
+            ::clSetProgramSpecializationConstant(
+                object_,
+                index,
+                size,
+                value),
+            __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220
 };
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
@@ -6587,6 +7044,22 @@ inline vector<vector<unsigned char>> cl::Program::getInfo<CL_PROGRAM_BINARIES>(c
     return binariesVectors;
 }
 
+#if CL_HPP_TARGET_OPENCL_VERSION >= 220
+// Template specialization for clSetProgramSpecializationConstant
+template <>
+inline cl_int cl::Program::setSpecializationConstant(cl_uint index, const bool &value)
+{
+    cl_uchar ucValue = value ? CL_UCHAR_MAX : 0;
+    return detail::errHandler(
+        ::clSetProgramSpecializationConstant(
+            object_,
+            index,
+            sizeof(ucValue),
+            &ucValue),
+        __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);
+}
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220
+
 inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
 {
     cl_int error;
@@ -6697,31 +7170,49 @@ public:
         }
         else {
             Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+            bool useWithProperties;
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200
-            cl_queue_properties queue_properties[] = {
-                CL_QUEUE_PROPERTIES, properties, 0 };
-            if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
-                object_ = ::clCreateCommandQueueWithProperties(
-                    context(), device(), queue_properties, &error);
-            }
-            else {
-                error = CL_INVALID_QUEUE_PROPERTIES;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+            // Run-time decision based on the actual platform
+            {
+                cl_uint version = detail::getContextPlatformVersion(context());
+                useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
             }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+            useWithProperties = true;
+#else
+            useWithProperties = false;
+#endif
 
-            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-            if (err != NULL) {
-                *err = error;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+            if (useWithProperties) {
+                cl_queue_properties queue_properties[] = {
+                    CL_QUEUE_PROPERTIES, properties, 0 };
+                if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
+                    object_ = ::clCreateCommandQueueWithProperties(
+                        context(), device(), queue_properties, &error);
+                }
+                else {
+                    error = CL_INVALID_QUEUE_PROPERTIES;
+                }
+
+                detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
             }
-#else
-            object_ = ::clCreateCommandQueue(
-                context(), device(), properties, &error);
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+            if (!useWithProperties) {
+                object_ = ::clCreateCommandQueue(
+                    context(), device(), properties, &error);
 
-            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
-                *err = error;
+                detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
             }
-#endif
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
         }
     }
 
@@ -6745,28 +7236,46 @@ public:
        }
        else {
            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+           bool useWithProperties;
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200
-           cl_queue_properties queue_properties[] = {
-               CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+           // Run-time decision based on the actual platform
+           {
+               cl_uint version = detail::getContextPlatformVersion(context());
+               useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
+           }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+           useWithProperties = true;
+#else
+           useWithProperties = false;
+#endif
 
-           object_ = ::clCreateCommandQueueWithProperties(
-               context(), device(), queue_properties, &error);
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+           if (useWithProperties) {
+               cl_queue_properties queue_properties[] = {
+                   CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
 
+               object_ = ::clCreateCommandQueueWithProperties(
+                   context(), device(), queue_properties, &error);
 
-           detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-           if (err != NULL) {
-               *err = error;
+               detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+               if (err != NULL) {
+                   *err = error;
+               }
            }
-#else
-           object_ = ::clCreateCommandQueue(
-               context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
-
-           detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-           if (err != NULL) {
-               *err = error;
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+           if (!useWithProperties) {
+               object_ = ::clCreateCommandQueue(
+                   context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
+
+               detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+               if (err != NULL) {
+                   *err = error;
+               }
            }
-#endif
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
+
        }
    }
 
@@ -6780,6 +7289,7 @@ public:
         cl_int* err = NULL)
     {
         cl_int error;
+        bool useWithProperties;
         vector<cl::Device> devices;
         error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
 
@@ -6793,31 +7303,47 @@ public:
             return;
         }
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200
-        cl_queue_properties queue_properties[] = {
-            CL_QUEUE_PROPERTIES, properties, 0 };
-        if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
-            object_ = ::clCreateCommandQueueWithProperties(
-                context(), devices[0](), queue_properties, &error);
-        }
-        else {
-            error = CL_INVALID_QUEUE_PROPERTIES;
-        }
-
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
-            *err = error;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
         }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+        useWithProperties = true;
 #else
-        object_ = ::clCreateCommandQueue(
-            context(), devices[0](), properties, &error);
+        useWithProperties = false;
+#endif
 
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-        if (err != NULL) {
-            *err = error;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        if (useWithProperties) {
+            cl_queue_properties queue_properties[] = {
+                CL_QUEUE_PROPERTIES, properties, 0 };
+            if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
+                object_ = ::clCreateCommandQueueWithProperties(
+                    context(), devices[0](), queue_properties, &error);
+            }
+            else {
+                error = CL_INVALID_QUEUE_PROPERTIES;
+            }
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
         }
-#endif
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        if (!useWithProperties) {
+            object_ = ::clCreateCommandQueue(
+                context(), devices[0](), properties, &error);
 
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
     }
 
     /*!
@@ -6830,6 +7356,7 @@ public:
         cl_int* err = NULL)
     {
         cl_int error;
+        bool useWithProperties;
         vector<cl::Device> devices;
         error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
 
@@ -6843,26 +7370,42 @@ public:
             return;
         }
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200
-        cl_queue_properties queue_properties[] = {
-            CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
-        object_ = ::clCreateCommandQueueWithProperties(
-            context(), devices[0](), queue_properties, &error);
-       
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
-            *err = error;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
         }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+        useWithProperties = true;
 #else
-        object_ = ::clCreateCommandQueue(
-            context(), devices[0](), static_cast<cl_command_queue_properties>(properties), &error);
+        useWithProperties = false;
+#endif
 
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-        if (err != NULL) {
-            *err = error;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        if (useWithProperties) {
+            cl_queue_properties queue_properties[] = {
+                CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
+            object_ = ::clCreateCommandQueueWithProperties(
+                context(), devices[0](), queue_properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
         }
-#endif
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        if (!useWithProperties) {
+            object_ = ::clCreateCommandQueue(
+                context(), devices[0](), static_cast<cl_command_queue_properties>(properties), &error);
 
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
     }
 
     /*!
@@ -6876,26 +7419,44 @@ public:
         cl_int* err = NULL)
     {
         cl_int error;
+        bool useWithProperties;
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200
-        cl_queue_properties queue_properties[] = {
-            CL_QUEUE_PROPERTIES, properties, 0 };
-        object_ = ::clCreateCommandQueueWithProperties(
-            context(), device(), queue_properties, &error);
-        
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
-            *err = error;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
         }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+        useWithProperties = true;
 #else
-        object_ = ::clCreateCommandQueue(
-            context(), device(), properties, &error);
+        useWithProperties = false;
+#endif
 
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-        if (err != NULL) {
-            *err = error;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        if (useWithProperties) {
+            cl_queue_properties queue_properties[] = {
+                CL_QUEUE_PROPERTIES, properties, 0 };
+            object_ = ::clCreateCommandQueueWithProperties(
+                context(), device(), queue_properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
         }
-#endif
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        if (!useWithProperties) {
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
     }
 
     /*!
@@ -6908,19 +7469,36 @@ public:
         QueueProperties properties,
         cl_int* err = NULL)
     {
-            cl_int error;
+        cl_int error;
+        bool useWithProperties;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
+        }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+        useWithProperties = true;
+#else
+        useWithProperties = false;
+#endif
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        if (useWithProperties) {
             cl_queue_properties queue_properties[] = {
                 CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
             object_ = ::clCreateCommandQueueWithProperties(
                 context(), device(), queue_properties, &error);
-      
+
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
             if (err != NULL) {
                 *err = error;
             }
-#else
+        }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        if (!useWithProperties) {
             object_ = ::clCreateCommandQueue(
                 context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
 
@@ -6928,8 +7506,9 @@ public:
             if (err != NULL) {
                 *err = error;
             }
-#endif
         }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
+    }
 
     static CommandQueue getDefault(cl_int * err = NULL) 
     {
@@ -6962,7 +7541,7 @@ public:
     CommandQueue() { }
 
 
-    /*! \brief Constructor from cl_mem - takes ownership.
+    /*! \brief Constructor from cl_command_queue - takes ownership.
      *
      * \param retainObject will cause the constructor to retain its cl object.
      *                     Defaults to false to maintain compatibility with
@@ -7014,7 +7593,7 @@ public:
                 __GET_COMMAND_QUEUE_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_command_queue_info name> typename
     detail::param_traits<detail::cl_command_queue_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -7100,7 +7679,7 @@ public:
 
         return err;
     }
-
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
     cl_int enqueueReadBufferRect(
         const Buffer& buffer,
         cl_bool blocking,
@@ -7215,7 +7794,7 @@ public:
 
         return err;
     }
-
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
     /**
      * Enqueue a command to fill a buffer object with a pattern
@@ -7647,7 +8226,7 @@ public:
     {
         cl_event tmp;
         cl_int err = detail::errHandler(::clEnqueueSVMMap(
-            object_, blocking, flags, static_cast<void*>(container.data()), container.size(),
+            object_, blocking, flags, static_cast<void*>(container.data()), container.size()*sizeof(T),
             (events != NULL) ? (cl_uint)events->size() : 0,
             (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
             (event != NULL) ? &tmp : NULL),
@@ -7773,7 +8352,7 @@ public:
      */
     cl_int enqueueMarkerWithWaitList(
         const vector<Event> *events = 0,
-        Event *event = 0)
+        Event *event = 0) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7803,7 +8382,7 @@ public:
      */
     cl_int enqueueBarrierWithWaitList(
         const vector<Event> *events = 0,
-        Event *event = 0)
+        Event *event = 0) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7829,7 +8408,7 @@ public:
         cl_mem_migration_flags flags,
         const vector<Event>* events = NULL,
         Event* event = NULL
-        )
+        ) const
     {
         cl_event tmp;
         
@@ -7838,8 +8417,7 @@ public:
         for( int i = 0; i < (int)memObjects.size(); ++i ) {
             localMemObjects[i] = memObjects[i]();
         }
-
-
+        
         cl_int err = detail::errHandler(
             ::clEnqueueMigrateMemObjects(
                 object_, 
@@ -7858,6 +8436,128 @@ public:
     }
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+    /**
+     * Enqueues a command that will allow the host associate ranges within a set of
+     * SVM allocations with a device.
+     * @param sizes - The length from each pointer to migrate.
+     */
+    template<typename T>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<T*> &svmRawPointers,
+        const cl::vector<size_type> &sizes,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMigrateMem(
+            object_,
+            svmRawPointers.size(), static_cast<void**>(svmRawPointers.data()),
+            sizes.data(), // array of sizes not passed
+            flags,
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MIGRATE_SVM_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueues a command that will allow the host associate a set of SVM allocations with
+     * a device.
+     */
+    template<typename T>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<T*> &svmRawPointers,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return enqueueMigrateSVM(svmRawPointers, cl::vector<size_type>(svmRawPointers.size()), flags, events, event);
+    }
+
+
+    /**
+     * Enqueues a command that will allow the host associate ranges within a set of
+     * SVM allocations with a device.
+     * @param sizes - The length from each pointer to migrate.
+     */
+    template<typename T, class D>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<cl::pointer<T, D>> &svmPointers,
+        const cl::vector<size_type> &sizes,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl::vector<void*> svmRawPointers;
+        svmRawPointers.reserve(svmPointers.size());
+        for (auto p : svmPointers) {
+            svmRawPointers.push_back(static_cast<void*>(p.get()));
+        }
+
+        return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event);
+    }
+
+
+    /**
+     * Enqueues a command that will allow the host associate a set of SVM allocations with
+     * a device.
+     */
+    template<typename T, class D>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<cl::pointer<T, D>> &svmPointers,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return enqueueMigrateSVM(svmPointers, cl::vector<size_type>(svmPointers.size()), flags, events, event);
+    }
+
+    /**
+     * Enqueues a command that will allow the host associate ranges within a set of
+     * SVM allocations with a device.
+     * @param sizes - The length from the beginning of each container to migrate.
+     */
+    template<typename T, class Alloc>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<cl::vector<T, Alloc>> &svmContainers,
+        const cl::vector<size_type> &sizes,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl::vector<void*> svmRawPointers;
+        svmRawPointers.reserve(svmContainers.size());
+        for (auto p : svmContainers) {
+            svmRawPointers.push_back(static_cast<void*>(p.data()));
+        }
+
+        return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event);
+    }
+
+    /**
+     * Enqueues a command that will allow the host associate a set of SVM allocations with
+     * a device.
+     */
+    template<typename T, class Alloc>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<cl::vector<T, Alloc>> &svmContainers,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return enqueueMigrateSVM(svmContainers, cl::vector<size_type>(svmContainers.size()), flags, events, event);
+    }
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+    
     cl_int enqueueNDRangeKernel(
         const Kernel& kernel,
         const NDRange& offset,
@@ -7885,10 +8585,10 @@ public:
     }
 
 #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
-    CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
+    CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
         const Kernel& kernel,
         const vector<Event>* events = NULL,
-        Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+        Event* event = NULL) const CL_API_SUFFIX__VERSION_1_2_DEPRECATED
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7945,8 +8645,8 @@ public:
  * Deprecated APIs for 1.2
  */
 #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
-    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    CL_API_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7961,8 +8661,8 @@ public:
         return err;
     }
 
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-    cl_int enqueueWaitForEvents(const vector<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    CL_API_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const vector<Event>& events) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
     {
         return detail::errHandler(
             ::clEnqueueWaitForEvents(
@@ -8098,8 +8798,8 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
  * Deprecated APIs for 1.2
  */
 #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    CL_API_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
     {
         return detail::errHandler(
             ::clEnqueueBarrier(object_),
@@ -8273,7 +8973,7 @@ public:
             __GET_COMMAND_QUEUE_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_command_queue_info name> typename
         detail::param_traits<detail::cl_command_queue_info, name>::param_type
         getInfo(cl_int* err = NULL) const
     {
@@ -8287,11 +8987,11 @@ public:
     }
 
     /*!
-    * Create a new default device command queue for the default device,
-    * in the default context and of the default size.
-    * If there is already a default queue for the specified device this
-    * function will return the pre-existing queue.
-    */
+     * Create a new default device command queue for the default device,
+     * in the default context and of the default size.
+     * If there is already a default queue for the specified device this
+     * function will return the pre-existing queue.
+     */
     static DeviceCommandQueue makeDefault(
         cl_int *err = nullptr)
     {
@@ -8317,11 +9017,11 @@ public:
     }
 
     /*!
-    * Create a new default device command queue for the specified device
-    * and of the default size.
-    * If there is already a default queue for the specified device this
-    * function will return the pre-existing queue.
-    */
+     * Create a new default device command queue for the specified device
+     * and of the default size.
+     * If there is already a default queue for the specified device this
+     * function will return the pre-existing queue.
+     */
     static DeviceCommandQueue makeDefault(
         const Context &context, const Device &device, cl_int *err = nullptr)
     {
@@ -8372,6 +9072,37 @@ public:
 
         return deviceQueue;
     }
+
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+    /*!
+     * Modify the default device command queue to be used for subsequent kernels.
+     * This can update the default command queue for a device repeatedly to account
+     * for kernels that rely on the default.
+     * @return updated default device command queue.
+     */
+    static DeviceCommandQueue updateDefault(const Context &context, const Device &device, const DeviceCommandQueue &default_queue, cl_int *err = nullptr)
+    {
+        cl_int error;
+        error = clSetDefaultDeviceCommandQueue(context.get(), device.get(), default_queue.get());
+
+        detail::errHandler(error, __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        return default_queue;
+    }
+
+    /*!
+     * Return the current default command queue for the specified command queue
+     */
+    static DeviceCommandQueue getDefault(const CommandQueue &queue, cl_int * err = NULL)
+    {
+        return queue.getInfo<CL_QUEUE_DEVICE_DEFAULT>(err);
+    }
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
 }; // DeviceCommandQueue
 
 namespace detail
@@ -9425,7 +10156,7 @@ public:
     }
 
     template<typename T0, typename... T1s>
-    cl_int setSVMPointers(const T0 &t0, T1s... ts)
+    cl_int setSVMPointers(const T0 &t0, T1s &... ts)
     {
         return kernel_.setSVMPointers(t0, ts...);
     }
@@ -9439,7 +10170,7 @@ public:
 
 namespace compatibility {
     /**
-     * Backward compatibility class to ensure that cl.hpp code works with cl2.hpp.
+     * Backward compatibility class to ensure that cl.hpp code works with opencl.hpp.
      * Please use KernelFunctor directly.
      */
     template<typename... Ts>
@@ -9484,73 +10215,101 @@ namespace compatibility {
 
 #undef CL_HPP_ERR_STR_
 #if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)
-#undef __GET_DEVICE_INFO_ERR
-#undef __GET_PLATFORM_INFO_ERR
-#undef __GET_DEVICE_IDS_ERR
-#undef __GET_CONTEXT_INFO_ERR
-#undef __GET_EVENT_INFO_ERR
-#undef __GET_EVENT_PROFILE_INFO_ERR
-#undef __GET_MEM_OBJECT_INFO_ERR
-#undef __GET_IMAGE_INFO_ERR
-#undef __GET_SAMPLER_INFO_ERR
-#undef __GET_KERNEL_INFO_ERR
-#undef __GET_KERNEL_ARG_INFO_ERR
-#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
-#undef __GET_PROGRAM_INFO_ERR
-#undef __GET_PROGRAM_BUILD_INFO_ERR
-#undef __GET_COMMAND_QUEUE_INFO_ERR
-
-#undef __CREATE_CONTEXT_ERR
-#undef __CREATE_CONTEXT_FROM_TYPE_ERR
-#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
-
-#undef __CREATE_BUFFER_ERR
-#undef __CREATE_SUBBUFFER_ERR
-#undef __CREATE_IMAGE2D_ERR
-#undef __CREATE_IMAGE3D_ERR
-#undef __CREATE_SAMPLER_ERR
-#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
-
-#undef __CREATE_USER_EVENT_ERR
-#undef __SET_USER_EVENT_STATUS_ERR
-#undef __SET_EVENT_CALLBACK_ERR
-#undef __SET_PRINTF_CALLBACK_ERR
-
-#undef __WAIT_FOR_EVENTS_ERR
-
-#undef __CREATE_KERNEL_ERR
-#undef __SET_KERNEL_ARGS_ERR
-#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
-#undef __CREATE_PROGRAM_WITH_BINARY_ERR
-#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
-#undef __BUILD_PROGRAM_ERR
-#undef __CREATE_KERNELS_IN_PROGRAM_ERR
-
-#undef __CREATE_COMMAND_QUEUE_ERR
-#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
-#undef __ENQUEUE_READ_BUFFER_ERR
-#undef __ENQUEUE_WRITE_BUFFER_ERR
-#undef __ENQUEUE_READ_BUFFER_RECT_ERR
-#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
-#undef __ENQEUE_COPY_BUFFER_ERR
-#undef __ENQEUE_COPY_BUFFER_RECT_ERR
-#undef __ENQUEUE_READ_IMAGE_ERR
-#undef __ENQUEUE_WRITE_IMAGE_ERR
-#undef __ENQUEUE_COPY_IMAGE_ERR
-#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
-#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
-#undef __ENQUEUE_MAP_BUFFER_ERR
-#undef __ENQUEUE_MAP_IMAGE_ERR
-#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
-#undef __ENQUEUE_NDRANGE_KERNEL_ERR
-#undef __ENQUEUE_TASK_ERR
-#undef __ENQUEUE_NATIVE_KERNEL
-
-#undef __UNLOAD_COMPILER_ERR
-#undef __CREATE_SUB_DEVICES_ERR
-
-#undef __CREATE_PIPE_ERR
-#undef __GET_PIPE_INFO_ERR
+#undef __GET_DEVICE_INFO_ERR               
+#undef __GET_PLATFORM_INFO_ERR             
+#undef __GET_DEVICE_IDS_ERR                
+#undef __GET_PLATFORM_IDS_ERR              
+#undef __GET_CONTEXT_INFO_ERR              
+#undef __GET_EVENT_INFO_ERR                
+#undef __GET_EVENT_PROFILE_INFO_ERR        
+#undef __GET_MEM_OBJECT_INFO_ERR           
+#undef __GET_IMAGE_INFO_ERR                
+#undef __GET_SAMPLER_INFO_ERR              
+#undef __GET_KERNEL_INFO_ERR               
+#undef __GET_KERNEL_ARG_INFO_ERR           
+#undef __GET_KERNEL_SUB_GROUP_INFO_ERR     
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR    
+#undef __GET_PROGRAM_INFO_ERR              
+#undef __GET_PROGRAM_BUILD_INFO_ERR        
+#undef __GET_COMMAND_QUEUE_INFO_ERR        
+#undef __CREATE_CONTEXT_ERR                
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR      
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR   
+#undef __CREATE_BUFFER_ERR                 
+#undef __COPY_ERR                          
+#undef __CREATE_SUBBUFFER_ERR              
+#undef __CREATE_GL_BUFFER_ERR              
+#undef __CREATE_GL_RENDER_BUFFER_ERR       
+#undef __GET_GL_OBJECT_INFO_ERR            
+#undef __CREATE_IMAGE_ERR                  
+#undef __CREATE_GL_TEXTURE_ERR             
+#undef __IMAGE_DIMENSION_ERR               
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR 
+#undef __CREATE_USER_EVENT_ERR             
+#undef __SET_USER_EVENT_STATUS_ERR         
+#undef __SET_EVENT_CALLBACK_ERR            
+#undef __WAIT_FOR_EVENTS_ERR               
+#undef __CREATE_KERNEL_ERR                 
+#undef __SET_KERNEL_ARGS_ERR               
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR    
+#undef __CREATE_PROGRAM_WITH_IL_ERR        
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR    
+#undef __CREATE_PROGRAM_WITH_IL_ERR        
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    
+#undef __BUILD_PROGRAM_ERR                 
+#undef __COMPILE_PROGRAM_ERR               
+#undef __LINK_PROGRAM_ERR                  
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR     
+#undef __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR          
+#undef __CREATE_SAMPLER_WITH_PROPERTIES_ERR                
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR    
+#undef __ENQUEUE_READ_BUFFER_ERR           
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR      
+#undef __ENQUEUE_WRITE_BUFFER_ERR          
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR     
+#undef __ENQEUE_COPY_BUFFER_ERR            
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR       
+#undef __ENQUEUE_FILL_BUFFER_ERR           
+#undef __ENQUEUE_READ_IMAGE_ERR            
+#undef __ENQUEUE_WRITE_IMAGE_ERR           
+#undef __ENQUEUE_COPY_IMAGE_ERR            
+#undef __ENQUEUE_FILL_IMAGE_ERR            
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  
+#undef __ENQUEUE_MAP_BUFFER_ERR            
+#undef __ENQUEUE_MAP_IMAGE_ERR             
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR      
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR        
+#undef __ENQUEUE_NATIVE_KERNEL             
+#undef __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   
+#undef __ENQUEUE_MIGRATE_SVM_ERR
+#undef __ENQUEUE_ACQUIRE_GL_ERR            
+#undef __ENQUEUE_RELEASE_GL_ERR            
+#undef __CREATE_PIPE_ERR             
+#undef __GET_PIPE_INFO_ERR           
+#undef __RETAIN_ERR                        
+#undef __RELEASE_ERR                       
+#undef __FLUSH_ERR                         
+#undef __FINISH_ERR                        
+#undef __VECTOR_CAPACITY_ERR               
+#undef __CREATE_SUB_DEVICES_ERR            
+#undef __CREATE_SUB_DEVICES_ERR            
+#undef __ENQUEUE_MARKER_ERR                
+#undef __ENQUEUE_WAIT_FOR_EVENTS_ERR       
+#undef __ENQUEUE_BARRIER_ERR               
+#undef __UNLOAD_COMPILER_ERR               
+#undef __CREATE_GL_TEXTURE_2D_ERR          
+#undef __CREATE_GL_TEXTURE_3D_ERR          
+#undef __CREATE_IMAGE2D_ERR                
+#undef __CREATE_IMAGE3D_ERR                
+#undef __CREATE_COMMAND_QUEUE_ERR          
+#undef __ENQUEUE_TASK_ERR                  
+#undef __CREATE_SAMPLER_ERR                
+#undef __ENQUEUE_MARKER_WAIT_LIST_ERR                
+#undef __ENQUEUE_BARRIER_WAIT_LIST_ERR               
+#undef __CLONE_KERNEL_ERR     
+#undef __GET_HOST_TIMER_ERR
+#undef __GET_DEVICE_AND_HOST_TIMER_ERR
 
 #endif //CL_HPP_USER_OVERRIDE_ERROR_STRINGS
 
diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp
index 32504acc4..2b8537565 100644
--- a/src/common/cpuinfo/CpuInfo.cpp
+++ b/src/common/cpuinfo/CpuInfo.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,10 +40,13 @@
 #include <unordered_map>
 #endif /* !defined(BARE_METAL) */
 
-#if !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__))
 #include <asm/hwcap.h> /* Get HWCAP bits from asm/hwcap.h */
 #include <sys/auxv.h>
-#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
+#elif defined(__APPLE__) && defined(__aarch64__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
 #define ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID (1 << 11)
 #define ARM_COMPUTE_GET_FEATURE_REG(var, freg) __asm __volatile("MRS %0, " #freg \
@@ -54,7 +57,7 @@ namespace cpuinfo
 {
 namespace
 {
-#if !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__))
 /** Extract MIDR using CPUID information that are exposed to user-space
  *
  * @param[in] max_num_cpus Maximum number of possible CPUs
@@ -258,7 +261,19 @@ int get_max_cpus()
     }
     return max_cpus;
 }
-#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
+#elif  defined(__aarch64__) && defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
+/** Query features through sysctlbyname
+  *
+  * @return int value queried
+  */
+int get_hw_capability(const std::string& cap)
+{
+     int64_t result(0);
+     size_t size = sizeof(result);
+     sysctlbyname(cap.c_str(), &result, &size, NULL, 0);
+     return result;
+}
+#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
 #if defined(BARE_METAL) && defined(__aarch64__)
 uint64_t get_sve_feature_reg()
@@ -282,7 +297,7 @@ CpuInfo::CpuInfo(CpuIsaInfo isa, std::vector<CpuModel> cpus)
 
 CpuInfo CpuInfo::build()
 {
-#if !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__))
     const uint32_t hwcaps   = getauxval(AT_HWCAP);
     const uint32_t hwcaps2  = getauxval(AT_HWCAP2);
     const uint32_t max_cpus = get_max_cpus();
@@ -313,7 +328,7 @@ CpuInfo CpuInfo::build()
     CpuInfo info(isa, cpus_model);
     return info;
 
-#elif(BARE_METAL) && defined(__aarch64__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
+#elif(BARE_METAL) && defined(__aarch64__) /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
     // Assume single CPU in bare metal mode.  Just read the ID register and feature bits directly.
     uint64_t isar0 = 0, isar1 = 0, pfr0 = 0, svefr0 = 0, midr = 0;
@@ -330,10 +345,19 @@ CpuInfo CpuInfo::build()
     std::vector<CpuModel> cpus_model(1, midr_to_model(midr));
     CpuInfo               info(isa, cpus_model);
     return info;
-#else                                     /* #elif(BARE_METAL) && defined(__aarch64__)  */
+#elif defined(__aarch64__) && defined(__APPLE__) /* #elif(BARE_METAL) && defined(__aarch64__) */
+    int ncpus = get_hw_capability("hw.logicalcpu");
+    CpuIsaInfo isainfo;
+    std::vector<CpuModel> cpus_model(ncpus);
+    isainfo.neon = get_hw_capability("hw.optional.neon");
+    isainfo.fp16 = get_hw_capability("hw.optional.neon_fp16");
+    isainfo.dot =  get_hw_capability("hw.optional.arm.FEAT_DotProd");
+    CpuInfo info(isainfo,cpus_model);
+    return info;
+#else  /* #elif defined(__aarch64__) && defined(__APPLE__) */
     CpuInfo info(CpuIsaInfo(), { CpuModel::GENERIC });
     return info;
-#endif                                    /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
+#endif                                    /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 }
 
 CpuModel CpuInfo::cpu_model(uint32_t cpuid) const
@@ -347,11 +371,11 @@ CpuModel CpuInfo::cpu_model(uint32_t cpuid) const
 
 CpuModel CpuInfo::cpu_model() const
 {
-#if defined(BARE_METAL) || defined(__APPLE__) || (!defined(__arm__) && !defined(__aarch64__))
+#if defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__))
     return cpu_model(0);
-#else  /* defined(BARE_METAL) || defined(__APPLE__) || (!defined(__arm__) && !defined(__aarch64__)) */
+#else  /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */
     return cpu_model(sched_getcpu());
-#endif /* defined(BARE_METAL) || defined(__APPLE__) || (!defined(__arm__) && !defined(__aarch64__)) */
+#endif /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */
 }
 
 uint32_t CpuInfo::num_cpus() const
@@ -415,4 +439,4 @@ uint32_t num_threads_hint()
     return num_threads_hint;
 }
 } // namespace cpuinfo
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/common/cpuinfo/CpuModel.h b/src/common/cpuinfo/CpuModel.h
index 4bd294e32..4fe6c29e5 100644
--- a/src/common/cpuinfo/CpuModel.h
+++ b/src/common/cpuinfo/CpuModel.h
@@ -55,7 +55,7 @@ CpuModel midr_to_model(uint32_t midr);
  *
  * @note This is used in case of old kernel configurations where some capabilities are not exposed.
  *
- * @param[in] model Model to check for whitelisted capabilities
+ * @param[in] model Model to check for allowlisted capabilities
  */
 bool model_supports_fp16(CpuModel model);
 
@@ -63,7 +63,7 @@ bool model_supports_fp16(CpuModel model);
  *
  * @note This is used in case of old kernel configurations where some capabilities are not exposed.
  *
- * @param[in] model Model to check for whitelisted capabilities
+ * @param[in] model Model to check for allowlisted capabilities
  */
 bool model_supports_dot(CpuModel model);
 } // namespace cpuinfo
diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h
index 0c295aae6..63be7b1ea 100644
--- a/src/core/CL/CLKernels.h
+++ b/src/core/CL/CLKernels.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,7 +56,6 @@
 #include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "src/core/CL/kernels/CLRangeKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
-#include "src/core/CL/kernels/CLRemapKernel.h"
 #include "src/core/CL/kernels/CLReorgLayerKernel.h"
 #include "src/core/CL/kernels/CLReverseKernel.h"
 #include "src/core/CL/kernels/CLSelectKernel.h"
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 5de9a5568..8f39c2d70 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 9ba17d0e0..9bbc710c8 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -116,6 +116,58 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
     ARM_COMPUTE_UNUSED(idx_start);
 }
 
+void ICLKernel::add_3d_tensor_nhw_argument(unsigned int &idx, const ICLTensor *tensor)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const ITensorInfo *info = tensor->info();
+    ARM_COMPUTE_ERROR_ON(info == nullptr);
+    const Strides &strides = info->strides_in_bytes();
+
+    // Tensor poniter
+    _kernel.setArg(idx++, tensor->cl_buffer());
+
+    // Add stride_y, stride_z
+    _kernel.setArg<cl_uint>(idx++, strides[1]);
+    _kernel.setArg<cl_uint>(idx++, strides[2]);
+
+    // Tensor dimensions
+    _kernel.setArg<cl_uint>(idx++, info->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(2));
+
+    // Offset of first element
+    unsigned int offset_first_element = info->offset_first_element_in_bytes();
+    _kernel.setArg<cl_uint>(idx++, offset_first_element);
+}
+
+void ICLKernel::add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const ITensorInfo *info = tensor->info();
+    ARM_COMPUTE_ERROR_ON(info == nullptr);
+    const Strides &strides = info->strides_in_bytes();
+
+    // Tensor poniter
+    _kernel.setArg(idx++, tensor->cl_buffer());
+
+    // Add stride_y, stride_z and stride_w
+    _kernel.setArg<cl_uint>(idx++, strides[1]);
+    _kernel.setArg<cl_uint>(idx++, strides[2]);
+    _kernel.setArg<cl_uint>(idx++, strides[3]);
+
+    // Tensor dimensions
+    _kernel.setArg<cl_uint>(idx++, info->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(2));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(3));
+
+    // Offset of first element
+    unsigned int offset_first_element = info->offset_first_element_in_bytes();
+    _kernel.setArg<cl_uint>(idx++, offset_first_element);
+}
+
 #ifndef DOXYGEN_SKIP_THIS
 template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window);
 template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window);
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index 3b3217d1d..bc138e7e3 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -225,6 +225,41 @@ public:
     {
         add_tensor_argument<4>(idx, tensor, window);
     }
+
+    /** Add the passed NHW 3D tensor's parameters to the object's kernel's arguments by passing strides, dimensions and the offset to the first valid element in bytes.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     */
+    void add_3d_tensor_nhw_argument(unsigned int &idx, const ICLTensor *tensor);
+
+    /** Returns the number of arguments enqueued per NHW 3D Tensor object.
+     *
+     * @return The number of arguments enqueued per NHW 3D Tensor object.
+     */
+    constexpr static unsigned int num_arguments_per_3d_tensor_nhw()
+    {
+        constexpr unsigned int no_args_per_3d_tensor_nhw = 7u;
+        return no_args_per_3d_tensor_nhw;
+    }
+
+    /** Add the passed NHWC 4D tensor's parameters to the object's kernel's arguments by passing strides, dimensions and the offset to the first valid element in bytes.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     */
+    void add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor);
+
+    /** Returns the number of arguments enqueued per NHWC 4D Tensor object.
+     *
+     * @return The number of arguments enqueued per NHWC 4D Tensor object.
+     */
+    constexpr static unsigned int num_arguments_per_4d_tensor_nhwc()
+    {
+        constexpr unsigned int no_args_per_4d_tensor_nhwc = 9u;
+        return no_args_per_4d_tensor_nhwc;
+    }
+
     /** Returns the number of arguments enqueued per 1D array object.
      *
      * @return The number of arguments enqueues per 1D array object.
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index d8c2736ef..d5034ba8f 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -163,7 +163,7 @@ bool opencl_is_available()
     // hold their state, we call a harmless OpenCL function (clGetPlatformIDs
     // with invalid parameters must result in CL_INVALID_VALUE) to ensure the
     // runtimes have a chance to initialize their static objects first. Thanks
-    // to C++11 rules about normal program termination (cf [basic.start]), this
+    // to C++11 rules about normal program completion (cf [basic.start]), this
     // ensures their static objects are destroyed last, i.e. after the
     // singleton CLScheduler is destroyed.
     //
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
index 4665d612f..d8453ed80 100644
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
+++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
@@ -27,7 +27,7 @@
 #include "repeat.h"
 
 /** (EXPERIMENTAL_POST_OPS) gemm_mm_native kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 #if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
 
 #define VFMA(a, b, c)     \
@@ -107,6 +107,7 @@
 #error "M0 not supported"
 #endif // M0 not supported
 
+#if defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
  * Post op 1: activation (optional)
  * Post op 2: elementwise op
@@ -140,8 +141,11 @@ __kernel void gemm_mm_native_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
 #if defined(BETA)
                                                      uint bias_stride_z,
 #endif //defined(BETA)
-                                                     uint dst_stride_z,
-                                                     uint eltwise_operand_stride_z
+                                                     uint      dst_stride_z,
+                                                     uint      eltwise_operand_stride_z,
+                                                     const int M,
+                                                     const int N,
+                                                     const int K
 #if defined(REINTERPRET_INPUT_AS_3D)
                                                      ,
                                                      uint lhs_cross_plane_pad
@@ -360,5 +364,6 @@ __kernel void gemm_mm_native_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
     // Store output block
     STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 }
+#endif // defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
 #endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
index 32186c359..89577e9eb 100644
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
+++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
@@ -27,7 +27,7 @@
 
 /** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped kernel */
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
 #if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
 
 #if defined(MIXED_PRECISION)
@@ -207,6 +207,7 @@
 #error "N0 value not supported"
 #endif // N0 conditions
 
+#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
  * Post op 1: activation (optional)
  * Post op 2: elementwise op
@@ -235,7 +236,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLAR
                                                                     IMAGE_DECLARATION(dst),
                                                                     // Post Op arguments
                                                                     IMAGE_DECLARATION(eltwise_operand),
-                                                                    uint k,
                                                                     uint lhs_stride_z,
                                                                     uint rhs_stride_z,
 #if defined(BETA)
@@ -247,7 +247,10 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLAR
                                                                     ,
                                                                     uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                                   )
+                                                                    ,
+                                                                    const int M,
+                                                                    const int N,
+                                                                    const int K)
 {
     // Block size
 #define LHS_BLOCK_SIZE ((K0) * (M0))
@@ -303,7 +306,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLAR
     REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
     REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
 
-    for(int i = 0; i < k; i += K0)
+    for(int i = 0; i < K; i += K0)
     {
         // Supported cases (M0, K0):
         // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
@@ -425,8 +428,9 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLAR
 #undef LHS_STEP_LOOP
 #undef RHS_STEP_LOOP
 }
+#endif // defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
 
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
  * Post op 1: activation (optional)
  * Post op 2: elementwise op
@@ -455,7 +459,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAG
                                                                             IMAGE_DECLARATION(dst),
                                                                             // Post Op arguments
                                                                             IMAGE_DECLARATION(eltwise_operand),
-                                                                            uint k,
                                                                             uint lhs_stride_z,
                                                                             uint rhs_stride_z,
 #if defined(BETA)
@@ -467,7 +470,10 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAG
                                                                             ,
                                                                             uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                                           )
+                                                                            ,
+                                                                            const int M,
+                                                                            const int N,
+                                                                            const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
@@ -643,7 +649,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAG
 #undef LHS_STEP_LOOP
 #undef RHS_STEP_LOOP
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
 
 #if defined(LHS_TRANSPOSE)
 
@@ -755,6 +761,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAG
     CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
     (M0, N0, TYPE, A, B, C)
 
+#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
  * Post op 1: activation (optional)
  * Post op 2: elementwise op
@@ -774,6 +781,9 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAG
  * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
  * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ * @param[in] M                        Number of rows in LHS matrix not reshaped.
+ * @param[in] N                        Number of columns in RHS matrix not reshaped.
+ * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
                                                                     IMAGE_DECLARATION(rhs),
@@ -783,7 +793,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLAR
                                                                     IMAGE_DECLARATION(dst),
                                                                     // Post Op arguments
                                                                     IMAGE_DECLARATION(eltwise_operand),
-                                                                    uint k,
                                                                     uint lhs_stride_z,
                                                                     uint rhs_stride_z,
 #if defined(BETA)
@@ -795,7 +804,10 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLAR
                                                                     ,
                                                                     uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                                   )
+                                                                    ,
+                                                                    const int M,
+                                                                    const int N,
+                                                                    const int K)
 {
     // Block size
 #define LHS_BLOCK_SIZE ((K0) * (M0))
@@ -858,7 +870,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLAR
     __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
     __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
 
-    for(int i = 0; i < k; i += K0)
+    for(int i = 0; i < K; i += K0)
     {
         VEC_DATA_TYPE(DATA_TYPE, M0)
         a0;
@@ -1083,7 +1095,9 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLAR
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
 }
-#if defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
+
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
  * Post op 1: activation (optional)
  * Post op 2: elementwise op
@@ -1112,7 +1126,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act(IMAG
                                                                             IMAGE_DECLARATION(dst),
                                                                             // Post Op arguments
                                                                             IMAGE_DECLARATION(eltwise_operand),
-                                                                            uint k,
                                                                             uint lhs_stride_z,
                                                                             uint rhs_stride_z,
 #if defined(BETA)
@@ -1124,7 +1137,10 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act(IMAG
                                                                             ,
                                                                             uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                                           )
+                                                                            ,
+                                                                            const int M,
+                                                                            const int N,
+                                                                            const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
@@ -1401,8 +1417,8 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act(IMAG
 #undef LHS_STEP_LOOP
 #undef RHS_STEP_LOOP
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
 
 #endif // defined(LHS_TRANSPOSE)
 #endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
-\ No newline at end of file
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
index e96aba613..09ddcde04 100644
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
+++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include "repeat.h"
 
 /** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped_only_rhs kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
 #if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
 
 #define CONCAT(a, b) a##b
@@ -151,6 +151,7 @@
 #error "N0 value not supported"
 #endif // N0 conditions
 
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
  * Post op 1: activation (optional)
  * Post op 2: elementwise op
@@ -194,7 +195,10 @@ __kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARAT
                                                                   ,
                                                                   uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                                 )
+                                                                  ,
+                                                                  const int M,
+                                                                  const int N,
+                                                                  const int K)
 {
     // Block size
 #define RHS_BLOCK_SIZE ((K0) * (N0))
@@ -409,8 +413,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARAT
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
 }
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
 
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
  * Post op 1: activation (optional)
  * Post op 2: elementwise op
@@ -430,6 +435,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARAT
  * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
  * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ * @param[in] M                        Number of rows in LHS matrix not reshaped.
+ * @param[in] N                        Number of columns in RHS matrix not reshaped.
+ * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
                                                                           __read_only image2d_t rhs_img,
@@ -454,12 +462,15 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_
                                                                           ,
                                                                           uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                                         )
+                                                                          ,
+                                                                          const int M,
+                                                                          const int N,
+                                                                          const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
 
-#define LEFTOVER_K (K % K0)
+    const uint LEFTOVER_K = K % K0;
 
     // Block size
 #define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
@@ -562,99 +573,99 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_
         x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
     }
 
-#if LEFTOVER_K != 0
-    // Note: We cannot read out-of-bound elements from the RHS matrix because
-    // the RHS width is always multiple of K0. This is not be true for the LHS matrix
-
-    union UNION_VEC_TYPE
+    if(LEFTOVER_K != 0)
     {
-        DATA_TYPE s[K0];
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        v;
-    };
+        // Note: We cannot read out-of-bound elements from the RHS matrix because
+        // the RHS width is always multiple of K0. This is not be true for the LHS matrix
+
+        union UNION_VEC_TYPE
+        {
+            DATA_TYPE s[K0];
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            v;
+        };
 
-    union UNION_VEC_TYPE a0 = {.v = 0 };
+        union UNION_VEC_TYPE a0 = {.v = 0 };
 #if M0 > 1
-    union UNION_VEC_TYPE a1 = {.v = 0 };
+        union UNION_VEC_TYPE a1 = {.v = 0 };
 #endif // M0 > 1
 #if M0 > 2
-    union UNION_VEC_TYPE a2 = {.v = 0 };
+        union UNION_VEC_TYPE a2 = {.v = 0 };
 #endif // M0 > 2
 #if M0 > 3
-    union UNION_VEC_TYPE a3 = {.v = 0 };
+        union UNION_VEC_TYPE a3 = {.v = 0 };
 #endif // M0 > 3
 #if M0 > 4
-    union UNION_VEC_TYPE a4 = {.v = 0 };
+        union UNION_VEC_TYPE a4 = {.v = 0 };
 #endif // M0 > 4
 #if M0 > 5
-    union UNION_VEC_TYPE a5 = {.v = 0 };
+        union UNION_VEC_TYPE a5 = {.v = 0 };
 #endif // M0 > 5
 #if M0 > 6
-    union UNION_VEC_TYPE a6 = {.v = 0 };
+        union UNION_VEC_TYPE a6 = {.v = 0 };
 #endif // M0 > 6
 #if M0 > 7
-    union UNION_VEC_TYPE a7 = {.v = 0 };
+        union UNION_VEC_TYPE a7 = {.v = 0 };
 #endif // M0 > 7
 
-    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
 
-    // Load from RHS matrix
-    LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+        // Load from RHS matrix
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
 
-    // Load from LHS matrix
-    for(int k = 0; k < LEFTOVER_K; ++k)
-    {
-        a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
+        // Load from LHS matrix
+        for(int k = 0; k < LEFTOVER_K; ++k)
+        {
+            a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
 #if M0 > 1
-        a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
+            a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
 #endif // M0 > 1
 #if M0 > 2
-        a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
+            a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
 #endif // M0 > 2
 #if M0 > 3
-        a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
+            a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
 #endif // M0 > 3
 #if M0 > 4
-        a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
+            a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
 #endif // M0 > 4
 #if M0 > 5
-        a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
+            a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
 #endif // M0 > 5
 #if M0 > 6
-        a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
+            a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
 #endif // M0 > 6
 #if M0 > 7
-        a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
+            a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
 #endif // M0 > 7
 
-        lhs_offset += sizeof(DATA_TYPE);
-    }
+            lhs_offset += sizeof(DATA_TYPE);
+        }
 
-    // Accumulate
-    ARM_DOT_K0XN0(K0, a0.v, b, c0);
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0.v, b, c0);
 #if M0 > 1
-    ARM_DOT_K0XN0(K0, a1.v, b, c1);
+        ARM_DOT_K0XN0(K0, a1.v, b, c1);
 #endif // M0 > 1
 #if M0 > 2
-    ARM_DOT_K0XN0(K0, a2.v, b, c2);
+        ARM_DOT_K0XN0(K0, a2.v, b, c2);
 #endif // M0 > 2
 #if M0 > 3
-    ARM_DOT_K0XN0(K0, a3.v, b, c3);
+        ARM_DOT_K0XN0(K0, a3.v, b, c3);
 #endif // M0 > 3
 #if M0 > 4
-    ARM_DOT_K0XN0(K0, a4.v, b, c4);
+        ARM_DOT_K0XN0(K0, a4.v, b, c4);
 #endif // M0 > 4
 #if M0 > 5
-    ARM_DOT_K0XN0(K0, a5.v, b, c5);
+        ARM_DOT_K0XN0(K0, a5.v, b, c5);
 #endif // M0 > 5
 #if M0 > 6
-    ARM_DOT_K0XN0(K0, a6.v, b, c6);
+        ARM_DOT_K0XN0(K0, a6.v, b, c6);
 #endif // M0 > 6
 #if M0 > 7
-    ARM_DOT_K0XN0(K0, a7.v, b, c7);
+        ARM_DOT_K0XN0(K0, a7.v, b, c7);
 #endif // M0 > 7
-
-#endif // LEFTOVER_K != 0
+    }
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
 
@@ -723,10 +734,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
-#undef LEFTOVER_K
 #undef PIXEL_UNIT
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
 
 #define VFMA(a, b, c)     \
     ({                    \
@@ -805,6 +815,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_
 #error "M0 not supported"
 #endif // M0 not supported
 
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
  * Post op 1: activation (optional)
  * Post op 2: elementwise op
@@ -824,6 +835,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_
  * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
  * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ * @param[in] M                        Number of rows in LHS matrix not reshaped.
+ * @param[in] N                        Number of columns in RHS matrix not reshaped.
+ * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
                                                                    IMAGE_DECLARATION(rhs),
@@ -848,7 +862,10 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARA
                                                                    ,
                                                                    uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                                  )
+                                                                   ,
+                                                                   const int M,
+                                                                   const int N,
+                                                                   const int K)
 {
     // Block size
 #define RHS_BLOCK_SIZE ((K0) * (N0))
@@ -1087,9 +1104,11 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARA
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
+#undef RHS_STEP_LOOP
 }
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
 
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
  * Post op 1: activation (optional)
  * Post op 2: elementwise op
@@ -1109,6 +1128,9 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARA
  * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
  * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ * @param[in] M                        Number of rows in LHS matrix not reshaped.
+ * @param[in] N                        Number of columns in RHS matrix not reshaped.
+ * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
                                                                            __read_only image2d_t rhs_img,
@@ -1133,7 +1155,10 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE
                                                                            ,
                                                                            uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                                          )
+                                                                           ,
+                                                                           const int M,
+                                                                           const int N,
+                                                                           const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
@@ -1145,9 +1170,11 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE
 #if defined(RHS_INTERLEAVE)
 #define RHS_OFFSET_X (PIXEL_UNIT)
 #define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#define RHS_STEP_LOOP (1)
 #else // defined(RHS_INTERLEAVE)
 #define RHS_OFFSET_X (RHS_BLOCK_SIZE)
 #define RHS_STEP_X (PIXEL_UNIT)
+#define RHS_STEP_LOOP (H0)
 #endif // defined(RHS_INTERLEAVE)
 
     uint x = get_global_id(0);
@@ -1365,7 +1392,8 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
+#undef RHS_STEP_LOOP
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
 #endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
-\ No newline at end of file
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
index a76ad458a..33ab25cad 100644
--- a/src/core/CL/cl_kernels/common/gemm.cl
+++ b/src/core/CL/cl_kernels/common/gemm.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,856 +24,7 @@
 #include "gemm_helpers.h"
 #include "repeat.h"
 
-#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
-#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
-#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
-#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
-#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
-#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
-#define CONCAT_INC(K0) INC##K0
-#define INC(K0) CONCAT_INC(K0)
-
-#if(SRC_WIDTH % K0)
-#define BOUNDARY_CONDITION_X(x, a)                                                                                                                   \
-    ({                                                                                                                                               \
-        a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
-    })
-#else // (SRC_WIDTH % K0)
-#define BOUNDARY_CONDITION_X(x, a) \
-    ({})
-#endif // (SRC_WIDTH % K0)
-
-#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                     \
-    ({                                                                                                           \
-        if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0)                                                    \
-        {                                                                                                        \
-            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
-            }                                                                                                    \
-            else                                                                                                 \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
-            }                                                                                                    \
-        }                                                                                                        \
-        else                                                                                                     \
-        {                                                                                                        \
-            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
-            }                                                                                                    \
-            else                                                                                                 \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);                           \
-            }                                                                                                    \
-        }                                                                                                        \
-    })
-
-/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
- * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
- * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
- * @note Only the following values for M0, K0 and V0 are supported:
- *                                      M0: 2,3,4,5,6,7,8
- *                                      K0: 2,3,4,8,16
- *                                      V0: greater than 0
- * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- */
-__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
-                                         TENSOR3D_DECLARATION(dst)
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                         ,
-                                         uint cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-                                        )
-{
-    // Block size
-#define BLOCK_SIZE ((M0) * (K0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (K0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (K0) * (V0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (K0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
-                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
-
-    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src_stride_z by DEPTH_GEMM3D
-
-    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    input_ptr += z * (uint)src_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    output_ptr += z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    // Load values from the LHS matrix
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
-
-    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
-
-    // ---------------------------Store output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-
-#if M0 == 2
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i);                                   \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 3 // M0 == 3
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i);                          \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 4 // M0 == 4
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                 \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 5 // M0 == 5
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                      \
-    ({                                                                                                \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                   \
-        res0           = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);           \
-        DATA_TYPE res1 = a4.s##i;                                                                     \
-        VSTORE(4)                                                                                     \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));    \
-        *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
-    })
-#elif M0 == 6 // M0 == 6
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
-    ({                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
-        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                    \
-        res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i);                                        \
-        VSTORE(4)                                                                                      \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
-        VSTORE(2)                                                                                      \
-        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
-    })
-#elif M0 == 7 // M0 == 7
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
-    ({                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
-        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                    \
-        res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i);                               \
-        VSTORE(4)                                                                                      \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
-        VSTORE(3)                                                                                      \
-        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
-    })
-#elif M0 == 8 // M0 == 8
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                                      \
-    ({                                                                                                                \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                                                  \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
-        VSTORE(M0)                                                                                                    \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));                     \
-    })
-#else // M0 not supported
-#error "M0 value not supported"
-#endif // N0 conditions
-
-/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
- * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
- * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
- * @note Only the following values for M0, K0 and V0 are supported:
- *                                      M0: 2,3,4,5,6,7,8
- *                                      K0: 2,3,4,8,16
- *                                      V0: greater than 0
- * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- */
-__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
-                                        TENSOR3D_DECLARATION(dst)
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                        ,
-                                        uint cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-                                       )
-{
-    // Block size
-#define BLOCK_SIZE ((M0) * (K0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (M0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (M0) * (V0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (M0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
-                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
-
-    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src_stride_z by DEPTH_GEMM3D
-
-    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    input_ptr += z * (uint)src_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    output_ptr += z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
-
-    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
-
-    // ---------------------------Transpose and store block -----------------------
-
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
-#if K0 > 2
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
-#endif // K0 > 2
-#if K0 > 3
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
-#endif // K0 > 3
-#if K0 > 4
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
-#endif // K0 > 4
-#if K0 > 8
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
-#endif // K0 > 8
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
-
-#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
-/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
- * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- * @note Only the following values for K0, N0 and H0 are supported:
- *                                      N0: 2,3,4,8,16
- *                                      K0: 1,2,3,4,8,16
- *                                      H0: greater than 0
- *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
-                                         TENSOR3D_DECLARATION(dst))
-{
-    // Block size
-#define BLOCK_SIZE ((K0) * (N0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (N0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (N0) * (H0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (N0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
-                                     x / (uint)H0)
-                                 * (uint)dst_stride_y)
-                                 + z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-
-    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
-
-    // Load values from the RHS matrix
-    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-#if K0 > 1
-    if(y * (uint)K0 + 1 < SRC_HEIGHT)
-    {
-        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    }
-#endif // K0 > 1
-#if K0 > 2
-    if(y * (uint)K0 + 2 < SRC_HEIGHT)
-    {
-        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-    }
-#endif // K0 > 2
-#if K0 > 3
-    if(y * (uint)K0 + 3 < SRC_HEIGHT)
-    {
-        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
-    }
-#endif // K0 > 3
-#if K0 > 4
-    if(y * (uint)K0 + 4 < SRC_HEIGHT)
-    {
-        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
-    }
-    if(y * (uint)K0 + 5 < SRC_HEIGHT)
-    {
-        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
-    }
-    if(y * (uint)K0 + 6 < SRC_HEIGHT)
-    {
-        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
-    }
-    if(y * (uint)K0 + 7 < SRC_HEIGHT)
-    {
-        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
-    }
-#endif // K0 > 4
-#if K0 > 8
-    if(y * (uint)K0 + 8 < SRC_HEIGHT)
-    {
-        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
-    }
-    if(y * (uint)K0 + 9 < SRC_HEIGHT)
-    {
-        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
-    }
-    if(y * (uint)K0 + 10 < SRC_HEIGHT)
-    {
-        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
-    }
-    if(y * (uint)K0 + 11 < SRC_HEIGHT)
-    {
-        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
-    }
-    if(y * (uint)K0 + 12 < SRC_HEIGHT)
-    {
-        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
-    }
-    if(y * (uint)K0 + 13 < SRC_HEIGHT)
-    {
-        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
-    }
-    if(y * (uint)K0 + 14 < SRC_HEIGHT)
-    {
-        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
-    }
-    if(y * (uint)K0 + 15 < SRC_HEIGHT)
-    {
-        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
-    }
-#endif // K0 > 8
-
-    // ---------------------------Store output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-
-#if defined(TRANSPOSE)
-/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
- * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- * @note The option -DTRANSPOSE must passed at compile time.
- * @note Only the following values for K0, N0 and H0 are supported:
- *                                      N0: 2,3,4,8,16
- *                                      K0: 2,3,4,8,16
- *                                      H0: greater than 0
- *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
-                                        TENSOR3D_DECLARATION(dst))
-{
-    // Block size
-#define BLOCK_SIZE ((K0) * (N0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (K0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (K0) * (H0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (K0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
-                                 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    a0=0, a1=0, ... a(K0-1)=0;
-
-    // Load values from the RHS matrix
-    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    if(y * (uint)K0 + 1 < SRC_HEIGHT)
-    {
-        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    }
-#if K0 > 2
-    if(y * (uint)K0 + 2 < SRC_HEIGHT)
-    {
-        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-    }
-#endif // K0 > 2
-#if K0 > 3
-    if(y * (uint)K0 + 3 < SRC_HEIGHT)
-    {
-        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
-    }
-#endif // K0 > 3
-#if K0 > 4
-    if(y * (uint)K0 + 4 < SRC_HEIGHT)
-    {
-        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
-    }
-    if(y * (uint)K0 + 5 < SRC_HEIGHT)
-    {
-        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
-    }
-    if(y * (uint)K0 + 6 < SRC_HEIGHT)
-    {
-        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
-    }
-    if(y * (uint)K0 + 7 < SRC_HEIGHT)
-    {
-        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
-    }
-#endif // K0 > 4
-#if K0 > 8
-    if(y * (uint)K0 + 8 < SRC_HEIGHT)
-    {
-        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
-    }
-    if(y * (uint)K0 + 9 < SRC_HEIGHT)
-    {
-        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
-    }
-    if(y * (uint)K0 + 10 < SRC_HEIGHT)
-    {
-        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
-    }
-    if(y * (uint)K0 + 11 < SRC_HEIGHT)
-    {
-        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
-    }
-    if(y * (uint)K0 + 12 < SRC_HEIGHT)
-    {
-        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
-    }
-    if(y * (uint)K0 + 13 < SRC_HEIGHT)
-    {
-        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
-    }
-    if(y * (uint)K0 + 14 < SRC_HEIGHT)
-    {
-        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
-    }
-    if(y * (uint)K0 + 15 < SRC_HEIGHT)
-    {
-        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
-    }
-#endif // K0 > 8
-
-    // ---------------------------Transpose the block ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0)    res0=0, res1=0, res2=0,... res(N0-1)=0;
-
-#if K0 == 2
-    // This part computes the following transpositions:
-    // 2x2 -> 2x2
-    // 2x4 -> 4x2
-    // 2x8 -> 8x2
-    // 2x16 -> 16x2
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
-#endif // N0 > 8
-
-#elif K0 == 3 // K0 == 2
-    // This part computes the following transpositions:
-    // 3x2 -> 2x3
-    // 3x4 -> 4x3
-    // 3x8 -> 8x3
-    // 3x16 -> 16x3
-    res0                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
-    res1                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
-#if N0 > 2
-    res2                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
-    res5                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
-    res6                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
-    res7                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
-    res9                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
-    resA                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
-    resB                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
-    resC                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
-    resD                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
-    resE                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
-    resF                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
-#endif // N0 > 8
-
-#elif K0 == 4 // K0 == 4
-    // This part computes the following transpositions:
-    // 4x2 -> 2x4
-    // 4x4 -> 4x4
-    // 4x8 -> 8x4
-    // 4x16 -> 16x4
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
-#endif // N0 > 8
-
-#elif K0 == 8 // K0 == 8
-    // This part computes the following transpositions:
-    // 8x2 -> 2x8
-    // 8x4 -> 4x8
-    // 8x8 -> 8x8
-    // 8x16 -> 16x8
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
-#endif // N0 > 8
-
-#elif K0 == 16 // K0 == 16
-
-    // This part computes the following transpositions:
-    // 16x2 -> 2x16
-    // 16x4 -> 4x16
-    // 16x8 -> 8x16
-    // 16x16 -> 16x16
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
-                                          a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
-                                          a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
-                                          a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
-                                          a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
-                                          a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
-                                          a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
-                                          a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
-                                          a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
-                                          a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
-                                          a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
-                                          a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
-                                          a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
-                                          a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
-                                          a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
-                                          a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
-                                          a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
-#endif // N0 > 8
-
-#else // N0 == 16
-#error "Not supported N0 value"
-#endif // N0 > 2
-
-    // ---------------------------Store the output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-#endif // defined(TRANSPOSE)
-#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
 
 #define CONCAT(a, b) a##b
 
@@ -997,14 +148,14 @@ __kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
 #error "N0 value not supported"
 #endif // N0 conditions
 
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
  * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
  * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
  * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
@@ -1056,6 +207,9 @@ __kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
                                           IMAGE_DECLARATION(rhs),
@@ -1077,7 +231,10 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
                                           ,
                                           uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                         )
+                                          ,
+                                          const int M,
+                                          const int N,
+                                          const int K)
 {
     // Block size
 #define RHS_BLOCK_SIZE ((K0) * (N0))
@@ -1288,9 +445,11 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
+#undef RHS_STEP_LOOP
 }
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T)
 
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
@@ -1298,7 +457,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
  * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
  *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
  *       could be different from the value returned by get_image_height(rhs_img).
@@ -1348,6 +507,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
                                                   __read_only image2d_t rhs_img,
@@ -1369,12 +531,15 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
                                                   ,
                                                   uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                 )
+                                                  ,
+                                                  const int M,
+                                                  const int N,
+                                                  const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
 
-#define LEFTOVER_K (K % K0)
+    const uint LEFTOVER_K = K % K0;
 
     // Block size
 #define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
@@ -1477,99 +642,100 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
         x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
     }
 
-#if LEFTOVER_K != 0
-    // Note: We cannot read out-of-bound elements from the RHS matrix because
-    // the RHS width is always multiple of K0. This is not be true for the LHS matrix
-
-    union UNION_VEC_TYPE
+    if(LEFTOVER_K != 0)
     {
-        DATA_TYPE s[K0];
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        v;
-    };
-
-    union UNION_VEC_TYPE a0 = {.v = 0 };
+        // Note: We cannot read out-of-bound elements from the RHS matrix because
+        // the RHS width is always multiple of K0. This is not be true for the LHS matrix
+        // Left-over accumulations for LHS matrix
+
+        union UNION_VEC_TYPE
+        {
+            DATA_TYPE s[K0];
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            v;
+        };
+
+        union UNION_VEC_TYPE a0 = {.v = 0 };
 #if M0 > 1
-    union UNION_VEC_TYPE a1 = {.v = 0 };
+        union UNION_VEC_TYPE a1 = {.v = 0 };
 #endif // M0 > 1
 #if M0 > 2
-    union UNION_VEC_TYPE a2 = {.v = 0 };
+        union UNION_VEC_TYPE a2 = {.v = 0 };
 #endif // M0 > 2
 #if M0 > 3
-    union UNION_VEC_TYPE a3 = {.v = 0 };
+        union UNION_VEC_TYPE a3 = {.v = 0 };
 #endif // M0 > 3
 #if M0 > 4
-    union UNION_VEC_TYPE a4 = {.v = 0 };
+        union UNION_VEC_TYPE a4 = {.v = 0 };
 #endif // M0 > 4
 #if M0 > 5
-    union UNION_VEC_TYPE a5 = {.v = 0 };
+        union UNION_VEC_TYPE a5 = {.v = 0 };
 #endif // M0 > 5
 #if M0 > 6
-    union UNION_VEC_TYPE a6 = {.v = 0 };
+        union UNION_VEC_TYPE a6 = {.v = 0 };
 #endif // M0 > 6
 #if M0 > 7
-    union UNION_VEC_TYPE a7 = {.v = 0 };
+        union UNION_VEC_TYPE a7 = {.v = 0 };
 #endif // M0 > 7
 
-    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
 
-    // Load from RHS matrix
-    LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+        // Load from RHS matrix
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
 
-    // Load from LHS matrix
-    for(int k = 0; k < LEFTOVER_K; ++k)
-    {
-        a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
+        // Load from LHS matrix
+        for(int k = 0; k < LEFTOVER_K; ++k)
+        {
+            a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
 #if M0 > 1
-        a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
+            a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
 #endif // M0 > 1
 #if M0 > 2
-        a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
+            a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
 #endif // M0 > 2
 #if M0 > 3
-        a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
+            a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
 #endif // M0 > 3
 #if M0 > 4
-        a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
+            a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
 #endif // M0 > 4
 #if M0 > 5
-        a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
+            a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
 #endif // M0 > 5
 #if M0 > 6
-        a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
+            a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
 #endif // M0 > 6
 #if M0 > 7
-        a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
+            a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
 #endif // M0 > 7
 
-        lhs_offset += sizeof(DATA_TYPE);
-    }
+            lhs_offset += sizeof(DATA_TYPE);
+        }
 
-    // Accumulate
-    ARM_DOT_K0XN0(K0, a0.v, b, c0);
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0.v, b, c0);
 #if M0 > 1
-    ARM_DOT_K0XN0(K0, a1.v, b, c1);
+        ARM_DOT_K0XN0(K0, a1.v, b, c1);
 #endif // M0 > 1
 #if M0 > 2
-    ARM_DOT_K0XN0(K0, a2.v, b, c2);
+        ARM_DOT_K0XN0(K0, a2.v, b, c2);
 #endif // M0 > 2
 #if M0 > 3
-    ARM_DOT_K0XN0(K0, a3.v, b, c3);
+        ARM_DOT_K0XN0(K0, a3.v, b, c3);
 #endif // M0 > 3
 #if M0 > 4
-    ARM_DOT_K0XN0(K0, a4.v, b, c4);
+        ARM_DOT_K0XN0(K0, a4.v, b, c4);
 #endif // M0 > 4
 #if M0 > 5
-    ARM_DOT_K0XN0(K0, a5.v, b, c5);
+        ARM_DOT_K0XN0(K0, a5.v, b, c5);
 #endif // M0 > 5
 #if M0 > 6
-    ARM_DOT_K0XN0(K0, a6.v, b, c6);
+        ARM_DOT_K0XN0(K0, a6.v, b, c6);
 #endif // M0 > 6
 #if M0 > 7
-    ARM_DOT_K0XN0(K0, a7.v, b, c7);
+        ARM_DOT_K0XN0(K0, a7.v, b, c7);
 #endif // M0 > 7
-
-#endif // LEFTOVER_K != 0
+    }
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
 
@@ -1635,10 +801,10 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
-#undef LEFTOVER_K
+#undef RHS_STEP_LOOP
 #undef PIXEL_UNIT
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)
 
 #define VFMA(a, b, c)     \
     ({                    \
@@ -1717,13 +883,14 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #error "M0 not supported"
 #endif // M0 not supported
 
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
  * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
  * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
  * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
@@ -1775,6 +942,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
                                            IMAGE_DECLARATION(rhs),
@@ -1796,7 +966,10 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
                                            ,
                                            uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                          )
+                                           ,
+                                           const int M,
+                                           const int N,
+                                           const int K)
 {
     // Block size
 #define RHS_BLOCK_SIZE ((K0) * (N0))
@@ -2032,9 +1205,11 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
+#undef RHS_STEP_LOOP
 }
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)
 
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
@@ -2042,7 +1217,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
  * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
  *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
  *       could be different from the value returned by get_image_height(rhs_img).
@@ -2092,6 +1267,9 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
                                                    __read_only image2d_t rhs_img,
@@ -2113,7 +1291,10 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
                                                    ,
                                                    uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                  )
+                                                   ,
+                                                   const int M,
+                                                   const int N,
+                                                   const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
@@ -2125,9 +1306,11 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #if defined(RHS_INTERLEAVE)
 #define RHS_OFFSET_X (PIXEL_UNIT)
 #define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#define RHS_STEP_LOOP 1
 #else // defined(RHS_INTERLEAVE)
 #define RHS_OFFSET_X (RHS_BLOCK_SIZE)
 #define RHS_STEP_X (PIXEL_UNIT)
+#define RHS_STEP_LOOP (H0)
 #endif // defined(RHS_INTERLEAVE)
 
     uint x = get_global_id(0);
@@ -2342,11 +1525,12 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
+#undef RHS_STEP_LOOP
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
 
 #if defined(MIXED_PRECISION)
 #if K0 == 2
@@ -2525,6 +1709,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #error "N0 value not supported"
 #endif // N0 conditions
 
+#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
@@ -2581,12 +1766,14 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
                                             IMAGE_DECLARATION(rhs),
@@ -2594,7 +1781,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
                                             IMAGE_DECLARATION(bias),
 #endif // defined(BETA)
                                             IMAGE_DECLARATION(dst),
-                                            uint k,
                                             uint lhs_stride_z,
                                             uint rhs_stride_z,
 #if defined(BETA)
@@ -2605,7 +1791,10 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
                                             ,
                                             uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                           )
+                                            ,
+                                            const int M,
+                                            const int N,
+                                            const int K)
 {
     // Block size
 #define LHS_BLOCK_SIZE ((K0) * (M0))
@@ -2661,7 +1850,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
     REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
     REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
 
-    for(int i = 0; i < k; i += K0)
+    for(int i = 0; i < K; i += K0)
     {
         // Supported cases (M0, K0):
         // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
@@ -2798,8 +1987,9 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 #undef LHS_STEP_LOOP
 #undef RHS_STEP_LOOP
 }
+#endif // defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)
 
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
@@ -2855,12 +2045,14 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
                                                     __read_only image2d_t rhs_img,
@@ -2868,7 +2060,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
                                                     IMAGE_DECLARATION(bias),
 #endif // defined(BETA)
                                                     IMAGE_DECLARATION(dst),
-                                                    uint k,
                                                     uint lhs_stride_z,
                                                     uint rhs_stride_z,
 #if defined(BETA)
@@ -2879,7 +2070,10 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
                                                     ,
                                                     uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                   )
+                                                    ,
+                                                    const int M,
+                                                    const int N,
+                                                    const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
@@ -3070,7 +2264,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #undef LHS_STEP_LOOP
 #undef RHS_STEP_LOOP
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)
 
 #if defined(LHS_TRANSPOSE)
 
@@ -3182,6 +2376,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
     CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
     (M0, N0, TYPE, A, B, C)
 
+#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
@@ -3236,12 +2431,14 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
                                             IMAGE_DECLARATION(rhs),
@@ -3249,7 +2446,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
                                             IMAGE_DECLARATION(bias),
 #endif // defined(BETA)
                                             IMAGE_DECLARATION(dst),
-                                            uint k,
                                             uint lhs_stride_z,
                                             uint rhs_stride_z,
 #if defined(BETA)
@@ -3260,7 +2456,10 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
                                             ,
                                             uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                           )
+                                            ,
+                                            const int M,
+                                            const int N,
+                                            const int K)
 {
     // Block size
 #define LHS_BLOCK_SIZE ((K0) * (M0))
@@ -3322,7 +2521,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
     __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
     __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
 
-    for(int i = 0; i < k; i += K0)
+    for(int i = 0; i < K; i += K0)
     {
         VEC_DATA_TYPE(DATA_TYPE, M0)
         a0;
@@ -3562,8 +2761,9 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
 }
+#endif // defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)
 
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
@@ -3572,7 +2772,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The GEMM's dimensions M, N and K must be passed at runtime.
  * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
  *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
  *       could be different from the value returned by get_image_height(rhs_img).
@@ -3617,12 +2817,14 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
                                                     __read_only image2d_t rhs_img,
@@ -3630,7 +2832,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
                                                     IMAGE_DECLARATION(bias),
 #endif // defined(BETA)
                                                     IMAGE_DECLARATION(dst),
-                                                    uint k,
                                                     uint lhs_stride_z,
                                                     uint rhs_stride_z,
 #if defined(BETA)
@@ -3641,7 +2842,10 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
                                                     ,
                                                     uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                   )
+                                                    ,
+                                                    const int M,
+                                                    const int N,
+                                                    const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
@@ -3933,13 +3137,13 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #undef LHS_STEP_LOOP
 #undef RHS_STEP_LOOP
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)
 
 #endif // defined(LHS_TRANSPOSE)
 
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)
 
 #define VFMA(a, b, c)     \
     ({                    \
@@ -4018,14 +3222,14 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #error "M0 not supported"
 #endif // M0 not supported
 
+#if defined(GEMM_MM_NATIVE)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS matrix is NOT reshaped
  * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
  * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
  * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
@@ -4073,6 +3277,9 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
  * @param[in]  rhs_stride_z                       Stride of the RHS matrix in Z dimension (in bytes)
  * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
@@ -4087,7 +3294,10 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
 #if defined(BETA)
                              uint bias_stride_z,
 #endif //defined(BETA)
-                             uint dst_stride_z
+                             uint      dst_stride_z,
+                             const int M,
+                             const int N,
+                             const int K
 #if defined(REINTERPRET_INPUT_AS_3D)
                              ,
                              uint lhs_cross_plane_pad
@@ -4303,7 +3513,8 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
     // Store output block
     STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 }
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+#endif // defined(GEMM_MM_NATIVE)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)
 
 #if defined(BETA)
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
@@ -4389,4 +3600,4 @@ __kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
     vstore8(out, 0, (__global half *)dst.ptr);
 }
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-#endif // defined(BETA)
-\ No newline at end of file
+#endif // defined(BETA)
diff --git a/src/core/CL/cl_kernels/common/gemm_utils.cl b/src/core/CL/cl_kernels/common/gemm_utils.cl
new file mode 100644
index 000000000..be57d94ce
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemm_utils.cl
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(RESHAPE_LHS_NT)
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] M                                 The size of height dimension of the source tensor, affected by reinterpret_input_as_3d
+ * @param[in] V0                                The number of blocks to place on the same row. It must be greater than 0.
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_T(src, BUFFER),
+                                         TENSOR3D_T(dst, BUFFER),
+                                         const int M,
+                                         const int V0)
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0); // K
+    const int y = GET_SPATIAL_IDX(1, 1, 0); // M
+    const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size
+
+    const int xi = x * K0;
+    const int yi = y * M0;
+
+    const int xo = x * BLOCK_SIZE * V0 + (y % V0) * OUTPUT_OFFSET_X;
+    const int yo = (y / V0);
+
+    // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, M0, K0, in);
+
+    // Initialize the input tile to zero
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in[_i].v = 0;
+    });
+
+    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
+    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
+    // Load input tile
+    TILE(uint, M0, 1, in_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in_indirect_y[_i].v = _i;
+
+    });
+#if PARTIAL_M0 != 0
+    if(y_cond)
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    else
+#endif // PARTIAL_M0 != 0
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+
+    // Store output tile
+    TILE(uint, M0, 1, dst_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        dst_indirect_y[_i].v = _i;
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_LHS_NT)
+
+#if defined(RESHAPE_LHS_T)
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] M                                 The size of height dimension of the source tensor, affected by reinterpret_input_as_3d
+ * @param[in] V0                                The number of blocks to place on the same row. It must be greater than 0
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_T(src, BUFFER),
+                                        TENSOR3D_T(dst, BUFFER),
+                                        const int M,
+                                        const int V0)
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0); // K
+    const int y = GET_SPATIAL_IDX(1, 1, 0); // M
+    const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size
+
+    const int xi = x * K0;
+    const int yi = y * M0;
+
+    const int xo = x * BLOCK_SIZE * V0 + ((y % V0) * OUTPUT_OFFSET_X);
+    const int yo = (y / V0);
+
+    // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, M0, K0, in);
+    TILE(DATA_TYPE, K0, M0, in_tr);
+
+    // Initialize the tile to zero
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in[_i].v = 0;
+    });
+
+    // Load input tile
+    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
+    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
+
+    TILE(uint, M0, 1, in_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in_indirect_y[_i].v = _i;
+
+    });
+#if PARTIAL_M0 != 0
+    if(y_cond)
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    else
+#endif // PARTIAL_M0 != 0
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    // Transpose input tile
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, k0, 0, 1, K0,
+        {
+            in_tr[k0].s[m0] = in[m0].s[k0];
+        })
+    });
+
+    TILE(uint, K0, 1, dst_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, K0,
+    {
+        dst_indirect_y[_i].v = _i;
+    });
+
+    // Store output tile
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, M0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_LHS_T)
+
+#if defined(RESHAPE_RHS_NT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 1,2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] H0                                The number of blocks to place on the same row. It must be greater than 0
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_T(src, BUFFER),
+                                         TENSOR3D_T(dst, BUFFER),
+                                         const int H0)
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0);
+    const int y = GET_SPATIAL_IDX(1, 1, 0);
+    const int z = GET_SPATIAL_IDX(2, 1, 0);
+
+    const int xi = x * N0;
+    const int yi = y * K0;
+
+    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
+    const int yo = (x / H0);
+
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, K0, N0, in);
+
+    // Initialize the tile to zero
+    for(int i = 0; i < K0; ++i)
+    {
+        in[i].v = 0;
+    }
+
+    // Load input tile
+    for(int i = 0; i < K0; ++i)
+    {
+        if(yi + i < src_h)
+        {
+            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
+        }
+    }
+
+    TILE(uint, K0, 1, dst_indirect_y);
+    for(int i = 0; i < K0; ++i)
+    {
+        dst_indirect_y[i].v = i;
+    }
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_RHS_NT)
+
+#if defined(RESHAPE_RHS_T)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] H0                                The number of blocks to place on the same row. It must be greater than 0.
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_T(src, BUFFER),
+                                        TENSOR3D_T(dst, BUFFER),
+                                        const int H0)
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0);
+    const int y = GET_SPATIAL_IDX(1, 1, 0);
+    const int z = GET_SPATIAL_IDX(2, 1, 0);
+
+    const int xi = x * N0;
+    const int yi = y * K0;
+
+    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
+    const int yo = (x / H0);
+
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, K0, N0, in);
+    TILE(DATA_TYPE, N0, K0, in_tr);
+
+    // Initialize the tile to zero
+    for(int i = 0; i < K0; ++i)
+    {
+        in[i].v = 0;
+    }
+
+    // Load input tile
+    for(int i = 0; i < K0; ++i)
+    {
+        if(yi + i < src_h)
+        {
+            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
+        }
+    }
+
+    // Transpose input tile
+    for(int k0 = 0; k0 < K0; ++k0)
+    {
+        for(int n0 = 0; n0 < N0; ++n0)
+        {
+            in_tr[n0].s[k0] = in[k0].s[n0];
+        }
+    }
+
+    TILE(uint, N0, 1, dst_indirect_y);
+    for(int i = 0; i < N0; ++i)
+    {
+        dst_indirect_y[i].v = i;
+    }
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#endif // defined(RESHAPE_RHS_T)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 88a7665ee..bfb693e37 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -392,18 +392,18 @@
 #define vload_partial_12(DATA, OFFSET, PTR)       \
     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
     vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
-
+// For vload_partial_{13,14,15}, an 8-vector size has been passed, because vectors size of size 5,6,7 are not supported
 #define vload_partial_13(DATA, OFFSET, PTR)       \
     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
-    vload_partial_5(DATA.s89ABC, OFFSET, PTR + 8);
+    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
 
 #define vload_partial_14(DATA, OFFSET, PTR)       \
     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
-    vload_partial_6(DATA.s89ABCD, OFFSET, PTR + 8);
+    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
 
 #define vload_partial_15(DATA, OFFSET, PTR)       \
     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
-    vload_partial_7(DATA.s89ABCDE, OFFSET, PTR + 8);
+    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
 
 #define vload_partial_16(DATA, OFFSET, PTR) \
     DATA = vload16(OFFSET, PTR);
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution.cl b/src/core/CL/cl_kernels/nchw/direct_convolution.cl
new file mode 100644
index 000000000..866f62da9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution.cl
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "helpers_asymm.h"
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
+ * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
+ * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3
+ * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    const int id0 = get_global_id(0);
+    const int id1 = get_global_id(1);
+    const int id2 = get_global_id(2);
+
+    const int x_coords = (id0 * STRIDE_X) - PAD_LEFT;
+    const int y_coords = (id1 * STRIDE_Y) - PAD_TOP;
+
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr     = (__global uchar *)(src_ptr + src_offset_first_element_in_bytes);
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + id2 * weights_stride_w);
+    __global uchar *dst_addr     = (__global uchar *)dst_ptr + dst_offset_first_element_in_bytes + x_offs + id1 * dst_stride_y + id2 * dst_stride_z;
+
+#ifdef IS_QUANTIZED
+    int acc_value = 0;
+#else  /* IS_QUANTIZED */
+    DATA_TYPE                 acc_value = 0;
+#endif /* IS_QUANTIZED */
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        for(int y = 0; y < WEI_HEIGHT; ++y)
+        {
+            for(int x = 0; x < WEI_WIDTH; ++x)
+            {
+                const int idx_x = (x_coords + x);
+                const int idx_y = (y_coords + y);
+                if((idx_x >= 0 && idx_x < SRC_WIDTH) && (idx_y >= 0 && idx_y < SRC_HEIGHT))
+                {
+                    const int weight_offset = x + (WEI_HEIGHT * y);
+                    const int input_offset  = idx_x + SRC_WIDTH * idx_y;
+#ifdef IS_QUANTIZED
+                    int weight = convert_int(*((__global DATA_TYPE *)weights_addr + weight_offset));
+                    int input  = convert_int(*((__global DATA_TYPE *)src_addr + input_offset));
+                    acc_value += (input + INPUT_OFFSET) * (weight + WEIGHTS_OFFSET);
+#else  /* IS_QUANTIZED */
+                    DATA_TYPE weight    = *((__global DATA_TYPE *)weights_addr + weight_offset);
+                    DATA_TYPE input     = *((__global DATA_TYPE *)src_addr + input_offset);
+                    acc_value += input * weight;
+#endif /* IS_QUANTIZED */
+                }
+            }
+        }
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#ifdef IS_QUANTIZED
+    int bias = *((__global int *)(vector_offset(&biases, id2)));
+#else  /* IS_QUANTIZED */
+    DATA_TYPE bias = *((__global DATA_TYPE *)(vector_offset(&biases, id2)));
+#endif /* IS_QUANTIZED */
+    acc_value += bias;
+
+#endif /* defined(HAS_BIAS) */
+
+#ifdef IS_QUANTIZED
+
+#if OUTPUT_SHIFT < 0
+    acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
+#else  // OUTPUT_SHIFT < 0
+    acc_value      = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
+#endif // OUTPUT_SHIFT < 0
+    acc_value = acc_value + OUTPUT_OFFSET;
+#endif /* IS_QUANTIZED */
+
+    *(__global DATA_TYPE *)dst_addr = CONVERT_SAT(acc_value, DATA_TYPE);
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl
deleted file mode 100644
index 8ab2d1d4e..000000000
--- a/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl
+++ /dev/null
@@ -1,316 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#define ADD_OP(a, b) ((a) + (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define CONVERT_SAT(a, b) ((a))
-
-#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if STRIDE_X == 3
-#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
-#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
-#elif STRIDE_X == 2
-#define INPUT_PIXEL(data_size) extract_input_stride2
-#elif STRIDE_X == 1
-#define INPUT_PIXEL(data_size) extract_input_stride1
-#else /* STRIDE_X not equals 1, 2 or 3 */
-#error "Only support strides 1, 2 and 3"
-#endif /* STRIDE_X == 3 */
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
-{
-    return vload8(0, input_pixel);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp = vload16(0, input_pixel);
-    return temp.s02468ace;
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp1 = vload4(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp2 = vload4(0, input_pixel + 6);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp3 = vload4(0, input_pixel + 12);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp4 = vload4(0, input_pixel + 18);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp1 = vload8(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp2 = vload8(0, input_pixel + 8);
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp3 = vload8(0, input_pixel + 16);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp1 = vload16(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp2 = vload16(0, input_pixel + 12);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
-}
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
- * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution1x1(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-#endif /* defined(HAS_BIAS) */
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
-    values = 0;
-
-    const uint z_index = get_global_id(2);
-
-    weights.ptr += z_index * weights_stride_w;
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
-        values      = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
-        src.ptr += src_stride_z;
-        weights.ptr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \
-    ({                                                 \
-        acc.s0 = mad(src.s0, weight_value, acc.s0);    \
-        acc.s1 = mad(src.s1, weight_value, acc.s1);    \
-        acc.s2 = mad(src.s2, weight_value, acc.s2);    \
-        acc.s3 = mad(src.s3, weight_value, acc.s3);    \
-    })
-
-/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases, -DHAS_BIAS must to be passed at compile
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution1x1_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 acc0 = 0.0f;
-    float4 acc1 = 0.0f;
-    float4 acc2 = 0.0f;
-    float4 acc3 = 0.0f;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights
-        float weight = *((__global float *)weights_addr);
-
-        // Load values from row0 of input tensor
-        float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-        float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-        float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-        float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
-
-        CONVOLUTION1x1_BIFROST(acc0, src0, weight);
-        CONVOLUTION1x1_BIFROST(acc1, src1, weight);
-        CONVOLUTION1x1_BIFROST(acc2, src2, weight);
-        CONVOLUTION1x1_BIFROST(acc3, src3, weight);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    acc0.s0 += bias;
-    acc0.s1 += bias;
-    acc0.s2 += bias;
-    acc0.s3 += bias;
-    acc1.s0 += bias;
-    acc1.s1 += bias;
-    acc1.s2 += bias;
-    acc1.s3 += bias;
-    acc2.s0 += bias;
-    acc2.s1 += bias;
-    acc2.s2 += bias;
-    acc2.s3 += bias;
-    acc3.s0 += bias;
-    acc3.s1 += bias;
-    acc3.s2 += bias;
-    acc3.s3 += bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-    vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl
deleted file mode 100644
index 811df053c..000000000
--- a/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#define ADD_OP(a, b) ((a) + (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define CONVERT_SAT(a, b) ((a))
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X == 2 */
-
-#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                                                  \
-    ({                                                                                                                                             \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                                \
-        weights_values0 = vload3(0, weights_row_ptr);                                                                                              \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                                \
-        src0 = vload8(0, src_row_ptr);                                                                                                             \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                                \
-        src1 = vload2(0, src_row_ptr + 8);                                                                                                         \
-        \
-        acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                          \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
-    })
-
-#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                                               \
-    ({                                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                             \
-        weights_values0 = vload3(0, weights_row_ptr);                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                            \
-        src0           = vload16(0, src_row_ptr);                                                                                               \
-        DATA_TYPE src1 = *(src_row_ptr + 16);                                                                                                   \
-        \
-        acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                  \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1));      \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
-    })
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note This OpenCL kernel works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution3x3(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
-    values0 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    values0 = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(CONVERT_SAT(values0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \
-    ({                                                        \
-        acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0);       \
-        acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1);       \
-        acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2);       \
-        acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3);       \
-        acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0);       \
-        acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1);       \
-        acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2);       \
-        acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3);       \
-        acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0);       \
-        acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1);       \
-        acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2);       \
-        acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3);       \
-    })
-
-/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases, -DHAS_BIAS must to be passed at compile
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution3x3_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 values0 = 0;
-    float4 values1 = 0;
-    float4 values2 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights
-        float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-        float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-        float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-        float4 src0;
-        float2 src1;
-
-        // Load values from row0 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);
-
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row0);
-
-        // Load values from row1 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row1);
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row0);
-
-        // Load values from row2 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row2);
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row1);
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row0);
-
-        // Load values from row3 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row2);
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row1);
-
-        // Row4
-        src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row2);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    values0 += (float4)bias;
-    values1 += (float4)bias;
-    values2 += (float4)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(values2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl
deleted file mode 100644
index 59d668f0b..000000000
--- a/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X == 2 */
-
-#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                               \
-    ({                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
-        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                             \
-        src0 = vload8(0, src_row_ptr);                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        src1 = vload4(0, src_row_ptr + 8);                                                                                      \
-        \
-        acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                          \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
-    })
-
-#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                               \
-    ({                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
-        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
-        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                            \
-        src0 = vload16(0, src_row_ptr);                                                                                         \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        src1 = vload4(0, src_row_ptr + 16);                                                                                     \
-        acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                     \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;         \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
-        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
-    })
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution5x5(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    values0 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \
-    ({                                                                  \
-        acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0);                \
-        acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1);                \
-        acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2);                \
-        acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3);                \
-        acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0);                \
-        acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1);                \
-        acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2);                \
-        acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3);                \
-        acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0);                \
-        acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1);                \
-        acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2);                \
-        acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3);                \
-        acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0);                \
-        acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1);                \
-        acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2);                \
-        acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3);                \
-        acc.s0 = mad(src0.s4, weights_row01, acc.s0);                   \
-        acc.s1 = mad(src0.s5, weights_row01, acc.s1);                   \
-        acc.s2 = mad(src0.s6, weights_row01, acc.s2);                   \
-        acc.s3 = mad(src0.s7, weights_row01, acc.s3);                   \
-    })
-
-/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution5x5_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 values0 = 0.0f;
-    float4 values1 = 0.0f;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights from row0 and row1
-        float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-        float  weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4);
-        float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-        float  weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4);
-        float8 src0;
-
-        // Load values from row0 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-
-        // Load values from row1 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        // Load values from row2 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
-
-        // Load weights from row2
-        weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-        weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
-
-        // Load values from row3 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
-
-        // Load weights from row3
-        weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y));
-        weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        // Load values from row4 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
-
-        // Load weights from row4
-        weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y));
-        weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4);
-
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
-
-        // Load values from row5 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    values0 += bias;
-    values1 += bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl
deleted file mode 100644
index b80d4f587..000000000
--- a/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#undef CONVERT_SAT_STR
-#undef CONVERT_SAT
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
-
-#define CONVERT_SAT_STR(x, type) (convert_##type##8_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
-
-#if KERNEL_SIZE == 9
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                             \
-    ({                                                                                                        \
-        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                     \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                          \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                       \
-        acc += (src0.lo + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                        \
-        acc += ((int8)(src0.s1234, src0.s5678) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s2345, src0.s6789) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s3456, src0.s789A) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s4567, src0.s89AB) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s5678, src0.s9ABC) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s6789, src0.sABCD) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s789A, src0.sBCDE) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s89AB, src0.sCDEF) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
-    })
-
-#define CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                             \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                                  \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                               \
-        int8  src1            = convert_int8(vload8(0, src_row_ptr + 16));                                            \
-        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                              \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);         \
-        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s468A, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s579B, src0.sDF, src1.s13) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s68AC, src0.sE, src1.s024) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s79BD, src0.sF, src1.s135) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s8ACE, src1.s0246) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);             \
-    })
-
-#elif KERNEL_SIZE == 5
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                              \
-        int  weights_value1  = convert_int(*(weights_row_ptr + 4));                                                   \
-        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                  \
-        int4 src1            = convert_int4(vload4(0, src_row_ptr + 8));                                              \
-        acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                                   \
-        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s345, src0.s67, src1.s012) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s45, src0.s67, src1.s0123) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
-    })
-
-#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int4  weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                             \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 4));                                                  \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                               \
-        int4  src1            = convert_int4(vload4(0, src_row_ptr + 16));                                            \
-        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                              \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);         \
-        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
-    })
-
-#elif KERNEL_SIZE == 3
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                              \
-        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                  \
-        int2 src1            = convert_int2(vload2(0, src_row_ptr + 8));                                              \
-        acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                                   \
-        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-    })
-
-#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                  \
-    ({                                                                                                             \
-        int3  weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                          \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                            \
-        int   src1            = convert_int(*(src_row_ptr + 16));                                                  \
-        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                           \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);      \
-        acc += ((int8)(src0.s2468, src0.sACE, src1) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-    })
-
-#elif KERNEL_SIZE == 1
-
-#if STRIDE_X == 3
-#define INPUT_VALUE extract_input_stride3
-#elif STRIDE_X == 2
-#define INPUT_VALUE extract_input_stride2
-#elif STRIDE_X == 1
-#define INPUT_VALUE extract_input_stride1
-
-#else /* STRIDE_X not equals 1, 2 or 3 */
-#error "Only support strides 1, 2 and 3"
-#endif /* STRIDE_X */
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value)
-{
-    return vload8(0, input_value);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp = vload16(0, input_value);
-    return temp.s02468ace;
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp1 = vload16(0, input_value);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp2 = vload16(0, input_value + 12);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
-}
-
-#else /* KERNEL_SIZE not equals 1, 3 , 5, 9 */
-#error "Only kernel sizes 1, 3, 5 and 9 are supported"
-#endif /* KERNEL_SIZE */
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
- * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
- * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3
- * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3
- * @note The destination offset quantization parameter must be passed at compile time using -DOUTPUT_OFFSET e.g. -DOUTPUT_OFFSET=3
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Supported data types: S32
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int8 values0 = 0;
-
-    __global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0);
-    __global DATA_TYPE *src_addr     = (__global DATA_TYPE *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-#if KERNEL_SIZE == 9
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 5 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 6 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 7 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 8 * weights_stride_y));
-#elif KERNEL_SIZE == 5
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-#elif KERNEL_SIZE == 3
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-#elif KERNEL_SIZE == 1
-        int weight       = convert_int(*(__global DATA_TYPE *)weights_addr);
-        int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr));
-        values0 += (input_value + INPUT_OFFSET) * ((int8)weight + WEIGHTS_OFFSET);
-#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector        biases    = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-    __global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index)));
-    values0 += (int8)(*bias_addr);
-#endif /* defined(HAS_BIAS) */
-
-#if OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_SHIFT < 0
-    values0 = values0 + OUTPUT_OFFSET;
-
-    vstore8(CONVERT_SAT(values0, DATA_TYPE), 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
diff --git a/src/core/CL/cl_kernels/nchw/remap.cl b/src/core/CL/cl_kernels/nchw/remap.cl
deleted file mode 100644
index fab88a168..000000000
--- a/src/core/CL/cl_kernels/nchw/remap.cl
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-#ifndef DEPTH_OUT
-/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-__kernel void remap_nearest_neighbour_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(mapx),
-    IMAGE_DECLARATION(mapy),
-    const float width,
-    const float height)
-{
-    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
-    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
-    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
-    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
-    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
-    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
-                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-
-    vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
-}
-
-/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-__kernel void remap_bilinear_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(mapx),
-    IMAGE_DECLARATION(mapy),
-    const float width,
-    const float height)
-{
-    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
-    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
-    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
-    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
-    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
-    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
-                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-
-    vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
-}
-#endif // DEPTH_OUT
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index 75a7a0f00..f1b422a68 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -103,9 +103,9 @@
  */
 //! @endcond
 __kernel void direct_convolution_nhwc(
-    TENSOR4D(src, SRC_TENSOR_TYPE),
-    TENSOR4D(dst, DST_TENSOR_TYPE),
-    TENSOR4D(wei, WEI_TENSOR_TYPE)
+    TENSOR4D_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_T(wei, WEI_TENSOR_TYPE)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bia)
@@ -116,12 +116,12 @@ __kernel void direct_convolution_nhwc(
     // In case of dynamic tensor support, the following dimensions should be passed as function argument.
 #define _IWEI_WIDTH WEI_WIDTH
 #define _IWEI_HEIGHT WEI_HEIGHT
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _ISRC_CHANNELS SRC_CHANNELS
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _IDST_CHANNELS DST_CHANNELS
+#define _ISRC_WIDTH src_w
+#define _ISRC_HEIGHT src_h
+#define _ISRC_CHANNELS src_c
+#define _IDST_WIDTH dst_w
+#define _IDST_HEIGHT dst_h
+#define _IDST_CHANNELS dst_c
 #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
 
     // If quantized, the output tile has to be quantized first before being stored to global memory
@@ -192,7 +192,7 @@ __kernel void direct_convolution_nhwc(
 
         // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
         // This #if directive should be removed in case of dynamic tensor support
-#if((SRC_CHANNELS % K0) != 0)
+#if defined(LEFTOVER_LOOP)
         // Left-over accumulations
         for(; k < _ISRC_CHANNELS; ++k)
         {
@@ -220,7 +220,7 @@ __kernel void direct_convolution_nhwc(
 
             ++ck;
         }
-#endif // ((SRC_CHANNELS % K0) != 0)
+#endif // defined(LEFTOVER_LOOP)
     }
 
     // Offset correction required for the quantized asymmetric computation
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
index 58f01fa3e..4f57a81e7 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -26,7 +26,7 @@
 #include "helpers.h"
 #include "tile_helpers.h"
 
-#if defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+#if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
 //! @cond Doxygen_Suppress
 /** OpenCL kernel to compute the depthwise convolution for floating-point data types (F32/F16)
  *
@@ -37,10 +37,6 @@
  * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
  * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
  * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
- * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
- * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
  * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
  * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
  * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
@@ -57,24 +53,22 @@
  * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
  * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
@@ -93,8 +87,8 @@
  */
 //! @endcond
 __kernel void dwc_native_fp_nhwc(
-    TENSOR4D(src, SRC_TENSOR_TYPE),
-    TENSOR4D(dst, DST_TENSOR_TYPE),
+    TENSOR4D_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_T(dst, DST_TENSOR_TYPE),
     TENSOR4D(wei, WEI_TENSOR_TYPE)
 #if defined(HAS_BIAS)
     ,
@@ -102,15 +96,10 @@ __kernel void dwc_native_fp_nhwc(
 #endif // defined(HAS_BIAS)
 )
 {
-    // All the tensor dimensions are passed at compile time.
+    // Only the weight tensor dimensions are passed at compile time.
     // In case of dynamic tensor support, the following dimensions should be passed as function argument.
 #define _IWEI_WIDTH WEI_WIDTH
 #define _IWEI_HEIGHT WEI_HEIGHT
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _IDST_CHANNELS DST_CHANNELS
 #define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
 #define _IN0_A N0          // Cols tile A
 #define _IM0_B _IWEI_WIDTH // Rows tile B
@@ -120,12 +109,12 @@ __kernel void dwc_native_fp_nhwc(
     const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
     const int xo   = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH
 #if defined(BATCHED_EXECUTION)
-    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
-    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
-#else                                                         // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
     const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
     const int bout = 0;                        // BATCH SIZE IDX
-#endif                                                        // defined(BATCHED_EXECUTION)
+#endif                                                 // defined(BATCHED_EXECUTION)
 
     int xi = xo * STRIDE_X;
     int yi = yo * STRIDE_Y;
@@ -145,7 +134,7 @@ __kernel void dwc_native_fp_nhwc(
             c[i].v = 0;
         })
 
-#if _IWEI_HEIGHT <= 5
+#if _IWEI_HEIGHT < 5
         LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
 #else  // _IWEI_HEIGHT <= 5
         for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
@@ -159,7 +148,7 @@ __kernel void dwc_native_fp_nhwc(
         })
 
         // Load tile from the src tensor (TILE A)
-        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
 
         TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
 
@@ -176,7 +165,7 @@ __kernel void dwc_native_fp_nhwc(
             })
         })
                        }
-#if _IWEI_HEIGHT <= 5
+#if _IWEI_HEIGHT < 5
                       )
 #endif // _IWEI_HEIGHT <= 5
 
@@ -199,7 +188,7 @@ __kernel void dwc_native_fp_nhwc(
         {
             LOOP_UNROLLING(int, m0, 0, 1, M0,
             {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
+                int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
                 VSTORE_PARTIAL(N0, PARTIAL_N0)
                 (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
             })
@@ -208,11 +197,11 @@ __kernel void dwc_native_fp_nhwc(
         {
             LOOP_UNROLLING(int, m0, 0, 1, M0,
             {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
+                int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
                 VSTORE(N0)
                 (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
             })
         }
     }
 }
-#endif // defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
-\ No newline at end of file
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
index 1bc58b6e2..ec2593af7 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -44,7 +44,7 @@
 #define T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE)
 #define T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_##QUANTIZATION_TYPE()
 
-#if defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+#if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
 //! @cond Doxygen_Suppress
 /** OpenCL kernel to compute the depthwise convolution for quantized data types
  *
@@ -54,10 +54,6 @@
  * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
  * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
  * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
- * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
- * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
  * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
  * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
  * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
@@ -79,24 +75,22 @@
  * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
  *
  * @param[in]  src_ptr                                       Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
- * @param[in]  src_stride_x                                  Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                                    src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                                  Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                                    src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                                  Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                                    src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_stride_w                                  Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                                    src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_c                                         The size of the channels dimension of the source tensor
+ * @param[in]  src_w                                         The size of the width dimension of the source tensor
+ * @param[in]  src_h                                         The size of the height dimension of the source tensor
+ * @param[in]  src_n                                         The size of the batches dimension of the source tensor
  * @param[in]  src_offset_first_element_in_bytes             The offset of the first element in the source tensor
  * @param[out] dst_ptr                                       Pointer to the destination tensor. Supported data type: same as @p src_ptr
- * @param[in]  dst_stride_x                                  Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                                    dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                                  Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                                    dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_stride_z                                  Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                                    dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_stride_w                                  Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                                    dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_c                                         The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                                         The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                                         The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                                         The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes             The offset of the first element in the destination tensor
  * @param[in]  wei_ptr                                       Pointer to the weights tensor. Supported data type: same as @p src_ptr
  * @param[in]  wei_stride_x                                  Stride of the weights tensor in X dimension (in bytes)
@@ -123,8 +117,8 @@
  */
 //! @endcond
 __kernel void dwc_native_quantized_nhwc(
-    TENSOR4D(src, SRC_TENSOR_TYPE),
-    TENSOR4D(dst, DST_TENSOR_TYPE),
+    TENSOR4D_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_T(dst, DST_TENSOR_TYPE),
     TENSOR4D(wei, WEI_TENSOR_TYPE),
     VECTOR_DECLARATION(dst_multipliers),
     VECTOR_DECLARATION(dst_shifts)
@@ -134,15 +128,10 @@ __kernel void dwc_native_quantized_nhwc(
 #endif // defined(HAS_BIAS)
 )
 {
-    // All the tensor dimensions are passed at compile time.
+    // Only the weight tensor dimensions are passed at compile time.
     // In case of dynamic tensor support, the following dimensions should be passed as function argument.
 #define _IWEI_WIDTH WEI_WIDTH
 #define _IWEI_HEIGHT WEI_HEIGHT
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _IDST_CHANNELS DST_CHANNELS
 #define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
 #define _IN0_A N0          // Cols tile A
 #define _IM0_B _IWEI_WIDTH // Rows tile B
@@ -152,12 +141,12 @@ __kernel void dwc_native_quantized_nhwc(
     const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
     const int xo   = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH
 #if defined(BATCHED_EXECUTION)
-    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
-    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
-#else                                                         // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
     const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
     const int bout = 0;                        // BATCH SIZE IDX
-#endif                                                        // defined(BATCHED_EXECUTION)
+#endif                                                 // defined(BATCHED_EXECUTION)
 
     int xi = xo * STRIDE_X;
     int yi = yo * STRIDE_Y;
@@ -191,7 +180,7 @@ __kernel void dwc_native_quantized_nhwc(
         })
 
         // Load tile from the src tensor (TILE A)
-        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
 
         TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
 
@@ -265,7 +254,7 @@ __kernel void dwc_native_quantized_nhwc(
         {
             LOOP_UNROLLING(int, m0, 0, 1, M0,
             {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
+                int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
                 VSTORE_PARTIAL(N0, PARTIAL_N0)
                 (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
             })
@@ -274,11 +263,11 @@ __kernel void dwc_native_quantized_nhwc(
         {
             LOOP_UNROLLING(int, m0, 0, 1, M0,
             {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
+                int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
                 VSTORE(N0)
                 (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
             })
         }
     }
 }
-#endif // defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
-\ No newline at end of file
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/im2col.cl b/src/core/CL/cl_kernels/nhwc/im2col.cl
index ac00c1128..a23e943fa 100644
--- a/src/core/CL/cl_kernels/nhwc/im2col.cl
+++ b/src/core/CL/cl_kernels/nhwc/im2col.cl
@@ -22,23 +22,11 @@
  * SOFTWARE.
  */
 #include "helpers.h"
-#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
-
-#if ELEMENT_SIZE == 1
-#define COND_DATA_TYPE char
-#elif ELEMENT_SIZE == 2
-#define COND_DATA_TYPE short
-#elif ELEMENT_SIZE == 4
-#define COND_DATA_TYPE int
-#else // ELEMENT_SIZE
-#error "Element size not support"
-#endif // ELEMENT_SIZE
-
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
 
 #define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)
+#define COND_N SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
 
+#if defined(IM2COL_3X3) || defined(IM2COL_9X9)
 /** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
  *  @name IM2COL1X9_NHWC_STORE
  *
@@ -109,7 +97,9 @@
     VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
     (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
 /** @}*/
+#endif // defined(IM2COL_3X3) || defined(IM2COL_9X9)
 
+#if defined(IM2COL_3X3)
 /** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
  *
  * @note This kernel computes VECTOR_SIZE elements
@@ -269,7 +259,9 @@ __kernel void im2col3x3_nhwc(
     }
 #endif // HAS_BIAS
 }
+#endif // defined(IM2COL_3X3)
 
+#if defined(IM2COL_9X9)
 #if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
 #define IM2COL1x9(i)                                                                                         \
     ({                                                                                                       \
@@ -416,7 +408,9 @@ __kernel void im2col9x9_nhwc(
     }
 #endif // HAS_BIAS
 }
+#endif // defined(IM2COL_9X9)
 
+#if defined(IM2COL_GENERIC)
 /** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
  *
  * @note This kernel computes VECTOR_SIZE elements
@@ -463,19 +457,20 @@ __kernel void im2col_generic_nhwc(
     const int batch        = get_global_id(2); // batch size
 
     // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+    const int xi = (yo % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (yo / (int)CONVOLVED_WIDTH) * STRIDE_Y;
 
     // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+    const int stride_x         = ch * sizeof(DATA_TYPE);
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + stride_x + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + stride_x + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
 
     int i = 0;
     for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
     {
         // Clamp yi_coord
         int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
-        yi_coord     = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
+        yi_coord     = clamp(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
 
         // Out-of-bound condition for Y
         int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
@@ -484,7 +479,7 @@ __kernel void im2col_generic_nhwc(
         {
             // Clamp xi_coord
             int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
-            xi_coord     = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
+            xi_coord     = clamp(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
 
             // Out-of-bound condition for X
             int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
@@ -528,5 +523,4 @@ __kernel void im2col_generic_nhwc(
     }
 #endif // HAS_BIAS
 }
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
-\ No newline at end of file
+#endif // defined(IM2COL_GENERIC)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/remap.cl b/src/core/CL/cl_kernels/nhwc/remap.cl
deleted file mode 100644
index 0b629fe6c..000000000
--- a/src/core/CL/cl_kernels/nhwc/remap.cl
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-#ifdef DEPTH_OUT
-/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
- *  Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8,F16.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8,F16.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- * @param[in]  border_val                         Value to use for border around input tensor when in CONSTANT border is selected
- */
-__kernel void remap_nearest_neighbour_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    TENSOR4D_DECLARATION(mapx),
-    TENSOR4D_DECLARATION(mapy),
-    const float width,
-    const float height
-#ifdef CONSTANT_BORDER
-    ,
-    const DATA_TYPE border_val
-#endif // CONSTANT_BORDER
-)
-{
-    Tensor4D in   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out  = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-    Tensor4D mapx = CONVERT_TO_TENSOR4D_STRUCT(mapx, DEPTH_OUT);
-    Tensor4D mapy = CONVERT_TO_TENSOR4D_STRUCT(mapy, DEPTH_OUT);
-
-    float mapx_coord = (float) * (__global float *)mapx.ptr;
-    float mapy_coord = (float) * (__global float *)mapy.ptr;
-
-#ifdef CONSTANT_BORDER
-    if(mapx_coord < 0 || mapx_coord > width - 1 || mapy_coord < 0 || mapy_coord > height - 1)
-    {
-        *((__global DATA_TYPE *)out.ptr) = border_val;
-        return;
-    }
-#else  // CONSTANT_BORDER
-    mapx_coord = clamp(mapx_coord, 0.0f, width - 1);
-    mapy_coord = clamp(mapy_coord, 0.0f, height - 1);
-#endif // CONSTANT_BORDER
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(mapx_coord), convert_int(mapy_coord), (get_global_id(2) / DEPTH_OUT)));
-}
-
-/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
- *  Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8,F16.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8,F16.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- * @param[in]  border_val                         Value to use for border around input tensor when in CONSTANT border is selected
- */
-__kernel void remap_bilinear_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    TENSOR4D_DECLARATION(mapx),
-    TENSOR4D_DECLARATION(mapy),
-    const float width,
-    const float height
-#ifdef CONSTANT_BORDER
-    ,
-    const DATA_TYPE border_val
-#endif // CONSTANT_BORDER
-)
-{
-    Tensor4D in   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out  = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-    Tensor4D mapx = CONVERT_TO_TENSOR4D_STRUCT(mapx, DEPTH_OUT);
-    Tensor4D mapy = CONVERT_TO_TENSOR4D_STRUCT(mapy, DEPTH_OUT);
-
-    float mapx_coord = (float) * (__global float *)mapx.ptr;
-    float mapy_coord = (float) * (__global float *)mapy.ptr;
-
-#ifdef CONSTANT_BORDER
-    if(mapx_coord < 0 || mapx_coord > width - 1 || mapy_coord < 0 || mapy_coord > height - 1)
-    {
-        *((__global DATA_TYPE *)out.ptr) = border_val;
-        return;
-    }
-#endif // CONSTANT_BORDER
-
-    const float new_xf     = floor(mapx_coord);
-    const float new_yf     = floor(mapy_coord);
-    const float clamped_x  = clamp(new_xf, 0.0f, width - 1);
-    const float clamped_x1 = clamp(new_xf + 1, 0.0f, width - 1);
-    const float clamped_y  = clamp(new_yf, 0.0f, height - 1);
-    const float clamped_y1 = clamp(new_yf + 1, 0.0f, height - 1);
-
-    float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-
-    const float a  = mapx_coord - new_xf;
-    const float b  = 1.f - a;
-    const float a1 = mapy_coord - new_yf;
-    const float b1 = 1.f - a1;
-    const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
-
-    *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
-}
-
-#endif // DEPTH_OUT
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index 21579aed9..bccfd6543 100644
--- a/src/core/CL/cl_kernels/nhwc/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -24,12 +24,11 @@
 #include "helpers.h"
 #include "tile_helpers.h"
 
+#if defined(SCALE_NEAREST_NEIGHBOUR)
 //! @cond Doxygen_Suppress
 /** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC)
  *
  * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
  * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
  * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
  * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
@@ -37,61 +36,52 @@
  * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
  * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
  * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
- * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5)
- * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5)
  * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c                             The size of the channels dimension of the source tensor
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale_x                           The scale value to apply on the source width
+ * @param[in] scale_y                           The scale value to apply on the source height
  */
- //! @endcond
+//! @endcond
 __kernel void scale_nearest_neighbour_nhwc(
-    TENSOR4D(src, SRC_TENSOR_TYPE),
-    TENSOR4D(dst, DST_TENSOR_TYPE))
+    TENSOR4D_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_T(dst, DST_TENSOR_TYPE),
+    const float scale_x,
+    const float scale_y)
 {
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _ISCALE_X SCALE_X
-#define _ISCALE_Y SCALE_Y
-
     const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
     const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
 #if defined(BATCHED_EXECUTION)
-    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
-    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
-#else                                                         // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
     const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
-    const int bout = 0; // BATCH SIZE IDX
-#endif                                                        // defined(BATCHED_EXECUTION)
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                 // defined(BATCHED_EXECUTION)
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
-    float xi_f = (xo * (float)SCALE_X);
-    float yi_f = (yo * (float)SCALE_Y);
+    float xi_f = (xo * scale_x);
+    float yi_f = (yo * scale_y);
 #elif SAMPLING_POLICY_CENTER
-    float     xi_f = ((xo + 0.5f) * (float)SCALE_X);
-    float     yi_f = ((yo + 0.5f) * (float)SCALE_Y);
+    float     xi_f = ((xo + 0.5f) * scale_x);
+    float     yi_f = ((yo + 0.5f) * scale_y);
 #else // SAMPLING_POLICY
 #error("Unsupported sampling policy");
 #endif // SAMPLING_POLICY
@@ -101,30 +91,30 @@ __kernel void scale_nearest_neighbour_nhwc(
     yi_f = round(yi_f);
 #endif // ALIGN_CORNERS
 
-    const int xi0 = clamp((int)xi_f, 0, _ISRC_WIDTH - 1);
-    const int yi0 = clamp((int)yi_f, 0, _ISRC_HEIGHT - 1);
+    const int xi0 = clamp((int)xi_f, 0, (int)src_w - 1);
+    const int yi0 = clamp((int)yi_f, 0, (int)src_h - 1);
 
     TILE(SRC_DATA_TYPE, 1, N0, in00);
 
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
 
     TILE(uint, 1, 1, dst_indirect_y);
 
     // Calculate the destination indirect Y
-    dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
 
     bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
 
     T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y);
 }
+#endif /* SCALE_NEAREST_NEIGHBOUR */
 
+#if defined(SCALE_BILINEAR)
 //! @cond Doxygen_Suppress
 /** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC)
  *
  * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
  * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
  * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
  * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
  * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
@@ -132,65 +122,56 @@ __kernel void scale_nearest_neighbour_nhwc(
  * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
  * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
  * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
- * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5)
- * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5)
  * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
  *
  * @note In case of QASYMM8, the following extra information must be passed at compile time:
  * - The source offset e.g. -DOFFSET=4
  * - The source scale e.g. -DSCALE=4
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c                             The size of the channels dimension of the source tensor
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale_x                           The scale value to apply on the source width
+ * @param[in] scale_y                           The scale value to apply on the source height
  */
- //! @endcond
+//! @endcond
 __kernel void scale_bilinear_nhwc(
-    TENSOR4D(src, SRC_TENSOR_TYPE),
-    TENSOR4D(dst, DST_TENSOR_TYPE))
+    TENSOR4D_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_T(dst, DST_TENSOR_TYPE),
+    const float scale_x,
+    const float scale_y)
 {
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _ISCALE_X SCALE_X
-#define _ISCALE_Y SCALE_Y
-
     const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
     const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
 #if defined(BATCHED_EXECUTION)
-    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
-    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
-#else                                                         // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
     const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
     const int bout = 0;                        // BATCH SIZE IDX
-#endif                                                        // defined(BATCHED_EXECUTION)
+#endif                                                 // defined(BATCHED_EXECUTION)
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
-    float xi_f = (xo * (float)SCALE_X);
-    float yi_f = (yo * (float)SCALE_Y);
+    float xi_f = (xo * scale_x);
+    float yi_f = (yo * scale_y);
 #elif SAMPLING_POLICY_CENTER
-    float     xi_f = ((xo + 0.5f) * (float)SCALE_X - 0.5f);
-    float     yi_f = ((yo + 0.5f) * (float)SCALE_Y - 0.5f);
+    float     xi_f = ((xo + 0.5f) * scale_x - 0.5f);
+    float     yi_f = ((yo + 0.5f) * scale_y - 0.5f);
 #else // SAMPLING_POLICY
 #error("Unsupported sampling policy");
 #endif // SAMPLING_POLICY
@@ -210,20 +191,20 @@ __kernel void scale_bilinear_nhwc(
     in11[0].v = CONSTANT_VALUE;
 
 #ifndef BORDER_MODE_REPLICATE
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in00);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in01);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in10);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in11);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, src_w, src_h, 1, 1, true, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, src_w, src_h, 1, 1, true, in01);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, src_w, src_h, 1, 1, true, in10);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, src_w, src_h, 1, 1, true, in11);
 #else  // BORDER_MODE_REPLICATE
-    const int xi0  = clamp(xi, 0, _ISRC_WIDTH - 1);
-    const int yi0  = clamp(yi, 0, _ISRC_HEIGHT - 1);
-    const int xi1  = clamp(xi + 1, 0, _ISRC_WIDTH - 1);
-    const int yi1  = clamp(yi + 1, 0, _ISRC_HEIGHT - 1);
+    const int xi0  = clamp(xi, 0, (int)src_w - 1);
+    const int yi0  = clamp(yi, 0, (int)src_h - 1);
+    const int xi1  = clamp(xi + 1, 0, (int)src_w - 1);
+    const int yi1  = clamp(yi + 1, 0, (int)src_h - 1);
 
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in01);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in10);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in11);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, src_w, src_h, 1, 1, false, in01);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, src_w, src_h, 1, 1, false, in10);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, src_w, src_h, 1, 1, false, in11);
 #endif // BORDER_MODE_REPLICATE
 
     TILE(DST_DATA_TYPE, 1, N0, out);
@@ -270,9 +251,10 @@ __kernel void scale_bilinear_nhwc(
     TILE(uint, 1, 1, dst_indirect_y);
 
     // Calculate the destination indirect Y
-    dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
 
     bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
 
     T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y);
-}
-\ No newline at end of file
+}
+#endif /* SCALE_BILINEAR */
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index f36f273e1..eba231624 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -104,6 +104,54 @@
 #define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
 #define TENSOR4D(name, type) TENSOR4D_STR(name, type)
 
+#define TENSOR4D_T_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    __global uchar *name##_ptr,       \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_stride_w, \
+    uint        name##_c,   \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_T_BUFFER(name)    \
+    __global uchar *name##_ptr,  \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_stride_w, \
+    uint        name##_c,   \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
+#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
+
+#define TENSOR3D_T_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    __global uchar *name##_ptr,       \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_T_BUFFER(name)    \
+    __global uchar *name##_ptr,  \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
+#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
+
 #if !defined(UNROLL_WITH_PRAGMA)
 #define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
 
@@ -448,6 +496,42 @@
         })                                                                                              \
     })
 
+/** Load a tile from global memory (tensor) using an indirect Y index tile and conditionally use a different length for the load
+ *
+ * @note If WIDTH1_CONDITION is true, the load will use the WIDTH1 length for the store
+ * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones
+ *
+ * @param[in]  DATA_TYPE        Data type
+ * @param[in]  HEIGHT           Number of dst rows
+ * @param[in]  WIDTH0           Store width to use if WIDTH1_CONDITION = false
+ * @param[in]  WIDTH1           Store width to use if WIDTH1_CONDITION = true
+ * @param[in]  TENSOR_TYPE      Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                              In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR           Tensor basename
+ * @param[in]  X                Starting X position
+ * @param[in]  STRIDE_Y         Stride Y (in bytes) used to load each row.
+ * @param[in]  WIDTH1_CONDITION Condition to select the WIDTH1 store
+ * @param[out] dst              Output tile
+ * @param[out] indirect_y       Indirect Y index tile
+ */
+#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
+    ({                                                                                                                                                                                             \
+        if(WIDTH1_CONDITION)                                                                                                                                                                       \
+        {                                                                                                                                                                                          \
+            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
+            {                                                                                                                                                                                      \
+                VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
+                (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
+            })                                                                                                                                                                                     \
+        }                                                                                                                                                                                          \
+        else                                                                                                                                                                                       \
+        {                                                                                                                                                                                          \
+            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
+            {                                                                                                                                                                                      \
+                dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
+            })                                                                                                                                                                                     \
+        }                                                                                                                                                                                          \
+    })
 /** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout
  *
  * @param[in]  DATA_TYPE     Data type
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 2b74f91a0..61c8d90f7 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -215,15 +215,11 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(conv_info.act_info.activation())));
     build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(conv_info.depth_multiplier));
     build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER");
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(1)));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(2)));
     // Note: SRC_DATA_TYPE must have the same data type of WEI_DATA_TYPE. In quantized, we could
     // have a case where the data types for the activation and weights are different. However, since the implementation
     // only works when both have same data type, we have to change the offset to take into account this aspect
     build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
     build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
-    build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(1)));
-    build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(2)));
     build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(_output->info()->data_type()));
     build_opts.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
     build_opts.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(1)));
@@ -290,7 +286,6 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     {
         kernel_name = "dwc_native_fp_nhwc";
         build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-        build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
         build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
         build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
     }
@@ -358,8 +353,9 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
     }
 
     unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice);
-    add_4D_tensor_argument(idx, _output, slice);
+    add_4d_tensor_nhwc_argument(idx, _input);
+    add_4d_tensor_nhwc_argument(idx, _output);
+
     if(_export_to_cl_image)
     {
         _kernel.setArg(idx++, weights_cl_image);
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
deleted file mode 100644
index ea3b637e8..000000000
--- a/src/core/CL/kernels/CLRemapKernel.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLRemapKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-CLRemapKernel::CLRemapKernel()
-    : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr), _data_layout(DataLayout::NCHW)
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-BorderSize CLRemapKernel::border_size() const
-{
-    return _data_layout == DataLayout::NCHW ? BorderSize(1) : BorderSize(0);
-}
-
-template <class T>
-void CLRemapKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value)
-{
-    T value;
-    constant_border_value.get(value);
-    ICLKernel::add_argument<T>(idx, static_cast<T>(value));
-}
-
-Status CLRemapKernel::validate(const ITensorInfo *input, const ITensorInfo *map_x, const ITensorInfo *map_y, const ITensorInfo *output, RemapInfo info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, map_x, map_y, output);
-    if(input->data_layout() == DataLayout::NCHW)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() != output->data_type(), "Input/output have different data types");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.policy == InterpolationPolicy::AREA, "Area interpolation is not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.border_mode != BorderMode::CONSTANT && info.border_mode != BorderMode::UNDEFINED, "Border mode not supported");
-    return Status{};
-}
-
-void CLRemapKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, RemapInfo info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, map_x, map_y, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLRemapKernel::validate(input->info(), map_x->info(), map_y->info(), output->info(), info));
-
-    _input       = input;
-    _output      = output;
-    _map_x       = map_x;
-    _map_y       = map_y;
-    _data_layout = input->info()->data_layout();
-
-    const bool is_nhwc            = _data_layout == DataLayout::NHWC;
-    const bool is_constant_border = info.border_mode == BorderMode::CONSTANT;
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(is_constant_border, "-DCONSTANT_BORDER");
-
-    const std::string interpolation_name = lower_string(string_from_interpolation_policy(info.policy));
-    const std::string kernel_name        = "remap_" + interpolation_name + "_" + lower_string(string_from_data_layout(_data_layout));
-    _kernel                              = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    const unsigned int num_elems_processed_per_iteration = is_nhwc ? 1 : 4;
-    const int          idx_height                        = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int          idx_width                         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int          input_height                      = input->info()->dimension(idx_height);
-    const int          input_width                       = input->info()->dimension(idx_width);
-
-    // Configure window
-    Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
-
-    // Update padding in NCHW case
-    if(_data_layout == DataLayout::NCHW)
-    {
-        const int          total_right  = ceil_to_multiple(input_width, num_elems_processed_per_iteration);
-        const int          access_right = total_right + (((total_right - input_width) == 0) ? border_size().right : 0);
-        AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input_height + border_size().bottom);
-
-        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-        update_window_and_padding(win, input_access, output_access);
-    }
-
-    ICLKernel::configure_internal(win);
-
-    // Set static arguments
-    unsigned int idx = 4 * (is_nhwc ? num_arguments_per_4D_tensor() : num_arguments_per_2D_tensor());
-    _kernel.setArg<cl_float>(idx++, input_width);
-    _kernel.setArg<cl_float>(idx++, input_height);
-    if(is_nhwc && is_constant_border)
-    {
-        switch(input->info()->data_type())
-        {
-            case DataType::U8:
-                set_constant_border<uint8_t>(idx, info.constant_border_value);
-                break;
-            case DataType::F16:
-                static_assert(sizeof(cl_half) == sizeof(half), "Half must be same size as cl_half");
-                static_assert(sizeof(cl_half) == 2, "Half must be 16 bit");
-                set_constant_border<half>(idx, info.constant_border_value);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data Type not handled");
-        }
-    }
-}
-
-void CLRemapKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            Window slice = window.first_slice_window_2D();
-            do
-            {
-                unsigned int idx = 0;
-                add_2D_tensor_argument(idx, _input, slice);
-                add_2D_tensor_argument(idx, _output, slice);
-                add_2D_tensor_argument(idx, _map_x, slice);
-                add_2D_tensor_argument(idx, _map_y, slice);
-                enqueue(queue, *this, slice, lws_hint());
-
-            }
-            while(window.slide_window_slice_2D(slice));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
-            Window slice     = collapsed.first_slice_window_4D();
-
-            unsigned int idx = 0;
-            add_4D_tensor_argument(idx, _input, slice);
-            add_4D_tensor_argument(idx, _output, slice);
-            add_4D_tensor_argument(idx, _map_x, slice);
-            add_4D_tensor_argument(idx, _map_y, slice);
-            enqueue(queue, *this, slice, lws_hint());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Invalid Data layout");
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLRemapKernel.h b/src/core/CL/kernels/CLRemapKernel.h
deleted file mode 100644
index 93b0b4e66..000000000
--- a/src/core/CL/kernels/CLRemapKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREMAPKERNEL_H
-#define ARM_COMPUTE_CLREMAPKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a remap on a tensor */
-class CLRemapKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLRemapKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRemapKernel(const CLRemapKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRemapKernel &operator=(const CLRemapKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLRemapKernel(CLRemapKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLRemapKernel &operator=(CLRemapKernel &&) = default;
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8 (or F16 when layout is NHWC).
-     * @param[in]  map_x           Map for X coordinates. Data types supported: F32.
-     * @param[in]  map_y           Map for Y coordinates. Data types supported: F32.
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  info            RemapInfo struct:
-     *                                   - policy                   Interpolation policy to use. Only NEAREST and BILINEAR are supported.
-     *                                   - border_mode              Border mode to use on the input tensor. Only CONSTANT and UNDEFINED are supported.
-     *                                   - constant_border_value    Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, RemapInfo info);
-    /** Checks if the kernel's input, output and border mode will lead to a valid configuration of @ref CLRemapKernel
-     *
-     * Similar to @ref CLRemapKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, RemapInfo info)
-     *
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *map_x, const ITensorInfo *map_y, const ITensorInfo *output, RemapInfo info);
-    /** Function to set the constant value on fill border kernel depending on type.
-     *
-     * @param[in] idx                   Index of the kernel argument to set.
-     * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    template <class T>
-    void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_map_x;
-    const ICLTensor *_map_y;
-    DataLayout       _data_layout;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREMAPKERNEL_H */
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index 44cd000ad..c197932a1 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "src/common/cpuinfo/CpuInfo.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
 
 namespace arm_compute
 {
@@ -110,6 +111,11 @@ CPUModel CPUInfo::get_cpu_model(unsigned int cpuid) const
     return _impl->info.cpu_model(cpuid);
 }
 
+cpuinfo::CpuIsaInfo CPUInfo::get_isa() const
+{
+    return _impl->info.isa();
+}
+
 unsigned int CPUInfo::get_L1_cache_size() const
 {
     return _impl->L1_cache_size;
diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index b4bd2ddf4..625c0145d 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,13 +39,13 @@ arm_compute::GPUTarget get_valhall_target(const std::string &version)
     {
         return arm_compute::GPUTarget::G78;
     }
-    else if(version.find("TODX") != std::string::npos)
+    else if(version.find("G710") != std::string::npos)
     {
-        return arm_compute::GPUTarget::TODX;
+        return arm_compute::GPUTarget::G710;
     }
     else
     {
-        return arm_compute::GPUTarget::VALHALL;
+        return arm_compute::GPUTarget::UNKNOWN;
     }
 }
 
@@ -136,7 +136,7 @@ const std::string &string_from_target(GPUTarget target)
         { GPUTarget::G76, "g76" },
         { GPUTarget::G77, "g77" },
         { GPUTarget::G78, "g78" },
-        { GPUTarget::TODX, "todx" }
+        { GPUTarget::G710, "g710" }
     };
 
     return gpu_target_map[target];
@@ -164,11 +164,17 @@ GPUTarget get_target_from_name(const std::string &device_name)
     GPUTarget gpu_target;
     if(target == 'G' || is_future_gpu)
     {
-        // Check for Bifrost or Valhall
-        gpu_target = get_bifrost_target(version);
+        // Check for Valhall or Bifrost
+        gpu_target = get_valhall_target(version);
+        if(gpu_target == GPUTarget::UNKNOWN)
+        {
+            gpu_target = get_bifrost_target(version);
+        }
+
+        // Default GPUTarget
         if(gpu_target == GPUTarget::UNKNOWN)
         {
-            gpu_target = get_valhall_target(version);
+            gpu_target = GPUTarget::VALHALL;
         }
     }
     else if(target == 'T')
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index af301c8d1..cd01659c0 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,6 @@
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NERangeKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/NEON/kernels/NERemapKernel.h"
 #include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 #include "src/core/NEON/kernels/NEReverseKernel.h"
 #include "src/core/NEON/kernels/NESelectKernel.h"
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 1e0a1742f..69bfd56ce 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/boundingboxtransform/list.h"
 
 #include <arm_neon.h>
 
@@ -37,6 +39,62 @@ namespace arm_compute
 {
 namespace
 {
+struct BoundingBoxTransformSelectorData
+{
+    DataType dt;
+};
+
+using BoundingBoxTransformSelctorPtr = std::add_pointer<bool(const BoundingBoxTransformSelectorData &data)>::type;
+using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)>::type;
+
+struct BoundingBoxTransformKernel
+{
+    const char                          *name;
+    const BoundingBoxTransformSelctorPtr is_selected;
+    BoundingBoxTransformUKernelPtr       ukernel;
+};
+
+static const BoundingBoxTransformKernel available_kernels[] =
+{
+    {
+        "fp32_neon_boundingboxtransform",
+        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)
+    },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "fp16_neon_boundingboxtransform",
+        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)
+    },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "qu16_neon_boundingboxtransform",
+        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; },
+        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)
+    },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
@@ -112,145 +170,15 @@ Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const IT
     return Status{};
 }
 
-template <>
-void NEBoundingBoxTransformKernel::internal_run<uint16_t>(const Window &window)
-{
-    const size_t num_classes  = _deltas->info()->tensor_shape()[0] >> 2;
-    const size_t deltas_width = _deltas->info()->tensor_shape()[0];
-    const int    img_h        = std::floor(_bbinfo.img_height() / _bbinfo.scale() + 0.5f);
-    const int    img_w        = std::floor(_bbinfo.img_width() / _bbinfo.scale() + 0.5f);
-
-    const auto scale_after  = (_bbinfo.apply_scale() ? _bbinfo.scale() : 1.f);
-    const auto scale_before = _bbinfo.scale();
-    const auto offset       = (_bbinfo.correct_transform_coords() ? 1.f : 0.f);
-
-    auto pred_ptr  = reinterpret_cast<uint16_t *>(_pred_boxes->buffer() + _pred_boxes->info()->offset_first_element_in_bytes());
-    auto delta_ptr = reinterpret_cast<uint8_t *>(_deltas->buffer() + _deltas->info()->offset_first_element_in_bytes());
-
-    const auto boxes_qinfo  = _boxes->info()->quantization_info().uniform();
-    const auto deltas_qinfo = _deltas->info()->quantization_info().uniform();
-    const auto pred_qinfo   = _pred_boxes->info()->quantization_info().uniform();
-
-    Iterator box_it(_boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto  ptr    = reinterpret_cast<uint16_t *>(box_it.ptr());
-        const auto  b0     = dequantize_qasymm16(*ptr, boxes_qinfo);
-        const auto  b1     = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
-        const auto  b2     = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
-        const auto  b3     = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
-        const float width  = (b2 / scale_before) - (b0 / scale_before) + 1.f;
-        const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
-        const float ctr_x  = (b0 / scale_before) + 0.5f * width;
-        const float ctr_y  = (b1 / scale_before) + 0.5f * height;
-        for(size_t j = 0; j < num_classes; ++j)
-        {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const float  dx       = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / _bbinfo.weights()[0];
-            const float  dy       = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / _bbinfo.weights()[1];
-            float        dw       = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / _bbinfo.weights()[2];
-            float        dh       = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / _bbinfo.weights()[3];
-            // Clip dw and dh
-            dw = std::min(dw, _bbinfo.bbox_xform_clip());
-            dh = std::min(dh, _bbinfo.bbox_xform_clip());
-            // Determine the predictions
-            const float pred_ctr_x = dx * width + ctr_x;
-            const float pred_ctr_y = dy * height + ctr_y;
-            const float pred_w     = std::exp(dw) * width;
-            const float pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo);
-        }
-    },
-    box_it);
-}
-
-template <typename T>
-void NEBoundingBoxTransformKernel::internal_run(const Window &window)
-{
-    const size_t num_classes  = _deltas->info()->tensor_shape()[0] >> 2;
-    const size_t deltas_width = _deltas->info()->tensor_shape()[0];
-    const int    img_h        = std::floor(_bbinfo.img_height() / _bbinfo.scale() + 0.5f);
-    const int    img_w        = std::floor(_bbinfo.img_width() / _bbinfo.scale() + 0.5f);
-
-    const auto scale_after  = (_bbinfo.apply_scale() ? T(_bbinfo.scale()) : T(1));
-    const auto scale_before = T(_bbinfo.scale());
-    ARM_COMPUTE_ERROR_ON(scale_before <= 0);
-    const auto offset = (_bbinfo.correct_transform_coords() ? T(1.f) : T(0.f));
-
-    auto pred_ptr  = reinterpret_cast<T *>(_pred_boxes->buffer() + _pred_boxes->info()->offset_first_element_in_bytes());
-    auto delta_ptr = reinterpret_cast<T *>(_deltas->buffer() + _deltas->info()->offset_first_element_in_bytes());
-
-    Iterator box_it(_boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto ptr    = reinterpret_cast<T *>(box_it.ptr());
-        const auto b0     = *ptr;
-        const auto b1     = *(ptr + 1);
-        const auto b2     = *(ptr + 2);
-        const auto b3     = *(ptr + 3);
-        const T    width  = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
-        const T    height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
-        const T    ctr_x  = (b0 / scale_before) + T(0.5f) * width;
-        const T    ctr_y  = (b1 / scale_before) + T(0.5f) * height;
-        for(size_t j = 0; j < num_classes; ++j)
-        {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const T      dx       = delta_ptr[delta_id] / T(_bbinfo.weights()[0]);
-            const T      dy       = delta_ptr[delta_id + 1] / T(_bbinfo.weights()[1]);
-            T            dw       = delta_ptr[delta_id + 2] / T(_bbinfo.weights()[2]);
-            T            dh       = delta_ptr[delta_id + 3] / T(_bbinfo.weights()[3]);
-            // Clip dw and dh
-            dw = std::min(dw, T(_bbinfo.bbox_xform_clip()));
-            dh = std::min(dh, T(_bbinfo.bbox_xform_clip()));
-            // Determine the predictions
-            const T pred_ctr_x = dx * width + ctr_x;
-            const T pred_ctr_y = dy * height + ctr_y;
-            const T pred_w     = std::exp(dw) * width;
-            const T pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
-            pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
-        }
-    },
-    box_it);
-}
-
 void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    switch(_boxes->info()->data_type())
-    {
-        case DataType::F32:
-        {
-            internal_run<float>(window);
-            break;
-        }
-        case DataType::QASYMM16:
-        {
-            internal_run<uint16_t>(window);
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            internal_run<float16_t>(window);
-            break;
-        }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        default:
-        {
-            ARM_COMPUTE_ERROR("Data type not supported");
-        }
-    }
+
+    const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
index c080ce6a5..def827836 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -83,9 +83,6 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    template <typename T>
-    void internal_run(const Window &window);
-
     const ITensor           *_boxes;
     ITensor                 *_pred_boxes;
     const ITensor           *_deltas;
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index fabbd6430..729402116 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -31,136 +31,92 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/bit_ops.h"
+#include "src/cpu/kernels/crop/generic/neon/list.h"
 
 namespace arm_compute
 {
 namespace
 {
-template <typename T>
-inline float32x4_t load_as_f32(T *ptr)
+struct CropSelectorData
 {
-    ARM_COMPUTE_UNUSED(ptr);
-    ARM_COMPUTE_ERROR("Type not supported.");
-}
-
-template <>
-inline float32x4_t load_as_f32(float *ptr)
-{
-    return wrapper::vloadq(ptr);
-}
-
-template <>
-inline float32x4_t load_as_f32(int32_t *ptr)
-{
-    return vcvtq_f32_s32(wrapper::vloadq(ptr));
-}
+    DataType dt;
+};
 
-template <>
-inline float32x4_t load_as_f32(uint32_t *ptr)
-{
-    return vcvtq_f32_u32(wrapper::vloadq(ptr));
-}
+using CropSelectorPtr = std::add_pointer<bool(const CropSelectorData &data)>::type;
+using CropUKernelPtr  = std::add_pointer<void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
 
-template <>
-inline float32x4_t load_as_f32(int16_t *ptr)
+struct CropUKernel
 {
-    return vcvtq_f32_s32(vmovl_s16(wrapper::vload(ptr)));
-}
+    const char           *name;
+    const CropSelectorPtr is_selected;
+    CropUKernelPtr        ukernel;
+};
 
-template <>
-inline float32x4_t load_as_f32(uint16_t *ptr)
+static const CropUKernel available_kernels[] =
 {
-    return vcvtq_f32_u32(vmovl_u16(wrapper::vload(ptr)));
-}
-
-template <>
-inline float32x4_t load_as_f32(uint8_t *ptr)
-{
-    return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr)))));
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float32x4_t load_as_f32(float16_t *ptr)
-{
-    return vcvt_f32_f16(wrapper::vload(ptr));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T>
-inline void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                  int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
-{
-    // Reverse elements if width flipped.
-    if(is_width_flipped)
     {
-        // Collapse first dimension if possible.
-        if(input_has_single_channel)
-        {
-            int32_t     x = output_width_start;
-            Coordinates negative_offset(input_offset);
-            negative_offset.set(1, negative_offset[1] - window_step_x + 1);
-            for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
-            {
-                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
-
-                in = wrapper::vrev64(in);
-                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+        "fp16_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)
+    },
+    {
+        "f32_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)
+    },
+    {
+        "u8_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::U8; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)
+    },
+    {
+        "u16_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::U16; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)
+    },
+    {
+        "u32_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::U32; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)
+    },
+    {
+        "s8_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::S8; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)
+    },
+    {
+        "s16_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::S16; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)
+    },
+    {
+        "s32_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::S32; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)
+    },
+};
 
-                wrapper::vstore(output_ptr + x, in);
-            }
-            input_offset[1] = negative_offset[1] + window_step_x - 1;
-            for(; x < output_width_limit; ++x, --input_offset[1])
-            {
-                *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-            }
-        }
-        else
-        {
-            for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
-            {
-                input_offset.set(0, 0);
-                int32_t c = 0;
-                for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
-                {
-                    auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                    wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
-                }
-                for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
-                {
-                    *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                }
-            }
-        }
-    }
-    else
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const CropUKernel *get_implementation(const CropSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
     {
-        // Use memcpy if the elements don't need converting to float.
-        if(std::is_same<T, float>::value)
+        if(uk.is_selected(data))
         {
-            memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
-                   reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
-                   (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
-        }
-        else
-        {
-            int32_t x                = 0;
-            int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
-            float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
-            for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
-            {
-                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                wrapper::vstore(output_start_ptr + x, in);
-            }
-            for(; x < limit; ++x, ++input_offset[0])
-            {
-                *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-            }
+            return &uk;
         }
     }
+
+    return nullptr;
 }
 
 inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value,
@@ -234,8 +190,7 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina
 } // namespace
 
 NECropKernel::NECropKernel()
-    : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds(),
-      _in_bounds_crop_function(nullptr)
+    : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds()
 {
 }
 
@@ -250,40 +205,14 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co
     _output              = output;
     _crop_box_ind        = crop_box_ind;
     _extrapolation_value = extrapolation_value;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::F32:
-            _in_bounds_crop_function = &in_bounds_crop_window<float>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _in_bounds_crop_function = &in_bounds_crop_window<float16_t>;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::U32:
-            _in_bounds_crop_function = &in_bounds_crop_window<uint32_t>;
-            break;
-        case DataType::S32:
-            _in_bounds_crop_function = &in_bounds_crop_window<int32_t>;
-            break;
-        case DataType::U16:
-            _in_bounds_crop_function = &in_bounds_crop_window<uint16_t>;
-            break;
-        case DataType::S16:
-            _in_bounds_crop_function = &in_bounds_crop_window<int16_t>;
-            break;
-        case DataType::U8:
-            _in_bounds_crop_function = &in_bounds_crop_window<uint8_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Datatype not supported");
-    }
 }
 
 Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value)
 {
     ARM_COMPUTE_UNUSED(extrapolation_value);
+    const auto *uk = get_implementation(CropSelectorData{ input->data_type() });
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
@@ -369,10 +298,12 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON(_input->info()->has_padding());
     ARM_COMPUTE_ERROR_ON(_output->info()->has_padding());
 
+    const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() });
+
     uint32_t    batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind))));
     Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
                              _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
-    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, _in_bounds_crop_function, _end[1] < _start[1],
+    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1],
                    _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0,
                    _start[0] <= _end[0], _end[0] < _start[0]);
 }
diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h
index 742215e22..6c989c1d2 100644
--- a/src/core/NEON/kernels/NECropKernel.h
+++ b/src/core/NEON/kernels/NECropKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -107,8 +107,6 @@ private:
     std::array<uint32_t, 2> _rows_out_of_bounds;
     /** The number of columns out of bounds at the start and end of output. */
     std::array<uint32_t, 2> _cols_out_of_bounds;
-
-    NECropKernel::InBoundsCropFunction *_in_bounds_crop_function;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEON_CROP_KERNEL_H */
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 56aed0ca2..7bba136e8 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,15 +28,72 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
+#include "src/cpu/kernels/genproposals/list.h"
 #include <arm_neon.h>
 
 namespace arm_compute
 {
 namespace
 {
+struct ComputeAllAnchorsData
+{
+    DataType dt;
+};
+
+using ComputeAllAnchorsSelectorPtr = std::add_pointer<bool(const ComputeAllAnchorsData &data)>::type;
+using ComputeAllAnchorsUKernelPtr  = std::add_pointer<void(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
+
+struct ComputeAllAnchorsKernel
+{
+    const char                        *name;
+    const ComputeAllAnchorsSelectorPtr is_selected;
+    ComputeAllAnchorsUKernelPtr        ukernel;
+};
+
+static const ComputeAllAnchorsKernel available_kernels[] =
+{
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "neon_qu16_computeallanchors",
+        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; },
+        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)
+    },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "neon_fp16_computeallanchors",
+        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)
+    },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "neon_fp32_computeallanchors",
+        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)
+    },
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(anchors, all_anchors);
@@ -100,100 +157,15 @@ Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITe
     return Status{};
 }
 
-template <>
-void NEComputeAllAnchorsKernel::internal_run<int16_t>(const Window &window)
-{
-    Iterator all_anchors_it(_all_anchors, window);
-    Iterator anchors_it(_all_anchors, window);
-
-    const size_t num_anchors = _anchors->info()->dimension(1);
-    const float  stride      = 1.f / _anchors_info.spatial_scale();
-    const size_t feat_width  = _anchors_info.feat_width();
-
-    const UniformQuantizationInfo qinfo = _anchors->info()->quantization_info().uniform();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
-
-        const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<int16_t *>(_anchors->ptr_to_element(Coordinates(0, anchor_offset)));
-
-        const size_t shift_idy = id.y() / num_anchors;
-        const float  shiftx    = (shift_idy % feat_width) * stride;
-        const float  shifty    = (shift_idy / feat_width) * stride;
-
-        const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
-        const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
-        const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
-        const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
-
-        *out_anchor_ptr       = quantize_qsymm16(new_anchor_x1, qinfo.scale);
-        *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
-        *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
-        *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
-    },
-    all_anchors_it);
-}
-
-template <typename T>
-void NEComputeAllAnchorsKernel::internal_run(const Window &window)
-{
-    Iterator all_anchors_it(_all_anchors, window);
-    Iterator anchors_it(_all_anchors, window);
-
-    const size_t num_anchors = _anchors->info()->dimension(1);
-    const T      stride      = 1.f / _anchors_info.spatial_scale();
-    const size_t feat_width  = _anchors_info.feat_width();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
-
-        const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<T *>(_anchors->ptr_to_element(Coordinates(0, anchor_offset)));
-
-        const size_t shift_idy = id.y() / num_anchors;
-        const T      shiftx    = (shift_idy % feat_width) * stride;
-        const T      shifty    = (shift_idy / feat_width) * stride;
-
-        *out_anchor_ptr       = *anchor_ptr + shiftx;
-        *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
-        *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
-        *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
-    },
-    all_anchors_it);
-}
-
 void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_anchors->info()->data_type())
-    {
-        case DataType::QSYMM16:
-        {
-            internal_run<int16_t>(window);
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            internal_run<float16_t>(window);
-            break;
-        }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-        {
-            internal_run<float>(window);
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Data type not supported");
-        }
-    }
+    const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_anchors, _all_anchors, _anchors_info, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
index f6d39e50a..297d6d4ab 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,9 +74,6 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    template <typename T>
-    void internal_run(const Window &window);
-
     const ITensor     *_anchors;
     ITensor           *_all_anchors;
     ComputeAnchorsInfo _anchors_info;
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index d33431a8d..71641404b 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,10 @@
 #include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/instancenorm/list.h"
 
 #include <arm_neon.h>
 
@@ -43,137 +45,53 @@ namespace arm_compute
 {
 namespace
 {
-template <typename InputType, typename AccType = InputType>
-void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs)
+struct InstanceNormSelectorData
 {
-    result        = wrapper::vadd(result, inputs);
-    result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs));
-}
+    DataType dt;
+};
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void vector_float_sum(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs)
-{
-    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgetlow(inputs)));
-    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+using InstanceNormSelctorPtr = std::add_pointer<bool(const InstanceNormSelectorData &data)>::type;
+using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)>::type;
 
-template <typename InputType, typename AccType = InputType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+struct InstanceNormKernel
 {
-    return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
-}
+    const char                  *name;
+    const InstanceNormSelctorPtr is_selected;
+    InstanceNormUKernelPtr       ukernel;
+};
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float16x8_t vector_float_norm(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta)
+static const InstanceNormKernel available_kernels[] =
 {
-    const auto  input_low   = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
-    const auto  input_high  = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
-    const auto  result_low  = wrapper::vcvt<float16_t>(vector_float_norm(input_low, vec_mean, vec_multip, vec_beta));
-    const auto  result_high = wrapper::vcvt<float16_t>(vector_float_norm(input_high, vec_mean, vec_multip, vec_beta));
-    float16x8_t result      = wrapper::vcombine(result_low, result_high);
-
-    return result;
-}
+    {
+        "fp32_neon_instancenorm",
+        [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)
+    },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "fp16_neon_instancenorm",
+        [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)
+    },
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+};
 
-template <typename T, typename AccType = T>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data)
 {
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    // Clear X/Y dimensions on execution window as we handle the planes manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    constexpr int      window_step_x  = 16 / sizeof(T);
-    const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
-
-    Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
+    for(const auto &uk : available_kernels)
     {
-        Window win_plane = window;
-        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
-        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
-        Iterator input_plane_it(input, win_plane);
-        Iterator output_plane_it(output, win_plane);
-
-        auto sum_h_w         = static_cast<AccType>(0.f);
-        auto sum_squares_h_w = static_cast<AccType>(0.f);
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
+        if(uk.is_selected(data))
         {
-            const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
-
-            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                auto vec_input_val = wrapper::vloadq(input_ptr + x);
-                vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
-            }
-
-            auto vec2_sum_h_w         = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
-            auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
-
-            vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-
-            sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
-            sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto value = static_cast<AccType>(*(input_ptr + x));
-                sum_h_w += value;
-                sum_squares_h_w += value * value;
-            }
-        },
-        input_plane_it, output_plane_it);
-
-        const auto mean_h_w = sum_h_w / elements_plane;
-        const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
-        const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
-        const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
-        {
-            auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
-            auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                const auto vec_val        = wrapper::vloadq(input_ptr + x);
-                const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
-                wrapper::vstore(output_ptr + x, normalized_vec);
-            }
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto val    = static_cast<AccType>(*(input_ptr + x));
-                *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
-            }
-        },
-        input_plane_it, output_plane_it);
-    },
-    input_it);
+            return &uk;
+        }
+    }
+    return nullptr;
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
@@ -210,7 +128,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
 } // namespace
 
 NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12)
+    : _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12)
 {
 }
 
@@ -227,28 +145,6 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), _gamma, _beta, _epsilon));
 
-    if(_input->info()->data_type() == DataType::F32)
-    {
-        _func = &instance_normalization_nchw<float>;
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else if(_input->info()->data_type() == DataType::F16)
-    {
-        if(_use_mixed_precision)
-        {
-            _func = &instance_normalization_nchw<float16_t, float>;
-        }
-        else
-        {
-            _func = &instance_normalization_nchw<float16_t>;
-        }
-    }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else
-    {
-        ARM_COMPUTE_ERROR("Unsupported data type");
-    }
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(_input->info(), _output->info());
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
@@ -268,6 +164,10 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    (*_func)(_input, _output, _gamma, _beta, _epsilon, window);
+
+    const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
index 96c011971..f166ce205 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -84,13 +84,12 @@ private:
      */
     using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 
-    NormalizationFunction *_func;
-    ITensor               *_input;
-    ITensor               *_output;
-    float                  _gamma;
-    float                  _beta;
-    float                  _epsilon;
-    bool                   _use_mixed_precision{ true };
+    ITensor *_input;
+    ITensor *_output;
+    float    _gamma;
+    float    _beta;
+    float    _epsilon;
+    bool     _use_mixed_precision{ true };
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
index 761fa1523..93da8a24c 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,9 +28,10 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
+#include "src/cpu/kernels/maxunpool/list.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
@@ -39,6 +40,67 @@ using namespace misc::shape_calculator;
 
 namespace
 {
+struct MaxUnpoolingSelectorData
+{
+    DataType dt;
+};
+
+using MaxUnpoolingSelctorPtr = std::add_pointer<bool(const MaxUnpoolingSelectorData &data)>::type;
+using MaxUnpoolingUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window)>::type;
+
+struct MaxUnpoolingKernel
+{
+    const char                  *name;
+    const MaxUnpoolingSelctorPtr is_selected;
+    MaxUnpoolingUKernelPtr       ukernel;
+};
+
+static const MaxUnpoolingKernel available_kernels[] =
+{
+    {
+        "fp32_neon_maxunpooling",
+        [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_maxunpooling)
+    },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "fp16_neon_maxunpooling",
+        [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_maxunpooling)
+    },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "qs8_neon_maxunpooling",
+        [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::QASYMM8; },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qs8_maxunpooling)
+    },
+    {
+        "qu8_neon_maxunpooling",
+        [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qu8_maxunpooling)
+    },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const MaxUnpoolingKernel *get_implementation(const MaxUnpoolingSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
@@ -69,7 +131,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 } // namespace
 
 NEMaxUnpoolingLayerKernel::NEMaxUnpoolingLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr)
+    : _input(nullptr), _output(nullptr), _indices(nullptr)
 {
 }
 
@@ -82,46 +144,12 @@ void NEMaxUnpoolingLayerKernel::configure(const ITensor *input, const ITensor *i
     _output  = output;
     _indices = indices;
 
-    switch(input->info()->data_type())
-    {
-        case DataType::F32:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<float>;
-            break;
-        case DataType::QASYMM8:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<uint8_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<int8_t>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<float16_t>;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            break;
-    }
     const TensorShape output_shape = compute_unpool_shape(*input->info(), pool_info);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     auto window = calculate_max_window(*input->info(), Steps());
     INEKernel::configure(window);
 }
-template <typename T>
-void NEMaxUnpoolingLayerKernel::unpooling2(const Window &window)
-{
-    Iterator  input(_input, window);
-    Iterator  indices(_indices, window);
-    auto      out_ptr      = reinterpret_cast<T *>(_output->buffer());
-    const int out_stride_w = static_cast<int>(_output->info()->strides_in_bytes()[3]);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto vindices                                         = reinterpret_cast<uint32_t *>(indices.ptr());
-        auto vinput                                           = reinterpret_cast<T *>(input.ptr());
-        out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
-    },
-    input, indices);
-}
 
 Status NEMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
@@ -135,8 +163,9 @@ void NEMaxUnpoolingLayerKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    // Run function
-    (this->*_func)(window);
+    const auto *uk = get_implementation(MaxUnpoolingSelectorData{ _input->info()->data_type() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_input, _output, _indices, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
index ecc116e58..f7f9a31f6 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,10 +88,9 @@ private:
     using UnpoolingFunction = void (NEMaxUnpoolingLayerKernel::*)(const Window &window);
 
 private:
-    UnpoolingFunction _func;
-    const ITensor    *_input;
-    ITensor          *_output;
-    const ITensor    *_indices;
+    const ITensor *_input;
+    ITensor       *_output;
+    const ITensor *_indices;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index d1c7d4eb9..7d8fc7ec7 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,13 +31,64 @@
 #include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/meanstddevnorm/list.h"
 
 namespace arm_compute
 {
 namespace
 {
+struct MeanStdDevNormSelectorData
+{
+    DataType dt;
+};
+
+using MeanStdDevNormSelctorPtr = std::add_pointer<bool(const MeanStdDevNormSelectorData &data)>::type;
+using MeanStdDevNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
+
+struct MeanStdDevNormKernel
+{
+    const char                    *name;
+    const MeanStdDevNormSelctorPtr is_selected;
+    MeanStdDevNormUKernelPtr       ukernel;
+};
+
+static const MeanStdDevNormKernel available_kernels[] =
+{
+    {
+        "fp32_neon_meanstddevnorm",
+        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)
+    },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "fp16_neon_meanstddevnorm",
+        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)
+    },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
@@ -72,80 +123,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-template <typename ScalarType, int size>
-void NEMeanStdDevNormalizationKernel::mean_stddev_normalization(const Window &window)
-{
-    using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type;
-
-    // Set build options
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x  = size;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-        auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-        auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto data  = wrapper::vloadq(in_ptr + x);
-            sum_vec    = wrapper::vadd(sum_vec, data);
-            sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
-        }
-
-        auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
-        auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
-        for(int i = 0; i < size / 4; ++i)
-        {
-            sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
-            sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
-        }
-
-        auto sum    = wrapper::vgetlane(sum_carry_res, 0);
-        auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            ScalarType data = *(in_ptr + x);
-            sum += data;
-            sum_sq += data * data;
-        }
-
-        ScalarType mean       = sum / _input->info()->dimension(0);
-        ScalarType var        = (sum_sq / _input->info()->dimension(0)) - (mean * mean);
-        ScalarType stddev_inv = 1.f / sqrt(var + _epsilon);
-
-        auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
-        auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto data = wrapper::vloadq(in_ptr + x);
-            auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
-            // Store results
-            wrapper::vstore(out_ptr + x, res);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
-        }
-    },
-    input, output);
-}
-
 NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel()
-    : _input(nullptr), _output(nullptr), _epsilon(1e-8f), _func(nullptr)
+    : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
 {
 }
 
@@ -163,23 +142,6 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
     auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICPPKernel::configure(win_config.second);
-
-    // Configure function to run based on different data types
-    const DataType data_type = input->info()->data_type();
-    switch(data_type)
-    {
-        case DataType::F32:
-            _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float, 4>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float16_t, 8>;
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        default:
-            ARM_COMPUTE_ERROR("Not Supported");
-            break;
-    }
 }
 
 Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
@@ -194,8 +156,10 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (this->*_func)(window);
+    const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_input, _output, _epsilon, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
index 59d073ada..844f0efdc 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,8 +91,6 @@ private:
     float    _epsilon;
 
     using MeanStdDevNormFunction = void (NEMeanStdDevNormalizationKernel::*)(const Window &window);
-
-    MeanStdDevNormFunction _func;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 60986812b..734510b63 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -261,9 +261,10 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info)
 
 size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
 {
-    ARM_COMPUTE_UNUSED(platform, thread_count);
-
-    return ICPPKernel::small_network_mws;
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
+    
+    return ICPPKernel::default_mws;
 }
 
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index ece7e40e3..802aebb52 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,10 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/misc/Utility.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/roialign/list.h"
 
 #include <arm_neon.h>
 
@@ -41,6 +43,67 @@ namespace arm_compute
 {
 namespace
 {
+struct ROIAlignSelectorData
+{
+    DataType dt;
+};
+
+using ROIAlignSelctorPtr = std::add_pointer<bool(const ROIAlignSelectorData &data)>::type;
+using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)>::type;
+
+struct ROIAlignKernel
+{
+    const char              *name;
+    const ROIAlignSelctorPtr is_selected;
+    ROIAlignUKernelPtr       ukernel;
+};
+
+static const ROIAlignKernel available_kernels[] =
+{
+    {
+        "fp32_neon_roialign",
+        [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)
+    },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "fp16_neon_roialign",
+        [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)
+    },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "qu8_neon_roialign",
+        [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)
+    },
+    {
+        "qs8_neon_roialign",
+        [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)
+    },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
@@ -110,328 +173,19 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn
     return Status{};
 }
 
-/** Average pooling over an aligned window */
-template <typename input_data_type>
-inline input_data_type roi_align_1x1(const ITensor *input,
-                                     unsigned int   roi_batch,
-                                     float          region_start_x,
-                                     float          bin_size_x,
-                                     int            grid_size_x,
-                                     float          region_end_x,
-                                     float          region_start_y,
-                                     float          bin_size_y,
-                                     int            grid_size_y,
-                                     float          region_end_y,
-                                     int            pz)
-{
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
-    {
-        return input_data_type(0);
-    }
-    else
-    {
-        const DataLayout data_layout = input->info()->data_layout();
-        float            avg         = 0;
-        // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
-        {
-            for(int ix = 0; ix < grid_size_x; ++ix)
-            {
-                // Align the window in the middle of every bin
-                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
-                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
-
-                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
-                const int y_low  = y;
-                const int x_low  = x;
-                const int y_high = y_low + 1;
-                const int x_high = x_low + 1;
-
-                const float ly = y - y_low;
-                const float lx = x - x_low;
-                const float hy = 1. - ly;
-                const float hx = 1. - lx;
-
-                const float w1 = hy * hx;
-                const float w2 = hy * lx;
-                const float w3 = ly * hx;
-                const float w4 = ly * lx;
-                if(data_layout == DataLayout::NCHW)
-                {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
-                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                }
-                else
-                {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
-                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                }
-            }
-        }
-
-        avg /= grid_size_x * grid_size_y;
-        return input_data_type(avg);
-    }
-}
-
-/** Average pooling over an aligned window */
-template <typename input_data_type>
-inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
-                                             unsigned int            roi_batch,
-                                             float                   region_start_x,
-                                             float                   bin_size_x,
-                                             int                     grid_size_x,
-                                             float                   region_end_x,
-                                             float                   region_start_y,
-                                             float                   bin_size_y,
-                                             int                     grid_size_y,
-                                             float                   region_end_y,
-                                             int                     pz,
-                                             const QuantizationInfo &out_qinfo)
-{
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
-    {
-        return input_data_type(out_qinfo.uniform().offset);
-    }
-    else
-    {
-        float                         avg              = 0;
-        const UniformQuantizationInfo input_qinfo      = input->info()->quantization_info().uniform();
-        const bool                    is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
-        const DataLayout              data_layout      = input->info()->data_layout();
-
-        // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
-        {
-            for(int ix = 0; ix < grid_size_x; ++ix)
-            {
-                // Align the window in the middle of every bin
-                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
-                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
-
-                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
-                const int y_low  = y;
-                const int x_low  = x;
-                const int y_high = y_low + 1;
-                const int x_high = x_low + 1;
-
-                const float ly = y - y_low;
-                const float lx = x - x_low;
-                const float hy = 1. - ly;
-                const float hx = 1. - lx;
-
-                const float w1 = hy * hx;
-                const float w2 = hy * lx;
-                const float w3 = ly * hx;
-                const float w4 = ly * lx;
-
-                if(data_layout == DataLayout::NCHW)
-                {
-                    if(is_qasymm_signed)
-                    {
-                        float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                    else
-                    {
-                        float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                }
-                else
-                {
-                    if(is_qasymm_signed)
-                    {
-                        const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                    else
-                    {
-                        const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                }
-            }
-        }
-
-        avg /= grid_size_x * grid_size_y;
-
-        input_data_type res = 0;
-        if(is_qasymm_signed)
-        {
-            res = quantize_qasymm8_signed(avg, out_qinfo);
-        }
-        else
-        {
-            res = quantize_qasymm8(avg, out_qinfo);
-        }
-        return res;
-    }
-}
-
-inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, float max_value)
-{
-    const float region_start = p * bin_size + roi_anchor;
-    return utility::clamp(region_start, 0.0f, max_value);
-}
-
 void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     const DataLayout data_layout = _input->info()->data_layout();
     if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
     {
-        switch(_input->info()->data_type())
-        {
-            case DataType::QASYMM8:
-            {
-                NEROIAlignLayerKernel::internal_run<uint8_t, uint16_t>(window, info);
-                break;
-            }
-            case DataType::QASYMM8_SIGNED:
-            {
-                NEROIAlignLayerKernel::internal_run<int8_t, uint16_t>(window, info);
-                break;
-            }
-            case DataType::F32:
-            {
-                NEROIAlignLayerKernel::internal_run<float>(window, info);
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                NEROIAlignLayerKernel::internal_run<float16_t>(window, info);
-                break;
-            }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            default:
-            {
-                ARM_COMPUTE_ERROR("DataType not supported");
-                break;
-            }
-        }
+        const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() });
+        ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+        uk->ukernel(_input, _output, _rois, _pool_info, window, info);
     }
     else
     {
         ARM_COMPUTE_ERROR("Invalid layout");
     }
 }
-
-template <typename input_data_type, typename roi_data_type>
-void NEROIAlignLayerKernel::internal_run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const DataLayout data_layout    = _input->info()->data_layout();
-    const size_t     values_per_roi = _rois->info()->dimension(0);
-
-    const int roi_list_start = window.x().start();
-    const int roi_list_end   = window.x().end();
-
-    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int idx_depth  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    const int input_width   = _input->info()->dimension(idx_width);
-    const int input_height  = _input->info()->dimension(idx_height);
-    const int input_chanels = _input->info()->dimension(idx_depth);
-    const int pooled_w      = _pool_info.pooled_width();
-    const int pooled_h      = _pool_info.pooled_height();
-
-    const DataType data_type = _input->info()->data_type();
-    const bool     is_qasymm = is_data_type_quantized_asymmetric(data_type);
-
-    const auto             *rois_ptr   = reinterpret_cast<const roi_data_type *>(_rois->buffer());
-    const QuantizationInfo &rois_qinfo = _rois->info()->quantization_info();
-    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
-    {
-        const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
-
-        roi_data_type qx1 = rois_ptr[values_per_roi * roi_indx + 1];
-        roi_data_type qy1 = rois_ptr[values_per_roi * roi_indx + 2];
-        roi_data_type qx2 = rois_ptr[values_per_roi * roi_indx + 3];
-        roi_data_type qy2 = rois_ptr[values_per_roi * roi_indx + 4];
-        float         x1(qx1);
-        float         x2(qx2);
-        float         y1(qy1);
-        float         y2(qy2);
-        if(is_qasymm)
-        {
-            x1 = dequantize_qasymm16(qx1, rois_qinfo);
-            x2 = dequantize_qasymm16(qx2, rois_qinfo);
-            y1 = dequantize_qasymm16(qy1, rois_qinfo);
-            y2 = dequantize_qasymm16(qy2, rois_qinfo);
-        }
-        const float roi_anchor_x = x1 * _pool_info.spatial_scale();
-        const float roi_anchor_y = y1 * _pool_info.spatial_scale();
-        const float roi_dims_x   = std::max((x2 - x1) * _pool_info.spatial_scale(), 1.0f);
-        const float roi_dims_y   = std::max((y2 - y1) * _pool_info.spatial_scale(), 1.0f);
-        float       bin_size_x   = roi_dims_x / _pool_info.pooled_width();
-        float       bin_size_y   = roi_dims_y / _pool_info.pooled_height();
-
-        // Iterate through all feature maps
-        for(int ch = 0; ch < input_chanels; ++ch)
-        {
-            // Iterate through all output pixels
-            for(int py = 0; py < pooled_h; ++py)
-            {
-                for(int px = 0; px < pooled_w; ++px)
-                {
-                    const float     region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
-                    const float     region_end_x   = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_end_y   = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
-                    const int       roi_bin_grid_x = (_pool_info.sampling_ratio() > 0) ? _pool_info.sampling_ratio() : int(ceil(bin_size_x));
-                    const int       roi_bin_grid_y = (_pool_info.sampling_ratio() > 0) ? _pool_info.sampling_ratio() : int(ceil(bin_size_y));
-                    input_data_type out_val(0);
-                    if(is_qasymm)
-                    {
-                        out_val = roi_align_1x1_qasymm8<input_data_type>(
-                                      _input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch, _output->info()->quantization_info());
-                    }
-                    else
-                    {
-                        out_val = roi_align_1x1<input_data_type>(
-                                      _input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch);
-                    }
-
-                    if(data_layout == DataLayout::NCHW)
-                    {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(_output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
-                        *out_ptr     = out_val;
-                    }
-                    else
-                    {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(_output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
-                        *out_ptr     = out_val;
-                    }
-                }
-            }
-        }
-    }
-}
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
index fa31a879b..48a3de728 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,9 +89,6 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    template <typename input_data_type, typename roi_data_type = input_data_type>
-    void internal_run(const Window &window, const ThreadInfo &info);
-
     const ITensor      *_input;
     ITensor            *_output;
     const ITensor      *_rois;
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index 0395e0bd3..82d1403c5 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -30,68 +30,96 @@
 #include "arm_compute/core/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "arm_compute/core/Utils.h"
+#include "src/cpu/kernels/range/list.h"
 
 namespace arm_compute
 {
 namespace
 {
-template <typename T>
-void range_function(ITensor *output, float start, float step, const Window &window)
+struct RangeSelectorData
 {
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
-
-    const auto step_vec  = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
-    const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
-    auto       id_vec    = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+    DataType dt;
+};
 
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16 / sizeof(T);
+using RangeSelectorPtr = std::add_pointer<bool(const RangeSelectorData &data)>::type;
+using RangeUKernelPtr  = std::add_pointer<void(ITensor *, float, float, const Window &)>::type;
 
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator output_it(output, win);
+struct RangeUKernel
+{
+    const char            *name;
+    const RangeSelectorPtr is_selected;
+    RangeUKernelPtr        ukernel;
+};
 
-    execute_window_loop(win, [&](const Coordinates &)
+static const RangeUKernel available_kernels[] =
+{
     {
-        int        x       = window_start_x;
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            for(int count = 0; count < window_step_x; ++count)
-            {
-                id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
-            }
-
-            // start + step * id
-            const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
-            wrapper::vstore(out_ptr + x, res_vec);
-        }
+        "fp16_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)
+    },
+    {
+        "f32_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)
+    },
+    {
+        "u8_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::U8; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)
+    },
+    {
+        "u16_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::U16; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)
+    },
+    {
+        "u32_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::U32; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)
+    },
+    {
+        "s8_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::S8; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)
+    },
+    {
+        "s16_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::S16; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)
+    },
+    {
+        "s32_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::S32; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)
+    },
+};
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const RangeUKernel *get_implementation(const RangeSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
         {
-            const auto res = start + x * step;
-            *(out_ptr + x) = res;
+            return &uk;
         }
-
-    },
-    output_it);
+    }
+    return nullptr;
 }
 
 Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output,
-                                                         1,
-                                                         DataType::U8, DataType::S8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
+    const auto *uk = get_implementation(RangeSelectorData{ output.data_type() });
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
@@ -111,7 +139,7 @@ Status validate_arguments(const ITensorInfo &output, const float start, const fl
 } // namespace
 
 NERangeKernel::NERangeKernel()
-    : _func(nullptr), _start(0), _end(1), _step(1), _output(nullptr)
+    : _start(0), _end(1), _step(1), _output(nullptr)
 {
 }
 
@@ -131,38 +159,6 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste
     _end    = end;
     _step   = step;
     _output = output;
-    switch(_output->info()->data_type())
-    {
-        case DataType::U8:
-            _func = &range_function<uint8_t>;
-            break;
-        case DataType::U16:
-            _func = &range_function<uint16_t>;
-            break;
-        case DataType::U32:
-            _func = &range_function<uint32_t>;
-            break;
-        case DataType::S8:
-            _func = &range_function<int8_t>;
-            break;
-        case DataType::S16:
-            _func = &range_function<int16_t>;
-            break;
-        case DataType::S32:
-            _func = &range_function<int32_t>;
-            break;
-        case DataType::F32:
-            _func = &range_function<float>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &range_function<float16_t>;
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-            break;
-    }
 
     INEKernel::configure(win);
 }
@@ -181,8 +177,8 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() });
 
-    (*_func)(_output, _start, _step, window);
+    uk->ukernel(_output, _start, _step, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h
index 7c42ef11d..90560995e 100644
--- a/src/core/NEON/kernels/NERangeKernel.h
+++ b/src/core/NEON/kernels/NERangeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,11 +80,10 @@ public:
 private:
     using RangeFunction = void(ITensor *output, float start, float step, const Window &window);
 
-    RangeFunction *_func;   /**< Range function to be called */
-    float          _start;  /**< Start of sequence */
-    float          _end;    /**< End of sequence */
-    float          _step;   /**< Increment/step value */
-    ITensor       *_output; /**< Destination tensor */
+    float    _start;  /**< Start of sequence */
+    float    _end;    /**< End of sequence */
+    float    _step;   /**< Increment/step value */
+    ITensor *_output; /**< Destination tensor */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NERANGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
deleted file mode 100644
index a1ba29e4c..000000000
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NERemapKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute::scale_helpers;
-
-namespace arm_compute
-{
-class Coordinates;
-
-namespace
-{
-inline int32_t num_out_of_tensor(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &width_1, const int32x4_t &height_1)
-{
-    const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr));
-    const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr));
-
-    const int32x4_t outbx_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(width_1, mapx_s32), mapx_s32), vdupq_n_s32(-1)), vdupq_n_s32(0));  // Contains -1 if out of border in x, 0 otherwise
-    const int32x4_t outby_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(height_1, mapy_s32), mapy_s32), vdupq_n_s32(-1)), vdupq_n_s32(0)); // Contains -1 if out of border in y, 0 otherwise
-
-    const int32x4_t out_of_tensor_v = vminq_s32(outbx_s32, outby_s32);
-#if defined(__aarch64__)
-    // only AArch64 supports vaddv
-    return vaddvq_s32(out_of_tensor_v);
-#else  // __aarch64__    
-    return vgetq_lane_s32(out_of_tensor_v, 0) + vgetq_lane_s32(out_of_tensor_v, 1) + vgetq_lane_s32(out_of_tensor_v, 2)  + vgetq_lane_s32(out_of_tensor_v, 3);
-#endif // __aarch64__
-}
-
-inline void serial_remap_nearest_interpolation(const uint8_t *in_ptr, const float *mapx_ptr, const float *mapy_ptr, uint8_t *out_ptr,
-                                               int32_t width_val, int32_t height_val, int32_t in_stride_val, uint8_t constant_border_value)
-{
-    const auto x_s32 = static_cast<int32_t>(*mapx_ptr);
-    const auto y_s32 = static_cast<int32_t>(*mapy_ptr);
-    if(x_s32 < 0 || y_s32 < 0 || x_s32 >= width_val || y_s32 >= height_val)
-    {
-        *(out_ptr) = constant_border_value;
-    }
-    else
-    {
-        *(out_ptr) = in_ptr[x_s32 + y_s32 * in_stride_val];
-    }
-}
-
-inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &stride)
-{
-    const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr));
-    const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr));
-    return vmlaq_s32(mapx_s32, mapy_s32, stride);
-}
-
-inline uint8_t pixel_bilinear_c1_clamp(const uint8_t *pixel_ptr, int32_t stride, int32_t width, int32_t height, float x, float y, uint8_t constant_border_value)
-{
-    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
-    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
-
-    const int32_t xi = static_cast<int32_t>(std::floor(x));
-    const int32_t yi = static_cast<int32_t>(std::floor(y));
-
-    const float dx = x - static_cast<float>(xi);
-    const float dy = y - static_cast<float>(yi);
-
-    // Calculating the address won't trigger a segfault in case the value is outside the tensor
-    // The ternary operator resolves the values in both conditions
-    const uint8_t *a00 = (xi < 0 || xi >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride);
-    const uint8_t *a01 = (xi + 1 >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride);
-    const uint8_t *a10 = (xi < 0 || xi >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride + stride);
-    const uint8_t *a11 = (xi + 1 >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride + stride);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-    const float w1  = dx1 * dy1;
-    const float w2  = dx * dy1;
-    const float w3  = dx1 * dy;
-    const float w4  = dx * dy;
-
-    return static_cast<uint8_t>((*a00) * w1 + (*a01) * w2 + (*a10) * w3 + (*a11) * w4);
-}
-} // namespace
-
-NERemapKernel::NERemapKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr), _border_mode(BorderMode::UNDEFINED), _constant_border_value(0)
-{
-}
-
-void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
-
-    _input                 = input;
-    _output                = output;
-    _map_x                 = map_x;
-    _map_y                 = map_y;
-    _border_mode           = border_mode;
-    _constant_border_value = constant_border_value;
-
-    switch(policy)
-    {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-        {
-            _func = &NERemapKernel::remap_nearest;
-            break;
-        }
-        case InterpolationPolicy::BILINEAR:
-        {
-            _func = &NERemapKernel::remap_bilinear;
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
-            break;
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps());
-    INEKernel::configure(win);
-}
-
-void NERemapKernel::remap_nearest(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    const auto    window_start_x = static_cast<int32_t>(window.x().start());
-    const auto    window_end_x   = static_cast<int32_t>(window.x().end());
-    const int32_t window_step_x  = 8;
-
-    // Don't increment in X direction for the output, mapx, mapy tensors
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-    Iterator mapx(_map_x, win);
-    Iterator mapy(_map_y, win);
-
-    const int32_t   width_val     = static_cast<int32_t>(_input->info()->dimension(0));
-    const int32_t   height_val    = static_cast<int32_t>(_input->info()->dimension(1));
-    const int32_t   in_stride_val = static_cast<int32_t>(_input->info()->strides_in_bytes()[1]);
-    const int32x4_t width_1       = vdupq_n_s32(width_val - 1);
-    const int32x4_t height_1      = vdupq_n_s32(height_val - 1);
-    const int32x4_t in_stride     = vdupq_n_s32(in_stride_val);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto           mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
-        auto           mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
-        const uint8_t *in_ptr   = in.ptr();
-        uint8_t       *out_ptr  = out.ptr();
-        int32_t        x        = window_start_x;
-        for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x)
-        {
-            const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_1, height_1);
-            const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_1, height_1);
-            const int32_t out_of_tensor  = out_of_tensor0 + out_of_tensor1;
-
-            if(out_of_tensor == -8)
-            {
-                // All elements are out of xy plane
-                uint8x8_t tmp = vdup_n_u8(_constant_border_value);
-                vst1_u8(out_ptr, tmp);
-            }
-            else if(out_of_tensor < 0)
-            {
-                // Some elements are out of xy plane
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 1, mapy_ptr + 1, out_ptr + 1, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 2, mapy_ptr + 2, out_ptr + 2, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 3, mapy_ptr + 3, out_ptr + 3, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 4, mapy_ptr + 4, out_ptr + 4, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 5, mapy_ptr + 5, out_ptr + 5, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 6, mapy_ptr + 6, out_ptr + 6, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 7, mapy_ptr + 7, out_ptr + 7, width_val, height_val, in_stride_val, _constant_border_value);
-            }
-            else
-            {
-                // All elements are in xy plane
-                uint8x8_t       tmp     = vdup_n_u8(0);
-                const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr, mapy_ptr, in_stride);
-                const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, in_stride);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7);
-                vst1_u8(out_ptr, tmp);
-            }
-        }
-        for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr)
-        {
-            serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value);
-        }
-    },
-    in, out, mapx, mapy);
-}
-
-void NERemapKernel::remap_bilinear(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    const auto    window_start_x = static_cast<int32_t>(window.x().start());
-    const auto    window_end_x   = static_cast<int32_t>(window.x().end());
-    const int32_t window_step_x  = 8;
-
-    // Don't increment in X direction for the output, mapx, mapy tensors
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-    Iterator mapx(_map_x, win);
-    Iterator mapy(_map_y, win);
-
-    const int32_t   width_val     = static_cast<int32_t>(_input->info()->dimension(0));
-    const int32_t   height_val    = static_cast<int32_t>(_input->info()->dimension(1));
-    const int32x4_t width_2       = vdupq_n_s32(width_val - 2);
-    const int32x4_t height_2      = vdupq_n_s32(height_val - 2);
-    const int32_t   in_stride_val = static_cast<int32_t>(_input->info()->strides_in_bytes()[1]);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto           mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
-        auto           mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
-        const uint8_t *in_ptr   = in.ptr();
-        uint8_t       *out_ptr  = out.ptr();
-        int32_t        x        = window_start_x;
-        for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x)
-        {
-            const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_2, height_2);
-            const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_2, height_2);
-            const int32_t out_of_tensor  = out_of_tensor0 + out_of_tensor1;
-
-            if(out_of_tensor < 0)
-            {
-                // Elements are out of xy plane
-                *(out_ptr)     = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value);
-                *(out_ptr + 1) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value);
-                *(out_ptr + 2) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value);
-                *(out_ptr + 3) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value);
-                *(out_ptr + 4) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value);
-                *(out_ptr + 5) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value);
-                *(out_ptr + 6) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value);
-                *(out_ptr + 7) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value);
-            }
-            else
-            {
-                // All elements are in xy plane
-                uint8x8_t tmp = vdup_n_u8(0);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value), tmp, 0);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value), tmp, 1);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value), tmp, 2);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value), tmp, 3);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value), tmp, 4);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value), tmp, 5);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value), tmp, 6);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value), tmp, 7);
-                vst1_u8(out_ptr, tmp);
-            }
-        }
-        for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr)
-        {
-            *(out_ptr) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value);
-        }
-    },
-    in, out, mapx, mapy);
-}
-
-void NERemapKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h
deleted file mode 100644
index 33e929805..000000000
--- a/src/core/NEON/kernels/NERemapKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREMAPKERNEL_H
-#define ARM_COMPUTE_NEREMAPKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform a remap on a tensor */
-class NERemapKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NERemapKernel";
-    }
-    /** Default constructor */
-    NERemapKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel(const NERemapKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel &operator=(const NERemapKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NERemapKernel(NERemapKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NERemapKernel &operator=(NERemapKernel &&) = default;
-    /** Default destructor */
-    ~NERemapKernel() = default;
-
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  input                 Source tensor. Data type supported: U8.
-     * @param[in]  map_x                 Map for X coordinates. Data type supported: F32.
-     * @param[in]  map_y                 Map for Y coordinates. Data type supported: F32.
-     * @param[out] output                Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy                The interpolation type.
-     * @param[in]  border_mode           Border mode to use on the input tensor.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. Defaults to 0.
-     */
-    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** function to perform nearest interpolation on the given window */
-    void remap_nearest(const Window &window);
-    /** function to perform bilinear interpolation on the given window */
-    void remap_bilinear(const Window &window);
-    /** Remap function to use for the particular interpolation type passed to configure() */
-    void (NERemapKernel::*_func)(const Window &window);
-
-    const ITensor *_input;                 /**< Input image */
-    ITensor       *_output;                /**< Output image */
-    const ITensor *_map_x;                 /**< Input remap x coordinates */
-    const ITensor *_map_y;                 /**< Input remap y coordinates */
-    BorderMode     _border_mode;           /**< Border mode */
-    uint8_t        _constant_border_value; /**< Border value to use */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */
-\ No newline at end of file
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index 7c988e9fa..b8c9b244e 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,10 @@
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/common/Registrars.h"
+
+#include "src/cpu/kernels/select/list.h"
+
 #include <arm_neon.h>
 #include <map>
 #include <string>
@@ -42,125 +46,123 @@ namespace arm_compute
 {
 namespace
 {
-template <typename ScalarType, typename VectorType>
-void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
-{
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator condition(cond, win);
-    Iterator input1(in1, win);
-    Iterator input2(in2, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
-        const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
-        const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
 
-        int x = window_start_x;
-        for(; x <= limit; x += window_step_x)
-        {
-            const auto c = (*condition_conversion)(condition_ptr + x);
-            const auto a = wrapper::vloadq(input1_ptr + x);
-            const auto b = wrapper::vloadq(input2_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            const auto c      = *(condition_ptr + x);
-            const auto a      = *(input1_ptr + x);
-            const auto b      = *(input2_ptr + x);
-            *(output_ptr + x) = static_cast<bool>(c) ? a : b;
-        }
-    },
-    condition, input1, input2, output);
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+struct SelectKernelSelectorData
 {
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
+    DataType dt;
+    bool     is_same_rank;
+};
 
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
-    });
-}
+using SelectorPtr = std::add_pointer<bool(const SelectKernelSelectorData &data)>::type;
+using KernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
 
-template <typename ScalarType, typename VectorType>
-void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+struct SelectKernelSelector
 {
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
+    const char       *name;
+    const SelectorPtr is_selected;
+    KernelPtr         ukernel;
+};
 
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
-    });
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+static const SelectKernelSelector available_kernels[] =
 {
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
     {
-        static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
-    });
-}
+        "neon_s8_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)
+    },
+    {
+        "neon_s16_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)
+    },
+    {
+        "neon_s32_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)
+    },
+    {
+        "neon_u8_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)
+    },
+    {
+        "neon_u16_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)
+    },
+    {
+        "neon_u32_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)
+    },
+    {
+        "neon_s8_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)
+    },
+    {
+        "neon_s16_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)
+    },
+    {
+        "neon_s32_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)
+    },
+    {
+        "neon_u8_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)
+    },
+    {
+        "neon_u16_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)
+    },
+    {
+        "neon_u32_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)
+    },
+    {
+        "neon_f16_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)
+    },
+    {
+        "neon_f16_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)
+    },
+    {
+        "neon_f32_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)
+    },
+    {
+        "neon_f32_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)
+    },
+};
 
-template <typename ScalarType>
-void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data)
 {
-    ARM_COMPUTE_UNUSED(window);
-
-    auto       output_ptr    = reinterpret_cast<ScalarType *>(out->buffer());
-    const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
-    const auto input1_ptr    = reinterpret_cast<const ScalarType *>(in1->buffer());
-    const auto input2_ptr    = reinterpret_cast<const ScalarType *>(in2->buffer());
-
-    const int outer_size = cond->info()->total_size() / cond->info()->element_size();
-    const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
-    int       offset     = 0;
-    const int step       = 16 / in1->info()->element_size();
-
-    for(int i = 0; i < outer_size; ++i)
+    for(const auto &uk : available_kernels)
     {
-        int        x         = offset;
-        const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
-        for(; x <= offset + inner_size - step; x += step)
-        {
-            wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
-        }
-        if(x <= offset + inner_size - (step / 2))
-        {
-            wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
-            x += step / 2;
-        }
-        for(; x < offset + inner_size; ++x)
+        if(uk.is_selected(data))
         {
-            *(output_ptr + x) = *(input_ptr + x);
+            return &uk;
         }
-        offset += inner_size;
     }
+    return nullptr;
 }
+
 } // namespace
 
 NESelectKernel::NESelectKernel()
-    : _function(nullptr), _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+    : /*_function(nullptr), */ _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
 {
 }
 
@@ -178,51 +180,6 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor
     _output        = output;
     _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
 
-    std::string function_to_call("op_");
-    function_to_call += string_from_data_type(x->info()->data_type());
-
-    static std::map<std::string, SelectFunction *> map_function;
-
-    if(_has_same_rank)
-    {
-        map_function =
-        {
-            { "op_S8", &select_op_8<int8_t, uint8x16_t> },
-            { "op_S16", &select_op_16<int16_t, uint16x8_t> },
-            { "op_S32", &select_op_32<int32_t, uint32x4_t> },
-            { "op_U8", &select_op_8<uint8_t, uint8x16_t> },
-            { "op_U16", &select_op_16<uint16_t, uint16x8_t> },
-            { "op_U32", &select_op_32<uint32_t, uint32x4_t> },
-            { "op_F32", &select_op_32<float, uint32x4_t> }
-        };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        map_function["op_F16"] = &select_op_16<float16_t, uint16x8_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    }
-    else
-    {
-        map_function =
-        {
-            { "op_S8", &select_op_not_same_rank<int8_t> },
-            { "op_S16", &select_op_not_same_rank<int16_t> },
-            { "op_S32", &select_op_not_same_rank<int32_t> },
-            { "op_U8", &select_op_not_same_rank<uint8_t> },
-            { "op_U16", &select_op_not_same_rank<uint16_t> },
-            { "op_U32", &select_op_not_same_rank<uint32_t> },
-            { "op_F32", &select_op_not_same_rank<float> }
-        };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        map_function["op_F16"] = &select_op_not_same_rank<float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    }
-
-    auto it = map_function.find(function_to_call);
-
-    if(it != map_function.end())
-    {
-        _function = it->second;
-    }
-
     Window win = calculate_max_window(*x->info());
     INEKernel::configure(win);
 }
@@ -254,7 +211,12 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_function == nullptr);
-    _function(_c, _x, _y, _output, window);
+    ARM_COMPUTE_ERROR_ON(_output == nullptr);
+    ARM_COMPUTE_ERROR_ON(_output->info() == nullptr);
+
+    const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr);
+    ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+    uk->ukernel(_c, _x, _y, _output, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h
index f7142feff..e82105a68 100644
--- a/src/core/NEON/kernels/NESelectKernel.h
+++ b/src/core/NEON/kernels/NESelectKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -82,22 +82,12 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Common signature for all the specialised select functions
-     *
-     * @param[in] c      Condition input tensor. Data types supported: U8.
-     * @param[in] x      First input tensor. Data types supported: All.
-     * @param[in] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in] output Output tensor. Data types supported: Same as @p x.
-     */
-    using SelectFunction = void(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window);
 
-    /** Select function to use for the particular tensor types passed to configure() */
-    SelectFunction *_function;
-    const ITensor *_c;              /**< Condition tensor */
-    const ITensor *_x;              /**< Source tensor 1 */
-    const ITensor *_y;              /**< Source tensor 2 */
-    ITensor        *_output;        /**< Destination tensor */
-    bool            _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
+    const ITensor *_c;             /**< Condition tensor */
+    const ITensor *_x;             /**< Source tensor 1 */
+    const ITensor *_y;             /**< Source tensor 2 */
+    ITensor       *_output;        /**< Destination tensor */
+    bool           _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NESELECTKERNEL_H */
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
index 1d52b56d3..ea41529d8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
@@ -136,7 +136,14 @@ UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &a
 {
   const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr;
   const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl);
-  return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr);
+
+  if(success)
+  {
+        auto i =  impl->get_instance(args, os);
+        i->set_name(impl->name);
+        return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(i);
+  }
+  return nullptr;
 }
 
 }  // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
index 20c823014..79fc65e56 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -23,7 +23,9 @@
  */
 #pragma once
 
+#if !defined(__OpenBSD__)
 #include <alloca.h>
+#endif /* !defined(__OpenBSD__) */
 
 #include <algorithm>
 #include <cassert>
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index d3857a50e..809946f10 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -144,7 +144,7 @@ struct GemmImplementation<Top, Tret, Nothing> {
 
 /* "Master" function implemented for each valid combination of types.
  * Returns a list of GEMM implementation descriptors for processing by the
- * other functions, terminated by an implementation with
+ * other functions, ended by an implementation with
  * method==GemmMethod::DEFAULT.  */
 template<typename Top, typename Tret, class OutputStage = Nothing>
 const GemmImplementation<Top, Tret, OutputStage> *gemm_implementation_list();
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
index d5003e4a1..91988e8c3 100644
--- a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
@@ -28,7 +28,9 @@
 #include "interleave_indirect.hpp"
 #include "bfloat.hpp"
 
+#if !defined(__OpenBSD__)
 #include <alloca.h>
+#endif /* !defined(__OpenBSD__) */
 
 #include <algorithm>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
index 758f2b1f8..9af1b4df1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
@@ -72,7 +72,7 @@ public:
                     return { 15.361, 0.9341, 0.1636 };
 
                 case CPUModel::V1:
-                    return { 62.40, 4.71, 0.67 };
+                    return { 51.14, 7.38, 0.65 };
 
                 default:
                     return { 29.0698, 3.9793, 0.4003 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
index 21c9f5966..6d333f344 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
@@ -80,7 +80,7 @@ public:
                     return { 15.361, 0.9341, 0.1636 };
 
                 case CPUModel::V1:
-                    return { 62.40, 4.71, 0.67 };
+                    return { 51.14, 7.38, 0.65 };
 
                 default:
                     return { 29.0698, 3.9793, 0.4003 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
index 090dd5855..e6e795097 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -82,7 +82,7 @@ public:
                 case CPUModel::A510:
                     return { 6.81 };
                 case CPUModel::V1:
-                    return { 28.40 };
+                    return { 22.33 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
index f5e9009f6..39ffcbef1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
@@ -82,7 +82,7 @@ public:
                 case CPUModel::A510:
                     return { 6.70 };
                 case CPUModel::V1:
-                    return { 26.64 };
+                    return { 21.28 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index 94f578368..905a60265 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -84,7 +84,7 @@ public:
                 case CPUModel::A510:
                     return { 14.81 };
                 case CPUModel::V1:
-                    return { 48.34 };
+                    return { 44.54 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
index bc933afd9..69ea87bc9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
@@ -81,7 +81,7 @@ public:
                 case CPUModel::A510:
                     return { 27.99 };
                 case CPUModel::V1:
-                    return { 68.76 };
+                    return { 62.26 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
index c5105a6d4..ce96c1b28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -75,29 +75,29 @@ public:
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        if (std::is_same<T, uint8_t>::value) {
+        if (std::is_same<T, uint32_t>::value) {
             switch (ci->get_cpu_model()) {
-                case CPUModel::A55r1:
-                    return { 9.5238, 2.0799, 0.2279 };
                 default:
-                    return { 29.6736, 11.4025, 0.5591 };
+                    return { 31.63 };
                 case CPUModel::A510:
-                    return { 16.65, 3.92, 0.48 };
+                    return { 15.89 };
                 case CPUModel::V1:
-                    return { 55.42, 19.29, 0.92 };
+                    return { 53.87 };
+                case CPUModel::A55r1:
+                    return { 9.217 };
             }
         }
 
-        if (std::is_same<T, uint32_t>::value) {
+        if (std::is_same<T, uint8_t>::value) {
             switch (ci->get_cpu_model()) {
-                default:
-                    return { 31.63 };
                 case CPUModel::A55r1:
-                    return { 9.217 };
+                    return { 9.5238, 2.0799, 0.2279 };
+                default:
+                    return { 29.6736, 11.4025, 0.5591 };
                 case CPUModel::A510:
-                    return { 15.89 };
+                    return { 16.65, 3.92, 0.48 };
                 case CPUModel::V1:
-                    return { 53.87 };
+                    return { 42.62, 16.32, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
index 24bad3c63..b5cedc7e9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
@@ -92,7 +92,7 @@ public:
                 case CPUModel::A510:
                     return { 33.64, 3.92, 0.48 };
                 case CPUModel::V1:
-                    return { 86.71, 19.00, 0.93 };
+                    return { 63.94, 16.18, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index 9b3517a80..6ec6bd2ed 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -36,6 +36,7 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void a64_interleaved_bf16fp32_mmla_8x12( ARGLIST );
+void a64_interleaved_bf16fp32_mmla_8x12_a510( ARGLIST );
 
 class cls_a64_interleaved_bf16fp32_mmla_8x12
 {
@@ -78,7 +79,7 @@ public:
                 default:
                     return { 31.54, 4.30, 7.33 };
                 case CPUModel::V1:
-                    return { 59.94, 5.08, 9.83 };
+                    return { 41.44, 5.01, 5.64 };
                 case CPUModel::A510:
                     return { 7.82, 4.05, 3.07 };
             }
@@ -101,8 +102,15 @@ public:
 
     // Default to the generic kernel
     kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
-    cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *)
+    cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A510:
+                kernel=a64_interleaved_bf16fp32_mmla_8x12_a510;
+                break;
+        }
     }
 };
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
new file mode 100644
index 000000000..0235e91bf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include "../../bfloat.hpp"
+
+namespace arm_gemm {
+
+void a64_interleaved_bf16fp32_mmla_8x12_a510(
+    const bfloat16 *Apanel, const bfloat16 *Bpanel,
+    float *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "mov %x[Apanel], x21\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.8h }, [x20], #0x10\n"
+      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+      "cmp x19, #0x2\n"
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "ld1 { v5.8h }, [x20], #0x10\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
+      "ld1 { v6.8h }, [x20], #0x10\n"
+      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
+      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
+      "ld1 { v7.8h }, [x20], #0x10\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
+      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
+      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
+      "ld1 { v4.8h }, [x20], #0x10\n"
+      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
+      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
+      "ld1 { v5.8h }, [x20], #0x10\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
+      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
+      "cbz x19, 5f\n"
+      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.8h }, [x20], #0x10\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v7.8h }, [x20], #0x10\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q4, [%x[Cpanel], #0x0]\n"
+      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q11, [%x[Cpanel], #0x10]\n"
+      "str q12, [%x[Cpanel], #0x20]\n"
+      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q13, [%x[Cpanel], #0x60]\n"
+      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q17, [%x[Cpanel], #0x70]\n"
+      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q18, [%x[Cpanel], #0x80]\n"
+      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q19, [%x[Cpanel], #0xc0]\n"
+      "str q23, [%x[Cpanel], #0xd0]\n"
+      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q25, [%x[Cpanel], #0x120]\n"
+      "str q29, [%x[Cpanel], #0x130]\n"
+      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index ff69bc8f5..4cc3ed040 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -35,6 +35,7 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void a64_interleaved_s8s32_mmla_8x12( ARGLIST );
+void a64_interleaved_s8s32_mmla_8x12_a510( ARGLIST );
 
 class cls_a64_interleaved_s8s32_mmla_8x12
 {
@@ -91,7 +92,7 @@ public:
                 case CPUModel::A510:
                     return { 48.22, 2.49, 0.29 };
                 case CPUModel::V1:
-                    return { 116.76, 4.67, 0.60 };
+                    return { 75.54, 8.06, 0.63 };
             }
         }
 
@@ -100,9 +101,17 @@ public:
 
     // Default to the generic kernel
     kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
-    cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *)
+    cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A510:
+                kernel=a64_interleaved_s8s32_mmla_8x12_a510;
+                break;
+        }
     }
+
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
new file mode 100644
index 000000000..a4d8c0ace
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_s8s32_mmla_8x12_a510(
+    const int8_t *Apanel, const int8_t *Bpanel,
+    int32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const int8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "mov %x[Apanel], x21\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.16b }, [x20], #0x10\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      "cmp x19, #0x2\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "ld1 { v5.16b }, [x20], #0x10\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
+      "ld1 { v6.16b }, [x20], #0x10\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
+      "ld1 { v7.16b }, [x20], #0x10\n"
+      ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a431  // smmla v17.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a47a  // smmla v26.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47d  // smmla v29.4s, v3.16b, v7.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e84a47b  // smmla v27.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
+      "ld1 { v4.16b }, [x20], #0x10\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e87a433  // smmla v19.4s, v1.16b, v7.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e85a47e  // smmla v30.4s, v3.16b, v5.16b\n"
+      "ld1 { v5.16b }, [x20], #0x10\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e86a47c  // smmla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47f  // smmla v31.4s, v3.16b, v7.16b\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
+      "cbz x19, 5f\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.16b }, [x20], #0x10\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v7.16b }, [x20], #0x10\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x4e87a431  // smmla v17.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a47a  // smmla v26.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47d  // smmla v29.4s, v3.16b, v7.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e84a47b  // smmla v27.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e87a433  // smmla v19.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e85a47e  // smmla v30.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a47c  // smmla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47f  // smmla v31.4s, v3.16b, v7.16b\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q4, [%x[Cpanel], #0x0]\n"
+      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q11, [%x[Cpanel], #0x10]\n"
+      "str q12, [%x[Cpanel], #0x20]\n"
+      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q13, [%x[Cpanel], #0x60]\n"
+      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q17, [%x[Cpanel], #0x70]\n"
+      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q18, [%x[Cpanel], #0x80]\n"
+      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q19, [%x[Cpanel], #0xc0]\n"
+      "str q23, [%x[Cpanel], #0xd0]\n"
+      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q25, [%x[Cpanel], #0x120]\n"
+      "str q29, [%x[Cpanel], #0x130]\n"
+      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index f492a474a..fa93c1d90 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -35,6 +35,7 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void a64_interleaved_u8u32_mmla_8x12( ARGLIST );
+void a64_interleaved_u8u32_mmla_8x12_a510( ARGLIST );
 
 class cls_a64_interleaved_u8u32_mmla_8x12
 {
@@ -91,7 +92,7 @@ public:
                 case CPUModel::A510:
                     return { 47.66, 2.47, 0.29 };
                 case CPUModel::V1:
-                    return { 111.60, 4.95, 0.66 };
+                    return { 75.54, 8.06, 0.63 };
             }
         }
 
@@ -100,8 +101,15 @@ public:
 
     // Default to the generic kernel
     kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
-    cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *)
+    cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A510:
+                kernel=a64_interleaved_u8u32_mmla_8x12_a510;
+                break;
+        }
     }
 };
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
new file mode 100644
index 000000000..3fe1a9bd0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_u8u32_mmla_8x12_a510(
+    const uint8_t *Apanel, const uint8_t *Bpanel,
+    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "mov %x[Apanel], x21\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.16b }, [x20], #0x10\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      "cmp x19, #0x2\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "ld1 { v5.16b }, [x20], #0x10\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a47a  // ummla v26.4s, v3.16b, v4.16b\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x6e85a47d  // ummla v29.4s, v3.16b, v5.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a47b  // ummla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a432  // ummla v18.4s, v1.16b, v7.16b\n"
+      "ld1 { v6.16b }, [x20], #0x10\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a47e  // ummla v30.4s, v3.16b, v7.16b\n"
+      "ld1 { v7.16b }, [x20], #0x10\n"
+      ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a47c  // ummla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47f  // ummla v31.4s, v3.16b, v5.16b\n"
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a431  // ummla v17.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a47a  // ummla v26.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47d  // ummla v29.4s, v3.16b, v7.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e84a47b  // ummla v27.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
+      "ld1 { v4.16b }, [x20], #0x10\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e87a433  // ummla v19.4s, v1.16b, v7.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e85a47e  // ummla v30.4s, v3.16b, v5.16b\n"
+      "ld1 { v5.16b }, [x20], #0x10\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e86a47c  // ummla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47f  // ummla v31.4s, v3.16b, v7.16b\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a47a  // ummla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47d  // ummla v29.4s, v3.16b, v5.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a47b  // ummla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e87a432  // ummla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a47e  // ummla v30.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a47c  // ummla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47f  // ummla v31.4s, v3.16b, v5.16b\n"
+      "cbz x19, 5f\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.16b }, [x20], #0x10\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v7.16b }, [x20], #0x10\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e87a431  // ummla v17.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a47a  // ummla v26.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47d  // ummla v29.4s, v3.16b, v7.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e84a47b  // ummla v27.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e87a433  // ummla v19.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e85a47e  // ummla v30.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a47c  // ummla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47f  // ummla v31.4s, v3.16b, v7.16b\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q4, [%x[Cpanel], #0x0]\n"
+      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q11, [%x[Cpanel], #0x10]\n"
+      "str q12, [%x[Cpanel], #0x20]\n"
+      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q13, [%x[Cpanel], #0x60]\n"
+      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q17, [%x[Cpanel], #0x70]\n"
+      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q18, [%x[Cpanel], #0x80]\n"
+      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q19, [%x[Cpanel], #0xc0]\n"
+      "str q23, [%x[Cpanel], #0xd0]\n"
+      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q25, [%x[Cpanel], #0x120]\n"
+      "str q29, [%x[Cpanel], #0x130]\n"
+      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
index 30e265fbc..ab175a375 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -83,7 +83,7 @@ public:
                 case CPUModel::A510:
                     return { 5.42 };
                 case CPUModel::V1:
-                    return { 34.56 };
+                    return { 20.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
index 61c7ad17e..b7c9aca9d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
@@ -83,7 +83,7 @@ public:
                 case CPUModel::A510:
                     return { 5.31 };
                 case CPUModel::V1:
-                    return { 28.93 };
+                    return { 17.32 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
index b8ca7c545..28057aa96 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
                 case CPUModel::A510:
                     return { 22.77, 3.90, 0.47 };
                 case CPUModel::V1:
-                    return { 62.97, 19.14, 0.92 };
+                    return { 48.09, 16.24, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
index b88ef14f2..c08977570 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
                 case CPUModel::A510:
                     return { 23.87, 3.89, 0.37 };
                 case CPUModel::V1:
-                    return { 107.63, 19.24, 0.92 };
+                    return { 75.14, 15.87, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
index d870711c6..901cc6d63 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
                 case CPUModel::A510:
                     return { 22.75, 3.90, 0.47 };
                 case CPUModel::V1:
-                    return { 62.97, 19.27, 0.92 };
+                    return { 48.09, 16.24, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
index 7f8eadc52..c0d089278 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
                 case CPUModel::A510:
                     return { 26.80, 3.89, 0.47 };
                 case CPUModel::V1:
-                    return { 108.33, 18.66, 0.92 };
+                    return { 75.14, 15.87, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index fa44bdbd3..fc91dd71a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -80,7 +80,7 @@ public:
                 case CPUModel::A510:
                     return { 7.78, 4.01, 2.43 };
                 case CPUModel::V1:
-                    return { 62.50, 5.09, 11.32 };
+                    return { 47.63, 5.11, 6.80 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index 1924b2cf7..0d707b039 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -89,7 +89,7 @@ public:
                 default:
                     return { 31.67, 3.57, 0.50 };
                 case CPUModel::V1:
-                    return { 63.35, 4.76, 0.77 };
+                    return { 52.24, 7.49, 0.80 };
                 case CPUModel::A510:
                     return { 27.47, 1.70, 0.28 };
             }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index bd1764bb7..4e65296f8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -89,7 +89,7 @@ public:
                 default:
                     return { 61.97, 3.64, 0.50 };
                 case CPUModel::V1:
-                    return { 123.84, 4.93, 0.76 };
+                    return {  95.28, 7.99, 0.79 };
                 case CPUModel::A510:
                     return { 43.36, 1.86, 0.28 };
             }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index f66a9bf51..0afcdd2ce 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -91,7 +91,7 @@ public:
                 case CPUModel::A510:
                     return { 27.45, 1.65, 0.28 };
                 case CPUModel::V1:
-                    return { 63.35, 4.96, 0.77 };
+                    return { 52.24, 7.49, 0.80 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index b530202bd..58d21d6c4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -91,7 +91,7 @@ public:
                 case CPUModel::A510:
                     return { 38.02, 1.85, 0.28 };
                 case CPUModel::V1:
-                    return { 123.84, 4.98, 0.76 };
+                    return { 95.28, 7.99, 0.79 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/transform.cpp b/src/core/NEON/kernels/arm_gemm/transform.cpp
index 60376ab80..c6a3bc0ed 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.cpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.cpp
@@ -25,7 +25,9 @@
 
 #include "bfloat.hpp"
 
+#if !defined(__OpenBSD__)
 #include <alloca.h>
+#endif /* !defined(__OpenBSD__) */
 
 namespace arm_gemm {
 
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index eadf48d00..9262ea05a 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -78,10 +78,20 @@ struct DepthwiseArgs
 template <typename TInput, typename TWeight, typename TOutput>
 class DepthwiseCommon : public IDepthwiseCommon
 {
+private:
+    std::string _name{};
+
 protected:
     const DepthwiseArgs m_args; // Copy of arguments
-
 public:
+    std::string name() const
+    {
+        return _name;
+    }
+    void set_name(const std::string &n)
+    {
+        _name = n;
+    }
     DepthwiseCommon(const DepthwiseArgs &args)
         : m_args(args) {};
     DepthwiseCommon(DepthwiseCommon &) = delete;
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index b81b498ae..ef16cb5e7 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -232,6 +232,18 @@ const std::string &string_from_pooling_type(PoolingType type)
     return pool_type_map[type];
 }
 
+bool is_pool_region_entirely_outside_input(const PoolingLayerInfo &info)
+{
+    if(info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0)
+    {
+        return false;
+    }
+    const auto ps                = info.pad_stride_info;
+    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.pad_left(), ps.pad_right() });
+    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.pad_top(), ps.pad_bottom() });
+    return pool_le_padding_x || pool_le_padding_y;
+}
+
 const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage)
 {
     static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map =
@@ -330,6 +342,13 @@ std::string lower_string(const std::string &val)
     return res;
 }
 
+std::string upper_string(const std::string &val)
+{
+    std::string res = val;
+    std::transform(res.begin(), res.end(), res.begin(), ::toupper);
+    return res;
+}
+
 PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation,
                                  const DimensionRoundingType &rounding_type)
 {
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index 65f6c7093..c7fbf7f83 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -32,6 +32,12 @@
 #define REGISTER_FP16_SVE(func_name) nullptr
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_FP16_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_FP16_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+
 #if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #define REGISTER_FP16_NEON(func_name) &(func_name)
 #else /* !defined(ARM_COMPUTE_ENABLE_NEON) */
@@ -41,6 +47,7 @@
 #else /* !defined(ENABLE_FP16_KERNELS) */
 #define REGISTER_FP16_NEON(func_name) nullptr
 #define REGISTER_FP16_SVE(func_name) nullptr
+#define REGISTER_FP16_SVE2(func_name) nullptr
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 
 #if defined(ENABLE_FP32_KERNELS)
@@ -51,6 +58,12 @@
 #define REGISTER_FP32_SVE(func_name) nullptr
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_FP32_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_FP32_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+
 #if defined(ARM_COMPUTE_ENABLE_NEON)
 #define REGISTER_FP32_NEON(func_name) &(func_name)
 #else /* !defined(ARM_COMPUTE_ENABLE_NEON) */
@@ -60,6 +73,7 @@
 #else /* defined(ENABLE_FP32_KERNELS) */
 #define REGISTER_FP32_NEON(func_name) nullptr
 #define REGISTER_FP32_SVE(func_name) nullptr
+#define REGISTER_FP32_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_FP32_KERNELS) */
 
 #if defined(ENABLE_QASYMM8_SIGNED_KERNELS)
@@ -72,9 +86,16 @@
 #define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_QASYMM8_SIGNED_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_QASYMM8_SIGNED_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+
 #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr
 #define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SIGNED_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 
 #if defined(ENABLE_QASYMM8_KERNELS)
@@ -86,9 +107,16 @@
 #define REGISTER_QASYMM8_SVE(func_name) nullptr
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_QASYMM8_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_QASYMM8_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+
 #else /* defined(ENABLE_QASYMM8_KERNELS) */
 #define REGISTER_QASYMM8_NEON(func_name) nullptr
 #define REGISTER_QASYMM8_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_KERNELS) */
 
 #if defined(ENABLE_QSYMM16_KERNELS)
@@ -101,9 +129,16 @@
 #define REGISTER_QSYMM16_SVE(func_name) nullptr
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_QSYMM16_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_QSYMM16_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+
 #else /* defined(ENABLE_QSYMM16_KERNELS) */
 #define REGISTER_QSYMM16_NEON(func_name) nullptr
 #define REGISTER_QSYMM16_SVE(func_name) nullptr
+#define REGISTER_QSYMM16_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QSYMM16_KERNELS) */
 
 #if defined(ENABLE_INTEGER_KERNELS)
@@ -114,6 +149,12 @@
 #define REGISTER_INTEGER_SVE(func_name) nullptr
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_INTEGER_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_INTEGER_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+
 #if defined(ARM_COMPUTE_ENABLE_NEON)
 #define REGISTER_INTEGER_NEON(func_name) &(func_name)
 #else /* !defined(ARM_COMPUTE_ENABLE_NEON) */
@@ -123,6 +164,7 @@
 #else /* defined(ENABLE_INTEGER_KERNELS) */
 #define REGISTER_INTEGER_NEON(func_name) nullptr
 #define REGISTER_INTEGER_SVE(func_name) nullptr
+#define REGISTER_INTEGER_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_INTEGER_KERNELS) */
 
 #endif /* SRC_CORE_COMMON_REGISTRARS_H */
diff --git a/src/cpu/CpuContext.cpp b/src/cpu/CpuContext.cpp
index 1a971a6a1..0cc507091 100644
--- a/src/cpu/CpuContext.cpp
+++ b/src/cpu/CpuContext.cpp
@@ -28,7 +28,10 @@
 #include "src/cpu/CpuTensor.h"
 
 #include <cstdlib>
+#if !defined(__APPLE__) && !defined(__OpenBSD__)
 #include <malloc.h>
+#endif // !defined(__APPLE__) && !defined(__OpenBSD__)
+
 
 namespace arm_compute
 {
@@ -50,11 +53,11 @@ void *default_aligned_allocate(void *user_data, size_t size, size_t alignment)
 {
     ARM_COMPUTE_UNUSED(user_data);
     void *ptr = nullptr;
-#if defined(BARE_METAL) || defined(__APPLE__)
+#if defined(BARE_METAL)
     size_t rem       = size % alignment;
     size_t real_size = (rem) ? (size + alignment - rem) : size;
     ptr              = memalign(alignment, real_size);
-#else  /* defined(BARE_METAL) || defined(__APPLE__) */
+#else  /* defined(BARE_METAL) */
     if(posix_memalign(&ptr, alignment, size) != 0)
     {
         // posix_memalign returns non-zero on failures, the return values will be
@@ -62,7 +65,7 @@ void *default_aligned_allocate(void *user_data, size_t size, size_t alignment)
         // - ENOMEM: insufficient memory
         ARM_COMPUTE_LOG_ERROR_ACL("posix_memalign failed, the returned pointer will be invalid");
     }
-#endif /* defined(BARE_METAL) || defined(__APPLE__) */
+#endif /* defined(BARE_METAL) */
     return ptr;
 }
 void default_aligned_free(void *user_data, void *ptr)
diff --git a/src/cpu/ICpuKernel.h b/src/cpu/ICpuKernel.h
index 650b3a7d0..8f4106240 100644
--- a/src/cpu/ICpuKernel.h
+++ b/src/cpu/ICpuKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,12 +25,46 @@
 #define ARM_COMPUTE_ICPUKERNEL_H
 
 #include "arm_compute/core/CPP/ICPPKernel.h"
+#include "src/cpu/kernels/CpuKernelSelectionTypes.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-using ICpuKernel = arm_compute::ICPPKernel;
+enum class KernelSelectionType
+{
+    Preferred, /**< Retrieve the best implementation available for the given Cpu ISA, ignoring the build flags */
+    Supported  /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */
+};
+
+template <class Derived>
+class ICpuKernel : public ICPPKernel
+{
+public:
+    /** Micro-kernel selector
+     *
+     * @param[in] selector       Selection struct passed including information to help pick the appropriate micro-kernel
+     * @param[in] selection_type (Optional) Decides whether to get the best implementation for the given hardware or for the given build
+     *
+     * @return A matching micro-kernel else nullptr
+     */
+
+    template <typename SelectorType>
+    static const auto *get_implementation(const SelectorType &selector, KernelSelectionType selection_type = KernelSelectionType::Supported)
+    {
+        using kernel_type = typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type;
+
+        for(const auto &uk : Derived::get_available_kernels())
+        {
+            if(uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr))
+            {
+                return &uk;
+            }
+        }
+
+        return static_cast<kernel_type *>(nullptr);
+    }
+};
 } // namespace cpu
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_ICPUKERNEL_H */
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index 70ab06fc8..cfe732ab5 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,94 +43,60 @@ namespace kernels
 {
 namespace
 {
-struct ActivationSelectorData
+static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels =
 {
-    DataType       dt;
-    const CPUInfo &ci;
-};
-
-using ActivationSelectorPtr = std::add_pointer<bool(const ActivationSelectorData &data)>::type;
-using ActivationKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
-
-struct ActivationKernel
-{
-    const char                 *name;
-    const ActivationSelectorPtr is_selected;
-    ActivationKernelPtr         ukernel;
-};
-
-static const ActivationKernel available_kernels[] =
-{
-#if defined(ARM_COMPUTE_ENABLE_SVE)
     {
         "sve_fp16_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation)
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.sve; },
+        REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)
     },
     {
         "sve_fp32_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation)
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32 && data.isa.sve; },
+        REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)
     },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE)  */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
     {
         "neon_fp16_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation)
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)
     },
     {
         "neon_fp32_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation)
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)
     },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON)  */
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
     {
-        "sve_qu8_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation)
+        "sve2_qu8_activation",
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve2; },
+        REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)
     },
     {
-        "sve_qs8_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation)
+        "sve2_qs8_activation",
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2; },
+        REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)
     },
     {
-        "sve_qs16_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16 && data.ci.has_sve2(); },
-        REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation)
+        "sve2_qs16_activation",
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16 && data.isa.sve2; },
+        REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)
     },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
     {
         "neon_qu8_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation)
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)
     },
     {
         "neon_qs8_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation)
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)
     },
     {
         "neon_qs16_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation)
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16; },
+        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)
     },
 };
 
-const ActivationKernel *get_implementation(const ActivationSelectorData &data)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected(data))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
 /* Supported activation in the 8-bit integer domain */
 static const std::array<ActivationLayerInfo::ActivationFunction, 7> qasymm8_activations =
 {
@@ -155,7 +121,8 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
 
-    const auto *uk = get_implementation(ActivationSelectorData{ src->data_type(), CPUInfo::get() });
+    const auto *uk = CpuActivationKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     const DataType                                data_type = src->data_type();
@@ -208,7 +175,8 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
 
-    const auto uk = get_implementation(ActivationSelectorData{ src->data_type(), CPUInfo::get() });
+    const auto uk = CpuActivationKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _act_info   = activation_info;
@@ -232,9 +200,10 @@ Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *
 
 size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
 {
-    ARM_COMPUTE_UNUSED(platform, thread_count);
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
 
-    return ICPPKernel::small_network_mws;
+    return ICPPKernel::default_mws;
 }
 
 void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
@@ -262,6 +231,11 @@ const char *CpuActivationKernel::name() const
 {
     return _name.c_str();
 }
+
+const std::vector<CpuActivationKernel::ActivationKernel> &CpuActivationKernel::get_available_kernels()
+{
+    return available_kernels;
+}
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
index 8e78d8601..b0476303f 100644
--- a/src/cpu/kernels/CpuActivationKernel.h
+++ b/src/cpu/kernels/CpuActivationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,11 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the activation kernel */
-class CpuActivationKernel : public ICpuKernel
+class CpuActivationKernel : public ICpuKernel<CpuActivationKernel>
 {
+private:
+    using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+
 public:
     CpuActivationKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuActivationKernel);
@@ -70,8 +73,14 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
-private:
-    using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+    struct ActivationKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        ActivationKernelPtr          ukernel;
+    };
+
+    static const std::vector<ActivationKernel> &get_available_kernels();
 
 private:
     ActivationLayerInfo _act_info{};
diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp
index 77aa1ffb3..d06621fae 100644
--- a/src/cpu/kernels/CpuAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,9 +30,7 @@
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/cpu/kernels/add/neon/list.h"
-#include "src/cpu/kernels/add/sve/list.h"
-
+#include "src/cpu/kernels/add/list.h"
 #include <array>
 
 namespace arm_compute
@@ -43,159 +41,116 @@ namespace kernels
 {
 namespace
 {
-struct AddSelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-
-using AddSelectorPtr = std::add_pointer<bool(const AddSelectorData &data)>::type;
-using AddKernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
-struct AddKernel
+static const std::vector<CpuAddKernel::AddKernel> available_kernels =
 {
-    const char          *name;
-    const AddSelectorPtr is_selected;
-    AddKernelPtr         ukernel;
-};
-
-static const AddKernel available_kernels[] =
-{
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
     {
         "sve2_qu8_add",
-        [](const AddSelectorData & data)
+        [](const DataTypeISASelectorData & data)
         {
-            return (data.dt == DataType::QASYMM8) && data.ci.has_sve2();
+            return (data.dt == DataType::QASYMM8) && data.isa.sve2;
         },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::add_qasymm8_sve)
+        REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)
     },
     {
         "sve2_qs8_add",
-        [](const AddSelectorData & data)
+        [](const DataTypeISASelectorData & data)
         {
-            return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve2();
+            return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2;
         },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::add_qasymm8_signed_sve)
+        REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)
     },
     {
         "sve2_qs16_add",
-        [](const AddSelectorData & data)
+        [](const DataTypeISASelectorData & data)
         {
-            return (data.dt == DataType::QSYMM16) && data.ci.has_sve2();
+            return (data.dt == DataType::QSYMM16) && data.isa.sve2;
         },
-        REGISTER_QSYMM16_SVE(arm_compute::cpu::add_qsymm16_sve)
+        REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)
     },
-#endif /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
-#if defined(ARM_COMPUTE_ENABLE_SVE)
     {
         "sve_fp32_add",
-        [](const AddSelectorData & data)
+        [](const DataTypeISASelectorData & data)
         {
-            return (data.dt == DataType::F32) && data.ci.has_sve();
+            return (data.dt == DataType::F32) && data.isa.sve;
         },
-        REGISTER_FP32_SVE(arm_compute::cpu::add_same_sve<float>)
+        REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)
     },
     {
         "sve_fp16_add",
-        [](const AddSelectorData & data)
+        [](const DataTypeISASelectorData & data)
         {
-            return (data.dt == DataType::F16) && data.ci.has_sve();
+            return (data.dt == DataType::F16) && data.isa.sve;
         },
-        REGISTER_FP16_SVE(arm_compute::cpu::add_same_sve<float16_t>)
+        REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)
     },
     {
         "sve_u8_add",
-        [](const AddSelectorData & data)
+        [](const DataTypeISASelectorData & data)
         {
-            return (data.dt == DataType::U8) && data.ci.has_sve();
+            return (data.dt == DataType::U8) && data.isa.sve;
         },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<uint8_t>)
+        REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)
     },
     {
         "sve_s16_add",
-        [](const AddSelectorData & data)
+        [](const DataTypeISASelectorData & data)
         {
-            return (data.dt == DataType::S16) && data.ci.has_sve();
+            return (data.dt == DataType::S16) && data.isa.sve;
         },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int16_t>)
+        REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)
     },
     {
         "sve_s32_add",
-        [](const AddSelectorData & data)
+        [](const DataTypeISASelectorData & data)
         {
-            return (data.dt == DataType::S32) && data.ci.has_sve();
+            return (data.dt == DataType::S32) && data.isa.sve;
         },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int32_t>)
+        REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)
     },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
     {
         "neon_fp32_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_same_neon<float>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
+        REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)
     },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "neon_fp16_add",
-        [](const AddSelectorData & data)
+        [](const DataTypeISASelectorData & data)
         {
-            return (data.dt == DataType::F16) && data.ci.has_fp16();
+            return (data.dt == DataType::F16) && data.isa.fp16;
         },
-        REGISTER_FP16_NEON(arm_compute::cpu::add_same_neon<float16_t>)
+        REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)
     },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     {
         "neon_u8_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::U8); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<uint8_t>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::U8); },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)
     },
     {
         "neon_s16_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::S16); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int16_t>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::S16); },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)
     },
     {
         "neon_s32_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::S32); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int32_t>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::S32); },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)
     },
-#endif /*  defined(ARM_COMPUTE_ENABLE_NEON) */
-#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
     {
         "neon_qu8_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
         REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)
     },
     {
         "neon_qs8_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)
     },
     {
         "neon_qs16_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::QSYMM16); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); },
         REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const AddKernel *get_implementation(const CPUInfo &cpuinfo, DataType dt)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt, cpuinfo }))
-        {
-            return &uk;
-        }
     }
-    return nullptr;
-}
+};
 
 Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
 {
@@ -222,7 +177,7 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons
                                         "Wrong shape for dst");
     }
 
-    const auto *uk = get_implementation(CPUInfo::get(), src0.data_type());
+    const auto *uk = CpuAddKernel::get_implementation(DataTypeISASelectorData{ src0.data_type(), CPUInfo::get().get_isa() });
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
@@ -248,7 +203,8 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
 
-    const auto uk = get_implementation(CPUInfo::get(), src0->data_type());
+    const auto uk = CpuAddKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
+
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _policy     = policy;
@@ -292,11 +248,17 @@ const char *CpuAddKernel::name() const
     return _name.c_str();
 }
 
+const std::vector<CpuAddKernel::AddKernel> &CpuAddKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
 size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
 {
-    ARM_COMPUTE_UNUSED(platform, thread_count);
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
 
-    return ICPPKernel::small_network_mws;
+    return ICPPKernel::default_mws;
 }
 
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h
index a0c7e497d..663813558 100644
--- a/src/cpu/kernels/CpuAddKernel.h
+++ b/src/cpu/kernels/CpuAddKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,9 +34,19 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform addition between two tensors */
-class CpuAddKernel : public ICpuKernel
+class CpuAddKernel : public ICpuKernel<CpuAddKernel>
 {
+private:
+    using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+
 public:
+    struct AddKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        AddKernelPtr                 ukernel;
+    };
+
     CpuAddKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuAddKernel);
     /** Initialise the kernel's input, dst and border mode.
@@ -79,8 +89,7 @@ public:
      */
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
 
-private:
-    using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+    static const std::vector<AddKernel> &get_available_kernels();
 
 private:
     ConvertPolicy _policy{};
diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h
index a8ce97230..7679178fa 100644
--- a/src/cpu/kernels/CpuCastKernel.h
+++ b/src/cpu/kernels/CpuCastKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,7 @@ namespace kernels
  *
  * @note When casting between quantized types the scale and zeroPoint are ignored
  */
-class CpuCastKernel : public ICpuKernel
+class CpuCastKernel : public ICpuKernel<CpuCastKernel>
 {
 public:
     CpuCastKernel() = default;
diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h
index 8e09a2b68..deafcc14d 100644
--- a/src/cpu/kernels/CpuCol2ImKernel.h
+++ b/src/cpu/kernels/CpuCol2ImKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@ namespace kernels
  * \end{array} \right)
  * @f]
  */
-class CpuCol2ImKernel : public ICpuKernel
+class CpuCol2ImKernel : public ICpuKernel<CpuCol2ImKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h
index 91f2808f8..0de68a5d6 100644
--- a/src/cpu/kernels/CpuConcatenateBatchKernel.h
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,7 @@ namespace kernels
 /** Interface for the batch concatenate kernel.
  *  The input tensor will be concatenated into the output tensor.
  */
-class CpuConcatenateBatchKernel : public ICpuKernel
+class CpuConcatenateBatchKernel : public ICpuKernel<CpuConcatenateBatchKernel>
 {
 public:
     CpuConcatenateBatchKernel() = default;
diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h
index 063118b33..5a0edb95b 100644
--- a/src/cpu/kernels/CpuConcatenateDepthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ namespace kernels
 /** Interface for the depth concatenate kernel.
  *  The input tensor will be concatenated into the output tensor.
  */
-class CpuConcatenateDepthKernel : public ICpuKernel
+class CpuConcatenateDepthKernel : public ICpuKernel<CpuConcatenateDepthKernel>
 {
 public:
     CpuConcatenateDepthKernel() = default;
diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h
index 883c59a20..74d5d0c2c 100644
--- a/src/cpu/kernels/CpuConcatenateHeightKernel.h
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,7 @@ namespace kernels
 /** Interface for the height concatenate kernel.
  *  The source tensor will be concatenated into the destination tensor.
  */
-class CpuConcatenateHeightKernel : public ICpuKernel
+class CpuConcatenateHeightKernel : public ICpuKernel<CpuConcatenateHeightKernel>
 {
 public:
     CpuConcatenateHeightKernel() = default;
diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h
index 3b4612ab0..418bc51b3 100644
--- a/src/cpu/kernels/CpuConcatenateWidthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,7 @@ namespace kernels
 /** Interface for the width concatenate kernel.
  *  The source tensor will be concatenated into the destination tensor.
  */
-class CpuConcatenateWidthKernel : public ICPPKernel
+class CpuConcatenateWidthKernel : public ICpuKernel<CpuConcatenateWidthKernel>
 {
 public:
     CpuConcatenateWidthKernel() = default;
diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
index 70f0a742f..9a1393323 100644
--- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@ namespace kernels
  *
  * @note This function assumes the weights are already reshaped (transposed)
  */
-class CpuConvertFullyConnectedWeightsKernel : public ICpuKernel
+class CpuConvertFullyConnectedWeightsKernel : public ICpuKernel<CpuConvertFullyConnectedWeightsKernel>
 {
 public:
     CpuConvertFullyConnectedWeightsKernel() = default;
diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
index 8cce1eaf1..b5eaf6548 100644
--- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */
-class CpuConvertQuantizedSignednessKernel : public ICpuKernel
+class CpuConvertQuantizedSignednessKernel : public ICpuKernel<CpuConvertQuantizedSignednessKernel>
 {
 public:
     CpuConvertQuantizedSignednessKernel() = default;
diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h
index 193f38078..c9ef8eba7 100644
--- a/src/cpu/kernels/CpuCopyKernel.h
+++ b/src/cpu/kernels/CpuCopyKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Kernel to perform a copy between two tensors */
-class CpuCopyKernel : public ICpuKernel
+class CpuCopyKernel : public ICpuKernel<CpuCopyKernel>
 {
 public:
     CpuCopyKernel() = default;
diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
index 1afb6bed4..e23a0fac8 100644
--- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class CpuDepthwiseConv2dNativeKernel : public ICpuKernel
+class CpuDepthwiseConv2dNativeKernel : public ICpuKernel<CpuDepthwiseConv2dNativeKernel>
 {
 public:
     CpuDepthwiseConv2dNativeKernel() = default;
diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h
index f515cd36f..cfa991dc7 100644
--- a/src/cpu/kernels/CpuDequantizeKernel.h
+++ b/src/cpu/kernels/CpuDequantizeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the dequantization layer kernel. */
-class CpuDequantizeKernel : public ICpuKernel
+class CpuDequantizeKernel : public ICpuKernel<CpuDequantizeKernel>
 {
 public:
     CpuDequantizeKernel() = default;
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
index 68de9803e..f3560156b 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,769 +52,6 @@ namespace kernels
 {
 namespace
 {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <unsigned int stridex>
-float16x8_t internal_vld1q(const float16_t *in);
-
-template <>
-float16x8_t internal_vld1q<1>(const float16_t *in)
-{
-    return vld1q_f16(in);
-}
-
-template <>
-float16x8_t internal_vld1q<2>(const float16_t *in)
-{
-    const float16x8x2_t tmp = vld2q_f16(in);
-    return tmp.val[0];
-}
-
-template <>
-float16x8_t internal_vld1q<3>(const float16_t *in)
-{
-    const float16x8x3_t tmp = vld3q_f16(in);
-    return tmp.val[0];
-}
-
-inline float16x8_t internal_vdupq_n(float16_t v)
-{
-    return vdupq_n_f16(v);
-}
-
-inline void internal_vst1q(float16_t *p, const float16x8_t &v)
-{
-    vst1q_f16(p, v);
-}
-
-float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y)
-{
-    return vmulq_f16(x, y);
-}
-
-inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z)
-{
-    return vaddq_f16(x, vmulq_f16(y, z));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <unsigned int stridex>
-float32x4_t internal_vld1q(const float *in);
-
-template <>
-float32x4_t internal_vld1q<1>(const float *in)
-{
-    return vld1q_f32(in);
-}
-
-template <>
-float32x4_t internal_vld1q<2>(const float *in)
-{
-    const float32x4x2_t tmp = vld2q_f32(in);
-    return tmp.val[0];
-}
-
-template <>
-float32x4_t internal_vld1q<3>(const float *in)
-{
-    const float32x4x3_t tmp = vld3q_f32(in);
-    return tmp.val[0];
-}
-
-inline float32x4_t internal_vdupq_n(float v)
-{
-    return vdupq_n_f32(v);
-}
-
-inline void internal_vst1q(float *p, const float32x4_t &v)
-{
-    vst1q_f32(p, v);
-}
-
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y)
-{
-    return vmulq_f32(x, y);
-}
-
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z)
-{
-    return vmlaq_f32(x, y, z);
-}
-
-constexpr int small_tensor_size_optim = 8;
-inline bool run_optim_small_tensor_info(const ITensorInfo *t)
-{
-    return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim;
-}
-
-inline bool run_optim_small_tensor(const ITensor *t)
-{
-    return run_optim_small_tensor_info(t->info());
-}
-
-// Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
-// For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to
-// store intermidiate results in memory. Temporary results are stored in SIMD registers directly and then written to the output buffer.
-template <unsigned int stridex>
-class convolver_w1x1_i8x8_f32
-{
-public:
-    static void convolve(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimX) > small_tensor_size_optim);
-        ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimY) > small_tensor_size_optim);
-
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_h        = dst->info()->dimension(1);
-        const int          range_z         = window.z().end() - window.z().start();
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
-            std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
-            for(int oz = 0; oz < range_z; ++oz)
-            {
-                accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
-                accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
-                auto p_out_base                                                                               = out_ptr + oz * output_stride_z;
-                for(int p = 0; p < kernel_depth; ++p)
-                {
-                    const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk0   = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const float *>(input_ptr + p * input_stride_z + offset_xy);
-                        auto      v_in0     = internal_vld1q<stridex>(in_val);
-                        auto      v_in1     = internal_vld1q<stridex>(in_val + 4);
-                        accum0[oh]          = vmlaq_f32(accum0[oh], vk0, v_in0);
-                        accum1[oh]          = vmlaq_f32(accum1[oh], vk0, v_in1);
-                    }
-                }
-                for(oh = 0; oh < output_h; ++oh)
-                {
-                    auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y);
-                    vst1q_f32(p_out, accum0[oh]);
-                    vst1q_f32(p_out + 4, accum1[oh]);
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_1x1
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          range_z         = window.z().end() - window.z().start();
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            /*
-                For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
-            */
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            for(int oz = 0; oz < range_z; ++oz)
-            {
-                auto p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk    = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
-                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
-                        {
-                            internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
-                        }
-                    }
-                }
-
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk    = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
-                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
-                        {
-                            internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <unsigned int stridex>
-float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                           const float *m0, const float *m1, const float *m2, const float *m3, const float *m4);
-
-inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
-{
-    const float32x4x3_t m00 =
-    {
-        {
-            vld1q_dup_f32(m0),
-            vld1q_dup_f32(m1),
-            vld1q_dup_f32(m2)
-        }
-    };
-    return m00;
-}
-
-inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)
-{
-    const float32x4x2_t m00 =
-    {
-        {
-            vld1q_dup_f32(m3),
-            vld1q_dup_f32(m4)
-        }
-    };
-    return m00;
-}
-
-inline float32x4x3_t load_input(const float *const in)
-{
-    const float32x4x3_t vin =
-    {
-        {
-            vld1q_f32(in),
-            vld1q_f32(in + 4),
-            vld1q_f32(in + 8)
-        }
-    };
-    return vin;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    const float32x4x3_t vin0 = load_input(in_0);
-    const float32x4x3_t vin1 = load_input(in_1);
-    const float32x4x3_t vin2 = load_input(in_2);
-    const float32x4x3_t vin3 = load_input(in_3);
-    const float32x4x3_t vin4 = load_input(in_4);
-    const float32x4x3_t m00  = load_matrix_hi(m0, 1 + m0, 2 + m0);
-    const float32x4x2_t m01  = load_matrix_lo(3 + m0, 4 + m0);
-    const float32x4x3_t m10  = load_matrix_hi(m1, 1 + m1, 2 + m1);
-    const float32x4x2_t m11  = load_matrix_lo(3 + m1, 4 + m1);
-    const float32x4x3_t m20  = load_matrix_hi(m2, 1 + m2, 2 + m2);
-    const float32x4x2_t m21  = load_matrix_lo(3 + m2, 4 + m2);
-    const float32x4x3_t m30  = load_matrix_hi(m3, 1 + m3, 2 + m3);
-    const float32x4x2_t m31  = load_matrix_lo(3 + m3, 4 + m3);
-    const float32x4x3_t m40  = load_matrix_hi(m4, 1 + m4, 2 + m4);
-    const float32x4x2_t m41  = load_matrix_lo(3 + m4, 4 + m4);
-
-    float32x4x2_t out =
-    {
-        {
-            vmulq_f32(vin0.val[0], m00.val[0]),
-            vmulq_f32(vin0.val[1], m00.val[0])
-        }
-    };
-
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
-
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
-    return out;
-}
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_3x3
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          num_planes_z    = window.z().end() - window.z().start();
-        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            /*
-                    Each thread executing this kernel computes one or more output's volume planes.
-
-                    Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
-                    the third thread [16,24] and the fourth thread [25,31].
-
-                    The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
-                    is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
-
-                    The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
-                        1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
-                        2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
-            */
-            for(int oz = 0; oz < num_planes_z; ++oz)
-            {
-                const int zoffset    = id.z() + oz;
-                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
-                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
-                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
-                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
-                        }
-                    }
-                }
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
-                    const uint8_t *input_base = input_ptr + p * input_stride_z;
-                    const auto     ptr_k_r0   = reinterpret_cast<const T1 *>(ptr_k_base);
-                    const auto     ptr_k_r1   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);
-                    const auto     ptr_k_r2   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);
-                    const auto     vk_r0      = load_matrix_row(ptr_k_r0);
-                    const auto     vk_r1      = load_matrix_row(ptr_k_r1);
-                    const auto     vk_r2      = load_matrix_row(ptr_k_r2);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);
-                        auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);
-                        auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);
-                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_5x5
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          num_planes_z    = window.z().end() - window.z().start();
-        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            for(int oz = 0; oz < num_planes_z; ++oz)
-            {
-                const int zoffset    = id.z() + oz;
-                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
-                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);
-                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);
-                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
-                            store_results<stridex>(p_out, vres);
-                        }
-                    }
-                }
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
-
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
-                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);
-                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);
-                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
-                            accumulate_results<stridex>(p_out, vres);
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <typename T1, typename T2>
-inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-template <>
-inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                                       const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    if(run_optim_small_tensor(src))
-    {
-        switch(conv_stride_x)
-        {
-            case 1:
-                convolver_w1x1_i8x8_f32<1>::convolve(window, src, weights, dst, conv_info);
-                break;
-            case 2:
-                convolver_w1x1_i8x8_f32<2>::convolve(window, src, weights, dst, conv_info);
-                break;
-            case 3:
-                convolver_w1x1_i8x8_f32<3>::convolve(window, src, weights, dst, conv_info);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
-    }
-    else
-    {
-        switch(conv_stride_x)
-        {
-            case 1:
-                convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            case 2:
-                convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            case 3:
-                convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
-    }
-}
-
-template <typename T1, typename T2>
-inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-template <typename T1, typename T2>
-inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
@@ -828,13 +65,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (src->data_type() == DataType::F16));
-
+    ARM_COMPUTE_UNUSED(width_idx);
     // Checks performed when output is configured
     if(dst->total_size() != 0)
     {
@@ -849,128 +84,16 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
-                                                        unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-
-    const DataLayout data_layout = src->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-
-    // Calculate right and bottom border
-    unsigned int kernel_size   = weights->dimension(width_idx);
-    const int    conv_stride_x = std::get<0>(conv_info.stride());
-    const int    conv_stride_y = std::get<1>(conv_info.stride());
-    const int    input_width   = src->dimension(width_idx);
+    ARM_COMPUTE_UNUSED(src);
 
     Window win{};
     bool   window_changed = false;
 
-    if(data_layout == DataLayout::NCHW)
-    {
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                switch(src->data_type())
-                {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        num_elems_written_per_iteration = 8;
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    case DataType::F32:
-                        if(run_optim_small_tensor_info(src))
-                        {
-                            num_elems_written_per_iteration = 8;
-                        }
-                        else
-                        {
-                            num_elems_written_per_iteration = 4;
-                        }
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-                num_weight_elems_read_per_row = kernel_size;
-                num_elems_read_per_iteration  = conv_stride_x * num_elems_written_per_iteration;
-                break;
-            }
-            case 3:
-                switch(src->data_type())
-                {
-                    case DataType::F32:
-                        num_weight_elems_read_per_row   = 4 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 12;
-                        num_elems_written_per_iteration = 16 >> conv_stride_x;
-                        break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        num_weight_elems_read_per_row   = 8 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 24;
-                        num_elems_written_per_iteration = 32 >> conv_stride_x;
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-                break;
-            case 5:
-            {
-                switch(src->data_type())
-                {
-                    case DataType::F32:
-                        num_weight_elems_read_per_row   = 4 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 12;
-                        num_elems_written_per_iteration = 16 >> conv_stride_x;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-            }
-            break;
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not implemented");
-                break;
-            }
-        }
-
-        // Calculate right pad
-        int start_x       = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
-        int end_x         = ceil_to_multiple(static_cast<int>(dst->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
-        int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
-
-        // Calculate border
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-        const unsigned int conv_pad_right  = std::max(upper_bound_w, 0);
-        const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-
-        border_size.left   = conv_pad_left;
-        border_size.top    = conv_pad_top;
-        border_size.right  = conv_pad_right;
-        border_size.bottom = conv_pad_bottom;
-
-        // Configure window
-        win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration));
-
-        AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top,
-                                           num_elems_read_per_iteration, kernel_size,
-                                           conv_stride_x, conv_stride_y);
-        AccessWindowStatic     weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
-        AccessWindowHorizontal output_access(dst, 0, num_elems_written_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-    }
-    else
-    {
-        // Configure window NHWC without any padding
-        win = calculate_max_window(*dst, Steps());
-    }
+    // Configure window without any padding
+    win = calculate_max_window(*dst, Steps());
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
@@ -1208,9 +331,112 @@ void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *s
     out);
 }
 
-BorderSize CpuDirectConv2dKernel::border_size() const
+template <typename T>
+void CpuDirectConv2dKernel::convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
 {
-    return _border_size;
+    // Declare useful types
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    // Scalar quantities
+    const int element_size   = src->info()->element_size();
+    const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size;
+    const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size;
+    const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size;
+    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+
+    const int input_dim_w = src->info()->dimension(0);
+    const int input_dim_h = src->info()->dimension(1);
+
+    const int output_stride_c = dst->info()->strides_in_bytes()[2];
+
+    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size;
+    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size;
+    const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size;
+
+    const int kernel_dim_w = weights->info()->dimension(0);
+    const int kernel_dim_h = weights->info()->dimension(1);
+
+    const int conv_pad_top  = _conv_info.pad_top();
+    const int conv_pad_left = _conv_info.pad_left();
+    const int conv_stride_w = std::get<0>(_conv_info.stride());
+    const int conv_stride_h = std::get<1>(_conv_info.stride());
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*weights->info(), Steps());
+    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    Iterator out(dst, window_out);
+    Iterator wei(weights, window_w);
+
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // We are computing the theoretical starting input starting points
+        const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
+        const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
+        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+        // We are computing the valid initial and ending input points by checking the borders
+        const int in_w_start = std::max(in_w_start_t, 0);
+        const int in_h_start = std::max(in_h_start_t, 0);
+        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+        // We use the input points to select the valid weight points to use
+        const int wei_w_start = in_w_start - in_w_start_t;
+        const int wei_h_start = in_h_start - in_h_start_t;
+        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+        const int      index_c_end  = weights->info()->dimension(2);
+        const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
+        execute_window_loop(window_w, [&](const Coordinates & id_w)
+        {
+            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+            T              out_temp          = static_cast<T>(0);
+
+            for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
+            {
+                const T *const in_ptr_row_0      = in_ptr_start + index_in_c * input_stride_c;
+                const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
+                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
+                {
+                    const T    *in_ptr_row      = in_ptr_row_0 + index_in_h * input_stride_h;
+                    const T    *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
+                    int         index_w         = in_w_start;
+                    int         index_wei_w     = wei_w_start;
+                    vector_type out_temp_vec    = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                    for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
+                    {
+                        const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
+                        const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
+                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                    }
+                    out_temp += vreduce(out_temp_vec);
+                    for(; index_w < in_w_end; ++index_w, ++index_wei_w)
+                    {
+                        const auto src_val = *(in_ptr_row + index_w * input_stride_w);
+                        const auto w_val   = *(weights_ptr_row + index_wei_w * kernel_stride_w);
+                        out_temp += src_val * w_val;
+                    }
+                }
+            }
+            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+
+        },
+        wei);
+    },
+    out);
 }
 
 void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
@@ -1221,19 +447,6 @@ void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, IT
     _data_layout = src->data_layout();
     _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
 
-    const unsigned int conv_pad_left   = conv_info.pad_left();
-    const unsigned int conv_pad_top    = conv_info.pad_top();
-    const unsigned int conv_pad_right  = conv_info.pad_right();
-    const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-    if(_data_layout == DataLayout::NCHW)
-    {
-        _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
-    }
-    else
-    {
-        _border_size = BorderSize(0);
-    }
-
     // Get convolved dimensions
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
@@ -1246,27 +459,16 @@ void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, IT
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src, weights, dst, conv_info, _num_weight_elems_read_per_row,
-                                                    _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
+    auto win_config = validate_and_configure_window(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICpuKernel::configure(win_config.second);
 }
 
 Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
 {
-    unsigned int num_weight_elems_read_per_row   = 0;
-    unsigned int num_elems_read_per_iteration    = 0;
-    unsigned int num_elems_written_per_iteration = 0;
-    BorderSize   border_size                     = {};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
-                                                              weights->clone().get(),
-                                                              dst->clone().get(),
-                                                              conv_info,
-                                                              num_weight_elems_read_per_row,
-                                                              num_elems_read_per_iteration,
-                                                              num_elems_written_per_iteration,
-                                                              border_size)
+                                                              dst->clone().get())
                                 .first);
 
     return Status{};
@@ -1278,69 +480,29 @@ void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, c
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
 
-    auto      src         = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto      weights     = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto      dst         = tensors.get_tensor(TensorType::ACL_DST);
-    const int kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
+    auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto dst     = tensors.get_tensor(TensorType::ACL_DST);
 
     if(_data_layout == DataLayout::NCHW)
     {
-        switch(kernel_size)
+        switch(src->info()->data_type())
         {
-            case 1:
-            {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
-                break;
-            }
-            case 3:
+            case DataType::F16:
             {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
+                convolve_nchw<float16_t>(window, src, weights, dst);
                 break;
             }
-            case 5:
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            case DataType::F32:
             {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
+                convolve_nchw<float>(window, src, weights, dst);
                 break;
             }
             default:
-            {
-                ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
+                ARM_COMPUTE_ERROR("Data type not supported");
                 break;
-            }
         }
     }
     else
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h
index 3ba7f7ed5..6ec4d4ee0 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform Direct Convolution Layer. */
-class CpuDirectConv2dKernel : public ICpuKernel
+class CpuDirectConv2dKernel : public ICpuKernel<CpuDirectConv2dKernel>
 {
 public:
     CpuDirectConv2dKernel() = default;
@@ -66,7 +66,6 @@ public:
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
-    BorderSize  border_size() const override;
 
 private:
     /* Template function for optimized convolution NHWC */
@@ -77,12 +76,12 @@ private:
     template <typename T>
     void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
 
+    /* Template function for convolution NCHW */
+    template <typename T>
+    void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
+
     PadStrideInfo _conv_info{};
-    BorderSize    _border_size{};
     unsigned int  _kernel_size{ 0 };
-    unsigned int  _num_weight_elems_read_per_row{ 0 };
-    unsigned int  _num_elems_read_per_iteration{ 0 };
-    unsigned int  _num_elems_written_per_iteration{ 0 };
     DataLayout    _data_layout{ DataLayout::UNKNOWN };
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
index a68936bba..d3ef17b7c 100644
--- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ namespace kernels
  * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
  *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
  */
-class CpuDirectConv2dOutputStageKernel : public ICpuKernel
+class CpuDirectConv2dOutputStageKernel : public ICpuKernel<CpuDirectConv2dOutputStageKernel>
 {
 public:
     CpuDirectConv2dOutputStageKernel() = default;
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.cpp b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
index 36764a625..22c60cd99 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,64 +49,32 @@ namespace kernels
 {
 namespace
 {
-struct DirectConv3dSelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-using DirectConv3dSelectorPtr = std::add_pointer<bool(const DirectConv3dSelectorData &data)>::type;
-using DirectConv3dKernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
-struct DirectConv3dKernel
-{
-    const char                   *name;
-    const DirectConv3dSelectorPtr is_selected;
-    DirectConv3dKernelPtr         ukernel;
-};
-
-static const DirectConv3dKernel available_kernels[] =
+static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels =
 {
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "neon_fp16_directconv3d",
-        [](const DirectConv3dSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
         REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)
     },
 #endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     {
         "neon_fp32_directconv3d",
-        [](const DirectConv3dSelectorData & data) { return data.dt == DataType::F32; },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
         REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)
     },
     {
         "neon_qasymm8_directconv3d",
-        [](const DirectConv3dSelectorData & data) { return data.dt == DataType::QASYMM8; },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
         REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)
     },
     {
         "neon_qasymm8_signed_directconv3d",
-        [](const DirectConv3dSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)
     }
 };
 
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const DirectConv3dKernel *get_implementation(const DirectConv3dSelectorData &data)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected(data))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
 Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
@@ -117,7 +85,8 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size3D(1U, 1U, 1U));
 
-    const auto *uk = get_implementation(DirectConv3dSelectorData{ src0->data_type(), CPUInfo::get() });
+    const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
+
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     const DataLayout data_layout = src0->data_layout();
@@ -161,7 +130,8 @@ void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo
     ARM_COMPUTE_UNUSED(src2);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    const auto *uk = get_implementation(DirectConv3dSelectorData{ src0->data_type(), CPUInfo::get() });
+    const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
+
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _conv_info  = conv_info;
@@ -210,6 +180,12 @@ const char *CpuDirectConv3dKernel::name() const
 {
     return _name.c_str();
 }
+
+const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> &CpuDirectConv3dKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
 \ No newline at end of file
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.h b/src/cpu/kernels/CpuDirectConv3dKernel.h
index ff3b30f8a..688f368b9 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
+
 namespace arm_compute
 {
 namespace cpu
@@ -34,8 +35,12 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform 3D Direct Convolution Layer. */
-class CpuDirectConv3dKernel : public ICpuKernel
+class CpuDirectConv3dKernel : public ICpuKernel<CpuDirectConv3dKernel>
 {
+private:
+    /* Template function for convolution 3d NDHWC */
+    using DirectConv3dKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
+
 public:
     CpuDirectConv3dKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv3dKernel);
@@ -71,14 +76,21 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
-private:
-    /* Template function for convolution 3d NDHWC */
-    using DirectConv3dKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
+    struct DirectConv3dKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        DirectConv3dKernelPtr        ukernel;
+    };
+
+    static const std::vector<DirectConv3dKernel> &get_available_kernels();
 
+private:
     Conv3dInfo            _conv_info{};
     DirectConv3dKernelPtr _run_method{ nullptr };
     std::string           _name{};
 };
+
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp
index 91de24b85..53179ae95 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,10 +28,7 @@
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/cpu/kernels/elementwise/neon/elementwise_list.h"
-#include "src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h"
+#include "src/cpu/kernels/elementwise_binary/list.h"
 
 #include <arm_neon.h>
 
@@ -68,76 +65,73 @@ CpuElementwiseKernel::UKernelInfo configure_arithm_func(const ITensorInfo *src0,
         {
             "sve_fp32_elementwise",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
-            REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float32_t>))
+            REGISTER_FP32_SVE((arm_compute::cpu::sve_fp32_elementwise_binary<op>))
         },
         {
             "sve_s32_elementwise",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); },
-            REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int32_t>))
+            REGISTER_INTEGER_SVE((arm_compute::cpu::sve_s32_elementwise_binary<op>))
         },
         {
             "sve_s16_elementwise",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); },
-            REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int16_t>))
+            REGISTER_INTEGER_SVE((arm_compute::cpu::sve_s16_elementwise_binary<op>))
+        },
+        {
+            "sve_fp16_elementwise",
+            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
+            REGISTER_FP16_SVE((arm_compute::cpu::sve_fp16_elementwise_binary<op>))
         },
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 #if defined(ARM_COMPUTE_ENABLE_NEON)
         {
             "neon_fp32_elementwise",
+
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; },
-            REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>))
+            REGISTER_FP32_NEON((arm_compute::cpu::neon_fp32_elementwise_binary<op>))
         },
         {
             "neon_s32_elementwise",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; },
-            REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>))
+            REGISTER_INTEGER_NEON((arm_compute::cpu::neon_s32_elementwise_binary<op>))
+        },
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+        {
+            "neon_fp16_elementwise",
+            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
+            REGISTER_FP16_NEON((arm_compute::cpu::neon_fp16_elementwise_binary<op>))
+        },
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+        {
+            "neon_s16_elementwise",
+            [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; },
+            REGISTER_INTEGER_NEON((arm_compute::cpu::neon_s16_elementwise_binary<op>))
         },
 #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
         {
             "sve2_qu8_elementwise",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
-            REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, uint8_t>))
+            REGISTER_QASYMM8_SVE2((arm_compute::cpu::sve2_qasymm8_elementwise_binary<op>))
         },
         {
             "sve2_qs8_elementwise",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
-            REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, int8_t>))
+            REGISTER_QASYMM8_SIGNED_SVE2((arm_compute::cpu::sve2_qasymm8_signed_elementwise_binary<op>))
         },
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 #if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
         {
             "neon_qu8_elementwise",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; },
-            REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized<op>))
+            REGISTER_QASYMM8_NEON((arm_compute::cpu::neon_qasymm8_elementwise_binary<op>))
         },
         {
             "neon_qs8_elementwise",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-            REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed<op>))
+            REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::neon_qasymm8_signed_elementwise_binary<op>))
         },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON)  || defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-        {
-            "sve_fp16_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
-            REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float16_t>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-        {
-            "neon_fp16_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
-            REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>))
-        },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-        {
-            "neon_s16_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; },
-            REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)  */
     };
 
     for(const auto &uk : kernels)
@@ -161,82 +155,82 @@ CpuElementwiseKernel::UKernelInfo configure_comp_func(const ITensorInfo *src0, c
         {
             "sve_u8_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); },
-            REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, uint8_t>))
+            REGISTER_INTEGER_SVE(arm_compute::cpu::sve_u8_comparison_elementwise_binary<op>)
         },
         {
             "sve_fp32_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
-            REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op<op, float>))
+            REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_comparison_elementwise_binary<op>)
         },
         {
             "sve_s16_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); },
-            REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int16_t>))
+            REGISTER_INTEGER_SVE(arm_compute::cpu::sve_s16_comparison_elementwise_binary<op>)
         },
         {
             "sve_s32_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); },
-            REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int32_t>))
+            REGISTER_INTEGER_SVE(arm_compute::cpu::sve_s32_comparison_elementwise_binary<op>)
         },
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 #if defined(ARM_COMPUTE_ENABLE_NEON)
         {
             "neon_u8_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8; },
-            REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8<op, uint8_t, uint8x16_t>))
+            REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_comparison_elementwise_binary<op>)
         },
         {
             "neon_fp32_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; },
-            REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32<op, float, float32x4_t>))
+            REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_comparison_elementwise_binary<op>)
         },
         {
             "neon_s16_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; },
-            REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16<op, int16_t, int16x8_t>))
+            REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_comparison_elementwise_binary<op>)
         },
         {
             "neon_s32_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; },
-            REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32<op, int32_t, int32x4_t>))
+            REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_comparison_elementwise_binary<op>)
         },
 #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
         {
             "sve2_qu8_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
-            REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, uint8_t>))
+            REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_comparison_elementwise_binary<op>)
         },
         {
             "sve2_qs8_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
-            REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, int8_t>))
+            REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_comparison_elementwise_binary<op>)
         },
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 #if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
         {
             "neon_qu8_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; },
-            REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized<op>))
+            REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_comparison_elementwise_binary<op>)
         },
         {
             "neon_qs8_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-            REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed<op>))
+            REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_comparison_elementwise_binary<op>)
         },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON)  || defined(ARM_COMPUTE_ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON ||ARM_COMPUTE_ENABLE_SVE) */
 #if defined(ARM_COMPUTE_ENABLE_SVE)
         {
             "sve_fp16_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
-            REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op<op, float16_t>))
+            REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_comparison_elementwise_binary<op>)
         },
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE)  */
 #if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         {
             "neon_fp16_comparison",
             [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
-            REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16<op, float16_t, float16x8_t>))
+            REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_comparison_elementwise_binary<op>)
         },
 #endif /* defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     };
diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h
index f323fe447..8cd5d58a9 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.h
+++ b/src/cpu/kernels/CpuElementwiseKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,7 @@ namespace kernels
  * @f[ dst(x,y) = OP(src0(x,y), src1(x,y))@f]
  *
  */
-class CpuElementwiseKernel : public ICpuKernel
+class CpuElementwiseKernel : public ICpuKernel<CpuElementwiseKernel>
 {
 public:
     CpuElementwiseKernel() = default;
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
index 3573fa081..e8211fe93 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,8 +31,7 @@
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/cpu/kernels/elementwise/neon/elementwise_unary_list.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_unary_list.h"
+#include "src/cpu/kernels/elementwise_unary/list.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
@@ -43,77 +42,58 @@ namespace kernels
 {
 namespace
 {
-struct ElementwiseUnarySelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-using ElementwiseUnarySelector = std::add_pointer<bool(const ElementwiseUnarySelectorData &)>::type;
-
-struct ElementwiseUnaryKernel
-{
-    const char                                           *name;
-    const ElementwiseUnarySelector                        is_selected;
-    CpuElementwiseUnaryKernel::ElementwiseUnaryUkernelPtr ukernel;
-};
-
-static const ElementwiseUnaryKernel available_kernels[] =
+static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels =
 {
 #if defined(ARM_COMPUTE_ENABLE_SVE)
     {
         "sve_fp32_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::elementwise_sve_op<float>),
+        [](const DataTypeISASelectorData & data)
+        {
+            return data.dt == DataType::F32 && data.isa.sve;
+        },
+        REGISTER_FP32_SVE(sve_fp32_elementwise_unary)
     },
     {
         "sve_fp16_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::elementwise_sve_op<__fp16>),
+        [](const DataTypeISASelectorData & data)
+        {
+            return (data.dt == DataType::F16) && data.isa.sve;
+        },
+        REGISTER_FP16_SVE(sve_fp16_elementwise_unary),
     },
     {
         "sve_s32_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op<int32_t>),
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::S32 && data.isa.sve; },
+        REGISTER_INTEGER_SVE(sve_s32_elementwise_unary),
     },
 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
 #if defined(ARM_COMPUTE_ENABLE_NEON)
     {
         "neon_fp32_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<float>),
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(neon_fp32_elementwise_unary),
     },
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "neon_fp16_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
-        REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<__fp16>),
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
+        REGISTER_FP16_NEON(neon_fp16_elementwise_unary),
     },
 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "neon_s32_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::S32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op<int32_t>),
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::S32; },
+        REGISTER_INTEGER_NEON(neon_s32_elementwise_unary),
     },
 #endif // defined(ARM_COMPUTE_ENABLE_NEON)
 };
 
-const ElementwiseUnaryKernel *get_implementation(DataType dt)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt, CPUInfo::get() }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
 } // namespace
 
 void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst));
-    const auto uk = get_implementation(src.data_type());
+    const auto uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() });
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     _op         = op;
@@ -135,7 +115,8 @@ Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInf
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
 
-    const auto *uk = get_implementation(src.data_type());
+    const auto *uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() });
+
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     switch(op)
@@ -177,6 +158,12 @@ const char *CpuElementwiseUnaryKernel::name() const
 {
     return _name.c_str();
 }
+
+const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> &CpuElementwiseUnaryKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
index f72eddf73..138049a60 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.h
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,8 +39,11 @@ namespace kernels
  * Element-wise operation is computed by:
  * @f[ dst(x) = OP(src(x))@f]
  */
-class CpuElementwiseUnaryKernel : public ICpuKernel
+class CpuElementwiseUnaryKernel : public ICpuKernel<CpuElementwiseUnaryKernel>
 {
+private:
+    using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary)>::type;
+
 public:
     CpuElementwiseUnaryKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseUnaryKernel);
@@ -64,11 +67,14 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
-    /** Common signature for all the specialised elementwise unary micro-kernels
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary)>::type;
+    struct ElementwiseUnaryKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        ElementwiseUnaryUkernelPtr   ukernel;
+    };
+
+    static const std::vector<ElementwiseUnaryKernel> &get_available_kernels();
 
 private:
     ElementWiseUnary           _op{};
diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h
index 3bc6a4030..ce41afc46 100644
--- a/src/cpu/kernels/CpuFillKernel.h
+++ b/src/cpu/kernels/CpuFillKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@ namespace cpu
 namespace kernels
 {
 /** Kernel for filling a tensor with a given constant value */
-class CpuFillKernel : public ICpuKernel
+class CpuFillKernel : public ICpuKernel<CpuFillKernel>
 {
 public:
     CpuFillKernel() = default;
diff --git a/src/cpu/kernels/CpuFloorKernel.cpp b/src/cpu/kernels/CpuFloorKernel.cpp
index bcac1a41f..65e390a81 100644
--- a/src/cpu/kernels/CpuFloorKernel.cpp
+++ b/src/cpu/kernels/CpuFloorKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,58 +42,25 @@ namespace kernels
 {
 namespace
 {
-struct FloorSelectorData
-{
-    DataType dt;
-};
-
-using FloorSelectorPtr = std::add_pointer<bool(const FloorSelectorData &data)>::type;
-using FloorUKernelPtr  = std::add_pointer<void(const void *, void *, int)>::type;
-
-struct FloorUKernel
-{
-    const char            *name;
-    const FloorSelectorPtr is_selected;
-    FloorUKernelPtr        ukernel;
-};
-
-static const FloorUKernel available_kernels[] =
+static const std::vector<CpuFloorKernel::FloorKernel> available_kernels =
 {
     {
         "neon_fp16_floor",
-        [](const FloorSelectorData & data) { return data.dt == DataType::F16; },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
         REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)
     },
     {
         "neon_fp32_floor",
-        [](const FloorSelectorData & data) { return data.dt == DataType::F32; },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
         REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)
-    },
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const FloorUKernel *get_implementation(const FloorSelectorData &data)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected(data))
-        {
-            return &uk;
-        }
     }
-    return nullptr;
-}
+};
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
-    const auto *uk = get_implementation(FloorSelectorData{ src->data_type() });
+    const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     // Validate in case of configured output
@@ -114,7 +81,7 @@ void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 
     auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
 
-    const auto *uk = get_implementation(FloorSelectorData{ src->data_type() });
+    const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _run_method = uk->ukernel;
@@ -172,6 +139,12 @@ const char *CpuFloorKernel::name() const
 {
     return _name.c_str();
 }
+
+const std::vector<CpuFloorKernel::FloorKernel> &CpuFloorKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h
index ffb965819..35ab534ca 100644
--- a/src/cpu/kernels/CpuFloorKernel.h
+++ b/src/cpu/kernels/CpuFloorKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,11 @@ namespace cpu
 namespace kernels
 {
 /** Cpu accelarated kernel to perform a floor operation */
-class CpuFloorKernel : public ICpuKernel
+class CpuFloorKernel : public ICpuKernel<CpuFloorKernel>
 {
+private:
+    using FloorKernelPtr = std::add_pointer<void(const void *, void *, int)>::type;
+
 public:
     CpuFloorKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFloorKernel);
@@ -65,12 +68,18 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
-private:
-    using FloorUKernelPtr = std::add_pointer<void(const void *, void *, int)>::type;
+    struct FloorKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        FloorKernelPtr               ukernel;
+    };
+
+    static const std::vector<FloorKernel> &get_available_kernels();
 
 private:
-    FloorUKernelPtr _run_method{ nullptr };
-    std::string     _name{};
+    FloorKernelPtr _run_method{ nullptr };
+    std::string    _name{};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
index 047776bd1..4fb6a52a8 100644
--- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@ namespace kernels
  *
  * After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
  */
-class CpuGemmInterleave4x4Kernel : public ICpuKernel
+class CpuGemmInterleave4x4Kernel : public ICpuKernel<CpuGemmInterleave4x4Kernel>
 {
 public:
     CpuGemmInterleave4x4Kernel() = default;
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
index 083ee187e..2cc789d6d 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,7 @@ namespace kernels
  *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
  *
  */
-class CpuGemmLowpMatrixMultiplyKernel : public ICpuKernel
+class CpuGemmLowpMatrixMultiplyKernel : public ICpuKernel<CpuGemmLowpMatrixMultiplyKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
index 918f8c89d..e469629cd 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ namespace kernels
  * @note This stage is needed to handle the offset of matrix product
  *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
  */
-class CpuGemmLowpMatrixAReductionKernel : public ICpuKernel
+class CpuGemmLowpMatrixAReductionKernel : public ICpuKernel<CpuGemmLowpMatrixAReductionKernel>
 {
 public:
     /** Default constructor */
@@ -98,7 +98,7 @@ private:
  * @note This stage is needed to handle the offset of matrix product
  *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
  */
-class CpuGemmLowpMatrixBReductionKernel : public ICpuKernel
+class CpuGemmLowpMatrixBReductionKernel : public ICpuKernel<CpuGemmLowpMatrixBReductionKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
index 1ec969be9..3514ca811 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@ namespace kernels
  *                   (a_offset * b_offset * k)
  *
  */
-class CpuGemmLowpOffsetContributionKernel : public ICpuKernel
+class CpuGemmLowpOffsetContributionKernel : public ICpuKernel<CpuGemmLowpOffsetContributionKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
index d97727dd0..ad8b05e49 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,7 +63,7 @@ namespace kernels
  *                        (a_offset * b_offset * k)
  */
 
-class CpuGemmLowpOffsetContributionOutputStageKernel : public ICpuKernel
+class CpuGemmLowpOffsetContributionOutputStageKernel : public ICpuKernel<CpuGemmLowpOffsetContributionOutputStageKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
index ae13e760f..c7813edcd 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,7 @@ namespace kernels
  *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ScaleKernel : public ICpuKernel
+class CpuGemmLowpQuantizeDownInt32ScaleKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ScaleKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ScaleKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 53a9d34ed..681d09969 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,7 +48,7 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel
+class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index 67829e777..3e615b935 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel
+class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index b62cac481..b773fdfdc 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel
+class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
index c9798fc24..4a748218d 100644
--- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@ namespace kernels
  *        - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref CpuGemmMatrixMultiplyKernel
  *        - MTX_1 = C
  */
-class CpuGemmMatrixAdditionKernel : public ICpuKernel
+class CpuGemmMatrixAdditionKernel : public ICpuKernel<CpuGemmMatrixAdditionKernel>
 {
 public:
     CpuGemmMatrixAdditionKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
index 0b4e01579..9c3dc8b1a 100644
--- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,7 @@ namespace kernels
  * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p lhs is a vector and the second input tensor @p rhs a matrix. The implementation also assumes that both tensors have not been reshaped
  *
  */
-class CpuGemmMatrixMultiplyKernel : public ICpuKernel
+class CpuGemmMatrixMultiplyKernel : public ICpuKernel<CpuGemmMatrixMultiplyKernel>
 {
 public:
     CpuGemmMatrixMultiplyKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
index de920b5ed..0ca92641b 100644
--- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,7 +68,7 @@ namespace kernels
  * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
  *
  */
-class CpuGemmTranspose1xWKernel : public ICpuKernel
+class CpuGemmTranspose1xWKernel : public ICpuKernel<CpuGemmTranspose1xWKernel>
 {
 public:
     CpuGemmTranspose1xWKernel() = default;
diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp
index 5e3385d4a..875d66594 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.cpp
+++ b/src/cpu/kernels/CpuIm2ColKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -446,9 +446,10 @@ const char *CpuIm2ColKernel::name() const
 
 size_t CpuIm2ColKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
 {
-    ARM_COMPUTE_UNUSED(platform, thread_count);
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
 
-    return ICPPKernel::small_network_mws;
+    return ICPPKernel::default_mws;
 }
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h
index 797d54c95..8160310da 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.h
+++ b/src/cpu/kernels/CpuIm2ColKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@ namespace kernels
  * \end{array} \right)
  * @f]
  */
-class CpuIm2ColKernel : public ICpuKernel
+class CpuIm2ColKernel : public ICpuKernel<CpuIm2ColKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
new file mode 100644
index 000000000..60dcea4a3
--- /dev/null
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
+#define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
+
+#include "arm_compute/core/Types.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+// Selector data types
+struct DataTypeISASelectorData
+{
+    DataType            dt;
+    cpuinfo::CpuIsaInfo isa;
+};
+
+struct PoolDataTypeISASelectorData
+{
+    DataType            dt;
+    DataLayout          dl;
+    int                 pool_stride_x;
+    Size2D              pool_size;
+    cpuinfo::CpuIsaInfo isa;
+};
+
+// Selector pointer types
+using DataTypeISASelectorPtr     = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
+using PoolDataTypeISASelectorPtr = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h
index b65ec2004..85fcf88a9 100644
--- a/src/cpu/kernels/CpuMulKernel.h
+++ b/src/cpu/kernels/CpuMulKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform multiplication between two tensors */
-class CpuMulKernel : public ICpuKernel
+class CpuMulKernel : public ICpuKernel<CpuMulKernel>
 {
 public:
     CpuMulKernel() = default;
@@ -118,7 +118,7 @@ private:
 };
 
 /** Interface for the complex pixelwise multiplication kernel. */
-class CpuComplexMulKernel : public ICpuKernel
+class CpuComplexMulKernel : public ICpuKernel<CpuComplexMulKernel>
 {
 public:
     CpuComplexMulKernel() = default;
diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h
index 1b2672b5b..9e1b93318 100644
--- a/src/cpu/kernels/CpuPermuteKernel.h
+++ b/src/cpu/kernels/CpuPermuteKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Kernel to perform tensor permutation given a permutation vector */
-class CpuPermuteKernel : public ICpuKernel
+class CpuPermuteKernel : public ICpuKernel<CpuPermuteKernel>
 {
 public:
     CpuPermuteKernel() = default;
diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp
index 00b1ac76f..d0ca2d285 100644
--- a/src/cpu/kernels/CpuPool2dKernel.cpp
+++ b/src/cpu/kernels/CpuPool2dKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,136 +52,101 @@ namespace
 {
 using namespace misc::shape_calculator;
 
-struct PoolingSelectorData
-{
-    DataType   dt;
-    DataLayout dl;
-    int        pool_stride_x;
-    Size2D     pool_size;
-};
-
-using PoolingSelectorPtr = std::add_pointer<bool(const PoolingSelectorData &data)>::type;
-using PoolingKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
-struct PoolingKernel
-{
-    const char              *name;
-    const PoolingSelectorPtr is_selected;
-    PoolingKernelPtr         ukernel;
-};
-
-static const PoolingKernel available_kernels[] =
+static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels =
 {
     {
         "neon_qu8_nhwc_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
         REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)
     },
     {
         "neon_qs8_nhwc_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)
     },
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "neon_f16_nhwc_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)); },
         REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)
     },
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     {
         "neon_fp32_nhwc_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
         REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)
     },
 #if defined(ENABLE_NCHW_KERNELS)
     {
         "neon_qu8_nchw_pool2",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
         REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)
     },
     {
         "neon_qu8_nchw_pool3",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
         REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)
     },
     {
         "neon_qu8_nchw_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
         REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)
     },
     {
         "neon_qs8_nchw_pool2",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)
     },
     {
         "neon_qs8_nchw_pool3",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)
     },
     {
         "neon_qs8_nchw_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)
     },
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "neon_fp16_nchw_pool2",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
         REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)
     },
     {
         "neon_fp16_nchw_pool3",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
         REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)
     },
     {
         "neon_fp16_nchw_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); },
         REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)
     },
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     {
         "neon_fp32_nchw_pool2",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
         REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)
     },
     {
         "neon_fp32_nchw_pool3",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
         REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)
     },
     {
         "neon_fp32_nchw_pool7",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
         REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)
     },
     {
         "neon_fp32_nchw_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
+        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
         REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)
     },
 #endif /* defined(ENABLE_NCHW_KERNELS) */
 };
 
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const PoolingKernel *get_implementation(DataType dt, DataLayout dl, int pool_stride_x, Size2D pool_size)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt, dl, pool_stride_x, pool_size }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
                           const ITensorInfo *indices, Size2D pool_size)
 {
@@ -199,6 +164,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type()))
+                                    && (is_pool_region_entirely_outside_input(pool_info)),
+                                    "Pooling region that is entirely outside input tensor is unsupported for non-float types");
+
     std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
                                                                      pool_size.x(), pool_size.y(), pool_info.pad_stride_info);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
@@ -231,7 +200,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
         }
     }
 
-    const auto *uk = get_implementation(src->data_type(), src->data_layout(), pool_stride_x, pool_size);
+    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa() });
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
@@ -331,7 +300,7 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
 
-    const auto *uk = get_implementation(src->data_type(), src->data_layout(), pad_stride_info.stride().first, pool_size);
+    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first, pool_size, CPUInfo::get().get_isa() });
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
 
     // Set instance variables
@@ -443,6 +412,12 @@ const char *CpuPool2dKernel::name() const
 {
     return _name.c_str();
 }
+
+const std::vector<CpuPool2dKernel::PoolingKernel> &CpuPool2dKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h
index aedeb7fbe..c952ea839 100644
--- a/src/cpu/kernels/CpuPool2dKernel.h
+++ b/src/cpu/kernels/CpuPool2dKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,11 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the pooling layer kernel */
-class CpuPool2dKernel : public ICpuKernel
+class CpuPool2dKernel : public ICpuKernel<CpuPool2dKernel>
 {
+private:
+    using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
+
 public:
     CpuPool2dKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel);
@@ -62,8 +65,14 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
-private:
-    using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
+    struct PoolingKernel
+    {
+        const char                      *name;
+        const PoolDataTypeISASelectorPtr is_selected;
+        PoolingKernelPtr                 ukernel;
+    };
+
+    static const std::vector<PoolingKernel> &get_available_kernels();
 
 private:
     PoolingLayerInfo _pool_info{};
diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h
index eb0814926..28690bea5 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.h
+++ b/src/cpu/kernels/CpuQuantizeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,7 @@ namespace kernels
  *
  * @note The implementation supports only 3D input tensors
  */
-class CpuQuantizeKernel : public ICpuKernel
+class CpuQuantizeKernel : public ICpuKernel<CpuQuantizeKernel>
 {
 public:
     CpuQuantizeKernel() = default;
diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp
index 91c549643..068ff07ef 100644
--- a/src/cpu/kernels/CpuReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuReshapeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -137,9 +137,10 @@ const char *CpuReshapeKernel::name() const
 
 size_t CpuReshapeKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
 {
-    ARM_COMPUTE_UNUSED(platform, thread_count);
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
 
-    return ICPPKernel::small_network_mws;
+    return ICPPKernel::default_mws;
 }
 
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h
index d4e2b44b5..17302c673 100644
--- a/src/cpu/kernels/CpuReshapeKernel.h
+++ b/src/cpu/kernels/CpuReshapeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform tensor reshaping */
-class CpuReshapeKernel : public ICpuKernel
+class CpuReshapeKernel : public ICpuKernel<CpuReshapeKernel>
 {
 public:
     CpuReshapeKernel() = default;
diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp
index 3063d8f68..60564a97d 100644
--- a/src/cpu/kernels/CpuScaleKernel.cpp
+++ b/src/cpu/kernels/CpuScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,52 +48,37 @@ namespace kernels
 {
 namespace
 {
-struct ScaleSelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-using ScaleSelectorPtr = std::add_pointer<bool(const ScaleSelectorData &data)>::type;
-using ScaleKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                               InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
-struct ScaleKernel
-{
-    const char            *name;
-    const ScaleSelectorPtr is_selected;
-    ScaleKernelPtr         ukernel;
-};
-
-static const ScaleKernel available_kernels[] =
+static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels =
 {
 #if defined(ARM_COMPUTE_ENABLE_SVE)
     {
         "sve_fp16_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.sve; },
         REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)
     },
     {
         "sve_fp32_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32 && data.isa.sve; },
         REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)
     },
     {
         "sve_qu8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve(); },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve; },
         REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)
     },
     {
         "sve_qs8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve(); },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve; },
         REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)
     },
     {
         "sve_u8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::U8 && data.isa.sve; },
         REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)
     },
     {
         "sve_s16_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::S16 && data.isa.sve; },
         REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)
     },
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
@@ -101,60 +86,43 @@ static const ScaleKernel available_kernels[] =
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "neon_fp16_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
         REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)
     },
 #endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     {
         "neon_fp32_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F32; },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
         REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)
     },
     {
         "neon_qu8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
         REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)
     },
     {
         "neon_qs8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)
     },
     {
         "neon_u8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::U8; },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::U8; },
         REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)
     },
     {
         "neon_s16_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
+        [](const DataTypeISASelectorData & data) { return data.dt == DataType::S16; },
         REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)
     },
 #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
 };
 
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const ScaleKernel *get_implementation(const ScaleSelectorData &data)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected(data))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy,
                           const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info)
 {
-    const auto *uk = get_implementation(ScaleSelectorData{ src->data_type(), CPUInfo::get() });
+    const auto *uk = CpuScaleKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
@@ -212,7 +180,7 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co
                                                   dst,
                                                   info));
 
-    const auto *uk = get_implementation(ScaleSelectorData{ src->data_type(), CPUInfo::get() });
+    const auto *uk = CpuScaleKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _run_method = uk->ukernel;
@@ -618,6 +586,12 @@ const char *CpuScaleKernel::name() const
 {
     return _name.c_str();
 }
+
+const std::vector<CpuScaleKernel::ScaleKernel> &CpuScaleKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h
index 913b5a559..e0e9e387b 100644
--- a/src/cpu/kernels/CpuScaleKernel.h
+++ b/src/cpu/kernels/CpuScaleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,14 @@ namespace cpu
 namespace kernels
 {
 /** Arm(R) Neon(TM) kernel to perform scaling on a tensor */
-class CpuScaleKernel : public ICpuKernel
+class CpuScaleKernel : public ICpuKernel<CpuScaleKernel>
 {
+private:
+    /** Scale function to use for the particular function to use */
+    using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
+    using ScaleKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
+                                                   InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
+
 public:
     CpuScaleKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScaleKernel);
@@ -67,6 +73,15 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
+    struct ScaleKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        ScaleKernelPtr               ukernel;
+    };
+
+    static const std::vector<ScaleKernel> &get_available_kernels();
+
 private:
 #ifdef ENABLE_NCHW_KERNELS
     /** function to perform scale using area interpolation on the given window
@@ -87,11 +102,6 @@ private:
     void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
 #endif // ENABLE_NCHW_KERNELS
 
-    /** Scale function to use for the particular function to use */
-    using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
-    using ScaleKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                   InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
-
     ScaleFunctionPtr    _func{ nullptr };
     InterpolationPolicy _policy{};
     BorderMode          _border_mode{};
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index cbf3773dd..6766b1012 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/CpuSoftmaxKernel.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -30,13 +29,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-#include "src/cpu/kernels/softmax/impl/neon/list.h"
-#include "src/cpu/kernels/softmax/impl/sve/list.h"
-
+#include "src/cpu/kernels/softmax/list.h"
 namespace arm_compute
 {
 namespace cpu
@@ -45,162 +41,60 @@ namespace kernels
 {
 namespace
 {
-struct SoftmaxSelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-using SoftmaxSelectorPtr          = std::add_pointer<bool(const SoftmaxSelectorData &data)>::type;
-using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
-using SoftmaxLogits1DKernelPtr    = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
-
-struct SoftmaxLogits1DKernel
-{
-    const char              *name;
-    const SoftmaxSelectorPtr is_selected;
-    SoftmaxLogits1DKernelPtr ukernel;
-};
-
-struct SoftmaxLogits1DMaxKernel
-{
-    const char                 *name;
-    const SoftmaxSelectorPtr    is_selected;
-    SoftmaxLogits1DMaxKernelPtr ukernel;
-};
-
-static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
-{
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "sve_fp32_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float>)
-    },
-    {
-        "sve_fp16_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "neon_fp32_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
-
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    {
-        "sve2_qu8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve2(); },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_t>)
-    },
-    {
-        "sve2_qs8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve2(); },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_signed_t>)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-    {
-        "neon_qu8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_t>)
-    },
-    {
-        "neon_qs8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_signed_t>)
-    },
-};
-
-static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
+/* Softmax Logits 1D Max - identifying the max value of 1D Logits  */
+static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits =
 {
 #if defined(ARM_COMPUTE_ENABLE_SVE)
     {
         "sve_fp32_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max<float>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
+        REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_logits)
     },
     {
         "sve_fp16_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max<float16_t>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve; },
+        REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_logits)
     },
     {
         "sve_qu8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_t>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
+        REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_qasymm8_logits)
     },
     {
         "sve_qs8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
+        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_qasymm8_signed_logits)
     },
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 #if defined(ARM_COMPUTE_ENABLE_NEON)
     {
         "neon_fp32_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max<float>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_logits)
     },
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "neon_fp16_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max<float16_t>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_logits)
     },
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     {
         "neon_qu8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_t>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_logits)
     },
     {
         "neon_qs8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>)
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_singed_logits)
     },
 #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
 };
-
-const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data)
-{
-    for(const auto &uk : available_logits_1d_kernels)
-    {
-        if(uk.is_selected({ data.dt, CPUInfo::get() }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-const SoftmaxLogits1DMaxKernel *get_implementation_logits_max(const SoftmaxSelectorData &data)
-{
-    for(const auto &uk : available_logits_1d_max_kernels)
-    {
-        if(uk.is_selected({ data.dt, CPUInfo::get() }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
 Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
     // Validate in case of configured output
     if(output.total_size() != 0)
     {
@@ -208,58 +102,104 @@ Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorI
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
     }
-
     return Status{};
 }
-
-} // namespace
-
+} //namespace
+const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> &CpuLogits1DMaxKernel::get_available_kernels()
+{
+    return available_kernels_max_logits;
+}
 void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
-
     // Softmax across the x dimension
     const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
-
-    const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() });
+    const auto *uk = get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
-
     _run_method = uk->ukernel;
     _name       = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name);
-
-    Window win = calculate_max_window(*src, Steps());
+    Window win  = calculate_max_window(*src, Steps());
     ICpuKernel::configure(win);
 }
-
 Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
-
     return Status{};
 }
-
 void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
     _run_method(src, dst, window);
 }
-
 const char *CpuLogits1DMaxKernel::name() const
 {
     return _name.c_str();
 }
 
+/* Softmax Logits 1D  - computation for QASYMM8 with pre-computed max.  */
+template <bool                                                                             IS_LOG>
+static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits =
+{
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+    {
+        "sve_fp32_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
+        REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_softmax)
+    },
+    {
+        "sve_fp16_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve; },
+        REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_softmax)
+    },
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "neon_fp32_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_softmax)
+    },
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    {
+        "neon_fp16_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_softmax)
+    },
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+    {
+        "sve2_qu8_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
+        REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_softmax)
+    },
+    {
+        "sve2_qs8_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+        REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_softmax)
+    },
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "neon_qu8_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)
+    },
+    {
+        "neon_qs8_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)
+    },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
 namespace
 {
 Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max,
@@ -269,14 +209,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
     // Check input
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
-
     // Check max
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape());
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
-
     // Check output if configured
     if(dst.total_size() != 0)
     {
@@ -285,7 +222,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
     }
-
     // Check tmp if configured
     if(tmp.total_size() != 0)
     {
@@ -295,84 +231,69 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
         // on the maximum number of threads that will run in parallel.
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp);
     }
-
     return Status{};
 }
 } // namespace
-
+template <bool IS_LOG>
+const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
+{
+    return available_kernels_logits<IS_LOG>;
+}
 template <bool IS_LOG>
 void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
-
     // Configure kernel window
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
-
     // Output auto initialization if not yet initialized
     const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info();
     auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
-
     // Tmp auto initialization if not yet initialized
     const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
     auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
-
-    const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() });
+    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
-
     std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
-
-    _beta       = beta;
-    _run_method = uk->ukernel;
-    _name       = kernel_name.append("/").append(uk->name);
-
+    _beta                   = beta;
+    _run_method             = uk->ukernel;
+    _name                   = kernel_name.append("/").append(uk->name);
     // Configure kernel window
     Window win = calculate_max_window(*max, Steps());
-
-    ICpuKernel::configure(win);
+    ICPPKernel::configure(win);
 }
-
 template <bool IS_LOG>
 Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max,
                                                   const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
-
     return Status{};
 }
-
 template <bool IS_LOG>
 void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto       max = tensors.get_tensor(TensorType::ACL_SRC_1);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST_0);
-    auto       tmp = tensors.get_tensor(TensorType::ACL_DST_1);
-
+    const auto         src                               = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto               max                               = tensors.get_tensor(TensorType::ACL_SRC_1);
+    auto               dst                               = tensors.get_tensor(TensorType::ACL_DST_0);
+    auto               tmp                               = tensors.get_tensor(TensorType::ACL_DST_1);
     const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
     const unsigned int tmp_size_for_thread               = tmp->info()->element_size() * num_elems_processed_per_iteration;
-
     ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
-
     void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
     _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
 }
-
 template <bool IS_LOG>
 const char    *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
 {
     return _name.c_str();
 }
-
 template class CpuLogits1DSoftmaxKernel<true>;
 template class CpuLogits1DSoftmaxKernel<false>;
-
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index 8073a677d..df7d3f7d9 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,8 @@
  */
 #ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
 #define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
-
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
-
 namespace arm_compute
 {
 namespace cpu
@@ -34,8 +32,11 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the identifying the max value of 1D Logits */
-class CpuLogits1DMaxKernel : public ICpuKernel
+class CpuLogits1DMaxKernel : public ICpuKernel<CpuLogits1DMaxKernel>
 {
+private:
+    using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
+
 public:
     CpuLogits1DMaxKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel);
@@ -52,27 +53,31 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
-
-private:
-    using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
+    struct SoftmaxLogits1DMaxKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        SoftmaxLogits1DMaxKernelPtr  ukernel;
+    };
+    static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels();
 
 private:
     SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr };
     std::string                 _name{};
 };
-
 /** Interface for softmax computation for QASYMM8 with pre-computed max. */
 template <bool IS_LOG = false>
-class CpuLogits1DSoftmaxKernel : public ICpuKernel
+class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>
 {
+private:
+    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
+
 public:
     CpuLogits1DSoftmaxKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel);
-
     /** Set the input and output tensors.
      *
      * @param[in]  src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -92,13 +97,16 @@ public:
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *max,
                            const ITensorInfo *dst, const float beta, const ITensorInfo *tmp);
-
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
-
-private:
-    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
+    struct SoftmaxLogits1DKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        SoftmaxLogits1DKernelPtr     ukernel;
+    };
+    static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels();
 
 private:
     float                    _beta{ 1.0f };
diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp
index ec65f12df..c12feb433 100644
--- a/src/cpu/kernels/CpuSubKernel.cpp
+++ b/src/cpu/kernels/CpuSubKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,85 +39,52 @@ namespace kernels
 {
 namespace
 {
-struct SubSelectorData
-{
-    DataType dt;
-};
-
-using SubSelectorPtr = std::add_pointer<bool(const SubSelectorData &data)>::type;
-using SubKernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
-
-struct SubKernel
-{
-    const char          *name;
-    const SubSelectorPtr is_selected;
-    SubKernelPtr         ukernel;
-};
-
-static const SubKernel available_kernels[] =
+static const std::vector<CpuSubKernel::SubKernel> available_kernels =
 {
     {
         "neon_fp32_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::F32); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
         REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)
     },
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "neon_fp16_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::F16); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
         REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)
     },
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     {
         "neon_u8_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::U8); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::U8); },
         REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)
     },
     {
         "neon_s16_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::S16); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::S16); },
         REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)
     },
     {
         "neon_s32_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::S32); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::S32); },
         REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)
     },
     {
         "neon_qu8_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
         REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)
     },
     {
         "neon_qs8_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)
     },
     {
         "neon_qs16_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::QSYMM16); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); },
         REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)
     },
 };
 
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const SubKernel *get_implementation(DataType dt)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
 inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_UNUSED(policy);
@@ -126,7 +93,8 @@ inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src
                                                          DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
 
-    const auto *uk = get_implementation(src0.data_type());
+    const auto *uk = CpuSubKernel::get_implementation(DataTypeISASelectorData{ src0.data_type(), CPUInfo::get().get_isa() });
+
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
@@ -157,7 +125,7 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     set_shape_if_empty(*dst, out_shape);
     set_data_type_if_unknown(*dst, src0->data_type());
 
-    const auto *uk = get_implementation(src0->data_type());
+    const auto *uk = CpuSubKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _policy     = policy;
@@ -196,6 +164,12 @@ const char *CpuSubKernel::name() const
 {
     return _name.c_str();
 }
+
+const std::vector<CpuSubKernel::SubKernel> &CpuSubKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h
index 80d6be68b..323a3f131 100644
--- a/src/cpu/kernels/CpuSubKernel.h
+++ b/src/cpu/kernels/CpuSubKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,11 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform subtraction between two tensors */
-class CpuSubKernel : public ICpuKernel
+class CpuSubKernel : public ICpuKernel<CpuSubKernel>
 {
+private:
+    using SubKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+
 public:
     CpuSubKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSubKernel);
@@ -70,8 +73,14 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
-private:
-    using SubKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+    struct SubKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        SubKernelPtr                 ukernel;
+    };
+
+    static const std::vector<SubKernel> &get_available_kernels();
 
 private:
     ConvertPolicy _policy{};
diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h
index 6805eac64..cb85daeb4 100644
--- a/src/cpu/kernels/CpuTransposeKernel.h
+++ b/src/cpu/kernels/CpuTransposeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Kernel which transposes the elements of a matrix */
-class CpuTransposeKernel : public ICpuKernel
+class CpuTransposeKernel : public ICpuKernel<CpuTransposeKernel>
 {
 public:
     CpuTransposeKernel() = default;
diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h
index c80bf3b25..1a260edc9 100644
--- a/src/cpu/kernels/CpuWeightsReshapeKernel.h
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,7 +56,7 @@ namespace kernels
  * \end{array} \right)
  * @f]
  */
-class CpuWeightsReshapeKernel : public ICpuKernel
+class CpuWeightsReshapeKernel : public ICpuKernel<CpuWeightsReshapeKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h
index db2d8acfd..6909216d9 100644
--- a/src/cpu/kernels/CpuWinogradConv2dKernel.h
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@ namespace arm_compute
 namespace cpu
 {
 /** Interface for the kernel to perform Winograd input transform. */
-class ICpuWinogradConv2dTransformInputKernel : public ICpuKernel
+class ICpuWinogradConv2dTransformInputKernel : public ICpuKernel<ICpuWinogradConv2dTransformInputKernel>
 {
 public:
     /** Get the working space required to perform the transformation.
@@ -216,7 +216,7 @@ private:
 };
 
 /** Interface for the kernel to perform Winograd output transform. */
-class ICpuWinogradConv2dTransformOutputKernel : public ICpuKernel
+class ICpuWinogradConv2dTransformOutputKernel : public ICpuKernel<ICpuWinogradConv2dTransformOutputKernel>
 {
 public:
     /** Get the working space required to perform the transformation.
@@ -418,7 +418,7 @@ private:
 };
 
 /** Interface for the kernel to perform Winograd weights transform. */
-class ICpuWinogradConv2dTransformWeightsKernel : public ICpuKernel
+class ICpuWinogradConv2dTransformWeightsKernel : public ICpuKernel<ICpuWinogradConv2dTransformWeightsKernel>
 {
 public:
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp
new file mode 100644
index 000000000..e51b5b342
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/activation/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+constexpr ActFpImplParams Fp16Params = { static_cast<float16_t>(1e-7), 8 };
+} // namespace
+
+void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    fp_neon_activation_impl<float16_t, Fp16Params>(src, dst, act_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+\ No newline at end of file
diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp
new file mode 100644
index 000000000..2a3b8a0bf
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/activation/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+constexpr ActFpImplParams Fp32Params = { static_cast<float>(1e-24), 4 };
+} // namespace
+void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    fp_neon_activation_impl<float, Fp32Params>(src, dst, act_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/impl.h
index 54301d45a..2dd239e3a 100644
--- a/src/cpu/kernels/activation/neon/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/neon/impl.h
@@ -22,72 +22,73 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
 namespace arm_compute
 {
 namespace cpu
 {
-namespace
+/** Constant parameters needed by the activation implementation.
+ *  These parameters differ for each floating type
+ *
+ * @note This are passed as a struct as C++ does not allow float as a template parameter until C++20
+ **/
+struct ActFpImplParams
 {
+    float delta;  /**< Minimum delta needed to avoid NaN on corner-cases of elementary functions */
+    int   step_x; /**< Window step at the x dimension */
+};
+
 #ifndef __aarch64__
 inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
 {
     auto int_in = vreinterpretq_u32_f32(in);
     return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
 }
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
+{
+    auto int_in = vreinterpretq_u16_f16(in);
+    return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
+}
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #endif /* __aarch64__ */
-} // namespace
 
-void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+template <typename T, const ActFpImplParams &P>
+void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
     /** SIMD vector tag type. */
-    using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
-
-    constexpr int                                 window_step_x  = 4;
+    using ExactTagType                                           = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+    constexpr int                                 window_step_x  = P.step_x;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
     const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    Window                                        win_collapsed  = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-
     // In case of non-aarch64, a small delta value is added to the input
     // to prevent NAN values caused by zeros in inputs to SQRT.
     // In case of aarh64, we call vsqrt directly, so we don't use delta.
 #ifndef __aarch64__
-    const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {});
+    const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType {});
 #endif /* __aarch64__ */
-    const auto const_1     = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {});
-    const auto const_0     = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
-    const auto const_6     = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{});
-    const auto const_3     = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{});
-    const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{});
-
+    const auto      const_1           = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType {});
+    const auto      const_0           = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+    const auto      const_6           = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
+    const auto      const_3           = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
+    const auto      const_inv_6       = wrapper::vdup_n(static_cast<T>(0.166666667f), ExactTagType{});
     constexpr float soft_relu_thresh  = 12.f;
-    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<float>(soft_relu_thresh), ExactTagType{});
-
-    const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{});
-    const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{});
-    const auto a  = static_cast<float>(act_info.a());
-    const auto b  = static_cast<float>(act_info.b());
+    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<T>(soft_relu_thresh), ExactTagType{});
+    const auto      va                = wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{});
+    const auto      vb                = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{});
+    const auto      a                 = static_cast<T>(act_info.a());
+    const auto      b                 = static_cast<T>(act_info.b());
     execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp;
-
+        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
         // Compute S elements per iteration
         int x = window_start_x;
         for(; x <= (window_end_x - window_step_x); x += window_step_x)
@@ -150,12 +151,11 @@ void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLaye
             }
             wrapper::vstore(output_ptr + x, tmp);
         }
-
         // Compute left-over elements
         for(; x < window_end_x; ++x)
         {
-            const float in = *(reinterpret_cast<const float *>(input_ptr + x));
-            float       tmp;
+            const T in = *(reinterpret_cast<const T *>(input_ptr + x));
+            T       tmp;
             switch(act)
             {
                 case ActivationLayerInfo::ActivationFunction::ABS:
@@ -165,22 +165,22 @@ void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLaye
                     tmp = a * in + b;
                     break;
                 case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in));
+                    tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
                     break;
                 case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = std::max<float>(static_cast<float>(0), in);
+                    tmp = std::max<T>(static_cast<T>(0), in);
                     break;
                 case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<float>(a, std::max(static_cast<float>(0), in));
+                    tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
                     break;
                 case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<float>(a, std::max<float>(b, in));
+                    tmp = std::min<T>(a, std::max<T>(b, in));
                     break;
                 case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
                     tmp = (in > 0) ? in : a * in;
                     break;
                 case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float>(1) + std::exp(in));
+                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
                     break;
                 case ActivationLayerInfo::ActivationFunction::ELU:
                     tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
diff --git a/src/cpu/kernels/activation/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
index a1217435b..62e329e69 100644
--- a/src/cpu/kernels/activation/neon/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -36,7 +36,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
     constexpr int                                 window_step_x  = 16;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
diff --git a/src/cpu/kernels/activation/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
index 8b40bf8e7..4dca1ba79 100644
--- a/src/cpu/kernels/activation/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
@@ -35,7 +35,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
     constexpr int                                 window_step_x  = 16;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
diff --git a/src/cpu/kernels/activation/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
index 54b41820f..865b9f114 100644
--- a/src/cpu/kernels/activation/neon/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
@@ -37,7 +37,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
     constexpr int                                 window_step_x  = 8;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
diff --git a/src/cpu/kernels/activation/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp
index 5e76e82c5..47d9fabb5 100644
--- a/src/cpu/kernels/activation/sve/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp
@@ -36,7 +36,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
diff --git a/src/cpu/kernels/activation/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp
index cb9f82eb3..1685b0f66 100644
--- a/src/cpu/kernels/activation/sve/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp
@@ -36,7 +36,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
diff --git a/src/cpu/kernels/activation/sve/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
index 69fffd96c..3b99c0f12 100644
--- a/src/cpu/kernels/activation/sve/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
@@ -36,7 +36,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
diff --git a/src/cpu/kernels/activation/sve/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
index 53ee515ff..24415145d 100644
--- a/src/cpu/kernels/activation/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
@@ -37,7 +37,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
diff --git a/src/cpu/kernels/activation/sve/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
index ac549770a..0eecfa618 100644
--- a/src/cpu/kernels/activation/sve/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
@@ -38,7 +38,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
diff --git a/src/cpu/kernels/activation/list.h b/src/cpu/kernels/activation/list.h
index 409d025db..bf9aa0f37 100644
--- a/src/cpu/kernels/activation/list.h
+++ b/src/cpu/kernels/activation/list.h
@@ -31,16 +31,16 @@ namespace cpu
 #define DECLARE_ACTIVATION_KERNEL(func_name) \
     void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
-DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_sve_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_signed_sve_activation);
-DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qsymm16_sve_activation);
-DECLARE_ACTIVATION_KERNEL(fp16_neon_activation);
-DECLARE_ACTIVATION_KERNEL(fp16_sve_activation);
-DECLARE_ACTIVATION_KERNEL(fp32_neon_activation);
-DECLARE_ACTIVATION_KERNEL(fp32_sve_activation);
+DECLARE_ACTIVATION_KERNEL(neon_qasymm8_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_activation);
+DECLARE_ACTIVATION_KERNEL(neon_qasymm8_signed_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_signed_activation);
+DECLARE_ACTIVATION_KERNEL(neon_qsymm16_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qsymm16_activation);
+DECLARE_ACTIVATION_KERNEL(sve_fp16_activation);
+DECLARE_ACTIVATION_KERNEL(sve_fp32_activation);
+DECLARE_ACTIVATION_KERNEL(neon_fp16_activation);
+DECLARE_ACTIVATION_KERNEL(neon_fp32_activation);
 
 #undef DECLARE_ACTIVATION_KERNEL
 } // namespace cpu
diff --git a/src/cpu/kernels/activation/neon/fp16.cpp b/src/cpu/kernels/activation/neon/fp16.cpp
deleted file mode 100644
index 6f2d5d853..000000000
--- a/src/cpu/kernels/activation/neon/fp16.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/NEMath.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-#ifndef __aarch64__
-inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
-{
-    auto int_in = vreinterpretq_u16_f16(in);
-    return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
-}
-#endif /* __aarch64__ */
-} // namespace
-
-void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType                                = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
-    const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
-    constexpr int window_step_x  = 8;
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    // In case of non-aarch64, a small delta value is added to the input
-    // to prevent NAN values caused by zeros in inputs to SQRT.
-    // In case of aarh64, we call vsqrt directly, so we don't use delta.
-#ifndef __aarch64__
-    const auto delta = wrapper::vdup_n(static_cast<float16_t>((1e-7), ExactTagType {}));
-#endif /* __aarch64__ */
-
-    const auto const_1     = wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
-    const auto const_0     = wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
-    const auto const_6     = wrapper::vdup_n(static_cast<float16_t>(6.f), ExactTagType{});
-    const auto const_3     = wrapper::vdup_n(static_cast<float16_t>(3.f), ExactTagType{});
-    const auto const_inv_6 = wrapper::vdup_n(static_cast<float16_t>(0.166666667f), ExactTagType{});
-
-    constexpr float soft_relu_thresh  = 12.f;
-    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<float16_t>(soft_relu_thresh), ExactTagType{});
-
-    const auto va = wrapper::vdup_n(static_cast<float16_t>(act_info.a()), ExactTagType{});
-    const auto vb = wrapper::vdup_n(static_cast<float16_t>(act_info.b()), ExactTagType{});
-    const auto a  = static_cast<float16_t>(act_info.a());
-    const auto b  = static_cast<float16_t>(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<float16_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = wrapper::vabs(vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = wrapper::vmla(vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = wrapper::vmax(const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-#ifdef __aarch64__
-                    tmp = wrapper::vsqrt(vin);
-#else  /* __aarch64__ */
-                    {
-                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{}));
-                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
-                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
-                    }
-#endif /* __aarch64__ */
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = wrapper::vmul(vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const float16_t in = *(reinterpret_cast<const float16_t *>(input_ptr + x));
-            float16_t       tmp;
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = std::abs(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = a * in + b;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = static_cast<float16_t>(1) / (static_cast<float16_t>(1) + std::exp(-in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = std::max<float16_t>(static_cast<float16_t>(0), in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<float16_t>(a, std::max(static_cast<float16_t>(0), in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<float16_t>(a, std::max<float16_t>(b, in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = (in > 0) ? in : a * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float16_t>(1) + std::exp(in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = std::sqrt(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = in * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = a * std::tanh(b * in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp
new file mode 100644
index 000000000..12d4a467b
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/fp16.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_fp16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_neon<float16_t>(src0, src1, dst, policy, window);
+}
+}
+} // namespace arm_compute
+#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp
new file mode 100644
index 000000000..3563162fc
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/fp32.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_fp32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_neon<float>(src0, src1, dst, policy, window);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/neon/list.h b/src/cpu/kernels/add/generic/neon/impl.cpp
index 379bd32fb..ad3e445ab 100644
--- a/src/cpu/kernels/add/neon/list.h
+++ b/src/cpu/kernels/add/generic/neon/impl.cpp
@@ -21,26 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_ADD_LIST_H
-#define SRC_CORE_NEON_KERNELS_ADD_LIST_H
 
-#include "arm_compute/core/Types.h"
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-
 namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_ADD_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-
-DECLARE_ADD_KERNEL(add_qasymm8_neon);
-DECLARE_ADD_KERNEL(add_qasymm8_signed_neon);
-DECLARE_ADD_KERNEL(add_qsymm16_neon);
-
-#undef DECLARE_ADD_KERNEL
-
 template <typename ScalarType>
 void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
@@ -138,6 +127,15 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
         input1, input2, output);
     }
 }
+
+template void add_same_neon<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_neon<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_neon<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_neon<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void add_same_neon<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_ADD_LIST_H
diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h
new file mode 100644
index 000000000..07afdda22
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/impl.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_ADD_IMPL_H
+#define SRC_CORE_NEON_KERNELS_ADD_IMPL_H
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp
new file mode 100644
index 000000000..62c19e66b
--- /dev/null
+++ b/src/cpu/kernels/add/generic/neon/integer.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_u8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_neon<uint8_t>(src0, src1, dst, policy, window);
+}
+
+void add_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_neon<int16_t>(src0, src1, dst, policy, window);
+}
+
+void add_s32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_neon<int32_t>(src0, src1, dst, policy, window);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/neon/qasymm8.cpp b/src/cpu/kernels/add/generic/neon/qasymm8.cpp
index e357a7ef7..e357a7ef7 100644
--- a/src/cpu/kernels/add/neon/qasymm8.cpp
+++ b/src/cpu/kernels/add/generic/neon/qasymm8.cpp
diff --git a/src/cpu/kernels/add/neon/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
index d62d0739f..d62d0739f 100644
--- a/src/cpu/kernels/add/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
diff --git a/src/cpu/kernels/add/neon/qsymm16.cpp b/src/cpu/kernels/add/generic/neon/qsymm16.cpp
index e76e408d6..e76e408d6 100644
--- a/src/cpu/kernels/add/neon/qsymm16.cpp
+++ b/src/cpu/kernels/add/generic/neon/qsymm16.cpp
diff --git a/src/cpu/kernels/add/generic/sve/fp16.cpp b/src/cpu/kernels/add/generic/sve/fp16.cpp
new file mode 100644
index 000000000..7dc3142d3
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sve/fp16.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/add/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_fp16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_sve<float16_t>(src0, src1, dst, policy, window);
+}
+}
+} // namespace arm_compute
+#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+#endif /* #if defined(ARM_COMPUTE_ENABLE_SVE) */
+\ No newline at end of file
diff --git a/src/cpu/kernels/add/generic/sve/fp32.cpp b/src/cpu/kernels/add/generic/sve/fp32.cpp
new file mode 100644
index 000000000..5383b887b
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sve/fp32.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "src/cpu/kernels/add/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_fp32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_sve<float>(src0, src1, dst, policy, window);
+}
+}
+} // namespace arm_compute
+#endif //ARM_COMPUTE_ENABLE_SVE
+\ No newline at end of file
diff --git a/src/cpu/kernels/add/sve/impl.cpp b/src/cpu/kernels/add/generic/sve/impl.cpp
index f8e16a508..615f346df 100644
--- a/src/cpu/kernels/add/sve/impl.cpp
+++ b/src/cpu/kernels/add/generic/sve/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,17 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "src/cpu/kernels/add/generic/sve/impl.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-
 #include "src/core/NEON/SVEMath.h"
-#include "src/cpu/kernels/add/sve/impl.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace cpu
@@ -128,12 +124,13 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const
         input1, input2, output);
     }
 }
-
 template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
-\ No newline at end of file
+#endif // ARM_COMPUTE_ENABLE_SVE
+\ No newline at end of file
diff --git a/src/cpu/kernels/add/sve/impl.h b/src/cpu/kernels/add/generic/sve/impl.h
index 32ff5d049..219dbc9b2 100644
--- a/src/cpu/kernels/add/sve/impl.h
+++ b/src/cpu/kernels/add/generic/sve/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H
 #define SRC_CORE_SVE_KERNELS_ADD_IMPL_H
-
-#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/Window.h"
 
 namespace arm_compute
 {
@@ -36,5 +36,5 @@ template <typename ScalarType>
 void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
-#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
-\ No newline at end of file
+#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
+#endif // ARM_COMPUTE_ENABLE_SVE
+\ No newline at end of file
diff --git a/src/cpu/kernels/add/generic/sve/integer.cpp b/src/cpu/kernels/add/generic/sve/integer.cpp
new file mode 100644
index 000000000..1d25d3463
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sve/integer.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "src/cpu/kernels/add/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void add_u8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_sve<uint8_t>(src0, src1, dst, policy, window);
+}
+
+void add_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_sve<int16_t>(src0, src1, dst, policy, window);
+}
+
+void add_s32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    return add_same_sve<int32_t>(src0, src1, dst, policy, window);
+}
+}
+} // namespace arm_compute
+#endif //(ARM_COMPUTE_ENABLE_SVE)
+\ No newline at end of file
diff --git a/src/cpu/kernels/add/sve/qasymm8.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
index 888ad878c..5fe9b95a4 100644
--- a/src/cpu/kernels/add/sve/qasymm8.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -179,4 +179,4 @@ void add_qasymm8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, con
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-\ No newline at end of file
+#endif //ARM_COMPUTE_ENABLE_SVE2
+\ No newline at end of file
diff --git a/src/cpu/kernels/add/sve/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
index 3b922c6c2..9135dfdcf 100644
--- a/src/cpu/kernels/add/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_signed_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -178,4 +178,4 @@ void add_qasymm8_signed_sve(const ITensor *src0, const ITensor *src1, ITensor *d
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-\ No newline at end of file
+#endif //(ARM_COMPUTE_ENABLE_SVE2)
+\ No newline at end of file
diff --git a/src/cpu/kernels/add/sve/qsymm16.cpp b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
index eef5d245d..723d2a6c9 100644
--- a/src/cpu/kernels/add/sve/qsymm16.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_qsymm16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -153,4 +153,4 @@ void add_qsymm16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, con
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-\ No newline at end of file
+#endif //ARM_COMPUTE_ENABLE_SVE2
+\ No newline at end of file
diff --git a/src/cpu/kernels/add/sve/list.h b/src/cpu/kernels/add/list.h
index 4529a9f7c..9d7c9a67f 100644
--- a/src/cpu/kernels/add/sve/list.h
+++ b/src/cpu/kernels/add/list.h
@@ -21,16 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_SVE_KERNELS_ADD_LIST_H
-#define SRC_CORE_SVE_KERNELS_ADD_LIST_H
+#ifndef SRC_CORE_KERNELS_ADD_LIST_H
+#define SRC_CORE_KERNELS_ADD_LIST_H
 
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/cpu/kernels/add/sve/impl.h"
-#include <arm_sve.h>
+#include "src/cpu/kernels/add/generic/neon/impl.h"
+#include "src/cpu/kernels/add/generic/sve/impl.h"
 
 namespace arm_compute
 {
@@ -39,13 +34,25 @@ namespace cpu
 #define DECLARE_ADD_KERNEL(func_name) \
     void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
-DECLARE_ADD_KERNEL(add_qasymm8_sve);
-DECLARE_ADD_KERNEL(add_qasymm8_signed_sve);
-DECLARE_ADD_KERNEL(add_qsymm16_sve);
+DECLARE_ADD_KERNEL(add_qasymm8_neon);
+DECLARE_ADD_KERNEL(add_qasymm8_signed_neon);
+DECLARE_ADD_KERNEL(add_qsymm16_neon);
+DECLARE_ADD_KERNEL(add_fp32_neon);
+DECLARE_ADD_KERNEL(add_fp16_neon);
+DECLARE_ADD_KERNEL(add_u8_neon);
+DECLARE_ADD_KERNEL(add_s16_neon);
+DECLARE_ADD_KERNEL(add_s32_neon);
+DECLARE_ADD_KERNEL(add_fp32_sve);
+DECLARE_ADD_KERNEL(add_fp16_sve);
+DECLARE_ADD_KERNEL(add_u8_sve);
+DECLARE_ADD_KERNEL(add_s16_sve);
+DECLARE_ADD_KERNEL(add_s32_sve);
+DECLARE_ADD_KERNEL(add_qasymm8_sve2);
+DECLARE_ADD_KERNEL(add_qasymm8_signed_sve2);
+DECLARE_ADD_KERNEL(add_qsymm16_sve2);
 
 #undef DECLARE_ADD_KERNEL
 
 } // namespace cpu
 } // namespace arm_compute
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
-#endif // SRC_CORE_SVE_KERNELS_ADD_LIST_H
-\ No newline at end of file
+#endif // SRC_CORE_KERNELS_ADD_LIST_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
index ff8b0b143..10bf8e4ff 100644
--- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -120,13 +120,14 @@ public:
      * @param[in] platform     The CPU platform used to create the context.
      * @param[in] thread_count Number of threads in the execution.
      *
-     * @return[out] small_network_mws         Minimum workload size for requsted configuration.
+     * @return[out] small_network_mws         Minimum workload size for requested configuration.
      */
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override
     {
-        ARM_COMPUTE_UNUSED(platform, thread_count);
+        ARM_COMPUTE_UNUSED(thread_count);
+        ARM_COMPUTE_UNUSED(platform);
 
-        return ICPPKernel::small_network_mws;
+        return ICPPKernel::default_mws;
     }
 
 private:
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
new file mode 100644
index 000000000..6826ff669
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+{
+    return bounding_box_transform<float16_t>(boxes, pred_boxes, deltas, bbinfo, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
new file mode 100644
index 000000000..34ff9224d
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+{
+    return bounding_box_transform<float>(boxes, pred_boxes, deltas, bbinfo, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
new file mode 100644
index 000000000..2d08c879c
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+
+{
+    const size_t num_classes  = deltas->info()->tensor_shape()[0] >> 2;
+    const size_t deltas_width = deltas->info()->tensor_shape()[0];
+    const int    img_h        = std::floor(bbinfo.img_height() / bbinfo.scale() + 0.5f);
+    const int    img_w        = std::floor(bbinfo.img_width() / bbinfo.scale() + 0.5f);
+
+    const auto scale_after  = (bbinfo.apply_scale() ? bbinfo.scale() : 1.f);
+    const auto scale_before = bbinfo.scale();
+    const auto offset       = (bbinfo.correct_transform_coords() ? 1.f : 0.f);
+
+    auto pred_ptr  = reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
+    auto delta_ptr = reinterpret_cast<uint8_t *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
+
+    const auto boxes_qinfo  = boxes->info()->quantization_info().uniform();
+    const auto deltas_qinfo = deltas->info()->quantization_info().uniform();
+    const auto pred_qinfo   = pred_boxes->info()->quantization_info().uniform();
+
+    Iterator box_it(boxes, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto  ptr    = reinterpret_cast<uint16_t *>(box_it.ptr());
+        const auto  b0     = dequantize_qasymm16(*ptr, boxes_qinfo);
+        const auto  b1     = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
+        const auto  b2     = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
+        const auto  b3     = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
+        const float width  = (b2 / scale_before) - (b0 / scale_before) + 1.f;
+        const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
+        const float ctr_x  = (b0 / scale_before) + 0.5f * width;
+        const float ctr_y  = (b1 / scale_before) + 0.5f * height;
+        for(size_t j = 0; j < num_classes; ++j)
+        {
+            // Extract deltas
+            const size_t delta_id = id.y() * deltas_width + 4u * j;
+            const float  dx       = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0];
+            const float  dy       = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1];
+            float        dw       = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2];
+            float        dh       = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3];
+            // Clip dw and dh
+            dw = std::min(dw, bbinfo.bbox_xform_clip());
+            dh = std::min(dh, bbinfo.bbox_xform_clip());
+            // Determine the predictions
+            const float pred_ctr_x = dx * width + ctr_x;
+            const float pred_ctr_y = dy * height + ctr_y;
+            const float pred_w     = std::exp(dw) * width;
+            const float pred_h     = std::exp(dh) * height;
+            // Store the prediction into the output tensor
+            pred_ptr[delta_id]     = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
+            pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
+            pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo);
+            pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo);
+        }
+    },
+    box_it);
+}
+
+template <typename T>
+void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+{
+    const size_t num_classes  = deltas->info()->tensor_shape()[0] >> 2;
+    const size_t deltas_width = deltas->info()->tensor_shape()[0];
+    const int    img_h        = std::floor(bbinfo.img_height() / bbinfo.scale() + 0.5f);
+    const int    img_w        = std::floor(bbinfo.img_width() / bbinfo.scale() + 0.5f);
+
+    const auto scale_after  = (bbinfo.apply_scale() ? T(bbinfo.scale()) : T(1));
+    const auto scale_before = T(bbinfo.scale());
+    ARM_COMPUTE_ERROR_ON(scale_before <= 0);
+    const auto offset = (bbinfo.correct_transform_coords() ? T(1.f) : T(0.f));
+
+    auto pred_ptr  = reinterpret_cast<T *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
+    auto delta_ptr = reinterpret_cast<T *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
+
+    Iterator box_it(boxes, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto ptr    = reinterpret_cast<T *>(box_it.ptr());
+        const auto b0     = *ptr;
+        const auto b1     = *(ptr + 1);
+        const auto b2     = *(ptr + 2);
+        const auto b3     = *(ptr + 3);
+        const T    width  = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
+        const T    height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
+        const T    ctr_x  = (b0 / scale_before) + T(0.5f) * width;
+        const T    ctr_y  = (b1 / scale_before) + T(0.5f) * height;
+        for(size_t j = 0; j < num_classes; ++j)
+        {
+            // Extract deltas
+            const size_t delta_id = id.y() * deltas_width + 4u * j;
+            const T      dx       = delta_ptr[delta_id] / T(bbinfo.weights()[0]);
+            const T      dy       = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]);
+            T            dw       = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]);
+            T            dh       = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]);
+            // Clip dw and dh
+            dw = std::min(dw, T(bbinfo.bbox_xform_clip()));
+            dh = std::min(dh, T(bbinfo.bbox_xform_clip()));
+            // Determine the predictions
+            const T pred_ctr_x = dx * width + ctr_x;
+            const T pred_ctr_y = dy * height + ctr_y;
+            const T pred_w     = std::exp(dw) * width;
+            const T pred_h     = std::exp(dh) * height;
+            // Store the prediction into the output tensor
+            pred_ptr[delta_id]     = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
+            pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
+            pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
+            pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
+        }
+    },
+    box_it);
+}
+
+template void bounding_box_transform<float>(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void bounding_box_transform<float16_t>(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window);
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+} // namespace cpu
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
new file mode 100644
index 000000000..d9ff694ae
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
+#define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window);
+
+void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
new file mode 100644
index 000000000..b27c187df
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qu16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+{
+    return bounding_box_transform_qsymm16(boxes, pred_boxes, deltas, bbinfo, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/boundingboxtransform/list.h b/src/cpu/kernels/boundingboxtransform/list.h
new file mode 100644
index 000000000..8f06acc8a
--- /dev/null
+++ b/src/cpu/kernels/boundingboxtransform/list.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_BOUNDINGBOXTRANFORM_LIST_H
+#define SRC_CORE_NEON_KERNELS_BOUNDINGBOXTRANFORM_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name) \
+    void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp32_boundingboxtransform);
+DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp16_boundingboxtransform);
+DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_qu16_boundingboxtransform);
+#undef DECLARE_BOUNDINGBOXTRANFORM_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_BOUNDINGBOXTRANFORM_LIST_H
diff --git a/src/cpu/kernels/conv3d/neon/list.h b/src/cpu/kernels/conv3d/neon/list.h
index 3e2db664d..3bfa124dc 100644
--- a/src/cpu/kernels/conv3d/neon/list.h
+++ b/src/cpu/kernels/conv3d/neon/list.h
@@ -150,7 +150,7 @@ void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, con
                         {
                             const auto src_vec = wrapper::vloadq(in_ptr_mover);
                             //Load Cin weights
-                            for(unsigned int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
+                            for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
                             {
                                 w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
                             }
diff --git a/src/cpu/kernels/conv3d/neon/quantized.h b/src/cpu/kernels/conv3d/neon/quantized.h
index cdbc45e20..a8165b494 100644
--- a/src/cpu/kernels/conv3d/neon/quantized.h
+++ b/src/cpu/kernels/conv3d/neon/quantized.h
@@ -171,7 +171,7 @@ void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1,
                         {
                             const auto src_vec = wrapper::vloadq(in_ptr_mover);
                             //Load Cin weights
-                            for(unsigned int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
+                            for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
                             {
                                 w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
                             }
diff --git a/src/cpu/kernels/crop/generic/neon/crop_helper.h b/src/cpu/kernels/crop/generic/neon/crop_helper.h
new file mode 100644
index 000000000..1fe8e11e9
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/crop_helper.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
+#define SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+inline float32x4_t load_as_f32(T *ptr)
+{
+    ARM_COMPUTE_UNUSED(ptr);
+    ARM_COMPUTE_ERROR("Type not supported.");
+}
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template <>
+inline float32x4_t load_as_f32(float16_t *ptr)
+{
+    return vcvt_f32_f16(wrapper::vload(ptr));
+}
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+
+template <>
+inline float32x4_t load_as_f32(float *ptr)
+{
+    return wrapper::vloadq(ptr);
+}
+
+template <>
+inline float32x4_t load_as_f32(int32_t *ptr)
+{
+    return vcvtq_f32_s32(wrapper::vloadq(ptr));
+}
+
+template <>
+inline float32x4_t load_as_f32(uint32_t *ptr)
+{
+    return vcvtq_f32_u32(wrapper::vloadq(ptr));
+}
+
+template <>
+inline float32x4_t load_as_f32(int16_t *ptr)
+{
+    return vcvtq_f32_s32(vmovl_s16(wrapper::vload(ptr)));
+}
+
+template <>
+inline float32x4_t load_as_f32(uint16_t *ptr)
+{
+    return vcvtq_f32_u32(vmovl_u16(wrapper::vload(ptr)));
+}
+
+template <>
+inline float32x4_t load_as_f32(uint8_t *ptr)
+{
+    return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr)))));
+}
+}
+} // namespace arm_compute
+
+#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/crop/generic/neon/fp16.cpp b/src/cpu/kernels/crop/generic/neon/fp16.cpp
new file mode 100644
index 000000000..218ebba19
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/fp16.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/crop/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                                int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+{
+    return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset,
+                                            window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+}
+}
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/crop/generic/neon/fp32.cpp b/src/cpu/kernels/crop/generic/neon/fp32.cpp
new file mode 100644
index 000000000..16d0218fc
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/fp32.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/crop/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                                int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+{
+    return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset,
+                                            window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/crop/generic/neon/impl.cpp b/src/cpu/kernels/crop/generic/neon/impl.cpp
new file mode 100644
index 000000000..95ab80494
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/impl.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/crop/generic/neon/impl.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
+#include "src/cpu/kernels/crop/generic/neon/crop_helper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                           int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+{
+    // Reverse elements if width flipped.
+    if(is_width_flipped)
+    {
+        // Collapse first dimension if possible.
+        if(input_has_single_channel)
+        {
+            int32_t     x = output_width_start;
+            Coordinates negative_offset(input_offset);
+            negative_offset.set(1, negative_offset[1] - window_step_x + 1);
+            for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
+            {
+                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
+
+                in = wrapper::vrev64(in);
+                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+
+                wrapper::vstore(output_ptr + x, in);
+            }
+            input_offset[1] = negative_offset[1] + window_step_x - 1;
+            for(; x < output_width_limit; ++x, --input_offset[1])
+            {
+                *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+            }
+        }
+        else
+        {
+            for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
+            {
+                input_offset.set(0, 0);
+                int32_t c = 0;
+                for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
+                {
+                    auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                    wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
+                }
+                for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
+                {
+                    *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                }
+            }
+        }
+    }
+    else
+    {
+        // Use memcpy if the elements don't need converting to float.
+        if(std::is_same<T, float>::value)
+        {
+            memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
+                   reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
+                   (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
+        }
+        else
+        {
+            int32_t x                = 0;
+            int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+            float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+            for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
+            {
+                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                wrapper::vstore(output_start_ptr + x, in);
+            }
+            for(; x < limit; ++x, ++input_offset[0])
+            {
+                *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+            }
+        }
+    }
+}
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void in_bounds_crop_window<float16_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+template void in_bounds_crop_window<float32_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
+template void in_bounds_crop_window<uint8_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                                             int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
+template void in_bounds_crop_window<uint16_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
+template void in_bounds_crop_window<uint32_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
+template void in_bounds_crop_window<int8_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                                            int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
+template void in_bounds_crop_window<int16_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                                             int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
+template void in_bounds_crop_window<int32_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                                             int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/crop/generic/neon/impl.h b/src/cpu/kernels/crop/generic/neon/impl.h
new file mode 100644
index 000000000..50f889705
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/impl.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_CROP_IMPL_H
+#define SRC_CORE_NEON_KERNELS_CROP_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                           int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_CROP_IMPL_H
diff --git a/src/cpu/kernels/crop/generic/neon/integer.cpp b/src/cpu/kernels/crop/generic/neon/integer.cpp
new file mode 100644
index 000000000..7dbf0a7f5
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/integer.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/crop/generic/neon/impl.h"
+#include "src/cpu/kernels/crop/generic/neon/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void u8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+{
+    return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset,
+                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+}
+
+void u16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+{
+    return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset,
+                                           window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+}
+
+void u32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+{
+    return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset,
+                                           window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+}
+
+void s8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+{
+    return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset,
+                                         window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+}
+
+void s16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+{
+    return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset,
+                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+}
+
+void s32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+{
+    return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset,
+                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/crop/generic/neon/list.h b/src/cpu/kernels/crop/generic/neon/list.h
new file mode 100644
index 000000000..f33049304
--- /dev/null
+++ b/src/cpu/kernels/crop/generic/neon/list.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_CROP_LIST_H
+#define SRC_CORE_NEON_KERNELS_CROP_LIST_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
+#include "src/cpu/kernels/crop/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_CROP_KERNEL(func_name)                                                                       \
+    void func_name(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, \
+                   int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+
+DECLARE_CROP_KERNEL(fp16_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(fp32_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(s8_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(s16_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(s32_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(u8_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(u16_in_bounds_crop_window);
+DECLARE_CROP_KERNEL(u32_in_bounds_crop_window);
+
+#undef DECLARE_CROP_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_CROP_LIST_H
diff --git a/src/cpu/kernels/elementwise/neon/elementwise_list.h b/src/cpu/kernels/elementwise/neon/elementwise_list.h
deleted file mode 100644
index 43e44be5e..000000000
--- a/src/cpu/kernels/elementwise/neon/elementwise_list.h
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H
-
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
-                    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
-                    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const auto a      = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
-            for(; x < window_end_x; ++x)
-            {
-                const auto a      = *(input1_ptr + x);
-                const auto b      = *(input2_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(a, b);
-            }
-        },
-        input1, input2, output);
-    }
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
-{
-    auto res = ScalarType(0);
-
-    switch(op)
-    {
-        case ArithmeticOperation::MAX:
-            res = std::max(a, b);
-            break;
-        case ArithmeticOperation::MIN:
-            res = std::min(a, b);
-            break;
-        case ArithmeticOperation::SQUARED_DIFF:
-        {
-            res = (a - b) * (a - b);
-            break;
-        }
-        case ArithmeticOperation::PRELU:
-        {
-            res = (a > 0 ? a : a * b);
-            break;
-        }
-        case ArithmeticOperation::DIV:
-        {
-            res = a / b;
-            if(std::is_integral<ScalarType>::value)
-            {
-                res = (b == 0) ? 0 : res;
-                if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
-                {
-                    --res;
-                }
-            }
-            break;
-        }
-        case ArithmeticOperation::POWER:
-        {
-            res = std::pow(a, b);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-    return res;
-}
-
-template <ArithmeticOperation    op, typename VectorType>
-inline typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b)
-{
-    using vec_type    = typename VectorType::type;
-    using scalar_type = typename VectorType::scalar_type;
-    using tag_type    = typename VectorType::tag_type;
-
-    vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
-
-    switch(op)
-    {
-        case ArithmeticOperation::MAX:
-            res = wrapper::vmax(a, b);
-            break;
-        case ArithmeticOperation::MIN:
-            res = wrapper::vmin(a, b);
-            break;
-        case ArithmeticOperation::SQUARED_DIFF:
-        {
-            const vec_type tmp = wrapper::vsub(a, b);
-            res                = wrapper::vmul(tmp, tmp);
-            break;
-        }
-        case ArithmeticOperation::PRELU:
-        {
-            const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
-            const vec_type tmp  = wrapper::vmul(a, b);
-            const auto     gt   = wrapper::vcgt(a, zero);
-
-            res = wrapper::vbsl(gt, a, tmp);
-            break;
-        }
-
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-
-    return res;
-}
-
-template <>
-inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
-{
-    return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
-}
-
-template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
-{
-    return wrapper::vdiv(a, b);
-}
-
-template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
-{
-    return wrapper::vpow(a, b);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
-{
-    return wrapper::vdiv(a, b);
-}
-
-template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
-{
-    return wrapper::vpow(a, b);
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <ArithmeticOperation    op, typename ScalarType, typename VectorType>
-inline typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
-{
-    using tag_type = typename VectorType::tag_type;
-    using vec_type = typename VectorType::type;
-
-    vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});
-    return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = wrapper::vloadq(input1_ptr + x);
-        const auto b = wrapper::vloadq(input2_ptr + x);
-        wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));
-    }
-    return x;
-}
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
-        wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
-    }
-    return x;
-}
-
-template <ArithmeticOperation op, typename VectorType>
-void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    using scalar_type = typename VectorType::scalar_type;
-
-    elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
-                                                         &elementwise_arithm_op_scalar<op, scalar_type>,
-                                                         &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
-                                                         &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType>
-inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
-{
-    bool res = false;
-
-    switch(op)
-    {
-        case ComparisonOperation::Equal:
-            res = (a == b);
-            break;
-        case ComparisonOperation::NotEqual:
-            res = (a != b);
-            break;
-        case ComparisonOperation::Greater:
-            res = (a > b);
-            break;
-        case ComparisonOperation::GreaterEqual:
-            res = (a >= b);
-            break;
-        case ComparisonOperation::Less:
-            res = (a < b);
-            break;
-        case ComparisonOperation::LessEqual:
-            res = (a <= b);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-    return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
-}
-
-template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
-{
-    OutputVectorType res = { 0, 0, 0, 0 };
-
-    switch(op)
-    {
-        case ComparisonOperation::Equal:
-            res = wrapper::vceq(a, b);
-            break;
-        case ComparisonOperation::NotEqual:
-            res = wrapper::vnot(wrapper::vceq(a, b));
-            break;
-        case ComparisonOperation::Greater:
-            res = wrapper::vcgt(a, b);
-            break;
-        case ComparisonOperation::GreaterEqual:
-            res = wrapper::vcge(a, b);
-            break;
-        case ComparisonOperation::Less:
-            res = wrapper::vcgt(b, a);
-            break;
-        case ComparisonOperation::LessEqual:
-            res = wrapper::vcge(b, a);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-
-    return res;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
-{
-    InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
-    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        wrapper::vstore(output_ptr + x, a);
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
-        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
-    }
-    if(x <= window_end_x - 4)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        for(int i = 0; i < 4; i++)
-        {
-            *(output_ptr + x + i) = wrapper::vgetlane(a, i);
-        }
-        x = +4;
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a   = wrapper::vloadq(input1_ptr + x);
-        const auto b   = wrapper::vloadq(input2_ptr + x);
-        const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
-        wrapper::vstore(output_ptr + x, res);
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a   = wrapper::vloadq(input1_ptr + x);
-        const auto b   = wrapper::vloadq(input2_ptr + x);
-        const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        auto       a    = wrapper::vloadq(input1_ptr + x);
-        auto       b    = wrapper::vloadq(input2_ptr + x);
-        const auto res  = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        a               = wrapper::vloadq(input1_ptr + x + 4);
-        b               = wrapper::vloadq(input2_ptr + x + 4);
-        const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
-    }
-    if(x <= window_end_x - 4)
-    {
-        const auto a   = wrapper::vloadq(input1_ptr + x);
-        const auto b   = wrapper::vloadq(input2_ptr + x);
-        const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        for(int i = 0; i < 4; i++)
-        {
-            *(output_ptr + x + i) = wrapper::vgetlane(res, i);
-        }
-        x = +4;
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
-}
-} // namesapce cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H */
-\ No newline at end of file
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
new file mode 100644
index 000000000..6091ef215
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>(in1, in2, out, window);
+}
+
+template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_16<op, float16_t, float16x8_t>(in1, in2, out, window);
+}
+
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+}
+} // namespace arm_compute
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
new file mode 100644
index 000000000..2d8fec91c
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>(in1, in2, out, window);
+}
+
+template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_32<op, float, float32x4_t>(in1, in2, out, window);
+}
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
index 3b4c11277..ead54ab14 100644
--- a/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,16 +21,464 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
+#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H
+#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H
 
-#include "src/cpu/kernels/elementwise/neon/elementwise_list.h"
+#include "src/core/NEON/NEAsymm.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
+template <ArithmeticOperation op, typename VectorType>
+typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b)
+{
+    using vec_type    = typename VectorType::type;
+    using scalar_type = typename VectorType::scalar_type;
+    using tag_type    = typename VectorType::tag_type;
+
+    vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
+
+    switch(op)
+    {
+        case ArithmeticOperation::MAX:
+            res = wrapper::vmax(a, b);
+            break;
+        case ArithmeticOperation::MIN:
+            res = wrapper::vmin(a, b);
+            break;
+        case ArithmeticOperation::SQUARED_DIFF:
+        {
+            const vec_type tmp = wrapper::vsub(a, b);
+            res                = wrapper::vmul(tmp, tmp);
+            break;
+        }
+        case ArithmeticOperation::PRELU:
+        {
+            const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
+            const vec_type tmp  = wrapper::vmul(a, b);
+            const auto     gt   = wrapper::vcgt(a, zero);
+
+            res = wrapper::vbsl(gt, a, tmp);
+            break;
+        }
+
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return res;
+}
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
+{
+    using tag_type = typename VectorType::tag_type;
+    using vec_type = typename VectorType::type;
+
+    vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});
+    return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+                    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
+                    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    if(is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
+            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
+            for(; x < window_end_x; ++x)
+            {
+                const auto a      = *(non_broadcast_input_ptr + x);
+                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
+            for(; x < window_end_x; ++x)
+            {
+                const auto a      = *(input1_ptr + x);
+                const auto b      = *(input2_ptr + x);
+                *(output_ptr + x) = (*scalar_func)(a, b);
+            }
+        },
+        input1, input2, output);
+    }
+}
+
+template <ArithmeticOperation op, typename ScalarType>
+inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+    auto res = ScalarType(0);
+
+    switch(op)
+    {
+        case ArithmeticOperation::MAX:
+            res = std::max(a, b);
+            break;
+        case ArithmeticOperation::MIN:
+            res = std::min(a, b);
+            break;
+        case ArithmeticOperation::SQUARED_DIFF:
+        {
+            res = (a - b) * (a - b);
+            break;
+        }
+        case ArithmeticOperation::PRELU:
+        {
+            res = (a > 0 ? a : a * b);
+            break;
+        }
+        case ArithmeticOperation::DIV:
+        {
+            res = a / b;
+            if(std::is_integral<ScalarType>::value)
+            {
+                res = (b == 0) ? 0 : res;
+                if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
+                {
+                    --res;
+                }
+            }
+            break;
+        }
+        case ArithmeticOperation::POWER:
+        {
+            res = std::pow(a, b);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+    return res;
+}
+
+template <>
+inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
+{
+    return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
+}
+
+template <>
+inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+{
+    return wrapper::vdiv(a, b);
+}
+
+template <>
+inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+{
+    return wrapper::vpow(a, b);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+{
+    return wrapper::vdiv(a, b);
+}
+
+template <>
+inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+{
+    return wrapper::vpow(a, b);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
+                                      const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = wrapper::vloadq(input1_ptr + x);
+        const auto b = wrapper::vloadq(input2_ptr + x);
+        wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));
+    }
+    return x;
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
+                                                const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+        wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
+    }
+    return x;
+}
+
+template <ArithmeticOperation op, typename VectorType>
+void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    using scalar_type = typename VectorType::scalar_type;
+
+    elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
+                                                         &elementwise_arithm_op_scalar<op, scalar_type>,
+                                                         &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
+                                                         &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType>
+inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
+{
+    bool res = false;
+
+    switch(op)
+    {
+        case ComparisonOperation::Equal:
+            res = (a == b);
+            break;
+        case ComparisonOperation::NotEqual:
+            res = (a != b);
+            break;
+        case ComparisonOperation::Greater:
+            res = (a > b);
+            break;
+        case ComparisonOperation::GreaterEqual:
+            res = (a >= b);
+            break;
+        case ComparisonOperation::Less:
+            res = (a < b);
+            break;
+        case ComparisonOperation::LessEqual:
+            res = (a <= b);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+    return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
+}
+
+template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
+inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
+{
+    OutputVectorType res = { 0, 0, 0, 0 };
+
+    switch(op)
+    {
+        case ComparisonOperation::Equal:
+            res = wrapper::vceq(a, b);
+            break;
+        case ComparisonOperation::NotEqual:
+            res = wrapper::vnot(wrapper::vceq(a, b));
+            break;
+        case ComparisonOperation::Greater:
+            res = wrapper::vcgt(a, b);
+            break;
+        case ComparisonOperation::GreaterEqual:
+            res = wrapper::vcge(a, b);
+            break;
+        case ComparisonOperation::Less:
+            res = wrapper::vcgt(b, a);
+            break;
+        case ComparisonOperation::LessEqual:
+            res = wrapper::vcge(b, a);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return res;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
+inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
+{
+    InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
+                                                const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        wrapper::vstore(output_ptr + x, a);
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
+        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
+    }
+    if(x <= window_end_x - 4)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        for(int i = 0; i < 4; i++)
+        {
+            *(output_ptr + x + i) = wrapper::vgetlane(a, i);
+        }
+        x = +4;
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
+                                      const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a   = wrapper::vloadq(input1_ptr + x);
+        const auto b   = wrapper::vloadq(input2_ptr + x);
+        const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
+        wrapper::vstore(output_ptr + x, res);
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
+                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a   = wrapper::vloadq(input1_ptr + x);
+        const auto b   = wrapper::vloadq(input2_ptr + x);
+        const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
+                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        auto       a    = wrapper::vloadq(input1_ptr + x);
+        auto       b    = wrapper::vloadq(input2_ptr + x);
+        const auto res  = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+        a               = wrapper::vloadq(input1_ptr + x + 4);
+        b               = wrapper::vloadq(input2_ptr + x + 4);
+        const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
+    }
+    if(x <= window_end_x - 4)
+    {
+        const auto a   = wrapper::vloadq(input1_ptr + x);
+        const auto b   = wrapper::vloadq(input2_ptr + x);
+        const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+        for(int i = 0; i < 4; i++)
+        {
+            *(output_ptr + x + i) = wrapper::vgetlane(res, i);
+        }
+        x = +4;
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
+                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
+                                                              &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
+                                                              &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
+                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
+                                                              &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
+                                                              &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
+                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
+                                                              &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
+                                                              &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
+}
+
+inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 {
     qasymm8x16_t        x = vld1q_u8(input1_ptr);
     const float32x4x4_t out =
@@ -45,7 +493,7 @@ float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset,
     return out;
 }
 
-float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
+inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 {
     qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
     const float32x4x4_t out =
@@ -60,21 +508,21 @@ float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &o
     return out;
 }
 
-void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
+inline void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
 {
     const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1])));
     const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3])));
     vst1q_u8(output_ptr, vcombine_u8(pa, pb));
 }
 
-void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
+inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
 {
     const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
     const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
     vst1q_u8(output_ptr, vcombine_u8(pa, pb));
 }
 
-void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 {
     int32x4x4_t out =
     {
@@ -88,14 +536,14 @@ void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32
     store_quantized(output_ptr, out);
 }
 
-void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
+inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
 {
     const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
     const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
     vst1q_s8(output_ptr, vcombine_s8(pa, pb));
 }
 
-void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 {
     int32x4x4_t out =
     {
@@ -122,7 +570,7 @@ inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, cons
 }
 
 template <ArithmeticOperation op>
-inline float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
+float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
 {
     using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
     float32x4x4_t out =
@@ -296,13 +744,13 @@ inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_
     return x;
 }
 
-void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                              uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                              int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                    float32x4_t, float32x4_t, const bool),
-                              int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
-                                               int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                               float32x4_t, float32x4_t))
+inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                                     uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                     int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
+                                                           float32x4_t, float32x4_t, const bool),
+                                     int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
+                                                      int32x4_t, int32x4_t, float32x4_t, float32x4_t,
+                                                      float32x4_t, float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -404,13 +852,13 @@ void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *o
     }
 }
 
-void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                       uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                       int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                             float32x4_t, float32x4_t, const bool),
-                                       int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *,
-                                                        int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                        float32x4_t, float32x4_t))
+inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                                              uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                              int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
+                                                                    float32x4_t, float32x4_t, const bool),
+                                              int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *,
+                                                               int32x4_t, int32x4_t, float32x4_t, float32x4_t,
+                                                               float32x4_t, float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -511,13 +959,13 @@ void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, I
     }
 }
 
-void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                     int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                     int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t,
-                                                           float32x4_t, float32x4_t, const bool),
-                                     int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *,
-                                                      int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                      float32x4_t, float32x4_t))
+inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                                            int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                            int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t,
+                                                                  float32x4_t, float32x4_t, const bool),
+                                            int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *,
+                                                             int32x4_t, int32x4_t, float32x4_t, float32x4_t,
+                                                             float32x4_t, float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -625,6 +1073,7 @@ void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITe
                              &elementwise_arithm_op_quantized_broadcast_loop<op>,
                              &elementwise_arithm_op_quantized_loop<op>);
 }
+
 template <ArithmeticOperation op>
 void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
@@ -651,4 +1100,4 @@ void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
+#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H */
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
new file mode 100644
index 000000000..c5c528d3f
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>(in1, in2, out, window);
+}
+
+template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ArithmeticOperation op>
+void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>(in1, in2, out, window);
+}
+template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_8<op, uint8_t, uint8x16_t>(in1, in2, out, window);
+}
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_16<op, int16_t, int16x8_t>(in1, in2, out, window);
+}
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_32<op, int32_t, int32x4_t>(in1, in2, out, window);
+}
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
new file mode 100644
index 000000000..fa8e08745
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op_quantized<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_quantized<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 000000000..abfdf93b7
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithm_op_quantized_signed<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comp_op_quantized_signed<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
new file mode 100644
index 000000000..d764f5662
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_op<op, float16_t>(in1, in2, out, window);
+}
+
+template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_op<op, float16_t>(in1, in2, out, window);
+}
+
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
new file mode 100644
index 000000000..bb33fd281
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_op<op, float32_t>(in1, in2, out, window);
+}
+
+template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_op<op, float>(in1, in2, out, window);
+}
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/elementwise/sve/elementwise.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
index 2f9a7998d..b3046e90a 100644
--- a/src/cpu/kernels/elementwise/sve/elementwise.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_list.h"
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+#include "src/core/NEON/SVEMath.h"
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -209,6 +208,41 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *
                                                                 &arithmetic_op_loop<ScalarType, ScalarType>,
                                                                 &arithmetic_op_broadcast_loop<ScalarType, ScalarType>);
 }
+template void elementwise_arithmetic_op<ArithmeticOperation::ADD, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SUB, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::ADD, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SUB, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::ADD, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SUB, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::ADD, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SUB, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
 
 template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
 void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -222,6 +256,41 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *
                                                                            &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>);
 }
 
+template void elementwise_comparison_op<ComparisonOperation::Equal, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
 template <>
 svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
 {
@@ -241,71 +310,6 @@ svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svi
     ARM_COMPUTE_ERROR("Not supported");
 }
 
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Equal, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Greater, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Less, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
-\ No newline at end of file
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
diff --git a/src/cpu/kernels/elementwise/sve/elementwise_list.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
index f762587ce..b7425c862 100644
--- a/src/cpu/kernels/elementwise/sve/elementwise_list.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,16 +24,10 @@
 #ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
 #if defined(ARM_COMPUTE_ENABLE_SVE)
+
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/core/NEON/wrapper/svtraits.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include <arm_sve.h>
 
 namespace arm_compute
 {
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
new file mode 100644
index 000000000..a4f4d0fc8
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_op<op, int32_t>(in1, in2, out, window);
+}
+template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ArithmeticOperation op>
+void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_op<op, int16_t>(in1, in2, out, window);
+}
+template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_op<op, uint8_t>(in1, in2, out, window);
+}
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_op<op, int16_t>(in1, in2, out, window);
+}
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_op<op, int32_t>(in1, in2, out, window);
+}
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
index a5d17a86a..c35ca2d6c 100644
--- a/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,10 +25,7 @@
 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
 
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
-
-#include "src/core/NEON/wrapper/svtraits.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_list.h"
-
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
 namespace arm_compute
 {
 namespace cpu
@@ -66,7 +63,7 @@ struct BroadcastQuantizedLoopArguments
     const svfloat32_t &out_scale;
 };
 
-svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
+inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
 {
     auto x = svld1(pg, ptr);
 
@@ -85,7 +82,7 @@ svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &of
                svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
 }
 
-svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
+inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
 {
     auto x = svld1(pg, ptr);
 
@@ -106,7 +103,7 @@ svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &o
                svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
 }
 
-void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
 {
     const auto quantized = svcreate4(
                                svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
@@ -120,7 +117,7 @@ void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint3
     svst1(pg, ptr, narrowed);
 }
 
-void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
 {
     const auto quantized = svcreate4(
                                svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
new file mode 100644
index 000000000..63c75c3d4
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_quantized_op<op, uint8_t>(in1, in2, out, window);
+}
+
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_quantized_op<op, uint8_t>(in1, in2, out, window);
+}
+
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE2)
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
new file mode 100644
index 000000000..fe332df38
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_arithmetic_quantized_op<op, int8_t>(in1, in2, out, window);
+}
+
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    return elementwise_comparison_quantized_op<op, int8_t>(in1, in2, out, window);
+}
+
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE2)
diff --git a/src/cpu/kernels/elementwise_binary/list.h b/src/cpu/kernels/elementwise_binary/list.h
new file mode 100644
index 000000000..78a098e7b
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/list.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H
+#define SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ELEMETWISE_BINARY_KERNEL(func_name) \
+    template <ArithmeticOperation op>               \
+    void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp32_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s32_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp32_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s32_elementwise_binary);
+
+#undef DECLARE_ELEMETWISE_BINARY_KERNEL
+
+#define DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(func_name) \
+    template <ComparisonOperation op>                     \
+    void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_u8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s32_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp32_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_u8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s32_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp32_comparison_elementwise_binary);
+#undef DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
new file mode 100644
index 000000000..976d006f1
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    return elementwise_op<__fp16>(in, out, window, op);
+}
+}
+} // namespace arm_compute
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
new file mode 100644
index 000000000..21f4d9d32
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    return elementwise_op<float>(in, out, window, op);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
index 307e95fae..fd930ae7c 100644
--- a/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
new file mode 100644
index 000000000..ef3120e20
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    return elementwise_op<int32_t>(in, out, window, op);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
new file mode 100644
index 000000000..4dd4a1905
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    return elementwise_sve_op<float16_t>(in, out, window, op);
+}
+}
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
new file mode 100644
index 000000000..3498a0b1e
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    return elementwise_sve_op<float32_t>(in, out, window, op);
+}
+}
+} // namespace arm_compute
+#endif //ARM_COMPUTE_ENABLE_SVE
diff --git a/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
index ddf1febd6..0c04a56be 100644
--- a/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
 
 namespace arm_compute
 {
@@ -108,6 +103,7 @@ void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, E
 template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
 template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
 template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
-\ No newline at end of file
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h
index c2b495f27..08f443869 100644
--- a/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,7 @@
  */
 #ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
-
-#include "arm_compute/core/Types.h"
 #if defined(ARM_COMPUTE_ENABLE_SVE)
-
 namespace arm_compute
 {
 namespace cpu
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
new file mode 100644
index 000000000..c3e3adfc9
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    return elementwise_sve_op<int32_t>(in, out, window, op);
+}
+}
+} // namespace arm_compute
+#endif //ARM_COMPUTE_ENABLE_SVE
diff --git a/src/cpu/kernels/elementwise_unary/list.h b/src/cpu/kernels/elementwise_unary/list.h
new file mode 100644
index 000000000..2a41b74c5
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/list.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H
+#define SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H
+
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ELEMETWISE_UNARY_KERNEL(func_name) \
+    void func_name(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+
+DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp32_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp16_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(sve_s32_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp32_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp16_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_s32_elementwise_unary);
+
+#undef DECLARE_ELEMETWISE_UNARY_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
new file mode 100644
index 000000000..d4e469b69
--- /dev/null
+++ b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/genproposals/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+{
+    return compute_all_anchors<float16_t>(anchors, all_anchors, anchors_info, window);
+}
+}
+} // namespace arm_compute
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
new file mode 100644
index 000000000..09aa6ecec
--- /dev/null
+++ b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/genproposals/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+{
+    return compute_all_anchors<float>(anchors, all_anchors, anchors_info, window);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.cpp b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
new file mode 100644
index 000000000..824e85ada
--- /dev/null
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/genproposals/generic/neon/impl.h"
+namespace arm_compute
+{
+class ITensor;
+class Window;
+namespace cpu
+{
+template <typename T>
+void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+{
+    Iterator all_anchors_it(all_anchors, window);
+    Iterator anchors_it(all_anchors, window);
+
+    const size_t num_anchors = anchors->info()->dimension(1);
+    const T      stride      = 1.f / anchors_info.spatial_scale();
+    const size_t feat_width  = anchors_info.feat_width();
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const size_t anchor_offset = id.y() % num_anchors;
+
+        const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
+        const auto anchor_ptr     = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+
+        const size_t shift_idy = id.y() / num_anchors;
+        const T      shiftx    = (shift_idy % feat_width) * stride;
+        const T      shifty    = (shift_idy / feat_width) * stride;
+
+        *out_anchor_ptr       = *anchor_ptr + shiftx;
+        *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
+        *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
+        *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
+    },
+    all_anchors_it);
+}
+
+template void compute_all_anchors<float>(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void compute_all_anchors<float16_t>(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+{
+    Iterator all_anchors_it(all_anchors, window);
+    Iterator anchors_it(all_anchors, window);
+
+    const size_t num_anchors = anchors->info()->dimension(1);
+    const float  stride      = 1.f / anchors_info.spatial_scale();
+    const size_t feat_width  = anchors_info.feat_width();
+
+    const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const size_t anchor_offset = id.y() % num_anchors;
+
+        const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
+        const auto anchor_ptr     = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+
+        const size_t shift_idy = id.y() / num_anchors;
+        const float  shiftx    = (shift_idy % feat_width) * stride;
+        const float  shifty    = (shift_idy / feat_width) * stride;
+
+        const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
+        const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
+        const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
+        const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
+
+        *out_anchor_ptr       = quantize_qsymm16(new_anchor_x1, qinfo.scale);
+        *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
+        *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
+        *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
+    },
+    all_anchors_it);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.h b/src/cpu/kernels/genproposals/generic/neon/impl.h
new file mode 100644
index 000000000..88f5e5202
--- /dev/null
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
+#define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+namespace arm_compute
+{
+class ITensor;
+class Window;
+namespace cpu
+{
+template <typename T>
+void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
+
+void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
diff --git a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
new file mode 100644
index 000000000..cfb5a41d6
--- /dev/null
+++ b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/genproposals/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qu16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+{
+    return compute_all_anchors_qasymm16(anchors, all_anchors, anchors_info, window);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/list.h b/src/cpu/kernels/genproposals/list.h
new file mode 100644
index 000000000..570c686e8
--- /dev/null
+++ b/src/cpu/kernels/genproposals/list.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_LIST_H
+#define SRC_CORE_NEON_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(func_name) \
+    void func_name(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+
+DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(neon_qu16_computeallanchors);
+DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(neon_fp16_computeallanchors);
+DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(neon_fp32_computeallanchors);
+
+#undef DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CORE_NEON_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_LIST_H */
diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
new file mode 100644
index 000000000..e9fcc84b3
--- /dev/null
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+{
+    if(use_mixed_precision)
+    {
+        return instance_normalization_nchw<float16_t, float>(input, output, gamma, beta, epsilon, window);
+    }
+    else
+    {
+        return instance_normalization_nchw<float16_t>(input, output, gamma, beta, epsilon, window);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
new file mode 100644
index 000000000..061dd9585
--- /dev/null
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(use_mixed_precision);
+    return instance_normalization_nchw<float>(input, output, gamma, beta, epsilon, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
new file mode 100644
index 000000000..e35cf9760
--- /dev/null
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+class ITensor;
+class Window;
+namespace cpu
+{
+template <typename InputType, typename AccType>
+void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs)
+{
+    result        = wrapper::vadd(result, inputs);
+    result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline void vector_float_sum(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs)
+{
+    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgetlow(inputs)));
+    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
+}
+template <>
+inline float16x8_t vector_float_norm(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta)
+{
+    const auto  input_low   = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
+    const auto  input_high  = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
+    const auto  result_low  = wrapper::vcvt<float16_t>(vector_float_norm(input_low, vec_mean, vec_multip, vec_beta));
+    const auto  result_high = wrapper::vcvt<float16_t>(vector_float_norm(input_high, vec_mean, vec_multip, vec_beta));
+    float16x8_t result      = wrapper::vcombine(result_low, result_high);
+
+    return result;
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename InputType, typename AccType>
+InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+{
+    return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <typename T, typename AccType>
+void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    // Clear X/Y dimensions on execution window as we handle the planes manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    constexpr int      window_step_x  = 16 / sizeof(T);
+    const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
+
+    Iterator input_it(input, win);
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        Window win_plane = window;
+        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+        Iterator input_plane_it(input, win_plane);
+        Iterator output_plane_it(output, win_plane);
+
+        auto sum_h_w         = static_cast<AccType>(0.f);
+        auto sum_squares_h_w = static_cast<AccType>(0.f);
+
+        execute_window_loop(win_plane, [&](const Coordinates &)
+        {
+            const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+
+            // Compute S elements per iteration
+            int x = window.x().start();
+            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
+            {
+                auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
+            }
+
+            auto vec2_sum_h_w         = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+            auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+
+            vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
+            sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+            sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+            // Compute left-over elements
+            for(; x < window.x().end(); ++x)
+            {
+                const auto value = static_cast<AccType>(*(input_ptr + x));
+                sum_h_w += value;
+                sum_squares_h_w += value * value;
+            }
+        },
+        input_plane_it, output_plane_it);
+
+        const auto mean_h_w = sum_h_w / elements_plane;
+        const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+        const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
+        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+        const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
+
+        execute_window_loop(win_plane, [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
+            auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+            // Compute S elements per iteration
+            int x = window.x().start();
+            //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
+            {
+                const auto vec_val        = wrapper::vloadq(input_ptr + x);
+                const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+                wrapper::vstore(output_ptr + x, normalized_vec);
+            }
+
+            // Compute left-over elements
+            for(; x < window.x().end(); ++x)
+            {
+                const auto val    = static_cast<AccType>(*(input_ptr + x));
+                *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
+            }
+        },
+        input_plane_it, output_plane_it);
+    },
+    input_it);
+}
+
+template void instance_normalization_nchw<float>(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void instance_normalization_nchw<float16_t, float>(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+template void instance_normalization_nchw<float16_t>(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.h b/src/cpu/kernels/instancenorm/generic/neon/impl.h
new file mode 100644
index 000000000..fa4b4b656
--- /dev/null
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H
+#define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H
+#include "arm_compute/core/Helpers.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T, typename AccType = T>
+void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+
+template <typename InputType, typename AccType = InputType>
+void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs);
+
+template <typename InputType, typename AccType = InputType>
+InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template <>
+inline void vector_float_sum(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs);
+
+template <>
+inline float16x8_t vector_float_norm(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta);
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H
diff --git a/src/cpu/kernels/instancenorm/list.h b/src/cpu/kernels/instancenorm/list.h
new file mode 100644
index 000000000..54f1d3213
--- /dev/null
+++ b/src/cpu/kernels/instancenorm/list.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_INSTANCENORM_LIST_H
+#define SRC_CORE_NEON_KERNELS_INSTANCENORM_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_INSTANCENORM_KERNEL(func_name) \
+    void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+DECLARE_INSTANCENORM_KERNEL(neon_fp32_instancenorm);
+DECLARE_INSTANCENORM_KERNEL(neon_fp16_instancenorm);
+#undef DECLARE_INSTANCENORM_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_INSTANCENORM_LIST_H
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
index a71864c10..73bf7dcb8 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,7 +56,7 @@ constexpr unsigned int idx_batches  = 3;
 template <typename TSrc, typename TWeights, typename TDst>
 void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
                     const ConvolutionInfo &info, const CPUInfo &cpu_info,
-                    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel)
+                    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, std::string &_name)
 {
     unsigned int stride_cols{};
     unsigned int stride_rows{};
@@ -88,6 +88,7 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI
         return;
     }
 
+    _name  = dwc_kernel_asm->name();
     kernel = std::move(dwc_kernel_asm);
 }
 
@@ -95,7 +96,8 @@ template <typename TSrc, typename TWeights, typename TDst>
 void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
                           const ConvolutionInfo &info, const CPUInfo &cpu_info,
                           std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
-                          std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts)
+                          std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts,
+                          std::string &_name)
 {
     unsigned int stride_cols{};
     unsigned int stride_rows{};
@@ -189,7 +191,7 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT
         // Configuration not supported: Leave function unconfigured:
         return;
     }
-
+    _name  = dwc_kernel_asm->name();
     kernel = std::move(dwc_kernel_asm);
 }
 } // namespace
@@ -198,7 +200,8 @@ CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel
     : _kernel_asm(nullptr),
       _multipliers(),
       _left_shifts(),
-      _right_shifts()
+      _right_shifts(),
+      _name()
 {
 }
 
@@ -213,30 +216,31 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
     // Destination initialization if not yet initialized
     const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info);
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
+    _name = "CpuDepthwiseConv2dAssemblyWrapperKernel";
+    std::string asm_kernel_name("");
 #if defined(__aarch64__)
     switch(src->data_type())
     {
         case DataType::QASYMM8:
             if(is_data_type_quantized_per_channel(weights->data_type()))
             {
-                create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
+                create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
             }
             else
             {
-                create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
+                create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
             }
             break;
         case DataType::QASYMM8_SIGNED:
-            create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
+            create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
             break;
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         case DataType::F16:
-            create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm);
+            create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
             break;
 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         case DataType::F32:
-            create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm);
+            create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
             break;
         default:
             break;
@@ -245,6 +249,10 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
 
     Window win = calculate_max_window(*dst, Steps());
     ICpuKernel::configure(win);
+    if(_kernel_asm != nullptr)
+    {
+        _name += "/" + asm_kernel_name;
+    }
 }
 
 Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
@@ -352,14 +360,15 @@ bool CpuDepthwiseConv2dAssemblyWrapperKernel::is_configured() const
 
 const char *CpuDepthwiseConv2dAssemblyWrapperKernel::name() const
 {
-    return "CpuDepthwiseConv2dAssemblyWrapperKernel";
+    return _name.c_str();
 }
 
 size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
 {
-    ARM_COMPUTE_UNUSED(platform, thread_count);
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
 
-    return ICPPKernel::small_network_mws;
+    return ICPPKernel::default_mws;
 }
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
index 898092294..a32a7a3ec 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
+#include "src/cpu/kernels/CpuKernelSelectionTypes.h"
 
 namespace arm_conv
 {
@@ -44,7 +45,7 @@ namespace cpu
 namespace kernels
 {
 /** This class is a wrapper for the depthwise convolution assembly kernels.  */
-class CpuDepthwiseConv2dAssemblyWrapperKernel final : public ICpuKernel
+class CpuDepthwiseConv2dAssemblyWrapperKernel final : public ICpuKernel<CpuDepthwiseConv2dAssemblyWrapperKernel>
 {
 public:
     /** Default constructor */
@@ -122,6 +123,7 @@ private:
     std::vector<int32_t>                                   _multipliers{};
     std::vector<int32_t>                                   _left_shifts{};
     std::vector<int32_t>                                   _right_shifts{};
+    std::string                                            _name{};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
index f9c11fd4b..aad1daaf5 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -104,6 +104,8 @@ Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const IT
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
                                     "Only AVG and MAX pooling are supported by assembly kernels");
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(info), "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
+
     if(dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -277,9 +279,20 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf
 
 size_t CpuPool2dAssemblyWrapperKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
 {
-    ARM_COMPUTE_UNUSED(platform, thread_count);
-
-    return ICPPKernel::small_network_mws;
+    ARM_COMPUTE_UNUSED(thread_count);
+    // Tuning results that gave optimized results in performance investigation
+    if(platform.get_cpu_model() == CPUModel::A73)
+    {
+        return 10240;
+    }
+    else if(platform.get_cpu_model() == CPUModel::A53 || platform.get_cpu_model() == CPUModel::A55r0 || platform.get_cpu_model() == CPUModel::A55r1)
+    {
+        return 1;
+    }
+    else
+    {
+        return 9216;
+    }
 }
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
index 8625fd96b..8713d5c54 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 #include "src/core/NEON/kernels/assembly/pooling.hpp"
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
+#include "src/cpu/kernels/CpuKernelSelectionTypes.h"
 
 #include "pool_common.hpp"
 
@@ -45,7 +46,7 @@ namespace kernels
   * execute a single assembly kernel in the context of an NEFunction.
   *
   */
-class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel
+class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel<CpuPool2dAssemblyWrapperKernel>
 {
 public:
     /** Constructor
diff --git a/src/cpu/kernels/maxunpool/generic/neon/fp16.cpp b/src/cpu/kernels/maxunpool/generic/neon/fp16.cpp
new file mode 100644
index 000000000..d43503aa2
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/generic/neon/fp16.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/maxunpool/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_maxunpooling(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window)
+{
+    return max_unpooling<float16_t>(input, output, indices, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/maxunpool/generic/neon/fp32.cpp b/src/cpu/kernels/maxunpool/generic/neon/fp32.cpp
new file mode 100644
index 000000000..2f96e8669
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/generic/neon/fp32.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/maxunpool/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_maxunpooling(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window)
+{
+    return max_unpooling<float>(input, output, indices, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/maxunpool/generic/neon/impl.cpp b/src/cpu/kernels/maxunpool/generic/neon/impl.cpp
new file mode 100644
index 000000000..8bbc8d128
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/generic/neon/impl.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/maxunpool/generic/neon/impl.h"
+namespace arm_compute
+{
+class ITensor;
+class Window;
+namespace cpu
+{
+template <typename T>
+void max_unpooling(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window)
+{
+    Iterator  input_itr(input, window);
+    Iterator  indices_itr(indices, window);
+    auto      out_ptr      = reinterpret_cast<T *>(output->buffer());
+    const int out_stride_w = static_cast<int>(output->info()->strides_in_bytes()[3]);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto vindices                                         = reinterpret_cast<uint32_t *>(indices_itr.ptr());
+        auto vinput                                           = reinterpret_cast<T *>(input_itr.ptr());
+        out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
+    },
+    input_itr, indices_itr);
+}
+template void max_unpooling<float>(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window);
+template void max_unpooling<int8_t>(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window);
+template void max_unpooling<uint8_t>(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void max_unpooling<float16_t>(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window);
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/maxunpool/generic/neon/impl.h b/src/cpu/kernels/maxunpool/generic/neon/impl.h
new file mode 100644
index 000000000..6a14c66b3
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/generic/neon/impl.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_MAXUNPOOLING_IMPL_H
+#define SRC_CORE_SVE_KERNELS_MAXUNPOOLING_IMPL_H
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+namespace arm_compute
+{
+class ITensor;
+class Window;
+namespace cpu
+{
+template <typename T>
+void max_unpooling(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_SVE_KERNELS_MAXUNPOOLING_IMPL_H
diff --git a/src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp b/src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp
new file mode 100644
index 000000000..b6d0f48fd
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/maxunpool/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qs8_maxunpooling(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window)
+{
+    return max_unpooling<int8_t>(input, output, indices, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 000000000..79f301380
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/maxunpool/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qu8_maxunpooling(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window)
+{
+    return max_unpooling<uint8_t>(input, output, indices, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/maxunpool/list.h b/src/cpu/kernels/maxunpool/list.h
new file mode 100644
index 000000000..0f9bb499d
--- /dev/null
+++ b/src/cpu/kernels/maxunpool/list.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_MAXUNPOOL_LIST_H
+#define SRC_CORE_NEON_KERNELS_MAXUNPOOL_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_MAXUNPOOL_KERNEL(func_name) \
+    void func_name(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window)
+DECLARE_MAXUNPOOL_KERNEL(neon_fp32_maxunpooling);
+DECLARE_MAXUNPOOL_KERNEL(neon_fp16_maxunpooling);
+DECLARE_MAXUNPOOL_KERNEL(neon_qs8_maxunpooling);
+DECLARE_MAXUNPOOL_KERNEL(neon_qu8_maxunpooling);
+#undef DECLARE_MAXUNPOOL_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_MAXUNPOOL_LIST_H
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
new file mode 100644
index 000000000..3e712b559
--- /dev/null
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window)
+{
+    return mean_stddev_normalization<float16_t, 8>(input, output, epsilon, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp
new file mode 100644
index 000000000..4bff26b03
--- /dev/null
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window)
+{
+    return mean_stddev_normalization<float, 4>(input, output, epsilon, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
new file mode 100644
index 000000000..be07ea78e
--- /dev/null
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType, int size>
+void mean_stddev_normalization(ITensor *input, ITensor *output, float epsilon, const Window &window)
+{
+    using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type;
+
+    // Set build options
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x  = size;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Iterator input_itr(input, win);
+    Iterator output_itr(output, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        int  x       = window_start_x;
+        auto in_ptr  = reinterpret_cast<const ScalarType *>(input_itr.ptr());
+        auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr());
+
+        auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
+        auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
+
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            auto data  = wrapper::vloadq(in_ptr + x);
+            sum_vec    = wrapper::vadd(sum_vec, data);
+            sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
+        }
+
+        auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
+        auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
+        for(int i = 0; i < size / 4; ++i)
+        {
+            sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
+            sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
+        }
+
+        auto sum    = wrapper::vgetlane(sum_carry_res, 0);
+        auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            ScalarType data = *(in_ptr + x);
+            sum += data;
+            sum_sq += data * data;
+        }
+
+        ScalarType mean       = sum / input->info()->dimension(0);
+        ScalarType var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
+        ScalarType stddev_inv = 1.f / sqrt(var + epsilon);
+
+        auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
+        auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
+        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            auto data = wrapper::vloadq(in_ptr + x);
+            auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
+            // Store results
+            wrapper::vstore(out_ptr + x, res);
+        }
+        for(; x < window_end_x; ++x)
+        {
+            *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
+        }
+    },
+    input_itr, output_itr);
+}
+template void mean_stddev_normalization<float, 4>(ITensor *input, ITensor *output, float epsilon, const Window &window);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void mean_stddev_normalization<float16_t, 8>(ITensor *input, ITensor *output, float epsilon, const Window &window);
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.h b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.h
new file mode 100644
index 000000000..6466506f0
--- /dev/null
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_MEANSTDDEVNORM_IMPL_H
+#define SRC_CORE_SVE_KERNELS_MEANSTDDEVNORM_IMPL_H
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType, int size>
+void mean_stddev_normalization(ITensor *_input, ITensor *_output, float _epsilon, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif //define SRC_CORE_SVE_KERNELS_MEANSTDDEVNORM_IMPL_H
diff --git a/src/cpu/kernels/meanstddevnorm/list.h b/src/cpu/kernels/meanstddevnorm/list.h
new file mode 100644
index 000000000..ac9cb37d2
--- /dev/null
+++ b/src/cpu/kernels/meanstddevnorm/list.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_MEANSTDDEVNORM_LIST_H
+#define SRC_CORE_NEON_KERNELS_MEANSTDDEVNORM_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_MEANSTDDEVNORM_KERNEL(func_name) \
+    void func_name(ITensor *input, ITensor *output, float epsilon, const Window &window)
+
+DECLARE_MEANSTDDEVNORM_KERNEL(neon_fp32_meanstddevnorm);
+DECLARE_MEANSTDDEVNORM_KERNEL(neon_fp16_meanstddevnorm);
+
+#undef DECLARE_MEANSTDDEVNORM_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_MEANSTDDEVNORM_LIST_H
diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp
index 534d24ab4..72f63af3b 100644
--- a/src/cpu/kernels/pool2d/neon/fp16.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp16.cpp
@@ -228,7 +228,7 @@ void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
             }
             else
             {
-                vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
+                vres = vdupq_n_f16(-std::numeric_limits<float>::infinity());
 
                 for(int y = pool_start_y; y < pool_end_y; ++y)
                 {
@@ -287,7 +287,7 @@ void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
             }
             else
             {
-                res = std::numeric_limits<float>::lowest();
+                res = -std::numeric_limits<float>::infinity();
                 for(int y = pool_start_y; y < pool_end_y; ++y)
                 {
                     for(int x = pool_start_x; x < pool_end_x; ++x)
diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp
index 26a32ed9d..e4261f746 100644
--- a/src/cpu/kernels/pool2d/neon/fp32.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp32.cpp
@@ -223,7 +223,7 @@ void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
                 }
                 else
                 {
-                    vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
+                    vres = vdupq_n_f32(-std::numeric_limits<float>::infinity());
                     for(int y = pool_start_y; y < pool_end_y; ++y)
                     {
                         for(int x = pool_start_x; x < pool_end_x; ++x)
@@ -285,7 +285,7 @@ void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
                 }
                 else
                 {
-                    res = std::numeric_limits<float>::lowest();
+                    res = -std::numeric_limits<float>::infinity();
                     for(int y = pool_start_y; y < pool_end_y; ++y)
                     {
                         for(int x = pool_start_x; x < pool_end_x; ++x)
diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
index 109fc1b28..10cbfc56a 100644
--- a/src/cpu/kernels/pool2d/neon/nchw/all.cpp
+++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -93,7 +93,7 @@ void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
     const int                  src_h          = src->info()->dimension(1);
     const int                  upper_bound_w  = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int                  upper_bound_h  = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    constexpr float16_t        fp16_min       = -100.0f;
+    const float16_t            fp16_min       = -std::numeric_limits<half_float::half>::infinity();
     const float16_t            fill_value     = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
     const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
     const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
@@ -142,7 +142,7 @@ void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
         // Calculate square-root in case of l2 pooling
         if(pool_info.pool_type == PoolingType::L2)
         {
-            res = vinv_f16(vinvsqrt_f16(res));
+            res = vsqrt_f16(res);
         }
 
         *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
@@ -203,7 +203,7 @@ void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *d
     const int            pad_left       = src->info()->padding().left;
     const int            pad_right      = src->info()->padding().right;
     const int            in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
-    constexpr T          float_min      = -100.0f;
+    constexpr T          float_min      = -std::numeric_limits<float>::infinity();
     const T              fill_value     = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -255,12 +255,12 @@ void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
         const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
         int           pool_stride_x, pool_stride_y = 0;
         std::tie(pool_stride_x, pool_stride_y)     = pool_info.pad_stride_info.stride();
-        const int           src_w         = src->info()->dimension(0);
-        const int           src_h         = src->info()->dimension(1);
-        const int           upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int           upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-        constexpr float16_t fp16_min      = -100.0f;
-        const float16_t     fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+        const int       src_w         = src->info()->dimension(0);
+        const int       src_h         = src->info()->dimension(1);
+        const int       upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+        const int       upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+        const float16_t fp16_min      = -std::numeric_limits<half_float::half>::infinity();
+        const float16_t fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
 
         const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
         const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
@@ -304,7 +304,7 @@ void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
             // Calculate square-root in case of l2 pooling
             if(pool_info.pool_type == PoolingType::L2)
             {
-                res = vinv_f16(vinvsqrt_f16(res));
+                res = vsqrt_f16(res);
             }
 
             // Store result
@@ -329,12 +329,12 @@ void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1,
     int       pool_stride_x   = 0;
     int       pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int           src_w         = src->info()->dimension(0);
-    const int           src_h         = src->info()->dimension(1);
-    const int           upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int           upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    constexpr float16_t fp16_min      = -100.0f;
-    const float16_t     fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+    const int       src_w         = src->info()->dimension(0);
+    const int       src_h         = src->info()->dimension(1);
+    const int       upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int       upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float16_t fp16_min      = -std::numeric_limits<half_float::half>::infinity();
+    const float16_t fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -421,7 +421,7 @@ void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1,
     const int   src_h         = src->info()->dimension(1);
     const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::max() : 0.0f;
+    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -459,7 +459,7 @@ void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1,
         }
         else // if max pooling
         {
-            res = std::numeric_limits<float>::lowest();
+            res = -std::numeric_limits<float>::infinity();
 
             for(int y = 0; y < pool_size_y; ++y)
             {
@@ -510,7 +510,7 @@ void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
         const int   src_h         = src->info()->dimension(1);
         const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
         const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-        const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::max() : 0.0f;
+        const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
 
         const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
         const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
@@ -584,7 +584,7 @@ void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
     const int   src_h         = src->info()->dimension(1);
     const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::max() : 0.0f;
+    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
 
     const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
     const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
@@ -630,7 +630,7 @@ void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
         else
         {
             const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
-            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
+            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::infinity(), max_data, 3)), vget_low_f32(max_data));
             res                        = vpmax_f32(res, res);
         }
         final_res = vget_lane_f32(res, 0);
@@ -665,7 +665,7 @@ void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
     const int   src_h         = src->info()->dimension(1);
     const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::max() : 0.0f;
+    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
 
     std::array<const uint8_t *, pool_size> src_ptrs{ {} };
     for(int i = 0; i < pool_size; ++i)
@@ -728,7 +728,7 @@ void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
                 float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
                 data               = vmax2q_f32(data, temp);
             }
-            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), data.val[1], 3)), vget_low_f32(data.val[1]));
+            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::infinity(), data.val[1], 3)), vget_low_f32(data.val[1]));
             res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
             res = vpmax_f32(res, res);
         }
diff --git a/src/cpu/kernels/range/generic/neon/fp16.cpp b/src/cpu/kernels/range/generic/neon/fp16.cpp
new file mode 100644
index 000000000..5d50dce90
--- /dev/null
+++ b/src/cpu/kernels/range/generic/neon/fp16.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/range/generic/neon/impl.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<float16_t>(output, start, step, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/range/generic/neon/fp32.cpp b/src/cpu/kernels/range/generic/neon/fp32.cpp
new file mode 100644
index 000000000..6044f0f88
--- /dev/null
+++ b/src/cpu/kernels/range/generic/neon/fp32.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/range/generic/neon/impl.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<float32_t>(output, start, step, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/range/generic/neon/impl.cpp b/src/cpu/kernels/range/generic/neon/impl.cpp
new file mode 100644
index 000000000..f91251c91
--- /dev/null
+++ b/src/cpu/kernels/range/generic/neon/impl.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/range/generic/neon/impl.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
+
+    const auto step_vec  = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
+    const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
+    auto       id_vec    = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16 / sizeof(T);
+
+    Window win{ window };
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator output_it(output, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        int        x       = window_start_x;
+        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            for(int count = 0; count < window_step_x; ++count)
+            {
+                id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
+            }
+
+            // start + step * id
+            const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
+            wrapper::vstore(out_ptr + x, res_vec);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const auto res = start + x * step;
+            *(out_ptr + x) = res;
+        }
+
+    },
+    output_it);
+}
+
+template void neon_range_function<uint8_t>(ITensor *output, float start, float step, const Window &window);
+template void neon_range_function<uint16_t>(ITensor *output, float start, float step, const Window &window);
+template void neon_range_function<uint32_t>(ITensor *output, float start, float step, const Window &window);
+template void neon_range_function<int8_t>(ITensor *output, float start, float step, const Window &window);
+template void neon_range_function<int16_t>(ITensor *output, float start, float step, const Window &window);
+template void neon_range_function<int32_t>(ITensor *output, float start, float step, const Window &window);
+template void neon_range_function<float32_t>(ITensor *output, float start, float step, const Window &window);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void neon_range_function<float16_t>(ITensor *output, float start, float step, const Window &window);
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/tests/validation/reference/Remap.h b/src/cpu/kernels/range/generic/neon/impl.h
index 0726f7596..7ac2fc9c1 100644
--- a/tests/validation/reference/Remap.h
+++ b/src/cpu/kernels/range/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,24 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_REMAP_H
-#define ARM_COMPUTE_TEST_REMAP_H
-
-#include "tests/SimpleTensor.h"
+#ifndef SRC_CORE_NEON_KERNELS_RANGE_IMPL_H
+#define SRC_CORE_NEON_KERNELS_RANGE_IMPL_H
 
 namespace arm_compute
 {
-namespace test
-{
-namespace validation
-{
-namespace reference
+class ITensor;
+class Window;
+
+namespace cpu
 {
 template <typename T>
-SimpleTensor<T> remap(const SimpleTensor<T> &in, SimpleTensor<float> &map_x, SimpleTensor<float> &map_y, SimpleTensor<T> &valid_mask, InterpolationPolicy policy, BorderMode border_mode,
-                      T constant_border_value = 0);
-} // namespace reference
-} // namespace validation
-} // namespace test
+void neon_range_function(ITensor *output, float start, float step, const Window &window);
+} // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_REMAP_H */
+#endif //SRC_CORE_NEON_KERNELS_RANGE_IMPL_H
diff --git a/src/cpu/kernels/range/generic/neon/integer.cpp b/src/cpu/kernels/range/generic/neon/integer.cpp
new file mode 100644
index 000000000..0f3ff89b7
--- /dev/null
+++ b/src/cpu/kernels/range/generic/neon/integer.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/range/generic/neon/impl.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void u8_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<uint8_t>(output, start, step, window);
+}
+
+void u16_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<uint16_t>(output, start, step, window);
+}
+
+void u32_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<uint32_t>(output, start, step, window);
+}
+
+void s8_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<int8_t>(output, start, step, window);
+}
+
+void s16_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<int16_t>(output, start, step, window);
+}
+
+void s32_neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    return neon_range_function<int32_t>(output, start, step, window);
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/range/list.h b/src/cpu/kernels/range/list.h
new file mode 100644
index 000000000..25d52bfe7
--- /dev/null
+++ b/src/cpu/kernels/range/list.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_RANGE_LIST_H
+#define SRC_CORE_NEON_KERNELS_RANGE_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_RANGE_KERNEL(func_name) \
+    void func_name(ITensor *output, float start, float step, const Window &window)
+
+DECLARE_RANGE_KERNEL(fp16_neon_range_function);
+DECLARE_RANGE_KERNEL(fp32_neon_range_function);
+DECLARE_RANGE_KERNEL(s8_neon_range_function);
+DECLARE_RANGE_KERNEL(s16_neon_range_function);
+DECLARE_RANGE_KERNEL(s32_neon_range_function);
+DECLARE_RANGE_KERNEL(u8_neon_range_function);
+DECLARE_RANGE_KERNEL(u16_neon_range_function);
+DECLARE_RANGE_KERNEL(u32_neon_range_function);
+
+#undef DECLARE_RANGE_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_RANGE_LIST_H
diff --git a/src/cpu/kernels/roialign/generic/neon/fp16.cpp b/src/cpu/kernels/roialign/generic/neon/fp16.cpp
new file mode 100644
index 000000000..6e585a4df
--- /dev/null
+++ b/src/cpu/kernels/roialign/generic/neon/fp16.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/roialign/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+{
+    return roi_align<float16_t, float16_t>(input, output, rois, pool_info, window, info);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/roialign/generic/neon/fp32.cpp b/src/cpu/kernels/roialign/generic/neon/fp32.cpp
new file mode 100644
index 000000000..51355aaef
--- /dev/null
+++ b/src/cpu/kernels/roialign/generic/neon/fp32.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/roialign/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+{
+    return roi_align<float, float>(input, output, rois, pool_info, window, info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/roialign/generic/neon/impl.cpp b/src/cpu/kernels/roialign/generic/neon/impl.cpp
new file mode 100644
index 000000000..a4502e703
--- /dev/null
+++ b/src/cpu/kernels/roialign/generic/neon/impl.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/roialign/generic/neon/impl.h"
+#include "src/core/NEON/INEKernel.h"
+namespace arm_compute
+{
+namespace cpu
+{
+/** Average pooling over an aligned window */
+template <typename input_data_type>
+inline input_data_type roi_align_1x1(const ITensor *input,
+                                     unsigned int   roi_batch,
+                                     float          region_start_x,
+                                     float          bin_size_x,
+                                     int            grid_size_x,
+                                     float          region_end_x,
+                                     float          region_start_y,
+                                     float          bin_size_y,
+                                     int            grid_size_y,
+                                     float          region_end_y,
+                                     int            pz)
+{
+    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return input_data_type(0);
+    }
+    else
+    {
+        const DataLayout data_layout = input->info()->data_layout();
+        float            avg         = 0;
+        // Iterate through the aligned pooling region
+        for(int iy = 0; iy < grid_size_y; ++iy)
+        {
+            for(int ix = 0; ix < grid_size_x; ++ix)
+            {
+                // Align the window in the middle of every bin
+                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
+                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
+
+                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
+                const int y_low  = y;
+                const int x_low  = x;
+                const int y_high = y_low + 1;
+                const int x_high = x_low + 1;
+
+                const float ly = y - y_low;
+                const float lx = x - x_low;
+                const float hy = 1. - ly;
+                const float hx = 1. - lx;
+
+                const float w1 = hy * hx;
+                const float w2 = hy * lx;
+                const float w3 = ly * hx;
+                const float w4 = ly * lx;
+                if(data_layout == DataLayout::NCHW)
+                {
+                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
+                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
+                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
+                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
+                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                }
+                else
+                {
+                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
+                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
+                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
+                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
+                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                }
+            }
+        }
+
+        avg /= grid_size_x * grid_size_y;
+        return input_data_type(avg);
+    }
+}
+
+/** Average pooling over an aligned window */
+template <typename input_data_type>
+inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
+                                             unsigned int            roi_batch,
+                                             float                   region_start_x,
+                                             float                   bin_size_x,
+                                             int                     grid_size_x,
+                                             float                   region_end_x,
+                                             float                   region_start_y,
+                                             float                   bin_size_y,
+                                             int                     grid_size_y,
+                                             float                   region_end_y,
+                                             int                     pz,
+                                             const QuantizationInfo &out_qinfo)
+{
+    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return input_data_type(out_qinfo.uniform().offset);
+    }
+    else
+    {
+        float                         avg              = 0;
+        const UniformQuantizationInfo input_qinfo      = input->info()->quantization_info().uniform();
+        const bool                    is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
+        const DataLayout              data_layout      = input->info()->data_layout();
+
+        // Iterate through the aligned pooling region
+        for(int iy = 0; iy < grid_size_y; ++iy)
+        {
+            for(int ix = 0; ix < grid_size_x; ++ix)
+            {
+                // Align the window in the middle of every bin
+                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
+                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
+
+                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
+                const int y_low  = y;
+                const int x_low  = x;
+                const int y_high = y_low + 1;
+                const int x_high = x_low + 1;
+
+                const float ly = y - y_low;
+                const float lx = x - x_low;
+                const float hy = 1. - ly;
+                const float hx = 1. - lx;
+
+                const float w1 = hy * hx;
+                const float w2 = hy * lx;
+                const float w3 = ly * hx;
+                const float w4 = ly * lx;
+
+                if(data_layout == DataLayout::NCHW)
+                {
+                    if(is_qasymm_signed)
+                    {
+                        float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
+                        float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
+                        float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
+                        float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                    else
+                    {
+                        float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
+                        float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
+                        float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
+                        float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                }
+                else
+                {
+                    if(is_qasymm_signed)
+                    {
+                        const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
+                        const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
+                        const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
+                        const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                    else
+                    {
+                        const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
+                        const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
+                        const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
+                        const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                }
+            }
+        }
+
+        avg /= grid_size_x * grid_size_y;
+
+        input_data_type res = 0;
+        if(is_qasymm_signed)
+        {
+            res = quantize_qasymm8_signed(avg, out_qinfo);
+        }
+        else
+        {
+            res = quantize_qasymm8(avg, out_qinfo);
+        }
+        return res;
+    }
+}
+inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, float max_value)
+{
+    const float region_start = p * bin_size + roi_anchor;
+    return utility::clamp(region_start, 0.0f, max_value);
+}
+
+template <typename input_data_type, typename roi_data_type>
+void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+
+    const DataLayout data_layout    = input->info()->data_layout();
+    const size_t     values_per_roi = rois->info()->dimension(0);
+
+    const int roi_list_start = window.x().start();
+    const int roi_list_end   = window.x().end();
+
+    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int idx_depth  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const int input_width   = input->info()->dimension(idx_width);
+    const int input_height  = input->info()->dimension(idx_height);
+    const int input_chanels = input->info()->dimension(idx_depth);
+    const int pooled_w      = pool_info.pooled_width();
+    const int pooled_h      = pool_info.pooled_height();
+
+    const DataType data_type = input->info()->data_type();
+    const bool     is_qasymm = is_data_type_quantized_asymmetric(data_type);
+
+    const auto             *rois_ptr   = reinterpret_cast<const roi_data_type *>(rois->buffer());
+    const QuantizationInfo &rois_qinfo = rois->info()->quantization_info();
+    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+    {
+        const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
+
+        roi_data_type qx1 = rois_ptr[values_per_roi * roi_indx + 1];
+        roi_data_type qy1 = rois_ptr[values_per_roi * roi_indx + 2];
+        roi_data_type qx2 = rois_ptr[values_per_roi * roi_indx + 3];
+        roi_data_type qy2 = rois_ptr[values_per_roi * roi_indx + 4];
+        float         x1(qx1);
+        float         x2(qx2);
+        float         y1(qy1);
+        float         y2(qy2);
+        if(is_qasymm)
+        {
+            x1 = dequantize_qasymm16(qx1, rois_qinfo);
+            x2 = dequantize_qasymm16(qx2, rois_qinfo);
+            y1 = dequantize_qasymm16(qy1, rois_qinfo);
+            y2 = dequantize_qasymm16(qy2, rois_qinfo);
+        }
+        const float roi_anchor_x = x1 * pool_info.spatial_scale();
+        const float roi_anchor_y = y1 * pool_info.spatial_scale();
+        const float roi_dims_x   = std::max((x2 - x1) * pool_info.spatial_scale(), 1.0f);
+        const float roi_dims_y   = std::max((y2 - y1) * pool_info.spatial_scale(), 1.0f);
+        float       bin_size_x   = roi_dims_x / pool_info.pooled_width();
+        float       bin_size_y   = roi_dims_y / pool_info.pooled_height();
+
+        // Iterate through all feature maps
+        for(int ch = 0; ch < input_chanels; ++ch)
+        {
+            // Iterate through all output pixels
+            for(int py = 0; py < pooled_h; ++py)
+            {
+                for(int px = 0; px < pooled_w; ++px)
+                {
+                    const float     region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
+                    const float     region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
+                    const float     region_end_x   = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
+                    const float     region_end_y   = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
+                    const int       roi_bin_grid_x = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
+                    const int       roi_bin_grid_y = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
+                    input_data_type out_val(0);
+                    if(is_qasymm)
+                    {
+                        out_val = roi_align_1x1_qasymm8<input_data_type>(
+                                      input, roi_batch, region_start_x, bin_size_x,
+                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
+                                      roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info());
+                    }
+                    else
+                    {
+                        out_val = roi_align_1x1<input_data_type>(
+                                      input, roi_batch, region_start_x, bin_size_x,
+                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
+                                      roi_bin_grid_y, region_end_y, ch);
+                    }
+
+                    if(data_layout == DataLayout::NCHW)
+                    {
+                        auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
+                        *out_ptr     = out_val;
+                    }
+                    else
+                    {
+                        auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
+                        *out_ptr     = out_val;
+                    }
+                }
+            }
+        }
+    }
+}
+template void roi_align<float, float>(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info);
+template void roi_align<uint8_t, uint16_t>(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info);
+template void roi_align<int8_t, uint16_t>(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info);
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void roi_align<float16_t, float16_t>(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info);
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/roialign/generic/neon/impl.h b/src/cpu/kernels/roialign/generic/neon/impl.h
new file mode 100644
index 000000000..ff96b8eda
--- /dev/null
+++ b/src/cpu/kernels/roialign/generic/neon/impl.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
+#define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
+#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+class ITensor;
+class Window;
+namespace cpu
+{
+template <typename input_data_type, typename roi_data_type>
+void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
new file mode 100644
index 000000000..d6bd9a95c
--- /dev/null
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/roialign/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qu8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+{
+    return roi_align<uint8_t, uint16_t>(input, output, rois, pool_info, window, info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 000000000..a839581af
--- /dev/null
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/roialign/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qs8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+{
+    return roi_align<int8_t, uint16_t>(input, output, rois, pool_info, window, info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/roialign/list.h b/src/cpu/kernels/roialign/list.h
new file mode 100644
index 000000000..1c71b0248
--- /dev/null
+++ b/src/cpu/kernels/roialign/list.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_ROIALIGN_LIST_H
+#define SRC_CORE_NEON_KERNELS_ROIALIGN_LIST_H
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ROIALIGN_KERNEL(func_name)                                     \
+    void func_name(const ITensor *input, ITensor *output, const ITensor *rois, \
+                   ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+DECLARE_ROIALIGN_KERNEL(neon_fp32_roialign);
+DECLARE_ROIALIGN_KERNEL(neon_fp16_roialign);
+DECLARE_ROIALIGN_KERNEL(neon_qu8_roialign);
+DECLARE_ROIALIGN_KERNEL(neon_qs8_roialign);
+#undef DECLARE_ROIALIGN_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CORE_NEON_KERNELS_ROIALIGN_LIST_H */
diff --git a/src/cpu/kernels/select/generic/neon/fp16.cpp b/src/cpu/kernels/select/generic/neon/fp16.cpp
new file mode 100644
index 000000000..b460213c7
--- /dev/null
+++ b/src/cpu/kernels/select/generic/neon/fp16.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/select/generic/neon/impl.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_f16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_16<float16_t, uint16x8_t>(c, x, y, output, window);
+}
+void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<float16_t>(c, x, y, output, window);
+}
+
+} // namespace cpu
+
+} // namespace arm_compute
+
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)  && defined(ENABLE_FP16_KERNELS) */
+\ No newline at end of file
diff --git a/src/cpu/kernels/select/generic/neon/fp32.cpp b/src/cpu/kernels/select/generic/neon/fp32.cpp
new file mode 100644
index 000000000..63fd59490
--- /dev/null
+++ b/src/cpu/kernels/select/generic/neon/fp32.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/select/generic/neon/impl.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_f32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_32<float, uint32x4_t>(c, x, y, output, window);
+}
+void neon_f32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<float>(c, x, y, output, window);
+}
+
+} // namespace cpu
+
+} // namespace arm_compute
diff --git a/src/cpu/kernels/select/generic/neon/impl.cpp b/src/cpu/kernels/select/generic/neon/impl.cpp
new file mode 100644
index 000000000..62c46f3c1
--- /dev/null
+++ b/src/cpu/kernels/select/generic/neon/impl.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/TensorInfo.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
+#include "src/core/NEON/NEAsymm.h"
+
+#include <arm_neon.h>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType, typename VectorType>
+void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
+{
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator condition(cond, win);
+    Iterator input1(in1, win);
+    Iterator input2(in2, win);
+    Iterator output(out, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
+        const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
+        const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
+        const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
+
+        int x = window_start_x;
+        for(; x <= limit; x += window_step_x)
+        {
+            const auto c = (*condition_conversion)(condition_ptr + x);
+            const auto a = wrapper::vloadq(input1_ptr + x);
+            const auto b = wrapper::vloadq(input2_ptr + x);
+            wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
+        }
+        for(; x < window_end_x; ++x)
+        {
+            const auto c      = *(condition_ptr + x);
+            const auto a      = *(input1_ptr + x);
+            const auto b      = *(input2_ptr + x);
+            *(output_ptr + x) = static_cast<bool>(c) ? a : b;
+        }
+    },
+    condition, input1, input2, output);
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
+    {
+        static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+        return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
+    });
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
+    {
+        static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+        return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
+    });
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
+    {
+        static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+        return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
+    });
+}
+
+template <typename ScalarType>
+void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(window);
+
+    auto       output_ptr    = reinterpret_cast<ScalarType *>(out->buffer());
+    const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
+    const auto input1_ptr    = reinterpret_cast<const ScalarType *>(in1->buffer());
+    const auto input2_ptr    = reinterpret_cast<const ScalarType *>(in2->buffer());
+
+    const int outer_size = cond->info()->total_size() / cond->info()->element_size();
+    const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
+    int       offset     = 0;
+    const int step       = 16 / in1->info()->element_size();
+
+    for(int i = 0; i < outer_size; ++i)
+    {
+        int        x         = offset;
+        const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
+        for(; x <= offset + inner_size - step; x += step)
+        {
+            wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
+        }
+        if(x <= offset + inner_size - (step / 2))
+        {
+            wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
+            x += step / 2;
+        }
+        for(; x < offset + inner_size; ++x)
+        {
+            *(output_ptr + x) = *(input_ptr + x);
+        }
+        offset += inner_size;
+    }
+}
+
+template void select_op_32<float, uint32x4_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void select_op_not_same_rank<float>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void select_op_8<int8_t, uint8x16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void select_op_16<int16_t, uint16x8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void select_op_16<float16_t, uint16x8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+
+template void select_op_32<int32_t, uint32x4_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void select_op_not_same_rank<int8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void select_op_not_same_rank<int16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void select_op_not_same_rank<float16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+
+template void select_op_not_same_rank<int32_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void select_op_8<uint8_t, uint8x16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void select_op_16<uint16_t, uint16x8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void select_op_32<uint32_t, uint32x4_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void select_op_not_same_rank<uint8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void select_op_not_same_rank<uint16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void select_op_not_same_rank<uint32_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+
+} // namespace arm_compute
diff --git a/src/cpu/kernels/select/generic/neon/impl.h b/src/cpu/kernels/select/generic/neon/impl.h
new file mode 100644
index 000000000..2bbc38b74
--- /dev/null
+++ b/src/cpu/kernels/select/generic/neon/impl.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_SELECT_IMPL_H
+#define SRC_CORE_NEON_KERNELS_SELECT_IMPL_H
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+class Window;
+
+namespace cpu
+{
+template <typename ScalarType>
+void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <typename ScalarType, typename VectorType>
+void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <typename ScalarType, typename VectorType>
+void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <typename ScalarType, typename VectorType>
+void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <typename ScalarType, typename VectorType>
+void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *));
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_SELECT_IMPL_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/select/generic/neon/integer.cpp b/src/cpu/kernels/select/generic/neon/integer.cpp
new file mode 100644
index 000000000..71b2f0b93
--- /dev/null
+++ b/src/cpu/kernels/select/generic/neon/integer.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+
+#include <arm_neon.h>
+
+#include "src/cpu/kernels/select/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_s8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_8<int8_t, uint8x16_t>(c, x, y, output, window);
+}
+void neon_s16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_16<int16_t, uint16x8_t>(c, x, y, output, window);
+}
+void neon_s32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_32<int32_t, uint32x4_t>(c, x, y, output, window);
+}
+void neon_s8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<int8_t>(c, x, y, output, window);
+}
+void neon_s16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<int16_t>(c, x, y, output, window);
+}
+void neon_s32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<int32_t>(c, x, y, output, window);
+}
+void neon_u8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_8<uint8_t, uint8x16_t>(c, x, y, output, window);
+}
+void neon_u16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_16<uint16_t, uint16x8_t>(c, x, y, output, window);
+}
+void neon_u32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_32<uint32_t, uint32x4_t>(c, x, y, output, window);
+}
+void neon_u8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<uint8_t>(c, x, y, output, window);
+}
+void neon_u16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<uint16_t>(c, x, y, output, window);
+}
+void neon_u32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+{
+    return select_op_not_same_rank<uint32_t>(c, x, y, output, window);
+}
+
+} // namespace cpu
+
+} // namespace arm_compute
diff --git a/src/cpu/kernels/select/list.h b/src/cpu/kernels/select/list.h
new file mode 100644
index 000000000..c33a25f6d
--- /dev/null
+++ b/src/cpu/kernels/select/list.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_SELECT_LIST_H
+#define SRC_CORE_NEON_KERNELS_SELECT_LIST_H
+
+#include "arm_compute/core/ITensor.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_SELECT_KERNEL(func_name) \
+    void func_name(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+
+DECLARE_SELECT_KERNEL(neon_s8_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_s16_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_s32_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_u8_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_u16_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_u32_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_f16_select_same_rank);
+DECLARE_SELECT_KERNEL(neon_f32_select_same_rank);
+
+DECLARE_SELECT_KERNEL(neon_s8_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_s16_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_s32_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_u8_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_u16_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_u32_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_f16_select_not_same_rank);
+DECLARE_SELECT_KERNEL(neon_f32_select_not_same_rank);
+
+#undef DECLARE_RANGE_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //SRC_CORE_NEON_KERNELS_SELECT_LIST_H
+\ No newline at end of file
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
new file mode 100644
index 000000000..3cb1cd683
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
+                       ITensor *out, const float beta, bool is_log, const Window &window)
+{
+    return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
+}
+
+void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
+{
+    return neon_logits_1d_max<float16_t>(in, out, window);
+}
+}
+} // namespace arm_compute
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
new file mode 100644
index 000000000..ddd270ae7
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
+                       ITensor *out, const float beta, bool is_log, const Window &window)
+{
+    return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
+}
+
+void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
+{
+    return neon_logits_1d_max<float>(in, out, window);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/impl/neon/list.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 5ebee3127..325e127f3 100644
--- a/src/cpu/kernels/softmax/impl/neon/list.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
+#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
+#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
 
-#include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "support/SaturateCast.h"
@@ -385,4 +384,4 @@ void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *c
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
+#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H */
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
new file mode 100644
index 000000000..a57289156
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
+                          ITensor *out, const float beta, bool is_log, const Window &window)
+{
+    return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
+}
+
+void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
+{
+    return neon_logits_1d_max<qasymm8_t>(in, out, window);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 000000000..7d3fe6e04
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
+                                 ITensor *out, const float beta, bool is_log, const Window &window)
+{
+    return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
+}
+
+void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &window)
+{
+    return neon_logits_1d_max<qasymm8_signed_t>(in, out, window);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
new file mode 100644
index 000000000..89be6c524
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/softmax/generic/sve/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
+                      ITensor *out, const float beta, bool is_log, const Window &window)
+{
+    return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
+}
+
+void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
+{
+    return sve_logits_1d_max<float16_t>(in, out, window);
+}
+}
+} // namespace arm_compute
+#endif //ARM_COMPUTE_ENABLE_SVE
diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
new file mode 100644
index 000000000..79130bf35
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/softmax/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
+                      ITensor *out, const float beta, bool is_log, const Window &window)
+{
+    return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
+}
+
+void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
+{
+    return sve_logits_1d_max<float>(in, out, window);
+}
+}
+} // namespace arm_compute
+#endif //ARM_COMPUTE_ENABLE_SVE
diff --git a/src/cpu/kernels/softmax/impl/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
index 7a577fd56..f17e50e77 100644
--- a/src/cpu/kernels/softmax/impl/sve/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,14 +22,8 @@
  * SOFTWARE.
  */
 #if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
+#include "src/cpu/kernels/softmax/generic/sve/impl.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
 
 namespace arm_compute
 {
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.h b/src/cpu/kernels/softmax/generic/sve/impl.h
new file mode 100644
index 000000000..1051f59ff
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve/impl.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_IMPL_H
+#define SRC_CORE_SVE_KERNELS_SOFTMAX_IMPL_H
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
+
+template <typename ScalarType>
+void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
+                                 ITensor *out, const float beta, bool is_log, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+
+#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_IMPL_H */
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
new file mode 100644
index 000000000..62afe4bf7
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/softmax/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
+{
+    return sve_logits_1d_max<qasymm8_t>(in, out, window);
+}
+}
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
new file mode 100644
index 000000000..5547cc902
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/softmax/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &window)
+{
+    return sve_logits_1d_max<qasymm8_signed_t>(in, out, window);
+}
+}
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/softmax/impl/sve/list.h b/src/cpu/kernels/softmax/generic/sve2/impl.h
index b4e1e1b18..16dde2b11 100644
--- a/src/cpu/kernels/softmax/impl/sve/list.h
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,31 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
+#ifndef SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H
+#define SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H
 
-#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename ScalarType>
-void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
-
-template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window);
-
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-template <typename ScalarType>
-void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                     ITensor *out, float beta, bool is_log, const Window &window)
+void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
+                                      ITensor *out, float beta, bool is_log, const Window &window)
 {
     const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
@@ -215,9 +204,7 @@ void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void
     },
     in_it, max_it, out_it);
 }
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-
-#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+#endif /* SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H */
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
new file mode 100644
index 000000000..8566a5143
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve2_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
+                          ITensor *out, const float beta, bool is_log, const Window &window)
+{
+    return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
+}
+}
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE2)
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
new file mode 100644
index 000000000..c2bdc5011
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve2_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
+                                 ITensor *out, const float beta, bool is_log, const Window &window)
+{
+    return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
+}
+}
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE2)
diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
new file mode 100644
index 000000000..ed3515f41
--- /dev/null
+++ b/src/cpu/kernels/softmax/list.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
+#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_SOFTMAX_KERNEL(func_name)                                  \
+    void func_name(const ITensor *in, const ITensor *max, void *const tmp, \
+                   ITensor *out, const float beta, bool is_log, const Window &window)
+
+DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax);
+DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax);
+DECLARE_SOFTMAX_KERNEL(neon_qasymm8_softmax);
+DECLARE_SOFTMAX_KERNEL(neon_qasymm8_signed_softmax);
+DECLARE_SOFTMAX_KERNEL(sve_fp32_softmax);
+DECLARE_SOFTMAX_KERNEL(sve_fp16_softmax);
+DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_signed_softmax);
+DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax);
+
+#undef DECLARE_SOFTMAX_KERNEL
+
+#define DECLARE_LOGITS_KERNEL(func_name) \
+    void func_name(const ITensor *in, ITensor *out, const Window &window)
+
+DECLARE_LOGITS_KERNEL(neon_fp32_logits);
+DECLARE_LOGITS_KERNEL(neon_fp16_logits);
+DECLARE_LOGITS_KERNEL(neon_qasymm8_logits);
+DECLARE_LOGITS_KERNEL(neon_qasymm8_singed_logits);
+DECLARE_LOGITS_KERNEL(sve_fp32_logits);
+DECLARE_LOGITS_KERNEL(sve_fp16_logits);
+DECLARE_LOGITS_KERNEL(sve_qasymm8_logits);
+DECLARE_LOGITS_KERNEL(sve_qasymm8_signed_logits);
+
+#undef DECLARE_LOGITS_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h
index 001ac6816..eb11926b4 100644
--- a/src/cpu/operators/CpuConcatenate.h
+++ b/src/cpu/operators/CpuConcatenate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,7 +67,7 @@ public:
     void run(ITensorPack &tensors) override;
 
 private:
-    std::vector<std::unique_ptr<ICpuKernel>> _concat_kernels{};
+    std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{};
     unsigned int                             _num_srcs{ 0 };
     unsigned int                             _axis{ 0 };
 };
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 20f3f006d..64df8704f 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,8 +92,8 @@ private:
 
     CpuPermute                  _permute_input;
     CpuPermute                  _permute_output;
-    std::unique_ptr<ICpuKernel> _max_kernel;
-    std::unique_ptr<ICpuKernel> _softmax_kernel;
+    std::unique_ptr<ICPPKernel> _max_kernel;
+    std::unique_ptr<ICPPKernel> _softmax_kernel;
 
     TensorInfo _max;
     TensorInfo _tmp;
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index c47cf8ef1..bab534216 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -290,10 +290,10 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
     { "gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
     { "gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
     { "gemm_lc_vm_f32", "common/gemm.cl" },
-    { "gemm_reshape_lhs_matrix_nt", "common/gemm.cl" },
-    { "gemm_reshape_lhs_matrix_t", "common/gemm.cl" },
-    { "gemm_reshape_rhs_matrix_nt", "common/gemm.cl" },
-    { "gemm_reshape_rhs_matrix_t", "common/gemm.cl" },
+    { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" },
+    { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" },
+    { "gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl" },
+    { "gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl" },
     { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" },
     { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" },
     { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" },
@@ -363,12 +363,8 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
     { "depth_to_space_nchw", "nchw/depth_to_space.cl" },
     { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" },
     { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" },
-    { "direct_convolution1x1_f32_bifrost", "nchw/direct_convolution1x1.cl" },
-    { "direct_convolution3x3", "nchw/direct_convolution3x3.cl" },
-    { "direct_convolution3x3_f32_bifrost", "nchw/direct_convolution3x3.cl" },
-    { "direct_convolution5x5", "nchw/direct_convolution5x5.cl" },
-    { "direct_convolution5x5_f32_bifrost", "nchw/direct_convolution5x5.cl" },
-    { "direct_convolution_quantized", "nchw/direct_convolution_quantized.cl" },
+    { "direct_convolution_nchw", "nchw/direct_convolution.cl" },
+
     { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" },
     { "im2col3x3_nchw", "nchw/im2col.cl" },
     { "im2col5x5_nchw", "nchw/im2col.cl" },
@@ -382,8 +378,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
     { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" },
     { "pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl" },
     { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" },
-    { "remap_nearest_neighbour_nchw", "nchw/remap.cl" },
-    { "remap_bilinear_nchw", "nchw/remap.cl" },
     { "reorg_layer_nchw", "nchw/reorg_layer.cl" },
     { "scale_nearest_neighbour_nchw", "nchw/scale.cl" },
     { "scale_bilinear_nchw", "nchw/scale.cl" },
@@ -443,8 +437,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
     { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" },
     { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" },
     { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" },
-    { "remap_nearest_neighbour_nhwc", "nhwc/remap.cl" },
-    { "remap_bilinear_nhwc", "nhwc/remap.cl" },
     { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
     { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
     { "scale_bilinear_nhwc", "nhwc/scale.cl" },
@@ -590,6 +582,10 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
 #include "./cl_kernels/common/gemm.clembed"
     },
     {
+        "common/gemm_utils.cl",
+#include "./cl_kernels/common/gemm_utils.clembed"
+    },
+    {
         "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
 #include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.clembed"
     },
@@ -763,20 +759,8 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
 #include "./cl_kernels/nchw/dequantization_layer.clembed"
     },
     {
-        "nchw/direct_convolution1x1.cl",
-#include "./cl_kernels/nchw/direct_convolution1x1.clembed"
-    },
-    {
-        "nchw/direct_convolution3x3.cl",
-#include "./cl_kernels/nchw/direct_convolution3x3.clembed"
-    },
-    {
-        "nchw/direct_convolution5x5.cl",
-#include "./cl_kernels/nchw/direct_convolution5x5.clembed"
-    },
-    {
-        "nchw/direct_convolution_quantized.cl",
-#include "./cl_kernels/nchw/direct_convolution_quantized.clembed"
+        "nchw/direct_convolution.cl",
+#include "./cl_kernels/nchw/direct_convolution.clembed"
     },
     {
         "nchw/im2col.cl",
@@ -807,10 +791,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
 #include "./cl_kernels/nchw/prior_box_layer.clembed"
     },
     {
-        "nchw/remap.cl",
-#include "./cl_kernels/nchw/remap.clembed"
-    },
-    {
         "nchw/reorg_layer.cl",
 #include "./cl_kernels/nchw/reorg_layer.clembed"
     },
@@ -906,10 +886,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
 #include "./cl_kernels/nhwc/pooling_layer_quantized.clembed"
     },
     {
-        "nhwc/remap.cl",
-#include "./cl_kernels/nhwc/remap.clembed"
-    },
-    {
         "nhwc/reorg_layer.cl",
 #include "./cl_kernels/nhwc/reorg_layer.clembed"
     },
diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp
index 48caf21d1..bfcd15229 100644
--- a/src/gpu/cl/kernels/ClCastKernel.cpp
+++ b/src/gpu/cl/kernels/ClCastKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src,
                                                          1,
-                                                         DataType::U8, DataType::S8, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
+                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
                                                          DataType::U16, DataType::U32, DataType::S32, DataType::F16,
                                                          DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index 2d851a698..ff8c2c32a 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -122,209 +122,6 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     return Status{};
 }
 
-inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
-                                                      DataType data_type, DataLayout data_layout)
-{
-    return gpu_target_is_in(gpu_target,
-                            GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                            GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                            GPUTarget::G52, GPUTarget::G52LIT)
-           && (kernel_size <= 5)
-           && (conv_stride_x == 1) && (conv_stride_y == 1)
-           && (data_type == DataType::F32)
-           && (data_layout == DataLayout::NCHW);
-}
-
-inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
-                                 unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
-                                 unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *src)
-{
-    const DataType   data_type     = src->data_type();
-    const DataLayout data_layout   = src->data_layout();
-    unsigned int     conv_stride_x = std::get<0>(conv_info.stride());
-    unsigned int     conv_stride_y = std::get<1>(conv_info.stride());
-
-    const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
-
-    if(run_optimized_bifrost)
-    {
-        // Configure kernel window
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                num_elems_read_per_iteration_x    = 4;
-                num_elems_read_per_iteration_y    = 4;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 4;
-                break;
-            }
-            case 3:
-            {
-                num_elems_read_per_iteration_x    = 6;
-                num_elems_read_per_iteration_y    = 5;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 3;
-                break;
-            }
-            case 5:
-            {
-                num_elems_read_per_iteration_x    = 8;
-                num_elems_read_per_iteration_y    = 6;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 2;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
-            }
-        }
-    }
-    else
-    {
-        num_elems_read_per_iteration_y    = kernel_size;
-        num_elems_written_per_iteration_x = 8;
-        num_elems_written_per_iteration_y = 1;
-        switch(kernel_size)
-        {
-            case 1:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 8;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 16;
-                        break;
-                    case 3:
-                        switch(src->element_size())
-                        {
-                            case 1:
-                                num_elems_read_per_iteration_x = 28;
-                                break;
-                            case 2:
-                                num_elems_read_per_iteration_x = 24;
-                                break;
-                            case 4:
-                                num_elems_read_per_iteration_x = 22;
-                                break;
-                            default:
-                                ARM_COMPUTE_ERROR("Invalid data size");
-                        }
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 3:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 10;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 17;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 5:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 12;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 20;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 9:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 16;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 24;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Invalid direct convolution size");
-        }
-    }
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target)
-{
-    const DataLayout data_layout = src->data_layout();
-
-    // Get dst shape
-    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
-
-    if(data_layout == DataLayout::NHWC)
-    {
-        const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u);
-        unsigned int       num_rows = 1U;
-        if(dst->tensor_shape()[0] > 16)
-        {
-            num_rows = src->data_type() == DataType::F32 ? 2U : 4U;
-        }
-
-        // Create window and update padding
-        Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
-        return std::make_pair(Status{}, win);
-    }
-    else if(data_layout == DataLayout::NCHW)
-    {
-        const int          width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const unsigned int kernel_size = weights->dimension(width_idx);
-
-        unsigned int num_elems_read_per_iteration_x    = 0;
-        unsigned int num_elems_read_per_iteration_y    = 0;
-        unsigned int num_elems_written_per_iteration_x = 0;
-        unsigned int num_elems_written_per_iteration_y = 0;
-
-        unsigned int conv_pad_left = conv_info.pad_left();
-        unsigned int conv_pad_top  = conv_info.pad_top();
-        unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-        unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-
-        setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
-                             num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
-                             kernel_size, conv_info, target, src);
-
-        // Create window and update padding
-        bool   window_changed = false;
-        Window win            = calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
-        AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
-        AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
-        AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-        return std::make_pair(err, win);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported");
-    }
-}
-
 bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout)
 {
     if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
@@ -370,11 +167,6 @@ bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataL
 
 } // namespace
 
-BorderSize ClDirectConv2dKernel::border_size() const
-{
-    return _border_size;
-}
-
 ClDirectConv2dKernel::ClDirectConv2dKernel()
 {
     _type = CLKernelType::DIRECT;
@@ -400,24 +192,49 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     const unsigned int kernel_size = weights->dimension(width_idx);
     const DataType     data_type   = src->data_type();
 
-    const GPUTarget gpu_target = get_target();
+    const GPUTarget gpu_target                         = get_target();
+    unsigned int    _num_elems_processed_per_iteration = 0;
+
+    // Get dst shape
+    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, output_shape,
+                       1,
+                       src->data_type(),
+                       src->quantization_info());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    Window win;
+    if(_data_layout == DataLayout::NHWC)
+    {
+        const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u);
+        unsigned int       num_rows = 1U;
+        if(dst->tensor_shape()[0] > 16)
+        {
+            num_rows = src->data_type() == DataType::F32 ? 2U : 4U;
+        }
+
+        // Create window and update padding
+        win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
+    }
+    else if(_data_layout == DataLayout::NCHW)
+    {
+        _num_elems_processed_per_iteration = 1u;
+        win                                = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
+    }
+
+    ICLKernel::configure_internal(win);
 
     std::stringstream kernel_name;
     CLBuildOptions    build_options;
 
     if(_data_layout == DataLayout::NHWC)
     {
-        _border_size = BorderSize();
-
         kernel_name << "direct_convolution_nhwc";
 
-        const unsigned int n0                 = win_config.second.x().step();
-        const unsigned int m0                 = win_config.second.y().step();
+        const unsigned int n0                 = win.x().step();
+        const unsigned int m0                 = win.y().step();
         const unsigned int k0                 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx));
         const unsigned int partial_store_n0   = dst->dimension(channel_idx) % n0;
         const unsigned int pad_left           = conv_info.pad_left();
@@ -438,14 +255,8 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
 
         build_options.add_option("-cl-fast-relaxed-math");
         build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
-        build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
-        build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
-        build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
         build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
-        build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx)));
-        build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx)));
-        build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx)));
         build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
         build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
@@ -459,6 +270,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
         build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
         build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+        build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP");
         build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
 
         if(is_data_type_quantized(data_type))
@@ -497,47 +309,42 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     }
     else
     {
-        _border_size = BorderSize(src->padding());
-
-        kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
-
+        kernel_name << "direct_convolution_nchw";
         build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS"));
+        build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
+        build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
+        build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
+        build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+        build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+        build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+        build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
+        build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
+        build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
+        build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
+        build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
+        build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
+        build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
+        build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
+        build_options.add_option(std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)));
+        build_options.add_option(std::string("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)));
 
-        const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout);
-
-        if(run_optimized_for_bifrost)
+        if(is_data_type_quantized(data_type))
         {
-            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
+            const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+            const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+            const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
 
-            kernel_name << "_f32_bifrost";
-        }
-        else
-        {
-            build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
-            build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
-            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
-            build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
-            build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-
-            if(is_data_type_quantized(data_type))
-            {
-                const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-                const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-                const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-                float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-                int   output_multiplier = 0;
-                int   output_shift      = 0;
-                quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-                build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-                build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-                build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
-                build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
-                build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
-                build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
-
-                kernel_name.str("direct_convolution_quantized");
-            }
+            float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+            int   output_multiplier = 0;
+            int   output_shift      = 0;
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+            build_options.add_option("-DIS_QUANTIZED");
+            build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+            build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+            build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
+            build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+            build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+            build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
         }
     }
 
@@ -570,11 +377,9 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
 }
 
 Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                      const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target)
+                                      const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first);
-
     return Status{};
 }
 
@@ -613,13 +418,13 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
         }
 
         unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src, slice);
-        add_4D_tensor_argument(idx, dst, slice);
+        add_4d_tensor_nhwc_argument(idx, src);
+        add_4d_tensor_nhwc_argument(idx, dst);
         if(export_to_cl_image)
         {
             _kernel.setArg(idx++, weights_cl_image);
         }
-        add_4D_tensor_argument(idx, weights, slice);
+        add_4d_tensor_nhwc_argument(idx, weights);
         if(biases != nullptr)
         {
             add_1D_tensor_argument(idx, biases, slice);
@@ -628,22 +433,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
     }
     else
     {
-        Window win_in = window;
-
-        win_in.adjust(Window::DimX, -_conv_info.pad_left(), true);
-        win_in.adjust(Window::DimY, -_conv_info.pad_top(), true);
-
-        const int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-        const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-        const int conv_stride_x = std::get<0>(_conv_info.stride());
-        const int conv_stride_y = std::get<1>(_conv_info.stride());
-
-        win_in.set_dimension_step(width_idx, window[width_idx].step() * conv_stride_x);
-        win_in.set_dimension_step(height_idx, window[height_idx].step() * conv_stride_y);
-
-        Window       slice_in = win_in.first_slice_window_3D();
-        unsigned int idx1     = 2 * num_arguments_per_3D_tensor();
+        unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
         add_3D_tensor_argument(idx1, weights, slice);
 
         if(biases != nullptr)
@@ -658,11 +448,11 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
         do
         {
             unsigned int idx = 0;
-            add_3D_tensor_argument(idx, src, slice_in);
+            add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, slice);
             enqueue(queue, *this, slice, lws_hint());
         }
-        while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
+        while(window.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
index 5624f3a0a..568192781 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.h
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
@@ -72,15 +72,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target);
+                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
 
 public:
     DataLayout    _data_layout{};
-    BorderSize    _border_size{};
     PadStrideInfo _conv_info{};
 };
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index af794354c..05988997e 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -275,6 +275,9 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
     // NOTE: This might have implications on heuristics and performance
     const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+    _m                             = internal_m;
+    _n                             = gemm_info.n;
+    _k                             = gemm_info.k;
 
     // Create build options
     CLBuildOptions build_opts;
@@ -289,9 +292,6 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
@@ -312,6 +312,9 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     std::string kernel_name("gemm_mm_native");
     post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
@@ -392,11 +395,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         unsigned int idx0;
         if(_add_bias)
         {
-            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (4 + _num_post_op_args);
+            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (7 + _num_post_op_args);
         }
         else
         {
-            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (3 + _num_post_op_args);
+            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (6 + _num_post_op_args);
         }
         const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -408,11 +411,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         unsigned int idx0;
         if(_add_bias)
         {
-            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
         }
         else
         {
-            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
         }
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -455,6 +458,12 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
             const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
         }
+
+        // Pass m, n and k at runtime
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+        _kernel.setArg<cl_int>(idx++, _k);
+
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
     }
     while(window.slide_window_slice_3D(slice));
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index 415eb7bf3..e478df727 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -81,6 +81,9 @@ private:
     bool         _reinterpret_output_as_3d{ false };
     bool         _use_dummy_work_items{ false };
     bool         _add_bias{ false };
+    signed int   _m{ 1 };
+    signed int   _n{ 1 };
+    signed int   _k{ 1 };
     unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
 };
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index 64e99332f..6a450b652 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -201,7 +201,6 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = src2 != nullptr;
     _export_to_cl_image       = rhs_info.export_to_cl_image;
-    _k                        = gemm_info.k;
     _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
     // Check if we need to slide the matrix B
@@ -230,6 +229,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
 
     const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
     const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+    _m                                  = gemm_info.m;
+    _n                                  = gemm_info.n;
+    _k                                  = gemm_info.k;
 
     // Create build options
     CLBuildOptions build_opts;
@@ -250,9 +252,6 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
-    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -278,6 +277,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
     post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
@@ -399,9 +401,6 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
             add_2D_tensor_argument(idx, post_op_arg, slice);
         }
 
-        // K dimension (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
-
         // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
 
@@ -429,6 +428,13 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
         }
 
+        // Pass m, n and k at runtime
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+
+        // K dimension (not used if _export_to_cl_image == true)
+        _kernel.setArg<cl_int>(idx++, _k);
+
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
     }
@@ -436,4 +442,4 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index 09160ec0d..2d668b91a 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -105,7 +105,9 @@ private:
     bool         _use_dummy_work_items{ false };
     bool         _add_bias{ false };
     bool         _export_to_cl_image{ false };
-    unsigned int _k{ 1 };
+    signed int   _m{ 1 };
+    signed int   _n{ 1 };
+    signed int   _k{ 1 };
     unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
 };
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index aa806978e..a8bcf8d6a 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -240,7 +240,9 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
     const unsigned int partial_store_m0 = internal_m % internal_m0;
     const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
+    _m                                  = internal_m;
+    _n                                  = gemm_info.n;
+    _k                                  = gemm_info.k;
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
@@ -253,9 +255,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
     build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
     build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
@@ -286,6 +285,9 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
     post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
@@ -447,6 +449,11 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
         }
 
+        // Pass m, n and k at runtime as signed ints, to ensure results of any subractions they could be operand in, would still be signed.
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+        _kernel.setArg<cl_int>(idx++, _k);
+
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
     }
     while(window.slide_window_slice_3D(slice));
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index a8f0c4c3a..00cdb299c 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,6 +97,9 @@ private:
     bool         _add_bias{ false };
     bool         _export_to_cl_image{ false };
     bool         _has_pad_y{ false };
+    signed int   _m{ 1 };
+    signed int   _n{ 1 };
+    signed int   _k{ 1 };
     unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
 };
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
index 4a01c77d0..413c70ae1 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
@@ -55,6 +55,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON((lhs_info.m0 > 4 && lhs_info.m0 < 8) && lhs_info.transpose);
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
@@ -70,11 +71,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
 {
     const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
     const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
-    bool               window_changed                      = false;
 
     TensorInfo tmp_info(*src);
 
@@ -91,23 +91,13 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)));
 
     // Configure window
-    Window win    = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    Window win_in = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic src_access(src, 0, 0,
-                                  src->dimension(0),
-                                  src->dimension(1));
-    AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1));
-
-    window_changed = update_window_and_padding(win_in, src_access) || // window used by the execute_window_loop
-                     update_window_and_padding(win, dst_access);      // window used to update the padding requirements of dst tensor
+    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
     Window collapsed = win.collapse(win, Window::DimZ);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
+    return collapsed;
 }
 } // namespace
 
@@ -125,27 +115,20 @@ void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_con
 
     auto padding_info = get_padding_info({ src });
 
-    _reinterpret_input_as_3d = reinterpret_input_as_3d;
-
-    const unsigned int src_w           = src->dimension(0);
-    const unsigned int src_h           = _reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1);
-    const unsigned int partial_load_m0 = src_h % lhs_info.m0;
-    const unsigned int partial_load_k0 = src_w % lhs_info.k0;
+    const unsigned int src_w      = src->dimension(0);
+    const unsigned int m          = reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1);
+    const unsigned int partial_m0 = m % lhs_info.m0;
+    const unsigned int partial_k0 = src_w % lhs_info.k0;
 
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
-    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_w));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_h));
     build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(src->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(src->dimension(2)));
+    build_opts.add_option_if_else(lhs_info.transpose, "-DRESHAPE_LHS_T", "-DRESHAPE_LHS_NT");
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
-    build_opts.add_option("-DPARTIAL_LOAD_M0=" + support::cpp11::to_string(partial_load_m0));
-    build_opts.add_option("-DPARTIAL_LOAD_K0=" + support::cpp11::to_string(partial_load_k0));
+    build_opts.add_option("-DPARTIAL_M0=" + support::cpp11::to_string(partial_m0));
+    build_opts.add_option("-DPARTIAL_K0=" + support::cpp11::to_string(partial_k0));
 
     std::string kernel_name("gemm_reshape_lhs_matrix_");
     kernel_name += lhs_info.transpose ? "t" : "nt";
@@ -154,13 +137,16 @@ void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_con
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, lhs_info, reinterpret_input_as_3d);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    auto win_config = configure_window(src, dst, lhs_info, reinterpret_input_as_3d);
+    ICLKernel::configure_internal(win_config);
+
+    unsigned int idx = 2 * num_arguments_per_3d_tensor_nhw();
+    _kernel.setArg<cl_int>(idx++, m);
+    _kernel.setArg<cl_int>(idx++, lhs_info.v0);
 
     // Set config_id for enabling LWS tuning
     _config_id = "gemm_reshape_lhs_matrix_";
-    _config_id += (_reinterpret_input_as_3d ? "3d_" : "");
+    _config_id += (reinterpret_input_as_3d ? "3d_" : "");
     _config_id += lower_string(string_from_data_type(src->data_type()));
     _config_id += "_";
     _config_id += support::cpp11::to_string(dst->dimension(0));
@@ -185,8 +171,6 @@ void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_con
 Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), lhs_info, reinterpret_input_as_3d).first);
-
     return Status{};
 }
 
@@ -202,19 +186,11 @@ void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
 
     Window slice = window.first_slice_window_3D();
 
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the src has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 2 * num_arguments_per_3D_tensor();
-        const unsigned int total_cross_plane_pad = src->info()->padding().top + src->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
+        add_3d_tensor_nhw_argument(idx, src);
+        add_3d_tensor_nhw_argument(idx, dst);
         enqueue(queue, *this, slice, lws_hint());
     }
     while(window.slide_window_slice_3D(slice));
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
index 69ec8f04f..db88e0d73 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
@@ -68,9 +68,6 @@ public:
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool _reinterpret_input_as_3d{ false };
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
index 778b9b9fa..b3a03880e 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
@@ -123,10 +123,9 @@ void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_con
     CLBuildOptions build_opts;
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE");
     build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE");
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1)));
+    build_opts.add_option_if(rhs_info.transpose, "-DRESHAPE_RHS_T");
+    build_opts.add_option_if(!rhs_info.transpose, "-DRESHAPE_RHS_NT");
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
 
     std::string kernel_name("gemm_reshape_rhs_matrix_");
@@ -139,6 +138,9 @@ void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_con
     auto win_config = validate_and_configure_window(src, dst, rhs_info);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
+
+    unsigned int idx = 2 * num_arguments_per_3d_tensor_nhw();
+    _kernel.setArg<cl_int>(idx++, rhs_info.h0);
 }
 
 Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
@@ -164,8 +166,8 @@ void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
+        add_3d_tensor_nhw_argument(idx, src);
+        add_3d_tensor_nhw_argument(idx, dst);
         enqueue(queue, *this, slice, lws_hint());
     }
     while(window.slide_window_slice_3D(slice));
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
index c42762b99..6d1271d24 100644
--- a/src/gpu/cl/kernels/ClIm2ColKernel.cpp
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
@@ -195,10 +195,16 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
         if(kernel_dims == Size2D(3U, 3U))
         {
             kernel_name = "im2col3x3_";
+            build_opts.add_option("-DIM2COL_3X3");
         }
         else if(kernel_dims == Size2D(9U, 9U))
         {
             kernel_name = "im2col9x9_";
+            build_opts.add_option("-DIM2COL_9X9");
+        }
+        else
+        {
+            build_opts.add_option("-DIM2COL_GENERIC");
         }
 
         // Get boundary vector (the first/last vector with potentially a partial vector size) size
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.cpp b/src/gpu/cl/kernels/ClPool2dKernel.cpp
index 5e53799f3..2c98c5940 100644
--- a/src/gpu/cl/kernels/ClPool2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClPool2dKernel.cpp
@@ -57,6 +57,9 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
     int          output_width      = 0;
     int          output_height     = 0;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
+
     std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
                                                                      pool_size_x, pool_size_y, pool_info.pad_stride_info);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
index d63c0e175..6f16adc65 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp
@@ -117,9 +117,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
     const int          idx_channel       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
     const unsigned int src_width         = src->dimension(idx_width);
     const unsigned int src_height        = src->dimension(idx_height);
-    const unsigned int src_channel       = src->dimension(idx_channel);
     const unsigned int dst_width         = dst->dimension(idx_width);
-    const unsigned int dst_height        = dst->dimension(idx_height);
     const unsigned int dst_channels      = dst->dimension(idx_channel);
     unsigned int       vec_size          = 0;
     unsigned int       vec_size_leftover = 0;
@@ -130,20 +128,13 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
         vec_size          = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels);
         vec_size_leftover = dst_channels % vec_size;
         build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER");
-        build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
-        build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
-        build_opts.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src_channel));
         build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
-        build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst_width));
-        build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst_height));
-        build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst_channels));
         build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
         build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
-        build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
-        build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
         build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size));
         build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover));
+        build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use));
         build_opts.add_option_if(src->num_dimensions() > 3, "-DBATCHED_EXECUTION");
         build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
         build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
@@ -203,6 +194,13 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 
+    // Pass scale kernel arguments
+    if(is_nhwc)
+    {
+        unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc();
+        _kernel.setArg<cl_float>(idx++, scale_x);
+        _kernel.setArg<cl_float>(idx++, scale_y);
+    }
     // Set config_id for enabling LWS tuning
     _config_id = "scale_";
     _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
@@ -248,8 +246,8 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
             Window slice     = collapsed.first_slice_window_4D();
 
             unsigned int idx = 0;
-            add_4D_tensor_argument(idx, src, slice);
-            add_4D_tensor_argument(idx, dst, slice);
+            add_4d_tensor_nhwc_argument(idx, src);
+            add_4d_tensor_nhwc_argument(idx, dst);
             enqueue(queue, *this, slice, lws_hint());
             break;
         }
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index d633c8f73..23c1b8af9 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -255,19 +255,39 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
             // Floating-point case: GeMM/Direct/Winograd
             if(is_data_type_float(src->data_type()))
             {
-                const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
-                const bool is_ifm_ge_16       = src->dimension(idx_c) >= 16;
-                const bool is_ifm_gt_ofm      = src->dimension(idx_c) > weights->dimension(3U);
+                // Get dst shape
+                TensorShape output_shape       = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+                const bool  is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+                const bool  is_ifm_ge_16       = src->dimension(idx_c) >= 16;
+                const bool  is_ofm_lte_8       = weights->dimension(3U) <= 8;
+                const bool  workload_gte_8192  = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+                const bool is_ifm_gt_ofm       = src->dimension(idx_c) > weights->dimension(3U);
 
                 // Run Winograd if valid and IFM >= 16
                 if(is_wino_valid && is_ifm_ge_16)
                 {
                     return ConvolutionMethod::WINOGRAD;
                 }
-                // Run Direct for Large kernel size
-                if(is_large_kernel_sz && is_ifm_ge_16 && is_direct_valid && is_ifm_gt_ofm)
+
+                // Direct convolution case
+                if(is_direct_valid)
                 {
-                    return ConvolutionMethod::DIRECT;
+                    if((gpu_target == arm_compute::GPUTarget::G71 ||
+                       gpu_target == arm_compute::GPUTarget::G72 ||
+                       gpu_target == arm_compute::GPUTarget::MIDGARD))
+                    {
+                        if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
+                        {
+                            return ConvolutionMethod::DIRECT;
+                        }
+                    }
+                    else
+                    {
+                        if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
+                        {
+                            return ConvolutionMethod::DIRECT;
+                        }
+                    }
                 }
 
                 // Default case
diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp
index d2e4049a0..53de6fc40 100644
--- a/src/gpu/cl/operators/ClDirectConv2d.cpp
+++ b/src/gpu/cl/operators/ClDirectConv2d.cpp
@@ -83,7 +83,7 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI
 Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
                                 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), CLScheduler::get().target()));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo()));
     if(act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index 50ecb214e..88f6b79b5 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -191,7 +191,6 @@ ClGemm::ClGemm()
       _mm_native_kernel(std::make_unique<ClGemmMatrixMultiplyNativeKernel>()),
       _mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()),
       _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
-      _mm_reshaped_only_rhs_fallback_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
       _tmp_a(),
       _tmp_b(),
       _reshape_b_only_on_first_run(false),
@@ -303,7 +302,6 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
 
     // Set the target for the kernels
     _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
-    _mm_reshaped_only_rhs_fallback_kernel->set_target(gpu_target);
 
     GEMMLHSMatrixInfo lhs_info{};
     GEMMRHSMatrixInfo rhs_info{};
@@ -322,10 +320,6 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
     kernel_info.has_pad_y = false;
     _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
-    // Configure matrix multiply kernel with y padding support
-    kernel_info.has_pad_y = true;
-    _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
-
     // Request memory for RHS reshape matrix
     _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
@@ -625,7 +619,7 @@ void ClGemm::run(ITensorPack &tensors)
 
             if(has_pad_y)
             {
-                CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true);
+                ARM_COMPUTE_ERROR_ON(has_pad_y);
             }
             else
             {
diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h
index e084e53fe..3c0cad3ca 100644
--- a/src/gpu/cl/operators/ClGemm.h
+++ b/src/gpu/cl/operators/ClGemm.h
@@ -121,7 +121,6 @@ private:
     std::unique_ptr<kernels::ClGemmMatrixMultiplyNativeKernel>          _mm_native_kernel;
     std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedKernel>        _mm_reshaped_kernel;
     std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel;
-    std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_fallback_kernel;
     TensorInfo                                                          _tmp_a;
     TensorInfo                                                          _tmp_b;
     bool                                                                _reshape_b_only_on_first_run;
diff --git a/src/graph/DataLayerVisitor.cpp b/src/graph/DataLayerVisitor.cpp
index 3d5d9578c..85d24b465 100644
--- a/src/graph/DataLayerVisitor.cpp
+++ b/src/graph/DataLayerVisitor.cpp
@@ -131,6 +131,14 @@ void DataLayerVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
     add_convolution_layer_method<FusedConvolutionBatchNormalizationNode>(_layer_data, n);
 }
 
+void DataLayerVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
+{
+    _layer_data.clear();
+    add_generic_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
+    add_convolution_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
+    add_convolution_layer_method<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
+}
+
 void DataLayerVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
 {
     _layer_data.clear();
diff --git a/src/graph/INodeVisitor.cpp b/src/graph/INodeVisitor.cpp
index ceaaeae7f..f067d618b 100644
--- a/src/graph/INodeVisitor.cpp
+++ b/src/graph/INodeVisitor.cpp
@@ -85,6 +85,10 @@ void DefaultNodeVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
 {
     default_visit(n);
 }
+void DefaultNodeVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
+{
+    default_visit(n);
+}
 void DefaultNodeVisitor::visit(FusedConvolutionWithPostOpNode &n)
 {
     default_visit(n);
@@ -147,4 +151,4 @@ void DefaultNodeVisitor::visit(StackLayerNode &n)
 }
 #endif /* DOXYGEN_SKIP_THIS */
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index 838977e75..c67f6a538 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -318,6 +318,8 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
             return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
         case NodeType::StridedSliceLayer:
             return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+        case NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer:
+            return detail::create_fused_convolution_batch_normalization_with_post_op<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationWithPostOpsNode *>(node), ctx);
         default:
             return nullptr;
     }
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 3f979e48e..5284fce80 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
+#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
 #include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
@@ -420,88 +421,91 @@ void fuse_convolution_with_post_op(Graph &g, INode *fused_node, std::list<INode
     }
 }
 
-std::list<INode *> get_post_op_list(Graph &g, int &eltwise_operand_id, int &prev_op_dst_pos, int conv_node_id, const std::set<Activation> &supported_fused_activations)
+std::list<INode *> get_post_op_list(Graph &g, int &eltwise_operand_id, int &prev_op_dst_pos, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
 {
     std::list<INode *> post_op_node_list    = {};
     NodeID             prev_op_dst_id       = conv_node_id;
     NodeType           post_op_type_list[3] = { NodeType::Dummy, NodeType::Dummy, NodeType::Dummy };
     int                post_op_idx          = 0;
-    for(unsigned int i = conv_node_id + 1; i < g.nodes().size(); ++i)
+
+    // Get list of the connected nodes
+    auto current_node = g.node(conv_node_id);
+
+    while(post_op_node_list.size() < 3)
     {
-        auto post_op_node    = g.node(i);
+        // This convolution node must have only one output edge, otherwise this function would not have been called
+
+        auto current_output_edge_id = current_node->output_edges().begin();
+        auto current_output_edge    = g.edge(*current_output_edge_id);
+        auto post_op_node           = current_output_edge->consumer();
+
         bool fusable_post_op = false;
         if(post_op_node != nullptr && post_op_node->output_edges().size() > 0)
         {
-            const auto post_op_output_edge_id = *post_op_node->output_edges().begin();
-            const auto post_op_output_edge    = g.edge(post_op_output_edge_id);
-
-            if(post_op_output_edge != nullptr)
+            switch(post_op_node->type())
             {
-                switch(post_op_output_edge->producer()->type())
+                case EltwiseLayerNode::node_type:
                 {
-                    case EltwiseLayerNode::node_type:
+                    auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op_node);
+                    ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
+                    if(eltwise_node->output(0)->accessor() == nullptr)
                     {
-                        auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op_output_edge->producer());
-                        ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
-                        if(eltwise_node->output(0)->accessor() == nullptr)
+                        post_op_node_list.push_back(post_op_node);
+                        fusable_post_op                  = true;
+                        post_op_type_list[post_op_idx++] = eltwise_node->type();
+
+                        // Extract elementwise inputs
+                        const auto eltwise_input_id_0 = eltwise_node->input_edge(0)->producer_id();
+                        const auto eltwise_input_id_1 = eltwise_node->input_edge(1)->producer_id();
+                        if(eltwise_input_id_0 == prev_op_dst_id)
                         {
-                            post_op_node_list.push_back(post_op_output_edge->producer());
-                            fusable_post_op                  = true;
-                            post_op_type_list[post_op_idx++] = eltwise_node->type();
-
-                            // Extract elementwise inputs
-                            const auto eltwise_input_id_0 = eltwise_node->input_edge(0)->producer_id();
-                            const auto eltwise_input_id_1 = eltwise_node->input_edge(1)->producer_id();
-                            if(eltwise_input_id_0 == prev_op_dst_id)
-                            {
-                                eltwise_operand_id = eltwise_input_id_1;
-                                prev_op_dst_pos    = 0;
-                            }
-                            else if(eltwise_input_id_1 == prev_op_dst_id)
-                            {
-                                eltwise_operand_id = eltwise_input_id_0;
-                                prev_op_dst_pos    = 1;
-                            }
+                            eltwise_operand_id = eltwise_input_id_1;
+                            prev_op_dst_pos    = 0;
                         }
-                        else
+                        else if(eltwise_input_id_1 == prev_op_dst_id)
                         {
-                            ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with elementwise due to the presence of an output accessor\n");
+                            eltwise_operand_id = eltwise_input_id_0;
+                            prev_op_dst_pos    = 1;
                         }
-                        break;
                     }
-                    case ActivationLayerNode::node_type:
+                    else
                     {
-                        auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op_output_edge->producer());
-                        ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
-                        // Check if activation is supported for fusion
-                        if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
-                        {
-                            break;
-                        }
-                        if(act_node->output(0)->accessor() == nullptr)
-                        {
-                            post_op_node_list.push_back(post_op_output_edge->producer());
-                            fusable_post_op                  = true;
-                            post_op_type_list[post_op_idx++] = act_node->type();
-                            prev_op_dst_id                   = act_node->id();
-                        }
-                        else
-                        {
-                            ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with activation due to the presence of an output accessor\n");
-                        }
-                        break;
+                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with elementwise due to the presence of an output accessor\n");
                     }
-                    default:
+                    break;
+                }
+                case ActivationLayerNode::node_type:
+                {
+                    auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op_node);
+                    ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
+                    // Check if activation is supported for fusion
+                    if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
                     {
                         break;
                     }
+                    if(act_node->output(0)->accessor() == nullptr)
+                    {
+                        post_op_node_list.push_back(post_op_node);
+                        fusable_post_op                  = true;
+                        post_op_type_list[post_op_idx++] = act_node->type();
+                        prev_op_dst_id                   = act_node->id();
+                    }
+                    else
+                    {
+                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
+                    }
+                    break;
+                }
+                default:
+                {
+                    break;
                 }
             }
 
             // Check if the node is not a branching node and current node is fusable
-            if(post_op_node->output_edges().size() == 1 && fusable_post_op == true && post_op_node_list.size() < 3)
+            if(post_op_node->output_edges().size() == 1 && fusable_post_op == true)
             {
-                continue;
+                current_node = post_op_node;
             }
             else
             {
@@ -534,12 +538,11 @@ std::list<INode *> get_post_op_list(Graph &g, int &eltwise_operand_id, int &prev
  *
  * Notes: currently, only GEMM supports fusion with post operator
 */
-template <typename N>
-void fuse_convolution(Graph &g, const Edge *output_edge, int conv_node_id, const std::set<Activation> &supported_fused_activations)
+void fuse_convolution_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
 {
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
-    auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->producer());
+    auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
     ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
 
     const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
@@ -552,14 +555,14 @@ void fuse_convolution(Graph &g, const Edge *output_edge, int conv_node_id, const
     // Prevent fusion if fused node has an output accessor
     if(conv_node->output(0)->accessor() == nullptr)
     {
-        // If data type is FP32/FP16, data layout is NHWC, and filter size if 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
+        // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
         const Edge *input_edge = conv_node->input_edge(1);
         if(input_edge != nullptr && input_edge->tensor() != nullptr)
         {
             const DataLayout  data_layout  = input_edge->tensor()->desc().layout;
             const DataType    data_type    = input_edge->tensor()->desc().data_type;
             const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
-            if(data_layout != DataLayout::NHWC || is_data_type_float(data_type) == false || tensor_shape.y() != 1 || tensor_shape.z() != 1)
+            if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
             {
                 ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
                 return;
@@ -603,7 +606,18 @@ void fuse_convolution(Graph &g, const Edge *output_edge, int conv_node_id, const
                 auto conv_bias_id = conv_node->input_edge(2)->producer_id();
                 g.add_connection(conv_bias_id, 0, fused_id, 2);
             }
-            g.add_connection(eltwise_operand_id, 0, fused_id, 3);
+            // Adding the Element wise operand in case the post op is element wise operation
+            auto it = std::find_if(post_op_node_list.begin(),
+                                   post_op_node_list.end(),
+                                   [&](const INode * nd)
+            {
+                return (nd->type() == graph::NodeType::EltwiseLayer);
+            });
+
+            if(it != post_op_node_list.end())
+            {
+                g.add_connection(eltwise_operand_id, 0, fused_id, 3);
+            }
             g.remove_node(conv_node->id());
 
             // Update fused node outputs
@@ -623,6 +637,136 @@ void fuse_convolution(Graph &g, const Edge *output_edge, int conv_node_id, const
     }
 }
 
+void fuse_convolution_batch_normalization_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
+{
+    ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
+
+    auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(output_edge->producer());
+    ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
+    const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
+    if(conv_algorithm != ConvolutionMethod::GEMM)
+    {
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
+        return;
+    }
+
+    // Prevent fusion if fused node has an output accessor
+    if(conv_node->output(0)->accessor() == nullptr)
+    {
+        // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
+        const Edge *input_edge = conv_node->input_edge(1);
+        if(input_edge != nullptr && input_edge->tensor() != nullptr)
+        {
+            const DataLayout  data_layout  = input_edge->tensor()->desc().layout;
+            const DataType    data_type    = input_edge->tensor()->desc().data_type;
+            const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
+            if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
+            {
+                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
+                return;
+            }
+        }
+        else
+        {
+            return;
+        }
+
+        // Get post op list
+        int                eltwise_operand_id = 0;
+        int                prev_op_dst_pos    = 0; // Previous operator dst's postion in current operator
+        std::list<INode *> post_op_node_list  = get_post_op_list(g, eltwise_operand_id, prev_op_dst_pos, conv_node_id, supported_fused_activations);
+
+        if(post_op_node_list.size() == 0)
+        {
+            return;
+        }
+        else // Do convolution fusion with post op if there're one(elementwise), two or more operators
+        {
+            const Target assigned_target = conv_node->assigned_target();
+
+            // Extract conv inputs
+            const auto   conv_input_id   = conv_node->input_edge(0)->producer_id();
+            const auto   conv_weights_id = conv_node->input_edge(1)->producer_id();
+            const auto   bn_mean_id      = conv_node->input_edge(3)->producer_id();
+            const auto   bn_var_id       = conv_node->input_edge(4)->producer_id();
+            const auto   conv_info       = conv_node->convolution_info();
+            const auto   conv_method     = conv_node->convolution_method();
+            const auto   num_groups      = conv_node->num_groups();
+            FastMathHint fast_math_hint  = conv_node->fast_math_hint();
+
+            // Create the fused node
+
+            const float  epsilon  = conv_node->epsilon();
+            const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationWithPostOpsNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint);
+
+            ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing FusedConvolutionBatchNormalization node with ID : " << conv_node->id());
+
+            // Add connections from the conv inputs to the fused node
+            g.add_connection(conv_input_id, 0, fused_id, 0);
+            g.add_connection(conv_weights_id, 0, fused_id, 1);
+
+            if(conv_node->input_edge(2) != nullptr)
+            {
+                auto conv_bias_id = conv_node->input_edge(2)->producer_id();
+                g.add_connection(conv_bias_id, 0, fused_id, 2);
+            }
+            g.add_connection(bn_mean_id, 0, fused_id, 3);
+            g.add_connection(bn_var_id, 0, fused_id, 4);
+
+            // Move connections of old FusedConvolutionBatchNormalization to the fused node
+            if(conv_node->input_edge(5) != nullptr)
+            {
+                const auto bn_beta_id = conv_node->input_edge(5)->producer_id();
+                g.add_connection(bn_beta_id, 0, fused_id, 5);
+            }
+
+            if(conv_node->input_edge(6) != nullptr)
+            {
+                const auto bn_gamma_id = conv_node->input_edge(6)->producer_id();
+                g.add_connection(bn_gamma_id, 0, fused_id, 6);
+            }
+
+            // Adding the Element wise operand in case the post op is element wise operation
+            auto it = std::find_if(post_op_node_list.begin(),
+                                   post_op_node_list.end(),
+                                   [&](const INode * nd)
+            {
+                return (nd->type() == graph::NodeType::EltwiseLayer);
+            });
+
+            if(it != post_op_node_list.end())
+            {
+                g.add_connection(eltwise_operand_id, 0, fused_id, 7);
+            }
+
+            // Update fused node outputs
+            auto fused_node = g.node(fused_id);
+            fused_node->set_assigned_target(assigned_target);
+
+            auto conv_node_name = conv_node->name();
+
+            // collect the post ops names
+            std::string post_ops_name = "";
+            for(auto &post_op : post_op_node_list)
+            {
+                post_ops_name += post_op->name();
+            }
+            fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + post_ops_name, assigned_target });
+
+            // Fuse convolution with post op
+            fuse_convolution_with_post_op(g, fused_node, post_op_node_list, prev_op_dst_pos);
+
+            post_op_node_list.clear();
+            g.remove_node(conv_node->id());
+            ARM_COMPUTE_LOG_GRAPH_VERBOSE(std::endl);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
+    }
+}
+
 template <typename N1, typename F, typename... Args>
 void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
 {
@@ -692,16 +836,21 @@ void NodeFusionMutator::mutate(Graph &g)
 
     // Fusion mutations
 
-    detail::fuse_layer<ConvolutionLayerNode>(g, cl_target_prec, detail::fuse_convolution<ConvolutionLayerNode>, supported_fused_activations);
-    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
-    detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
     detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
     detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
+    // The fusion of PostOps to ConvolutionLayer:
+    // It must occur after the fusion of PadLayer into ConvolutionLayer
+    // It must occur before the fusion of normal ActivationLayer into ConvolutionLayer as it takes precedence
+    detail::fuse_layer<ConvolutionLayerNode>(g, cl_target_prec, detail::fuse_convolution_with_post_ops, supported_fused_activations);
     detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
+    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
     detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
-    detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
     detail::fuse_layer<FullyConnectedLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<FullyConnectedLayerNode>, supported_fused_activations);
     detail::fuse_layer<EltwiseLayerNode, ActivationLayerNode>(g, cl_target_prec, detail::fuse_node_with_activation<EltwiseLayerNode>, supported_fused_activations);
+    // The fusion of BatchNormalizationLayer must occur after the fusion of ActivationLayer. Because FusedConvolutionBatchNormalizationNode assumes the BatchNormalization is already fused with activation, if any
+    detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
+    detail::fuse_layer<FusedConvolutionBatchNormalizationNode>(g, cl_target_prec, detail::fuse_convolution_batch_normalization_with_post_ops, supported_fused_activations);
 }
 } // namespace graph
 } // namespace arm_compute
diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
new file mode 100644
index 000000000..af81f0369
--- /dev/null
+++ b/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+FusedConvolutionBatchNormalizationWithPostOpsNode::FusedConvolutionBatchNormalizationWithPostOpsNode(float epsilon, PadStrideInfo info,
+                                                                                                     unsigned int      num_groups,
+                                                                                                     ConvolutionMethod method,
+                                                                                                     FastMathHint      fast_math_hint)
+    : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint)
+{
+    _input_edges.resize(8, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+void FusedConvolutionBatchNormalizationWithPostOpsNode::set_convolution_method(ConvolutionMethod method)
+{
+    _method = method;
+}
+
+float FusedConvolutionBatchNormalizationWithPostOpsNode::epsilon() const
+{
+    return _epsilon;
+}
+
+ConvolutionMethod FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_method() const
+{
+    return _method;
+}
+
+void FusedConvolutionBatchNormalizationWithPostOpsNode::set_fast_math_hint(FastMathHint hint)
+{
+    _fast_math_hint = hint;
+}
+
+FastMathHint FusedConvolutionBatchNormalizationWithPostOpsNode::fast_math_hint() const
+{
+    return _fast_math_hint;
+}
+
+PadStrideInfo FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_info() const
+{
+    return _info;
+}
+
+unsigned int FusedConvolutionBatchNormalizationWithPostOpsNode::num_groups() const
+{
+    return _num_groups;
+}
+
+TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                                              const TensorDescriptor &weights_descriptor,
+                                                                                              const PadStrideInfo    &info)
+{
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
+
+    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+    const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
+
+    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+
+    const DataLayout data_layout       = input_descriptor.layout;
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+
+    return output_descriptor;
+}
+
+bool FusedConvolutionBatchNormalizationWithPostOpsNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    const Tensor *src     = input(0);
+    const Tensor *weights = input(1);
+
+    ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
+
+    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
+
+    return output_info;
+}
+
+NodeType FusedConvolutionBatchNormalizationWithPostOpsNode::type() const
+{
+    return FusedConvolutionBatchNormalizationWithPostOpsNode::node_type;
+}
+
+void FusedConvolutionBatchNormalizationWithPostOpsNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index 47371e34d..1071d5019 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp
@@ -85,6 +85,14 @@ void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
     _info = ss.str();
 }
 
+void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
+{
+    ARM_COMPUTE_UNUSED(n);
+    std::stringstream ss;
+    ss << "FusedConvolutionBatchNormalizationWithPostOpsNode";
+    _info = ss.str();
+}
+
 void DotGraphVisitor::visit(FusedConvolutionWithPostOpNode &n)
 {
     ARM_COMPUTE_UNUSED(n);
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
deleted file mode 100644
index b9c62d848..000000000
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLRemap.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLRemapKernel.h"
-
-#include "src/common/utils/Log.h"
-
-namespace arm_compute
-{
-void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
-                        BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(compile_context, input, map_x, map_y, output, policy, border_mode, PixelValue{ constant_border_value });
-}
-
-void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_mode, PixelValue{ constant_border_value });
-}
-
-void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_mode, constant_border_value);
-}
-
-void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
-                        BorderMode border_mode, PixelValue constant_border_value)
-{
-    ARM_COMPUTE_LOG_PARAMS(input, map_x, map_y, output, policy, border_mode, constant_border_value);
-    auto k = std::make_unique<CLRemapKernel>();
-    k->configure(compile_context, input, map_x, map_y, output, RemapInfo{ policy, border_mode, constant_border_value });
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, constant_border_value);
-}
-
-Status CLRemap::validate(const ITensorInfo *input, const ITensorInfo *map_x, const ITensorInfo *map_y, const ITensorInfo *output, const InterpolationPolicy policy, const BorderMode border_mode,
-                         PixelValue constant_border_value)
-{
-    return CLRemapKernel::validate(input, map_x, map_y, output, RemapInfo{ policy, border_mode, constant_border_value });
-}
-} // namespace arm_compute
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 3d7f1f16b..94a2f31d6 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -104,12 +104,12 @@ void set_thread_affinity(int core_id)
         return;
     }
 
-#if !defined(__APPLE__)
+#if !defined(__APPLE__) && !defined(__OpenBSD__)
     cpu_set_t set;
     CPU_ZERO(&set);
     CPU_SET(core_id, &set);
     ARM_COMPUTE_EXIT_ON_MSG(sched_setaffinity(0, sizeof(set), &set), "Error setting thread affinity");
-#endif /* !defined(__APPLE__) */
+#endif /* !defined(__APPLE__) && !defined(__OpenBSD__) */
 }
 
 /** There are currently 2 scheduling modes supported by CPPScheduler
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 004b8a46b..1d068c9b3 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Log.h"
 #include "arm_compute/core/Window.h"
 #include "src/common/cpuinfo/CpuInfo.h"
 #include "src/runtime/SchedulerUtils.h"
@@ -138,6 +139,9 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
                 default:
                     ARM_COMPUTE_ERROR("Unknown strategy");
             }
+            // Make sure the smallest window is larger than minimim workload size
+            num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info());
+
             std::vector<IScheduler::Workload> workloads(num_windows);
             for(unsigned int t = 0; t < num_windows; ++t)
             {
@@ -171,4 +175,37 @@ void IScheduler::run_tagged_workloads(std::vector<Workload> &workloads, const ch
     run_workloads(workloads);
 }
 
+std::size_t IScheduler::adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info)
+{
+    // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow").
+    if(window.num_iterations(split_dimension) < init_num_windows )
+    {
+        auto recommended_split_dim = Window::DimX;
+        for(std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims)
+        {
+            if(window.num_iterations(recommended_split_dim) < window.num_iterations(dims))
+            {
+                recommended_split_dim = dims;
+            }
+        }
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("%lu dimension is not a suitable dimension to split the workload. Recommended: %lu recommended_split_dim", split_dimension,
+                                                  recommended_split_dim);
+    }
+
+    for(auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first
+    {
+        // Try splitting the workload into t, subject to each subworkload size <= mws.
+        if((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t)
+        {
+            if(t != init_num_windows)
+            {
+                ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using a different thread count than the one assigned by the user.");
+            }
+            return t;
+        }
+    }
+    ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using single thread instead of the thread count assigned by the user.");
+    return 1; //  If the workload is so small that it can't be split, we should run a single thread
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index e776e673c..8bacdd300 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -153,6 +153,11 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
             }
             // Concatenate the padding before and after with the input.
             ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i];
+            out->info()->set_quantization_info(output->info()->quantization_info());
+            for(auto &v : concat_vector)
+            {
+                v->info()->set_quantization_info(input->info()->quantization_info());
+            }
             _concat_functions[i].configure(concat_vector, out, i);
             if(i != _num_dimensions - 1)
             {
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
deleted file mode 100644
index 154479aed..000000000
--- a/src/runtime/NEON/functions/NERemap.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NERemap.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/NERemapKernel.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(input, map_x, map_y, output, policy, border_mode, PixelValue{ constant_border_value });
-}
-void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
-    ARM_COMPUTE_LOG_PARAMS(input, map_x, map_y, output, policy, border_mode, constant_border_value);
-
-    auto k = std::make_unique<NERemapKernel>();
-    k->configure(input, map_x, map_y, output, policy, border_mode, constant_border_value.get<uint8_t>());
-    _kernel = std::move(k);
-}
-} // namespace arm_compute
diff --git a/support/StringSupport.h b/support/StringSupport.h
index 5e237c7df..e8b3ca7ab 100644
--- a/support/StringSupport.h
+++ b/support/StringSupport.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -131,6 +131,12 @@ inline std::string to_string(T && value)
     return stream.str();
 }
 
+// Specialization for const std::string&
+inline std::string to_string(const std::string &value)
+{
+    return value;
+}
+
 /** Convert string values to float.
  *
  * @note This function implements the same behaviour as std::stof. The latter
@@ -164,6 +170,12 @@ inline std::string to_string(T &&value)
     return ::std::to_string(std::forward<T>(value));
 }
 
+// Specialization for const std::string&
+inline std::string to_string(const std::string &value)
+{
+    return value;
+}
+
 /** Convert string values to float.
  *
  * @note This function acts as a convenience wrapper around std::stof. The
diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h
index d8c14411e..cddcd542c 100644
--- a/support/ToolchainSupport.h
+++ b/support/ToolchainSupport.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -143,12 +143,12 @@ inline T fma(T x, T y, T z)
  *  and writes the result to a character string buffer.
  *
  * @param[in] s    Pointer to a character string to write to
- * @param[in] n    Up to buf_size - 1 characters may be written, plus the null terminator
- * @param[in] fmt  Pointer to a null-terminated multibyte string specifying how to interpret the data.
+ * @param[in] n    Up to buf_size - 1 characters may be written, plus the null ending character
+ * @param[in] fmt  Pointer to a null-ended multibyte string specifying how to interpret the data.
  * @param[in] args Arguments forwarded to snprintf.
  *
  * @return  Number of characters that would have been written for a sufficiently large buffer
- *          if successful (not including the terminating null character), or a negative value if an error occurred.
+ *          if successful (not including the ending null character), or a negative value if an error occurred.
  */
 template <typename... Ts>
 inline int snprintf(char *s, size_t n, const char *fmt, Ts &&... args)
@@ -258,12 +258,12 @@ inline T fma(T x, T y, T z)
  *  and writes the result to a character string buffer.
  *
  * @param[in] s    Pointer to a character string to write to
- * @param[in] n    Up to buf_size - 1 characters may be written, plus the null terminator
- * @param[in] fmt  Pointer to a null-terminated multibyte string specifying how to interpret the data.
+ * @param[in] n    Up to buf_size - 1 characters may be written, plus the null ending character
+ * @param[in] fmt  Pointer to a null-ended multibyte string specifying how to interpret the data.
  * @param[in] args Arguments forwarded to std::snprintf.
  *
  * @return  Number of characters that would have been written for a sufficiently large buffer
- *          if successful (not including the terminating null character), or a negative value if an error occurred.
+ *          if successful (not including the ending null character), or a negative value if an error occurred.
  */
 template <typename... Ts>
 inline int snprintf(char *s, std::size_t n, const char *fmt, Ts &&... args)
@@ -309,6 +309,23 @@ inline bool isfinite(bfloat16 value)
 {
     return std::isfinite(float(value));
 }
+
+// std::signbit
+template <typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value>::type>
+inline bool signbit(T value)
+{
+    return std::signbit(value);
+}
+
+inline bool signbit(half_float::half value)
+{
+    return half_float::signbit(value);
+}
+
+inline bool signbit(bfloat16 value)
+{
+    return std::signbit(float(value));
+}
 } // namespace cpp11
 } // namespace support
 } // namespace arm_compute
diff --git a/tests/framework/Framework.cpp b/tests/framework/Framework.cpp
index e59c5a45e..03c1db82c 100644
--- a/tests/framework/Framework.cpp
+++ b/tests/framework/Framework.cpp
@@ -529,7 +529,7 @@ void Framework::run_test(const TestInfo &info, TestCaseFactory &test_factory)
     {
         if(_stop_on_error)
         {
-            throw std::runtime_error("Abort on first error.");
+            throw std::runtime_error("Abandon on first error.");
         }
     }
 
diff --git a/tests/framework/Framework.h b/tests/framework/Framework.h
index 4c2e86c6e..274f03a92 100644
--- a/tests/framework/Framework.h
+++ b/tests/framework/Framework.h
@@ -230,13 +230,13 @@ public:
 
     /** Indicates if test execution is stopped after the first failed test.
      *
-     * @return True if the execution is going to be aborted after the first failed test.
+     * @return True if the execution is going to be stopped after the first failed test.
      */
     bool stop_on_error() const;
 
-    /** Set whether to abort execution after the first failed test.
+    /** Set whether to stop execution after the first failed test.
      *
-     * @param[in] stop_on_error True if execution is going to be aborted after first failed test.
+     * @param[in] stop_on_error True if execution is going to be stopped after first failed test.
      */
     void set_stop_on_error(bool stop_on_error);
 
diff --git a/tests/framework/instruments/Instruments.h b/tests/framework/instruments/Instruments.h
index 8a6cec0e9..28c994b11 100644
--- a/tests/framework/instruments/Instruments.h
+++ b/tests/framework/instruments/Instruments.h
@@ -24,12 +24,12 @@
 #ifndef ARM_COMPUTE_TEST_INSTRUMENTS
 #define ARM_COMPUTE_TEST_INSTRUMENTS
 
-#if !defined(BARE_METAL) && !defined(__APPLE__)
+#if !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__)
 #include "MaliCounter.h"
 #include "OpenCLMemoryUsage.h"
 #include "OpenCLTimer.h"
 #include "PMUCounter.h"
-#endif /* !defined(BARE_METAL) && !defined(__APPLE__) */
+#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) */
 #include "SchedulerTimer.h"
 #include "WallClockTimer.h"
 
diff --git a/tests/main.cpp b/tests/main.cpp
index e1963b160..bc264de37 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -116,7 +116,7 @@ int main(int argc, char **argv)
     auto filter_id = parser.add_option<utils::SimpleOption<std::string>>("filter-id");
     filter_id->set_help("List of test ids. ... can be used to define a range.");
     auto stop_on_error = parser.add_option<utils::ToggleOption>("stop-on-error");
-    stop_on_error->set_help("Abort execution after the first failed test (useful for debugging)");
+    stop_on_error->set_help("Stop execution after the first failed test (useful for debugging)");
     auto seed = parser.add_option<utils::SimpleOption<std::random_device::result_type>>("seed", std::random_device()());
     seed->set_help("Global seed for random number generation");
     auto list_tests = parser.add_option<utils::ToggleOption>("list-tests", false);
diff --git a/tests/validation/CL/Cast.cpp b/tests/validation/CL/Cast.cpp
index 2ca8b5804..84455ba94 100644
--- a/tests/validation/CL/Cast.cpp
+++ b/tests/validation/CL/Cast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,6 +48,9 @@ constexpr AbsoluteTolerance<float> one_tolerance(1);
 constexpr AbsoluteTolerance<float> zero_tolerance(0);
 
 /** Input data sets **/
+// QASYMM8
+const auto CastQASYMM8toF32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::F32));
+
 // U8
 const auto CastU8toS8Dataset  = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S8));
 const auto CastU8toU16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U16));
@@ -149,6 +152,9 @@ using CLCastToF32Fixture = CastValidationFixture<CLTensor, CLAccessor, CLCast, T
     }                                                                                                                            \
     TEST_SUITE_END()
 
+// QASYMM8
+CAST_SUITE(QASYMM8_to_F32, DataType::QASYMM8, DataType::F32, CLCastToF32Fixture<uint8_t>, CastQASYMM8toF32Dataset, zero_tolerance)
+
 // U8
 CAST_SUITE(U8_to_S8, DataType::U8, DataType::S8, CLCastToS8Fixture<uint8_t>, CastU8toS8Dataset, zero_tolerance)
 CAST_SUITE(U8_to_U16, DataType::U8, DataType::U16, CLCastToU16Fixture<uint8_t>, CastU8toU16Dataset, zero_tolerance)
diff --git a/tests/validation/CL/DepthConvertLayer.cpp b/tests/validation/CL/DepthConvertLayer.cpp
index 8f14337b2..490b38ccf 100644
--- a/tests/validation/CL/DepthConvertLayer.cpp
+++ b/tests/validation/CL/DepthConvertLayer.cpp
@@ -62,7 +62,7 @@ TEST_SUITE(DepthConvertLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8), // Invalid data type combination
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8), // Support upcasting from QASYMM8 to S16
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Mismatching shapes
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),      // Invalid shift
@@ -84,7 +84,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                    ConvertPolicy::WRAP,
                                                      })),
                framework::dataset::make("Shift",{ 0, 0, 0, 1, 1, 0, })),
-               framework::dataset::make("Expected", { false, false, false, false, false, true})),
+               framework::dataset::make("Expected", { true, false, false, false, false, true})),
                input_info, output_info, policy, shift, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLDepthConvertLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), policy, shift)) == expected, framework::LogLevel::ERRORS);
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
index ca63d3a67..cfd98bd8f 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/GEMMReshapeLHSMatrix.cpp b/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
index 894b83701..0dd9b811f 100644
--- a/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
+++ b/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
@@ -66,8 +66,10 @@ const auto b_values = framework::dataset::make("batchsize", 1, 3);
 
 /** M0 values to test */
 const auto m0_values_s32 = framework::dataset::make("M0", { 2, 3 });
-const auto m0_values_s16 = framework::dataset::make("M0", { 4, 5 });
-const auto m0_values_s8 = framework::dataset::make("M0", { 6, 7, 8 });
+const auto m0_values_s16 = framework::dataset::make("M0", { 4 });
+const auto m0_values_s16_nt = framework::dataset::make("M0", { 5 });
+const auto m0_values_s8_nt = framework::dataset::make("M0", { 6,7 });
+const auto m0_values_s8 = framework::dataset::make("M0", { 8 });
 
 /** K0 values to test */
 const auto k0_values_s32 = framework::dataset::make("K0", { 2, 3 });
@@ -101,6 +103,7 @@ FIXTURE_DATA_TEST_CASE(S32, CLGEMMReshapeLHSMatrixFixture<int>, framework::Datas
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+
 FIXTURE_DATA_TEST_CASE(S16, CLGEMMReshapeLHSMatrixFixture<short>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape2DShapes(),
                                                                    b_values),
@@ -114,6 +117,7 @@ FIXTURE_DATA_TEST_CASE(S16, CLGEMMReshapeLHSMatrixFixture<short>, framework::Dat
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+
 FIXTURE_DATA_TEST_CASE(S8, CLGEMMReshapeLHSMatrixFixture<char>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape2DShapes(),
                                                                    b_values),
@@ -128,6 +132,37 @@ FIXTURE_DATA_TEST_CASE(S8, CLGEMMReshapeLHSMatrixFixture<char>, framework::Datas
     validate(CLAccessor(_target), _reference);
 }
 
+TEST_SUITE(NotTransposed)
+FIXTURE_DATA_TEST_CASE(S16, CLGEMMReshapeLHSMatrixFixture<short>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape2DShapes(),
+                                                                   b_values),
+                                                                   framework::dataset::make("DataType", DataType::S16)),
+                                                                   m0_values_s16_nt),
+                                                                   k0_values_s16),
+                                                                   v0_values),
+                                                                   i_values),
+                                                                   framework::dataset::make("transpose", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(S8, CLGEMMReshapeLHSMatrixFixture<char>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape2DShapes(),
+                                                                   b_values),
+                                                                   framework::dataset::make("DataType", DataType::S8)),
+                                                                   m0_values_s8_nt),
+                                                                   k0_values_s8),
+                                                                   v0_values),
+                                                                   i_values),
+                                                                   framework::dataset::make("transpose", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+TEST_SUITE_END()
+
 TEST_SUITE(ReinterpretInputAs3D)
 FIXTURE_DATA_TEST_CASE(S32, CLGEMMReshapeLHSMatrix3DFixture<int>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape3DShapes(),
diff --git a/tests/validation/CL/Remap.cpp b/tests/validation/CL/Remap.cpp
deleted file mode 100644
index 7849d7739..000000000
--- a/tests/validation/CL/Remap.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLRemap.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/RemapFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(1);
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(Remap)
-
-// *INDENT-OFF*
-// clang-format off
-
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-               framework::dataset::make("input", { TensorInfo(TensorShape(10U, 10U), 1, DataType::U8, DataLayout::NCHW),
-                                                   TensorInfo(TensorShape(10U, 10U), 1, DataType::U8, DataLayout::NHWC),
-                                                   TensorInfo(TensorShape(10U, 10U), 1, DataType::F16, DataLayout::NCHW),
-                                                   TensorInfo(TensorShape(10U, 10U), 1, DataType::F16, DataLayout::NHWC)
-                                                      }),
-               framework::dataset::make("map_x", { TensorInfo(TensorShape(10U, 10U), 1, DataType::F32, DataLayout::NCHW),
-                                                   TensorInfo(TensorShape(10U, 10U), 1, DataType::F32, DataLayout::NHWC),
-                                                   TensorInfo(TensorShape(10U, 10U), 1, DataType::F32, DataLayout::NCHW),
-                                                   TensorInfo(TensorShape(10U, 10U), 1, DataType::F32, DataLayout::NHWC)
-
-                                                      })),
-               framework::dataset::make("map_y", { TensorInfo(TensorShape(10U, 10U), 1, DataType::F32, DataLayout::NCHW),
-                                                   TensorInfo(TensorShape(10U, 10U), 1, DataType::F32, DataLayout::NHWC),
-                                                   TensorInfo(TensorShape(10U, 10U), 1, DataType::F32, DataLayout::NCHW),
-                                                   TensorInfo(TensorShape(10U, 10U), 1, DataType::F32, DataLayout::NHWC)
-                                                      })),
-               framework::dataset::make("output", { TensorInfo(TensorShape(10U, 10U), 1, DataType::U8, DataLayout::NCHW),
-                                                    TensorInfo(TensorShape(10U, 10U), 1, DataType::U8, DataLayout::NHWC),
-                                                    TensorInfo(TensorShape(10U, 10U), 1, DataType::F16, DataLayout::NCHW),
-                                                    TensorInfo(TensorShape(10U, 10U), 1, DataType::F16, DataLayout::NHWC)
-                                                      })),
-               framework::dataset::make("policy",{ InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                   InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                   InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                   InterpolationPolicy::NEAREST_NEIGHBOR
-                                                      })),
-               framework::dataset::make("border_mode",{ BorderMode::CONSTANT,
-                                                        BorderMode::CONSTANT,
-                                                        BorderMode::CONSTANT,
-                                                        BorderMode::CONSTANT
-                                                      })),
-               framework::dataset::make("Expected", { true, // NCHW, U8
-                                                      true, // NHWC, U8
-                                                      false, // NCHW, F16
-                                                      true // NHWC, F16
-                                                    })),
-               input, map_x, map_y, output, policy, border_mode, expected)
-{
-    ARM_COMPUTE_EXPECT(bool(CLRemap::validate(&input, &map_x, &map_y, &output, policy, border_mode, PixelValue{})) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-template <typename T>
-using CLRemapFixture = RemapValidationFixture<CLTensor, CLAccessor, CLRemap, T>;
-template <typename T>
-using CLRemapLayoutFixture = RemapValidationMixedLayoutFixture<CLTensor, CLAccessor, CLRemap, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLRemapLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                   framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                   framework::dataset::make("DataType", DataType::U8)),
-                                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })),
-                                                                                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, _valid_mask, tolerance_value);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLRemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                           framework::dataset::make("DataType", DataType::U8)),
-                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, _valid_mask, tolerance_value);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLRemapLayoutFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                        framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })),
-                                                                                                        framework::dataset::make("DataLayout", DataLayout::NHWC)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, _valid_mask, tolerance_value);
-}
-TEST_SUITE_END() // F16
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/CL/UNIT/Multithreaded.cpp b/tests/validation/CL/UNIT/Multithreaded.cpp
new file mode 100644
index 000000000..5c75df709
--- /dev/null
+++ b/tests/validation/CL/UNIT/Multithreaded.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/RuntimeContext.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/ParametersLibrary.h"
+#include "tests/validation/Validation.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/PixelWiseMultiplication.h"
+#include <thread>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(UNIT)
+TEST_SUITE(RuntimeContext)
+// This test tries scheduling work concurrently from two independent threads
+TEST_CASE(MultipleThreadedScheduller, framework::DatasetMode::ALL)
+{
+    constexpr auto num_threads(16u);
+    std::array<CLActivationLayer, num_threads>         func{};
+    std::array<CLPixelWiseMultiplication, num_threads> pmul{};
+    std::array<CLTensor, num_threads>                  s0{};
+    std::array<CLTensor, num_threads>                  s1{};
+
+    std::array<CLTensor, num_threads> st{};
+    std::array<CLTensor, num_threads> dt{};
+
+    const TensorShape         tensor_shape(128u, 4u, 5u);
+    const ActivationLayerInfo ainfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.5f, 1.f);
+    std::array<std::thread, num_threads> threads;
+    auto ctx = parameters->get_ctx<CLTensor>();
+
+    for(auto i = 0u; i < num_threads; ++i)
+    {
+        s0[i]   = create_tensor<CLTensor>(tensor_shape, DataType::F32, 1);
+        s1[i]   = create_tensor<CLTensor>(tensor_shape, DataType::F32, 1);
+        st[i]   = create_tensor<CLTensor>(tensor_shape, DataType::F32, 1);
+        dt[i]   = create_tensor<CLTensor>(tensor_shape, DataType::F32, 1);
+        func[i] = CLActivationLayer(ctx);
+        pmul[i] = CLPixelWiseMultiplication();
+        threads[i] =
+            std::thread([&,i]
+        {
+            auto &s  = st[i];
+            auto &t  = dt[i];
+            auto &p0 = s0[i];
+            auto &p1 = s1[i];
+            pmul[i].configure(&p0, &p1, &s, 1.f, ConvertPolicy::WRAP, RoundingPolicy::TO_NEAREST_UP);
+            func[i].configure(&s, &t, ainfo);
+            s.allocator()->allocate();
+            t.allocator()->allocate();
+            p0.allocator()->allocate();
+            p1.allocator()->allocate();
+            library->fill_tensor_uniform(CLAccessor(p0), 0, -1.f, 1.f);
+            library->fill_tensor_uniform(CLAccessor(p1), 0, -1.f, 1.f);
+            pmul[i].run();
+            func[i].run();
+        });
+    }
+
+    for(auto &t : threads)
+    {
+        t.join();
+    }
+
+    SimpleTensor<float> rs{ tensor_shape, DataType::F32, 1 };
+    SimpleTensor<float> ra{ tensor_shape, DataType::F32, 1 };
+    SimpleTensor<float> rb{ tensor_shape, DataType::F32, 1 };
+    library->fill_tensor_uniform(ra, 0, -1.f, 1.f);
+    library->fill_tensor_uniform(rb, 0, -1.f, 1.f);
+    const auto mul    = reference::pixel_wise_multiplication<float, float, float>(ra, rb, 1.f, ConvertPolicy::WRAP, RoundingPolicy::TO_NEAREST_UP, DataType::F32);
+    const auto golden = reference::activation_layer<float>(mul, ainfo);
+    for(auto &d : dt)
+    {
+        validate(CLAccessor(d), golden);
+    }
+}
+
+TEST_SUITE_END() // MultipleThreadedScheduller
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 69fe9053d..8d70ca541 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,8 @@
 #include "arm_compute/runtime/RuntimeContext.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuActivationKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ActivationFunctionsDataset.h"
@@ -279,6 +281,43 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
     bool is_valid = bool(NEActivationLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), act_info));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat(
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::QASYMM8,
+                                                              DataType::QASYMM8_SIGNED,
+                                                              DataType::QSYMM16
+                                                            })),
+                combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                        framework::dataset::make("DataType", { DataType::F32,
+                                                               DataType::F16,
+                                                             }))),
+                combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                        framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                               DataType::QASYMM8_SIGNED,
+                                                               DataType::QSYMM16
+                                                             }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuActivationKernel::get_implementation(DataTypeISASelectorData{data_type, cpu_isa}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_activation";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp
index f3e4dfc6e..c72e082a7 100644
--- a/tests/validation/NEON/ArithmeticAddition.cpp
+++ b/tests/validation/NEON/ArithmeticAddition.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,8 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuAddKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ConvertPolicyDataset.h"
@@ -85,6 +87,49 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                               ConvertPolicy::WRAP);
     ARM_COMPUTE_EXPECT(bool(s) == expected, framework::LogLevel::ERRORS);
 }
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat(
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::U8,
+                                                              DataType::S16,
+                                                              DataType::S32,
+                                                              DataType::QASYMM8,
+                                                              DataType::QASYMM8_SIGNED,
+                                                              DataType::QSYMM16
+                                                            })),
+                combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                        framework::dataset::make("DataType", { DataType::F32,
+                                                               DataType::F16,
+                                                               DataType::U8,
+                                                               DataType::S16,
+                                                               DataType::S32
+                                                             }))),
+                combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                        framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                               DataType::QASYMM8_SIGNED,
+                                                               DataType::QSYMM16
+                                                             }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuAddKernel::get_implementation(DataTypeISASelectorData{data_type, cpu_isa}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_add";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index 368aef216..824741db5 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,8 +70,8 @@ const auto data_pad_f16 = concat(combine(framework::dataset::make("PadX", { 0, 1
                                                  framework::dataset::make("KernelSize", 1))));
 
 const auto data_f32 = combine(datasets::SmallDirectConvolutionShapes(),
-                              combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
-                                      combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
+                              combine(framework::dataset::make("StrideX", { 1, 2, 3, 4 }),
+                                      combine(framework::dataset::make("StrideY", { 1, 2, 3, 4 }),
                                               data_pad_f32)));
 
 const auto data_f16 = combine(datasets::SmallDirectConvolutionShapes(),
@@ -87,17 +87,25 @@ const auto data_prec = combine(datasets::SmallDirectConvolutionShapes(),
                                                                framework::dataset::make("KernelSize", 3))))));
 
 const auto data9x9 = combine(datasets::SmallDirectConvolutionShapes(),
-                             combine(framework::dataset::make("StrideX", { 1 }),
-                                     combine(framework::dataset::make("StrideY", { 1 }),
+                             combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
+                                     combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
                                              combine(framework::dataset::make("PadX", { 0, 2 }),
                                                      combine(framework::dataset::make("PadY", { 0, 3 }),
                                                              framework::dataset::make("KernelSize", 9))))));
 
-const auto data_f32_nightly = combine(data_f32, framework::dataset::make("NumKernels", { 1, 4 }));
-const auto data_f16_nightly = combine(data_f16, framework::dataset::make("NumKernels", { 1, 4 }));
+const auto data8x8 = combine(datasets::SmallDirectConvolutionShapes(),
+                             combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
+                                     combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
+                                             combine(framework::dataset::make("PadX", { 0 }),
+                                                     combine(framework::dataset::make("PadY", { 0 }),
+                                                             framework::dataset::make("KernelSize", 8))))));
+
+const auto data_f32_nightly = combine(data_f32, framework::dataset::make("NumKernels", { 1, 4, 5 }));
+const auto data_f16_nightly = combine(data_f16, framework::dataset::make("NumKernels", { 1, 4, 5 }));
 
 const auto data_precommit    = combine(data_prec, framework::dataset::make("NumKernels", { 1 }));
 const auto data_precommit9x9 = combine(data9x9, framework::dataset::make("NumKernels", { 4 }));
+const auto data_precommit8x8 = combine(data8x8, framework::dataset::make("NumKernels", { 4 }));
 
 /* The following tests is from real use-case that made DirectConvolution
  * overflows in terms of its tensor indexing. This test case is using
@@ -319,13 +327,23 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolutionLayerFixture<float>, framewo
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
 FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEDirectConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit,
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    ActivationFunctionsDataset),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       framework::dataset::make("DataType", DataType::F32)),
+                       ActivationFunctionsDataset),
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmall8x8, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit8x8, framework::dataset::make("DataType",
+                                                                                                                       DataType::F32)),
+                                                                                                                       ActivationFunctionsDataset),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
 FIXTURE_DATA_TEST_CASE(RunSmall9x9, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit9x9, framework::dataset::make("DataType",
                                                                                                                        DataType::F32)),
                                                                                                                        ActivationFunctionsDataset),
diff --git a/tests/validation/NEON/Floor.cpp b/tests/validation/NEON/Floor.cpp
index 419ce56e4..d3bd3e0b1 100644
--- a/tests/validation/NEON/Floor.cpp
+++ b/tests/validation/NEON/Floor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,8 @@
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuFloorKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -62,6 +64,30 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
     const Status status = NEFloor::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false));
     ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
 }
+
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                            })),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuFloorKernel::get_implementation(DataTypeISASelectorData{data_type, cpu_isa}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_floor";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 77a501582..457610f2b 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -81,6 +81,14 @@ const auto qasymm8_signed_out_qinfo_dataset = framework::dataset::make("OutputQu
     QuantizationInfo(.1f, -5),  // Multiplier <= 1
     QuantizationInfo(2.f, -3)   // Multiplier > 1
 });
+
+// Cases where pooling region is completely outside the input tensor (excluding global pooling)
+const auto pool_outside_input_dataset = zip(zip(zip(zip(
+                                                        framework::dataset::make("Shape", { TensorShape{ 2U, 2U, 1U }, TensorShape{ 2U, 2U, 4U }, TensorShape{ 3U, 5U, 2U }, TensorShape{ 10U, 20U, 3U } }),
+                                                        framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                                    framework::dataset::make("PoolingSize", { Size2D{ 2, 2 }, Size2D{ 3, 3 }, Size2D{ 2, 2 }, Size2D{ 3, 6 } })),
+                                                framework::dataset::make("PadStride", { PadStrideInfo{ 1, 1, 2, 2 }, PadStrideInfo{ 1, 1, 4, 4 }, PadStrideInfo{ 1, 1, 3, 3 }, PadStrideInfo{ 1, 1, 2, 5 } })),
+                                            framework::dataset::make("ExcludePadding", { false, false, false, false }));
 } // namespace
 
 TEST_SUITE(NEON)
@@ -186,6 +194,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<float>, framework::Datase
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE(CornerCases)
+FIXTURE_DATA_TEST_CASE(PoolRegionCompletelyOutsideInput, NEPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(pool_outside_input_dataset,
+                       framework::dataset::make("DataType",
+                                                DataType::F32)),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // CornerCases
 TEST_SUITE_END() // FP32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -213,6 +231,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<half>, framework::Dataset
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
+TEST_SUITE(CornerCases)
+FIXTURE_DATA_TEST_CASE(PoolRegionCompletelyOutsideInput, NEPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(pool_outside_input_dataset,
+                       framework::dataset::make("DataType",
+                                                DataType::F16)),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // CornerCases
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 TEST_SUITE_END() // Float
diff --git a/tests/validation/NEON/Remap.cpp b/tests/validation/NEON/Remap.cpp
deleted file mode 100644
index 3c02f8eec..000000000
--- a/tests/validation/NEON/Remap.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NERemap.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/RemapFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(0);
-constexpr float                      tolerance_number = 0.f;
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Remap)
-
-template <typename T>
-using NERemapFixture = RemapValidationFixture<Tensor, Accessor, NERemap, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NERemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                           framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NERemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                           framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/UNIT/GPUTarget.cpp b/tests/validation/UNIT/GPUTarget.cpp
index d2c81cf77..b5eccf623 100644
--- a/tests/validation/UNIT/GPUTarget.cpp
+++ b/tests/validation/UNIT/GPUTarget.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@ TEST_CASE(GetGPUTargetFromName, framework::DatasetMode::ALL)
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G77") == GPUTarget::G77, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G78") == GPUTarget::G78, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G78AE") == GPUTarget::G78, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-TODX") == GPUTarget::TODX, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G710") == GPUTarget::G710, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-T000") == GPUTarget::MIDGARD, framework::LogLevel::ERRORS);
 }
 
diff --git a/tests/validation/Validation.h b/tests/validation/Validation.h
index 638a1c20e..289aca4d0 100644
--- a/tests/validation/Validation.h
+++ b/tests/validation/Validation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,17 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+// Compare if 2 values are both infinities and if they are "equal" (has the same sign)
+template <typename T>
+inline bool are_equal_infs(T val0, T val1)
+{
+    const auto same_sign = support::cpp11::signbit(val0) == support::cpp11::signbit(val1);
+    return (!support::cpp11::isfinite(val0)) && (!support::cpp11::isfinite(val1)) && same_sign;
+}
+} // namespace
+
 /** Class reprensenting an absolute tolerance value. */
 template <typename T>
 class AbsoluteTolerance
@@ -296,9 +307,9 @@ struct compare<AbsoluteTolerance<U>> : public compare_base<AbsoluteTolerance<U>>
     /** Perform comparison */
     operator bool() const
     {
-        if(!support::cpp11::isfinite(this->_target) || !support::cpp11::isfinite(this->_reference))
+        if(are_equal_infs(this->_target, this->_reference))
         {
-            return false;
+            return true;
         }
         else if(this->_target == this->_reference)
         {
@@ -322,9 +333,9 @@ struct compare<RelativeTolerance<U>> : public compare_base<RelativeTolerance<U>>
     /** Perform comparison */
     operator bool() const
     {
-        if(!support::cpp11::isfinite(this->_target) || !support::cpp11::isfinite(this->_reference))
+        if(are_equal_infs(this->_target, this->_reference))
         {
-            return false;
+            return true;
         }
         else if(this->_target == this->_reference)
         {
@@ -494,9 +505,9 @@ void validate_wrap(const IAccessor &tensor, const SimpleTensor<T> &reference, co
                 // check for wrapping
                 if(!equal)
                 {
-                    if(!support::cpp11::isfinite(target_value) || !support::cpp11::isfinite(reference_value))
+                    if(are_equal_infs(target_value, reference_value))
                     {
-                        equal = false;
+                        equal = true;
                     }
                     else
                     {
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index 52cd6759a..8b748032f 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/fixtures/PoolingLayerFixture.h b/tests/validation/fixtures/PoolingLayerFixture.h
index ec4e9f80d..6e9edfbb5 100644
--- a/tests/validation/fixtures/PoolingLayerFixture.h
+++ b/tests/validation/fixtures/PoolingLayerFixture.h
@@ -213,7 +213,7 @@ public:
     template <typename...>
     void setup(TensorShape src_shape, PoolingLayerInfo pool_info, DataType data_type)
     {
-        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, pool_info, data_type, DataLayout::NCHW);
+        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, pool_info, data_type, pool_info.data_layout);
     }
 };
 
diff --git a/tests/validation/fixtures/RemapFixture.h b/tests/validation/fixtures/RemapFixture.h
deleted file mode 100644
index 03cb6aef4..000000000
--- a/tests/validation/fixtures/RemapFixture.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_REMAP_FIXTURE
-#define ARM_COMPUTE_TEST_REMAP_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Remap.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class RemapValidationGenericFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, InterpolationPolicy policy, DataType data_type, BorderMode border_mode, DataLayout data_layout = DataLayout::NCHW)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution(0, 255);
-        PixelValue                             constant_border_value{ static_cast<T>(distribution(gen)) };
-
-        _data_layout = data_layout;
-        _target      = compute_target(shape, policy, data_type, border_mode, constant_border_value);
-        _reference   = compute_reference(shape, policy, data_type, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i, int min, int max)
-    {
-        switch(tensor.data_type())
-        {
-            case DataType::F32:
-            {
-                // map_x,y as integer values
-                std::uniform_int_distribution<int> distribution(min, max);
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            case DataType::F16:
-            {
-                arm_compute::utils::uniform_real_distribution_16bit<half> distribution(static_cast<float>(min), static_cast<float>(max));
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            case DataType::U8:
-            {
-                std::uniform_int_distribution<uint8_t> distribution(min, max);
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("DataType for Remap not supported");
-        }
-    }
-
-    TensorType compute_target(TensorShape shape, InterpolationPolicy policy, DataType data_type, BorderMode border_mode, PixelValue constant_border_value)
-    {
-        if(_data_layout == DataLayout::NHWC)
-        {
-            permute(shape, PermutationVector(2U, 0U, 1U));
-        }
-
-        // Create tensors
-        TensorType src   = create_tensor<TensorType>(shape, data_type, 1, QuantizationInfo(), _data_layout);
-        TensorType map_x = create_tensor<TensorType>(shape, DataType::F32, 1, QuantizationInfo(), _data_layout);
-        TensorType map_y = create_tensor<TensorType>(shape, DataType::F32, 1, QuantizationInfo(), _data_layout);
-        TensorType dst   = create_tensor<TensorType>(shape, data_type, 1, QuantizationInfo(), _data_layout);
-
-        // Create and configure function
-        FunctionType remap;
-        remap.configure(&src, &map_x, &map_y, &dst, policy, border_mode, constant_border_value);
-
-        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(map_x.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(map_y.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        map_x.allocator()->allocate();
-        map_y.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!map_x.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!map_y.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
-
-        // Fill tensors
-        int max_val = std::max({ shape.x(), shape.y(), shape.z() });
-
-        fill(AccessorType(src), 0, 0, 255);
-        fill(AccessorType(map_x), 1, -5, max_val);
-        fill(AccessorType(map_y), 2, -5, max_val);
-
-        // Compute function
-        remap.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape shape, InterpolationPolicy policy, DataType data_type, BorderMode border_mode, PixelValue constant_border_value)
-    {
-        ARM_COMPUTE_ERROR_ON(data_type != DataType::U8 && data_type != DataType::F16);
-
-        // Create reference
-        SimpleTensor<T>     src{ shape, data_type };
-        SimpleTensor<float> map_x{ shape, DataType::F32 };
-        SimpleTensor<float> map_y{ shape, DataType::F32 };
-        T                   border_value{};
-        constant_border_value.get(border_value);
-
-        // Create the valid mask Tensor
-        _valid_mask = SimpleTensor<T> { shape, data_type };
-
-        // Fill reference
-        int max_val = std::max({ shape.x(), shape.y(), shape.z() });
-
-        fill(src, 0, 0, 255);
-        fill(map_x, 1, -5, max_val);
-        fill(map_y, 2, -5, max_val);
-
-        // Compute reference
-        return reference::remap<T>(src, map_x, map_y, _valid_mask, policy, border_mode, border_value);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-    SimpleTensor<T> _valid_mask{};
-    DataLayout      _data_layout{};
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class RemapValidationFixture : public RemapValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, InterpolationPolicy policy, DataType data_type, BorderMode border_mode)
-    {
-        RemapValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, policy, data_type, border_mode);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class RemapValidationMixedLayoutFixture : public RemapValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, InterpolationPolicy policy, DataType data_type, BorderMode border_mode, DataLayout data_layout)
-    {
-        RemapValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, policy, data_type, border_mode, data_layout);
-    }
-};
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_REMAP_FIXTURE */
diff --git a/tests/validation/reference/FullyConnectedLayer.cpp b/tests/validation/reference/FullyConnectedLayer.cpp
index 21333958f..af30e9ee5 100644
--- a/tests/validation/reference/FullyConnectedLayer.cpp
+++ b/tests/validation/reference/FullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,7 +123,7 @@ SimpleTensor<T> fully_connected_layer(const SimpleTensor<T> &src, const SimpleTe
     // Create reference
     SimpleTensor<T> dst{ TensorShape{ dst_shape }, src.data_type(), 1, out_quant_info };
 
-    // Sanity checks
+    // Health checks
     const int          num_batch_dimensions = std::max(0, static_cast<int>(dst_shape.num_dimensions()) - 1);
     const int          num_input_dimensions = src.shape().num_dimensions() - num_batch_dimensions;
     const unsigned int linear_input_size    = src.shape().total_size_lower(num_input_dimensions);
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index 5f4edfe49..9e671e317 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,7 +88,7 @@ SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const Pooling
                         int hend     = std::min(hstart + pool_size_y, h_src);
                         wstart       = std::max(wstart, 0);
                         hstart       = std::max(hstart, 0);
-                        auto max_val = std::numeric_limits<ACC_T>::lowest();
+                        auto max_val = -std::numeric_limits<ACC_T>::infinity();
                         int  max_index{ 0 };
                         for(int y = hstart; y < hend; ++y)
                         {
diff --git a/tests/validation/reference/Remap.cpp b/tests/validation/reference/Remap.cpp
deleted file mode 100644
index dfbe1aa12..000000000
--- a/tests/validation/reference/Remap.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "Remap.h"
-
-#include "Utils.h"
-#include "tests/validation/Helpers.h"
-
-#include <algorithm>
-#include <array>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> remap(const SimpleTensor<T> &in, SimpleTensor<float> &map_x, SimpleTensor<float> &map_y, SimpleTensor<T> &valid_mask, InterpolationPolicy policy, BorderMode border_mode,
-                      T constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(border_mode == BorderMode::REPLICATE, "BorderMode not supported");
-    SimpleTensor<T> out(in.shape(), in.data_type());
-    ARM_COMPUTE_ERROR_ON(out.num_elements() != map_x.num_elements());
-    const int      width        = in.shape().x();
-    const int      height       = in.shape().y();
-    const uint32_t num_elements = out.num_elements();
-    for(uint32_t idx = 0; idx < num_elements; idx++)
-    {
-        const Coordinates id_out = index2coord(out.shape(), idx);
-        valid_mask[idx]          = 1;
-        Coordinates src_idx      = id_out; // need to setup all coordinates and not just xy
-        if((0 <= map_y[idx]) && (map_y[idx] < height) && (0 <= map_x[idx]) && (map_x[idx] < width))
-        {
-            switch(policy)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                {
-                    src_idx.set(0, static_cast<int>(std::floor(map_x[idx])));
-                    src_idx.set(1, static_cast<int>(std::floor(map_y[idx])));
-                    out[idx] = in[coord2index(in.shape(), src_idx)];
-                    break;
-                }
-                case InterpolationPolicy::BILINEAR:
-                {
-                    (valid_bilinear_policy(map_x[idx], map_y[idx], width, height, border_mode)) ?
-                    out[idx]        = bilinear_policy(in, src_idx, map_x[idx], map_y[idx], border_mode, constant_border_value) :
-                                      valid_mask[idx] = 0;
-                    break;
-                }
-                case InterpolationPolicy::AREA:
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-                    break;
-            }
-        }
-        else
-        {
-            if(border_mode == BorderMode::UNDEFINED)
-            {
-                valid_mask[idx] = 0;
-            }
-            else
-            {
-                switch(policy)
-                {
-                    case InterpolationPolicy::NEAREST_NEIGHBOR:
-                        out[idx] = constant_border_value;
-                        break;
-                    case InterpolationPolicy::BILINEAR:
-                        out[idx] = bilinear_policy(in, src_idx, map_x[idx], map_y[idx], border_mode, constant_border_value);
-                        break;
-                    case InterpolationPolicy::AREA:
-                    default:
-                        break;
-                }
-            }
-        }
-    }
-    return out;
-}
-
-template SimpleTensor<uint8_t> remap(const SimpleTensor<uint8_t> &src, SimpleTensor<float> &map_x, SimpleTensor<float> &map_y, SimpleTensor<uint8_t> &valid_mask, InterpolationPolicy policy,
-                                     BorderMode border_mode,
-                                     uint8_t    constant_border_value);
-
-template SimpleTensor<half> remap(const SimpleTensor<half> &src, SimpleTensor<float> &map_x, SimpleTensor<float> &map_y, SimpleTensor<half> &valid_mask, InterpolationPolicy policy,
-                                  BorderMode border_mode,
-                                  half       constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index e152cf6fc..664a39d15 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -2266,8 +2266,8 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GPUTarget &gpu_targe
         case GPUTarget::G31:
             os << "G31";
             break;
-        case GPUTarget::TODX:
-            os << "TODX";
+        case GPUTarget::G710:
+            os << "G710";
             break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
author	Jenkins <bsgcomp@arm.com>	2022-02-26 12:23:41 +0000
committer	Jenkins <bsgcomp@arm.com>	2022-02-26 12:23:41 +0000
commit	8f587de9214dbc3aee4ff4eeb2ede66747769b19 (patch)
tree	51dddcedfbc88d417908b9d129a4f59ac7a0efa8
parent	91ee4d0a9ef128b16936921470a0e3ffef347536 (diff)
download	ARMComputeLibrary-upstream-master.tar.gz