53 files changed, 1570 insertions, 1699 deletions
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index af301c8d1..cd01659c0 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,6 @@
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NERangeKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/NEON/kernels/NERemapKernel.h"
 #include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 #include "src/core/NEON/kernels/NEReverseKernel.h"
 #include "src/core/NEON/kernels/NESelectKernel.h"
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 1e0a1742f..69bfd56ce 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/boundingboxtransform/list.h"
 
 #include <arm_neon.h>
 
@@ -37,6 +39,62 @@ namespace arm_compute
 {
 namespace
 {
+struct BoundingBoxTransformSelectorData
+{
+    DataType dt;
+};
+
+using BoundingBoxTransformSelctorPtr = std::add_pointer<bool(const BoundingBoxTransformSelectorData &data)>::type;
+using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)>::type;
+
+struct BoundingBoxTransformKernel
+{
+    const char                          *name;
+    const BoundingBoxTransformSelctorPtr is_selected;
+    BoundingBoxTransformUKernelPtr       ukernel;
+};
+
+static const BoundingBoxTransformKernel available_kernels[] =
+{
+    {
+        "fp32_neon_boundingboxtransform",
+        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)
+    },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "fp16_neon_boundingboxtransform",
+        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)
+    },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "qu16_neon_boundingboxtransform",
+        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; },
+        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)
+    },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
@@ -112,145 +170,15 @@ Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const IT
     return Status{};
 }
 
-template <>
-void NEBoundingBoxTransformKernel::internal_run<uint16_t>(const Window &window)
-{
-    const size_t num_classes  = _deltas->info()->tensor_shape()[0] >> 2;
-    const size_t deltas_width = _deltas->info()->tensor_shape()[0];
-    const int    img_h        = std::floor(_bbinfo.img_height() / _bbinfo.scale() + 0.5f);
-    const int    img_w        = std::floor(_bbinfo.img_width() / _bbinfo.scale() + 0.5f);
-
-    const auto scale_after  = (_bbinfo.apply_scale() ? _bbinfo.scale() : 1.f);
-    const auto scale_before = _bbinfo.scale();
-    const auto offset       = (_bbinfo.correct_transform_coords() ? 1.f : 0.f);
-
-    auto pred_ptr  = reinterpret_cast<uint16_t *>(_pred_boxes->buffer() + _pred_boxes->info()->offset_first_element_in_bytes());
-    auto delta_ptr = reinterpret_cast<uint8_t *>(_deltas->buffer() + _deltas->info()->offset_first_element_in_bytes());
-
-    const auto boxes_qinfo  = _boxes->info()->quantization_info().uniform();
-    const auto deltas_qinfo = _deltas->info()->quantization_info().uniform();
-    const auto pred_qinfo   = _pred_boxes->info()->quantization_info().uniform();
-
-    Iterator box_it(_boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto  ptr    = reinterpret_cast<uint16_t *>(box_it.ptr());
-        const auto  b0     = dequantize_qasymm16(*ptr, boxes_qinfo);
-        const auto  b1     = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
-        const auto  b2     = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
-        const auto  b3     = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
-        const float width  = (b2 / scale_before) - (b0 / scale_before) + 1.f;
-        const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
-        const float ctr_x  = (b0 / scale_before) + 0.5f * width;
-        const float ctr_y  = (b1 / scale_before) + 0.5f * height;
-        for(size_t j = 0; j < num_classes; ++j)
-        {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const float  dx       = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / _bbinfo.weights()[0];
-            const float  dy       = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / _bbinfo.weights()[1];
-            float        dw       = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / _bbinfo.weights()[2];
-            float        dh       = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / _bbinfo.weights()[3];
-            // Clip dw and dh
-            dw = std::min(dw, _bbinfo.bbox_xform_clip());
-            dh = std::min(dh, _bbinfo.bbox_xform_clip());
-            // Determine the predictions
-            const float pred_ctr_x = dx * width + ctr_x;
-            const float pred_ctr_y = dy * height + ctr_y;
-            const float pred_w     = std::exp(dw) * width;
-            const float pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo);
-        }
-    },
-    box_it);
-}
-
-template <typename T>
-void NEBoundingBoxTransformKernel::internal_run(const Window &window)
-{
-    const size_t num_classes  = _deltas->info()->tensor_shape()[0] >> 2;
-    const size_t deltas_width = _deltas->info()->tensor_shape()[0];
-    const int    img_h        = std::floor(_bbinfo.img_height() / _bbinfo.scale() + 0.5f);
-    const int    img_w        = std::floor(_bbinfo.img_width() / _bbinfo.scale() + 0.5f);
-
-    const auto scale_after  = (_bbinfo.apply_scale() ? T(_bbinfo.scale()) : T(1));
-    const auto scale_before = T(_bbinfo.scale());
-    ARM_COMPUTE_ERROR_ON(scale_before <= 0);
-    const auto offset = (_bbinfo.correct_transform_coords() ? T(1.f) : T(0.f));
-
-    auto pred_ptr  = reinterpret_cast<T *>(_pred_boxes->buffer() + _pred_boxes->info()->offset_first_element_in_bytes());
-    auto delta_ptr = reinterpret_cast<T *>(_deltas->buffer() + _deltas->info()->offset_first_element_in_bytes());
-
-    Iterator box_it(_boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto ptr    = reinterpret_cast<T *>(box_it.ptr());
-        const auto b0     = *ptr;
-        const auto b1     = *(ptr + 1);
-        const auto b2     = *(ptr + 2);
-        const auto b3     = *(ptr + 3);
-        const T    width  = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
-        const T    height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
-        const T    ctr_x  = (b0 / scale_before) + T(0.5f) * width;
-        const T    ctr_y  = (b1 / scale_before) + T(0.5f) * height;
-        for(size_t j = 0; j < num_classes; ++j)
-        {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const T      dx       = delta_ptr[delta_id] / T(_bbinfo.weights()[0]);
-            const T      dy       = delta_ptr[delta_id + 1] / T(_bbinfo.weights()[1]);
-            T            dw       = delta_ptr[delta_id + 2] / T(_bbinfo.weights()[2]);
-            T            dh       = delta_ptr[delta_id + 3] / T(_bbinfo.weights()[3]);
-            // Clip dw and dh
-            dw = std::min(dw, T(_bbinfo.bbox_xform_clip()));
-            dh = std::min(dh, T(_bbinfo.bbox_xform_clip()));
-            // Determine the predictions
-            const T pred_ctr_x = dx * width + ctr_x;
-            const T pred_ctr_y = dy * height + ctr_y;
-            const T pred_w     = std::exp(dw) * width;
-            const T pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
-            pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
-        }
-    },
-    box_it);
-}
-
 void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    switch(_boxes->info()->data_type())
-    {
-        case DataType::F32:
-        {
-            internal_run<float>(window);
-            break;
-        }
-        case DataType::QASYMM16:
-        {
-            internal_run<uint16_t>(window);
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            internal_run<float16_t>(window);
-            break;
-        }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        default:
-        {
-            ARM_COMPUTE_ERROR("Data type not supported");
-        }
-    }
+
+    const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
index c080ce6a5..def827836 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -83,9 +83,6 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    template <typename T>
-    void internal_run(const Window &window);
-
     const ITensor           *_boxes;
     ITensor                 *_pred_boxes;
     const ITensor           *_deltas;
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index fabbd6430..729402116 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -31,136 +31,92 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/bit_ops.h"
+#include "src/cpu/kernels/crop/generic/neon/list.h"
 
 namespace arm_compute
 {
 namespace
 {
-template <typename T>
-inline float32x4_t load_as_f32(T *ptr)
+struct CropSelectorData
 {
-    ARM_COMPUTE_UNUSED(ptr);
-    ARM_COMPUTE_ERROR("Type not supported.");
-}
-
-template <>
-inline float32x4_t load_as_f32(float *ptr)
-{
-    return wrapper::vloadq(ptr);
-}
-
-template <>
-inline float32x4_t load_as_f32(int32_t *ptr)
-{
-    return vcvtq_f32_s32(wrapper::vloadq(ptr));
-}
+    DataType dt;
+};
 
-template <>
-inline float32x4_t load_as_f32(uint32_t *ptr)
-{
-    return vcvtq_f32_u32(wrapper::vloadq(ptr));
-}
+using CropSelectorPtr = std::add_pointer<bool(const CropSelectorData &data)>::type;
+using CropUKernelPtr  = std::add_pointer<void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
 
-template <>
-inline float32x4_t load_as_f32(int16_t *ptr)
+struct CropUKernel
 {
-    return vcvtq_f32_s32(vmovl_s16(wrapper::vload(ptr)));
-}
+    const char           *name;
+    const CropSelectorPtr is_selected;
+    CropUKernelPtr        ukernel;
+};
 
-template <>
-inline float32x4_t load_as_f32(uint16_t *ptr)
+static const CropUKernel available_kernels[] =
 {
-    return vcvtq_f32_u32(vmovl_u16(wrapper::vload(ptr)));
-}
-
-template <>
-inline float32x4_t load_as_f32(uint8_t *ptr)
-{
-    return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr)))));
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float32x4_t load_as_f32(float16_t *ptr)
-{
-    return vcvt_f32_f16(wrapper::vload(ptr));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T>
-inline void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                  int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
-{
-    // Reverse elements if width flipped.
-    if(is_width_flipped)
     {
-        // Collapse first dimension if possible.
-        if(input_has_single_channel)
-        {
-            int32_t     x = output_width_start;
-            Coordinates negative_offset(input_offset);
-            negative_offset.set(1, negative_offset[1] - window_step_x + 1);
-            for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
-            {
-                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
-
-                in = wrapper::vrev64(in);
-                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+        "fp16_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)
+    },
+    {
+        "f32_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)
+    },
+    {
+        "u8_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::U8; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)
+    },
+    {
+        "u16_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::U16; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)
+    },
+    {
+        "u32_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::U32; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)
+    },
+    {
+        "s8_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::S8; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)
+    },
+    {
+        "s16_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::S16; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)
+    },
+    {
+        "s32_neon_crop",
+        [](const CropSelectorData & data) { return data.dt == DataType::S32; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)
+    },
+};
 
-                wrapper::vstore(output_ptr + x, in);
-            }
-            input_offset[1] = negative_offset[1] + window_step_x - 1;
-            for(; x < output_width_limit; ++x, --input_offset[1])
-            {
-                *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-            }
-        }
-        else
-        {
-            for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
-            {
-                input_offset.set(0, 0);
-                int32_t c = 0;
-                for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
-                {
-                    auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                    wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
-                }
-                for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
-                {
-                    *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                }
-            }
-        }
-    }
-    else
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const CropUKernel *get_implementation(const CropSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
     {
-        // Use memcpy if the elements don't need converting to float.
-        if(std::is_same<T, float>::value)
+        if(uk.is_selected(data))
         {
-            memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
-                   reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
-                   (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
-        }
-        else
-        {
-            int32_t x                = 0;
-            int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
-            float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
-            for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
-            {
-                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                wrapper::vstore(output_start_ptr + x, in);
-            }
-            for(; x < limit; ++x, ++input_offset[0])
-            {
-                *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-            }
+            return &uk;
         }
     }
+
+    return nullptr;
 }
 
 inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value,
@@ -234,8 +190,7 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina
 } // namespace
 
 NECropKernel::NECropKernel()
-    : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds(),
-      _in_bounds_crop_function(nullptr)
+    : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds()
 {
 }
 
@@ -250,40 +205,14 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co
     _output              = output;
     _crop_box_ind        = crop_box_ind;
     _extrapolation_value = extrapolation_value;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::F32:
-            _in_bounds_crop_function = &in_bounds_crop_window<float>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _in_bounds_crop_function = &in_bounds_crop_window<float16_t>;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::U32:
-            _in_bounds_crop_function = &in_bounds_crop_window<uint32_t>;
-            break;
-        case DataType::S32:
-            _in_bounds_crop_function = &in_bounds_crop_window<int32_t>;
-            break;
-        case DataType::U16:
-            _in_bounds_crop_function = &in_bounds_crop_window<uint16_t>;
-            break;
-        case DataType::S16:
-            _in_bounds_crop_function = &in_bounds_crop_window<int16_t>;
-            break;
-        case DataType::U8:
-            _in_bounds_crop_function = &in_bounds_crop_window<uint8_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Datatype not supported");
-    }
 }
 
 Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value)
 {
     ARM_COMPUTE_UNUSED(extrapolation_value);
+    const auto *uk = get_implementation(CropSelectorData{ input->data_type() });
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
@@ -369,10 +298,12 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON(_input->info()->has_padding());
     ARM_COMPUTE_ERROR_ON(_output->info()->has_padding());
 
+    const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() });
+
     uint32_t    batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind))));
     Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
                              _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
-    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, _in_bounds_crop_function, _end[1] < _start[1],
+    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1],
                    _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0,
                    _start[0] <= _end[0], _end[0] < _start[0]);
 }
diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h
index 742215e22..6c989c1d2 100644
--- a/src/core/NEON/kernels/NECropKernel.h
+++ b/src/core/NEON/kernels/NECropKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -107,8 +107,6 @@ private:
     std::array<uint32_t, 2> _rows_out_of_bounds;
     /** The number of columns out of bounds at the start and end of output. */
     std::array<uint32_t, 2> _cols_out_of_bounds;
-
-    NECropKernel::InBoundsCropFunction *_in_bounds_crop_function;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEON_CROP_KERNEL_H */
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 56aed0ca2..7bba136e8 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,15 +28,72 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
+#include "src/cpu/kernels/genproposals/list.h"
 #include <arm_neon.h>
 
 namespace arm_compute
 {
 namespace
 {
+struct ComputeAllAnchorsData
+{
+    DataType dt;
+};
+
+using ComputeAllAnchorsSelectorPtr = std::add_pointer<bool(const ComputeAllAnchorsData &data)>::type;
+using ComputeAllAnchorsUKernelPtr  = std::add_pointer<void(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
+
+struct ComputeAllAnchorsKernel
+{
+    const char                        *name;
+    const ComputeAllAnchorsSelectorPtr is_selected;
+    ComputeAllAnchorsUKernelPtr        ukernel;
+};
+
+static const ComputeAllAnchorsKernel available_kernels[] =
+{
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "neon_qu16_computeallanchors",
+        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; },
+        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)
+    },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "neon_fp16_computeallanchors",
+        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)
+    },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "neon_fp32_computeallanchors",
+        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)
+    },
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(anchors, all_anchors);
@@ -100,100 +157,15 @@ Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITe
     return Status{};
 }
 
-template <>
-void NEComputeAllAnchorsKernel::internal_run<int16_t>(const Window &window)
-{
-    Iterator all_anchors_it(_all_anchors, window);
-    Iterator anchors_it(_all_anchors, window);
-
-    const size_t num_anchors = _anchors->info()->dimension(1);
-    const float  stride      = 1.f / _anchors_info.spatial_scale();
-    const size_t feat_width  = _anchors_info.feat_width();
-
-    const UniformQuantizationInfo qinfo = _anchors->info()->quantization_info().uniform();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
-
-        const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<int16_t *>(_anchors->ptr_to_element(Coordinates(0, anchor_offset)));
-
-        const size_t shift_idy = id.y() / num_anchors;
-        const float  shiftx    = (shift_idy % feat_width) * stride;
-        const float  shifty    = (shift_idy / feat_width) * stride;
-
-        const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
-        const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
-        const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
-        const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
-
-        *out_anchor_ptr       = quantize_qsymm16(new_anchor_x1, qinfo.scale);
-        *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
-        *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
-        *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
-    },
-    all_anchors_it);
-}
-
-template <typename T>
-void NEComputeAllAnchorsKernel::internal_run(const Window &window)
-{
-    Iterator all_anchors_it(_all_anchors, window);
-    Iterator anchors_it(_all_anchors, window);
-
-    const size_t num_anchors = _anchors->info()->dimension(1);
-    const T      stride      = 1.f / _anchors_info.spatial_scale();
-    const size_t feat_width  = _anchors_info.feat_width();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
-
-        const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<T *>(_anchors->ptr_to_element(Coordinates(0, anchor_offset)));
-
-        const size_t shift_idy = id.y() / num_anchors;
-        const T      shiftx    = (shift_idy % feat_width) * stride;
-        const T      shifty    = (shift_idy / feat_width) * stride;
-
-        *out_anchor_ptr       = *anchor_ptr + shiftx;
-        *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
-        *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
-        *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
-    },
-    all_anchors_it);
-}
-
 void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_anchors->info()->data_type())
-    {
-        case DataType::QSYMM16:
-        {
-            internal_run<int16_t>(window);
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            internal_run<float16_t>(window);
-            break;
-        }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-        {
-            internal_run<float>(window);
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Data type not supported");
-        }
-    }
+    const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_anchors, _all_anchors, _anchors_info, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
index f6d39e50a..297d6d4ab 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,9 +74,6 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    template <typename T>
-    void internal_run(const Window &window);
-
     const ITensor     *_anchors;
     ITensor           *_all_anchors;
     ComputeAnchorsInfo _anchors_info;
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index d33431a8d..71641404b 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,10 @@
 #include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/instancenorm/list.h"
 
 #include <arm_neon.h>
 
@@ -43,137 +45,53 @@ namespace arm_compute
 {
 namespace
 {
-template <typename InputType, typename AccType = InputType>
-void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs)
+struct InstanceNormSelectorData
 {
-    result        = wrapper::vadd(result, inputs);
-    result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs));
-}
+    DataType dt;
+};
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void vector_float_sum(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs)
-{
-    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgetlow(inputs)));
-    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+using InstanceNormSelctorPtr = std::add_pointer<bool(const InstanceNormSelectorData &data)>::type;
+using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)>::type;
 
-template <typename InputType, typename AccType = InputType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+struct InstanceNormKernel
 {
-    return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
-}
+    const char                  *name;
+    const InstanceNormSelctorPtr is_selected;
+    InstanceNormUKernelPtr       ukernel;
+};
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float16x8_t vector_float_norm(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta)
+static const InstanceNormKernel available_kernels[] =
 {
-    const auto  input_low   = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
-    const auto  input_high  = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
-    const auto  result_low  = wrapper::vcvt<float16_t>(vector_float_norm(input_low, vec_mean, vec_multip, vec_beta));
-    const auto  result_high = wrapper::vcvt<float16_t>(vector_float_norm(input_high, vec_mean, vec_multip, vec_beta));
-    float16x8_t result      = wrapper::vcombine(result_low, result_high);
-
-    return result;
-}
+    {
+        "fp32_neon_instancenorm",
+        [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)
+    },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "fp16_neon_instancenorm",
+        [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)
+    },
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+};
 
-template <typename T, typename AccType = T>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data)
 {
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    // Clear X/Y dimensions on execution window as we handle the planes manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    constexpr int      window_step_x  = 16 / sizeof(T);
-    const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
-
-    Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
+    for(const auto &uk : available_kernels)
     {
-        Window win_plane = window;
-        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
-        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
-        Iterator input_plane_it(input, win_plane);
-        Iterator output_plane_it(output, win_plane);
-
-        auto sum_h_w         = static_cast<AccType>(0.f);
-        auto sum_squares_h_w = static_cast<AccType>(0.f);
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
+        if(uk.is_selected(data))
         {
-            const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
-
-            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                auto vec_input_val = wrapper::vloadq(input_ptr + x);
-                vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
-            }
-
-            auto vec2_sum_h_w         = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
-            auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
-
-            vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-
-            sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
-            sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto value = static_cast<AccType>(*(input_ptr + x));
-                sum_h_w += value;
-                sum_squares_h_w += value * value;
-            }
-        },
-        input_plane_it, output_plane_it);
-
-        const auto mean_h_w = sum_h_w / elements_plane;
-        const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
-        const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
-        const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
-        {
-            auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
-            auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                const auto vec_val        = wrapper::vloadq(input_ptr + x);
-                const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
-                wrapper::vstore(output_ptr + x, normalized_vec);
-            }
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto val    = static_cast<AccType>(*(input_ptr + x));
-                *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
-            }
-        },
-        input_plane_it, output_plane_it);
-    },
-    input_it);
+            return &uk;
+        }
+    }
+    return nullptr;
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
@@ -210,7 +128,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
 } // namespace
 
 NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12)
+    : _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12)
 {
 }
 
@@ -227,28 +145,6 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), _gamma, _beta, _epsilon));
 
-    if(_input->info()->data_type() == DataType::F32)
-    {
-        _func = &instance_normalization_nchw<float>;
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else if(_input->info()->data_type() == DataType::F16)
-    {
-        if(_use_mixed_precision)
-        {
-            _func = &instance_normalization_nchw<float16_t, float>;
-        }
-        else
-        {
-            _func = &instance_normalization_nchw<float16_t>;
-        }
-    }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else
-    {
-        ARM_COMPUTE_ERROR("Unsupported data type");
-    }
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(_input->info(), _output->info());
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
@@ -268,6 +164,10 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    (*_func)(_input, _output, _gamma, _beta, _epsilon, window);
+
+    const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
index 96c011971..f166ce205 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -84,13 +84,12 @@ private:
      */
     using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 
-    NormalizationFunction *_func;
-    ITensor               *_input;
-    ITensor               *_output;
-    float                  _gamma;
-    float                  _beta;
-    float                  _epsilon;
-    bool                   _use_mixed_precision{ true };
+    ITensor *_input;
+    ITensor *_output;
+    float    _gamma;
+    float    _beta;
+    float    _epsilon;
+    bool     _use_mixed_precision{ true };
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
index 761fa1523..93da8a24c 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,9 +28,10 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
+#include "src/cpu/kernels/maxunpool/list.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
@@ -39,6 +40,67 @@ using namespace misc::shape_calculator;
 
 namespace
 {
+struct MaxUnpoolingSelectorData
+{
+    DataType dt;
+};
+
+using MaxUnpoolingSelctorPtr = std::add_pointer<bool(const MaxUnpoolingSelectorData &data)>::type;
+using MaxUnpoolingUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window)>::type;
+
+struct MaxUnpoolingKernel
+{
+    const char                  *name;
+    const MaxUnpoolingSelctorPtr is_selected;
+    MaxUnpoolingUKernelPtr       ukernel;
+};
+
+static const MaxUnpoolingKernel available_kernels[] =
+{
+    {
+        "fp32_neon_maxunpooling",
+        [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_maxunpooling)
+    },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "fp16_neon_maxunpooling",
+        [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_maxunpooling)
+    },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "qs8_neon_maxunpooling",
+        [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::QASYMM8; },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qs8_maxunpooling)
+    },
+    {
+        "qu8_neon_maxunpooling",
+        [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qu8_maxunpooling)
+    },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const MaxUnpoolingKernel *get_implementation(const MaxUnpoolingSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
@@ -69,7 +131,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 } // namespace
 
 NEMaxUnpoolingLayerKernel::NEMaxUnpoolingLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr)
+    : _input(nullptr), _output(nullptr), _indices(nullptr)
 {
 }
 
@@ -82,46 +144,12 @@ void NEMaxUnpoolingLayerKernel::configure(const ITensor *input, const ITensor *i
     _output  = output;
     _indices = indices;
 
-    switch(input->info()->data_type())
-    {
-        case DataType::F32:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<float>;
-            break;
-        case DataType::QASYMM8:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<uint8_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<int8_t>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<float16_t>;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            break;
-    }
     const TensorShape output_shape = compute_unpool_shape(*input->info(), pool_info);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     auto window = calculate_max_window(*input->info(), Steps());
     INEKernel::configure(window);
 }
-template <typename T>
-void NEMaxUnpoolingLayerKernel::unpooling2(const Window &window)
-{
-    Iterator  input(_input, window);
-    Iterator  indices(_indices, window);
-    auto      out_ptr      = reinterpret_cast<T *>(_output->buffer());
-    const int out_stride_w = static_cast<int>(_output->info()->strides_in_bytes()[3]);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto vindices                                         = reinterpret_cast<uint32_t *>(indices.ptr());
-        auto vinput                                           = reinterpret_cast<T *>(input.ptr());
-        out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
-    },
-    input, indices);
-}
 
 Status NEMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
@@ -135,8 +163,9 @@ void NEMaxUnpoolingLayerKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    // Run function
-    (this->*_func)(window);
+    const auto *uk = get_implementation(MaxUnpoolingSelectorData{ _input->info()->data_type() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_input, _output, _indices, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
index ecc116e58..f7f9a31f6 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,10 +88,9 @@ private:
     using UnpoolingFunction = void (NEMaxUnpoolingLayerKernel::*)(const Window &window);
 
 private:
-    UnpoolingFunction _func;
-    const ITensor    *_input;
-    ITensor          *_output;
-    const ITensor    *_indices;
+    const ITensor *_input;
+    ITensor       *_output;
+    const ITensor *_indices;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index d1c7d4eb9..7d8fc7ec7 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,13 +31,64 @@
 #include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/meanstddevnorm/list.h"
 
 namespace arm_compute
 {
 namespace
 {
+struct MeanStdDevNormSelectorData
+{
+    DataType dt;
+};
+
+using MeanStdDevNormSelctorPtr = std::add_pointer<bool(const MeanStdDevNormSelectorData &data)>::type;
+using MeanStdDevNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
+
+struct MeanStdDevNormKernel
+{
+    const char                    *name;
+    const MeanStdDevNormSelctorPtr is_selected;
+    MeanStdDevNormUKernelPtr       ukernel;
+};
+
+static const MeanStdDevNormKernel available_kernels[] =
+{
+    {
+        "fp32_neon_meanstddevnorm",
+        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)
+    },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "fp16_neon_meanstddevnorm",
+        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)
+    },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
@@ -72,80 +123,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-template <typename ScalarType, int size>
-void NEMeanStdDevNormalizationKernel::mean_stddev_normalization(const Window &window)
-{
-    using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type;
-
-    // Set build options
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x  = size;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-        auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-        auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto data  = wrapper::vloadq(in_ptr + x);
-            sum_vec    = wrapper::vadd(sum_vec, data);
-            sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
-        }
-
-        auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
-        auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
-        for(int i = 0; i < size / 4; ++i)
-        {
-            sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
-            sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
-        }
-
-        auto sum    = wrapper::vgetlane(sum_carry_res, 0);
-        auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            ScalarType data = *(in_ptr + x);
-            sum += data;
-            sum_sq += data * data;
-        }
-
-        ScalarType mean       = sum / _input->info()->dimension(0);
-        ScalarType var        = (sum_sq / _input->info()->dimension(0)) - (mean * mean);
-        ScalarType stddev_inv = 1.f / sqrt(var + _epsilon);
-
-        auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
-        auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto data = wrapper::vloadq(in_ptr + x);
-            auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
-            // Store results
-            wrapper::vstore(out_ptr + x, res);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
-        }
-    },
-    input, output);
-}
-
 NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel()
-    : _input(nullptr), _output(nullptr), _epsilon(1e-8f), _func(nullptr)
+    : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
 {
 }
 
@@ -163,23 +142,6 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
     auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICPPKernel::configure(win_config.second);
-
-    // Configure function to run based on different data types
-    const DataType data_type = input->info()->data_type();
-    switch(data_type)
-    {
-        case DataType::F32:
-            _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float, 4>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float16_t, 8>;
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        default:
-            ARM_COMPUTE_ERROR("Not Supported");
-            break;
-    }
 }
 
 Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
@@ -194,8 +156,10 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (this->*_func)(window);
+    const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_input, _output, _epsilon, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
index 59d073ada..844f0efdc 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,8 +91,6 @@ private:
     float    _epsilon;
 
     using MeanStdDevNormFunction = void (NEMeanStdDevNormalizationKernel::*)(const Window &window);
-
-    MeanStdDevNormFunction _func;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 60986812b..734510b63 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -261,9 +261,10 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info)
 
 size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
 {
-    ARM_COMPUTE_UNUSED(platform, thread_count);
-
-    return ICPPKernel::small_network_mws;
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
+    
+    return ICPPKernel::default_mws;
 }
 
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index ece7e40e3..802aebb52 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,10 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/misc/Utility.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/roialign/list.h"
 
 #include <arm_neon.h>
 
@@ -41,6 +43,67 @@ namespace arm_compute
 {
 namespace
 {
+struct ROIAlignSelectorData
+{
+    DataType dt;
+};
+
+using ROIAlignSelctorPtr = std::add_pointer<bool(const ROIAlignSelectorData &data)>::type;
+using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)>::type;
+
+struct ROIAlignKernel
+{
+    const char              *name;
+    const ROIAlignSelctorPtr is_selected;
+    ROIAlignUKernelPtr       ukernel;
+};
+
+static const ROIAlignKernel available_kernels[] =
+{
+    {
+        "fp32_neon_roialign",
+        [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)
+    },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {
+        "fp16_neon_roialign",
+        [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)
+    },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "qu8_neon_roialign",
+        [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)
+    },
+    {
+        "qs8_neon_roialign",
+        [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)
+    },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
@@ -110,328 +173,19 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn
     return Status{};
 }
 
-/** Average pooling over an aligned window */
-template <typename input_data_type>
-inline input_data_type roi_align_1x1(const ITensor *input,
-                                     unsigned int   roi_batch,
-                                     float          region_start_x,
-                                     float          bin_size_x,
-                                     int            grid_size_x,
-                                     float          region_end_x,
-                                     float          region_start_y,
-                                     float          bin_size_y,
-                                     int            grid_size_y,
-                                     float          region_end_y,
-                                     int            pz)
-{
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
-    {
-        return input_data_type(0);
-    }
-    else
-    {
-        const DataLayout data_layout = input->info()->data_layout();
-        float            avg         = 0;
-        // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
-        {
-            for(int ix = 0; ix < grid_size_x; ++ix)
-            {
-                // Align the window in the middle of every bin
-                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
-                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
-
-                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
-                const int y_low  = y;
-                const int x_low  = x;
-                const int y_high = y_low + 1;
-                const int x_high = x_low + 1;
-
-                const float ly = y - y_low;
-                const float lx = x - x_low;
-                const float hy = 1. - ly;
-                const float hx = 1. - lx;
-
-                const float w1 = hy * hx;
-                const float w2 = hy * lx;
-                const float w3 = ly * hx;
-                const float w4 = ly * lx;
-                if(data_layout == DataLayout::NCHW)
-                {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
-                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                }
-                else
-                {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
-                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                }
-            }
-        }
-
-        avg /= grid_size_x * grid_size_y;
-        return input_data_type(avg);
-    }
-}
-
-/** Average pooling over an aligned window */
-template <typename input_data_type>
-inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
-                                             unsigned int            roi_batch,
-                                             float                   region_start_x,
-                                             float                   bin_size_x,
-                                             int                     grid_size_x,
-                                             float                   region_end_x,
-                                             float                   region_start_y,
-                                             float                   bin_size_y,
-                                             int                     grid_size_y,
-                                             float                   region_end_y,
-                                             int                     pz,
-                                             const QuantizationInfo &out_qinfo)
-{
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
-    {
-        return input_data_type(out_qinfo.uniform().offset);
-    }
-    else
-    {
-        float                         avg              = 0;
-        const UniformQuantizationInfo input_qinfo      = input->info()->quantization_info().uniform();
-        const bool                    is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
-        const DataLayout              data_layout      = input->info()->data_layout();
-
-        // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
-        {
-            for(int ix = 0; ix < grid_size_x; ++ix)
-            {
-                // Align the window in the middle of every bin
-                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
-                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
-
-                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
-                const int y_low  = y;
-                const int x_low  = x;
-                const int y_high = y_low + 1;
-                const int x_high = x_low + 1;
-
-                const float ly = y - y_low;
-                const float lx = x - x_low;
-                const float hy = 1. - ly;
-                const float hx = 1. - lx;
-
-                const float w1 = hy * hx;
-                const float w2 = hy * lx;
-                const float w3 = ly * hx;
-                const float w4 = ly * lx;
-
-                if(data_layout == DataLayout::NCHW)
-                {
-                    if(is_qasymm_signed)
-                    {
-                        float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                    else
-                    {
-                        float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                }
-                else
-                {
-                    if(is_qasymm_signed)
-                    {
-                        const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                    else
-                    {
-                        const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                }
-            }
-        }
-
-        avg /= grid_size_x * grid_size_y;
-
-        input_data_type res = 0;
-        if(is_qasymm_signed)
-        {
-            res = quantize_qasymm8_signed(avg, out_qinfo);
-        }
-        else
-        {
-            res = quantize_qasymm8(avg, out_qinfo);
-        }
-        return res;
-    }
-}
-
-inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, float max_value)
-{
-    const float region_start = p * bin_size + roi_anchor;
-    return utility::clamp(region_start, 0.0f, max_value);
-}
-
 void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     const DataLayout data_layout = _input->info()->data_layout();
     if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
     {
-        switch(_input->info()->data_type())
-        {
-            case DataType::QASYMM8:
-            {
-                NEROIAlignLayerKernel::internal_run<uint8_t, uint16_t>(window, info);
-                break;
-            }
-            case DataType::QASYMM8_SIGNED:
-            {
-                NEROIAlignLayerKernel::internal_run<int8_t, uint16_t>(window, info);
-                break;
-            }
-            case DataType::F32:
-            {
-                NEROIAlignLayerKernel::internal_run<float>(window, info);
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                NEROIAlignLayerKernel::internal_run<float16_t>(window, info);
-                break;
-            }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            default:
-            {
-                ARM_COMPUTE_ERROR("DataType not supported");
-                break;
-            }
-        }
+        const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() });
+        ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+        uk->ukernel(_input, _output, _rois, _pool_info, window, info);
     }
     else
     {
         ARM_COMPUTE_ERROR("Invalid layout");
     }
 }
-
-template <typename input_data_type, typename roi_data_type>
-void NEROIAlignLayerKernel::internal_run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const DataLayout data_layout    = _input->info()->data_layout();
-    const size_t     values_per_roi = _rois->info()->dimension(0);
-
-    const int roi_list_start = window.x().start();
-    const int roi_list_end   = window.x().end();
-
-    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int idx_depth  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    const int input_width   = _input->info()->dimension(idx_width);
-    const int input_height  = _input->info()->dimension(idx_height);
-    const int input_chanels = _input->info()->dimension(idx_depth);
-    const int pooled_w      = _pool_info.pooled_width();
-    const int pooled_h      = _pool_info.pooled_height();
-
-    const DataType data_type = _input->info()->data_type();
-    const bool     is_qasymm = is_data_type_quantized_asymmetric(data_type);
-
-    const auto             *rois_ptr   = reinterpret_cast<const roi_data_type *>(_rois->buffer());
-    const QuantizationInfo &rois_qinfo = _rois->info()->quantization_info();
-    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
-    {
-        const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
-
-        roi_data_type qx1 = rois_ptr[values_per_roi * roi_indx + 1];
-        roi_data_type qy1 = rois_ptr[values_per_roi * roi_indx + 2];
-        roi_data_type qx2 = rois_ptr[values_per_roi * roi_indx + 3];
-        roi_data_type qy2 = rois_ptr[values_per_roi * roi_indx + 4];
-        float         x1(qx1);
-        float         x2(qx2);
-        float         y1(qy1);
-        float         y2(qy2);
-        if(is_qasymm)
-        {
-            x1 = dequantize_qasymm16(qx1, rois_qinfo);
-            x2 = dequantize_qasymm16(qx2, rois_qinfo);
-            y1 = dequantize_qasymm16(qy1, rois_qinfo);
-            y2 = dequantize_qasymm16(qy2, rois_qinfo);
-        }
-        const float roi_anchor_x = x1 * _pool_info.spatial_scale();
-        const float roi_anchor_y = y1 * _pool_info.spatial_scale();
-        const float roi_dims_x   = std::max((x2 - x1) * _pool_info.spatial_scale(), 1.0f);
-        const float roi_dims_y   = std::max((y2 - y1) * _pool_info.spatial_scale(), 1.0f);
-        float       bin_size_x   = roi_dims_x / _pool_info.pooled_width();
-        float       bin_size_y   = roi_dims_y / _pool_info.pooled_height();
-
-        // Iterate through all feature maps
-        for(int ch = 0; ch < input_chanels; ++ch)
-        {
-            // Iterate through all output pixels
-            for(int py = 0; py < pooled_h; ++py)
-            {
-                for(int px = 0; px < pooled_w; ++px)
-                {
-                    const float     region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
-                    const float     region_end_x   = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_end_y   = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
-                    const int       roi_bin_grid_x = (_pool_info.sampling_ratio() > 0) ? _pool_info.sampling_ratio() : int(ceil(bin_size_x));
-                    const int       roi_bin_grid_y = (_pool_info.sampling_ratio() > 0) ? _pool_info.sampling_ratio() : int(ceil(bin_size_y));
-                    input_data_type out_val(0);
-                    if(is_qasymm)
-                    {
-                        out_val = roi_align_1x1_qasymm8<input_data_type>(
-                                      _input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch, _output->info()->quantization_info());
-                    }
-                    else
-                    {
-                        out_val = roi_align_1x1<input_data_type>(
-                                      _input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch);
-                    }
-
-                    if(data_layout == DataLayout::NCHW)
-                    {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(_output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
-                        *out_ptr     = out_val;
-                    }
-                    else
-                    {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(_output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
-                        *out_ptr     = out_val;
-                    }
-                }
-            }
-        }
-    }
-}
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
index fa31a879b..48a3de728 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,9 +89,6 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    template <typename input_data_type, typename roi_data_type = input_data_type>
-    void internal_run(const Window &window, const ThreadInfo &info);
-
     const ITensor      *_input;
     ITensor            *_output;
     const ITensor      *_rois;
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index 0395e0bd3..82d1403c5 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -30,68 +30,96 @@
 #include "arm_compute/core/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "arm_compute/core/Utils.h"
+#include "src/cpu/kernels/range/list.h"
 
 namespace arm_compute
 {
 namespace
 {
-template <typename T>
-void range_function(ITensor *output, float start, float step, const Window &window)
+struct RangeSelectorData
 {
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
-
-    const auto step_vec  = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
-    const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
-    auto       id_vec    = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+    DataType dt;
+};
 
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16 / sizeof(T);
+using RangeSelectorPtr = std::add_pointer<bool(const RangeSelectorData &data)>::type;
+using RangeUKernelPtr  = std::add_pointer<void(ITensor *, float, float, const Window &)>::type;
 
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator output_it(output, win);
+struct RangeUKernel
+{
+    const char            *name;
+    const RangeSelectorPtr is_selected;
+    RangeUKernelPtr        ukernel;
+};
 
-    execute_window_loop(win, [&](const Coordinates &)
+static const RangeUKernel available_kernels[] =
+{
     {
-        int        x       = window_start_x;
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            for(int count = 0; count < window_step_x; ++count)
-            {
-                id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
-            }
-
-            // start + step * id
-            const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
-            wrapper::vstore(out_ptr + x, res_vec);
-        }
+        "fp16_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)
+    },
+    {
+        "f32_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)
+    },
+    {
+        "u8_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::U8; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)
+    },
+    {
+        "u16_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::U16; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)
+    },
+    {
+        "u32_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::U32; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)
+    },
+    {
+        "s8_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::S8; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)
+    },
+    {
+        "s16_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::S16; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)
+    },
+    {
+        "s32_neon_range",
+        [](const RangeSelectorData & data) { return data.dt == DataType::S32; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)
+    },
+};
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const RangeUKernel *get_implementation(const RangeSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
         {
-            const auto res = start + x * step;
-            *(out_ptr + x) = res;
+            return &uk;
         }
-
-    },
-    output_it);
+    }
+    return nullptr;
 }
 
 Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output,
-                                                         1,
-                                                         DataType::U8, DataType::S8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
+    const auto *uk = get_implementation(RangeSelectorData{ output.data_type() });
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
@@ -111,7 +139,7 @@ Status validate_arguments(const ITensorInfo &output, const float start, const fl
 } // namespace
 
 NERangeKernel::NERangeKernel()
-    : _func(nullptr), _start(0), _end(1), _step(1), _output(nullptr)
+    : _start(0), _end(1), _step(1), _output(nullptr)
 {
 }
 
@@ -131,38 +159,6 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste
     _end    = end;
     _step   = step;
     _output = output;
-    switch(_output->info()->data_type())
-    {
-        case DataType::U8:
-            _func = &range_function<uint8_t>;
-            break;
-        case DataType::U16:
-            _func = &range_function<uint16_t>;
-            break;
-        case DataType::U32:
-            _func = &range_function<uint32_t>;
-            break;
-        case DataType::S8:
-            _func = &range_function<int8_t>;
-            break;
-        case DataType::S16:
-            _func = &range_function<int16_t>;
-            break;
-        case DataType::S32:
-            _func = &range_function<int32_t>;
-            break;
-        case DataType::F32:
-            _func = &range_function<float>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &range_function<float16_t>;
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-            break;
-    }
 
     INEKernel::configure(win);
 }
@@ -181,8 +177,8 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() });
 
-    (*_func)(_output, _start, _step, window);
+    uk->ukernel(_output, _start, _step, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h
index 7c42ef11d..90560995e 100644
--- a/src/core/NEON/kernels/NERangeKernel.h
+++ b/src/core/NEON/kernels/NERangeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,11 +80,10 @@ public:
 private:
     using RangeFunction = void(ITensor *output, float start, float step, const Window &window);
 
-    RangeFunction *_func;   /**< Range function to be called */
-    float          _start;  /**< Start of sequence */
-    float          _end;    /**< End of sequence */
-    float          _step;   /**< Increment/step value */
-    ITensor       *_output; /**< Destination tensor */
+    float    _start;  /**< Start of sequence */
+    float    _end;    /**< End of sequence */
+    float    _step;   /**< Increment/step value */
+    ITensor *_output; /**< Destination tensor */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NERANGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
deleted file mode 100644
index a1ba29e4c..000000000
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NERemapKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute::scale_helpers;
-
-namespace arm_compute
-{
-class Coordinates;
-
-namespace
-{
-inline int32_t num_out_of_tensor(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &width_1, const int32x4_t &height_1)
-{
-    const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr));
-    const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr));
-
-    const int32x4_t outbx_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(width_1, mapx_s32), mapx_s32), vdupq_n_s32(-1)), vdupq_n_s32(0));  // Contains -1 if out of border in x, 0 otherwise
-    const int32x4_t outby_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(height_1, mapy_s32), mapy_s32), vdupq_n_s32(-1)), vdupq_n_s32(0)); // Contains -1 if out of border in y, 0 otherwise
-
-    const int32x4_t out_of_tensor_v = vminq_s32(outbx_s32, outby_s32);
-#if defined(__aarch64__)
-    // only AArch64 supports vaddv
-    return vaddvq_s32(out_of_tensor_v);
-#else  // __aarch64__    
-    return vgetq_lane_s32(out_of_tensor_v, 0) + vgetq_lane_s32(out_of_tensor_v, 1) + vgetq_lane_s32(out_of_tensor_v, 2)  + vgetq_lane_s32(out_of_tensor_v, 3);
-#endif // __aarch64__
-}
-
-inline void serial_remap_nearest_interpolation(const uint8_t *in_ptr, const float *mapx_ptr, const float *mapy_ptr, uint8_t *out_ptr,
-                                               int32_t width_val, int32_t height_val, int32_t in_stride_val, uint8_t constant_border_value)
-{
-    const auto x_s32 = static_cast<int32_t>(*mapx_ptr);
-    const auto y_s32 = static_cast<int32_t>(*mapy_ptr);
-    if(x_s32 < 0 || y_s32 < 0 || x_s32 >= width_val || y_s32 >= height_val)
-    {
-        *(out_ptr) = constant_border_value;
-    }
-    else
-    {
-        *(out_ptr) = in_ptr[x_s32 + y_s32 * in_stride_val];
-    }
-}
-
-inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &stride)
-{
-    const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr));
-    const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr));
-    return vmlaq_s32(mapx_s32, mapy_s32, stride);
-}
-
-inline uint8_t pixel_bilinear_c1_clamp(const uint8_t *pixel_ptr, int32_t stride, int32_t width, int32_t height, float x, float y, uint8_t constant_border_value)
-{
-    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
-    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
-
-    const int32_t xi = static_cast<int32_t>(std::floor(x));
-    const int32_t yi = static_cast<int32_t>(std::floor(y));
-
-    const float dx = x - static_cast<float>(xi);
-    const float dy = y - static_cast<float>(yi);
-
-    // Calculating the address won't trigger a segfault in case the value is outside the tensor
-    // The ternary operator resolves the values in both conditions
-    const uint8_t *a00 = (xi < 0 || xi >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride);
-    const uint8_t *a01 = (xi + 1 >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride);
-    const uint8_t *a10 = (xi < 0 || xi >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride + stride);
-    const uint8_t *a11 = (xi + 1 >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride + stride);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-    const float w1  = dx1 * dy1;
-    const float w2  = dx * dy1;
-    const float w3  = dx1 * dy;
-    const float w4  = dx * dy;
-
-    return static_cast<uint8_t>((*a00) * w1 + (*a01) * w2 + (*a10) * w3 + (*a11) * w4);
-}
-} // namespace
-
-NERemapKernel::NERemapKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr), _border_mode(BorderMode::UNDEFINED), _constant_border_value(0)
-{
-}
-
-void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
-
-    _input                 = input;
-    _output                = output;
-    _map_x                 = map_x;
-    _map_y                 = map_y;
-    _border_mode           = border_mode;
-    _constant_border_value = constant_border_value;
-
-    switch(policy)
-    {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-        {
-            _func = &NERemapKernel::remap_nearest;
-            break;
-        }
-        case InterpolationPolicy::BILINEAR:
-        {
-            _func = &NERemapKernel::remap_bilinear;
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
-            break;
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps());
-    INEKernel::configure(win);
-}
-
-void NERemapKernel::remap_nearest(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    const auto    window_start_x = static_cast<int32_t>(window.x().start());
-    const auto    window_end_x   = static_cast<int32_t>(window.x().end());
-    const int32_t window_step_x  = 8;
-
-    // Don't increment in X direction for the output, mapx, mapy tensors
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-    Iterator mapx(_map_x, win);
-    Iterator mapy(_map_y, win);
-
-    const int32_t   width_val     = static_cast<int32_t>(_input->info()->dimension(0));
-    const int32_t   height_val    = static_cast<int32_t>(_input->info()->dimension(1));
-    const int32_t   in_stride_val = static_cast<int32_t>(_input->info()->strides_in_bytes()[1]);
-    const int32x4_t width_1       = vdupq_n_s32(width_val - 1);
-    const int32x4_t height_1      = vdupq_n_s32(height_val - 1);
-    const int32x4_t in_stride     = vdupq_n_s32(in_stride_val);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto           mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
-        auto           mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
-        const uint8_t *in_ptr   = in.ptr();
-        uint8_t       *out_ptr  = out.ptr();
-        int32_t        x        = window_start_x;
-        for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x)
-        {
-            const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_1, height_1);
-            const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_1, height_1);
-            const int32_t out_of_tensor  = out_of_tensor0 + out_of_tensor1;
-
-            if(out_of_tensor == -8)
-            {
-                // All elements are out of xy plane
-                uint8x8_t tmp = vdup_n_u8(_constant_border_value);
-                vst1_u8(out_ptr, tmp);
-            }
-            else if(out_of_tensor < 0)
-            {
-                // Some elements are out of xy plane
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 1, mapy_ptr + 1, out_ptr + 1, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 2, mapy_ptr + 2, out_ptr + 2, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 3, mapy_ptr + 3, out_ptr + 3, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 4, mapy_ptr + 4, out_ptr + 4, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 5, mapy_ptr + 5, out_ptr + 5, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 6, mapy_ptr + 6, out_ptr + 6, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 7, mapy_ptr + 7, out_ptr + 7, width_val, height_val, in_stride_val, _constant_border_value);
-            }
-            else
-            {
-                // All elements are in xy plane
-                uint8x8_t       tmp     = vdup_n_u8(0);
-                const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr, mapy_ptr, in_stride);
-                const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, in_stride);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7);
-                vst1_u8(out_ptr, tmp);
-            }
-        }
-        for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr)
-        {
-            serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value);
-        }
-    },
-    in, out, mapx, mapy);
-}
-
-void NERemapKernel::remap_bilinear(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    const auto    window_start_x = static_cast<int32_t>(window.x().start());
-    const auto    window_end_x   = static_cast<int32_t>(window.x().end());
-    const int32_t window_step_x  = 8;
-
-    // Don't increment in X direction for the output, mapx, mapy tensors
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-    Iterator mapx(_map_x, win);
-    Iterator mapy(_map_y, win);
-
-    const int32_t   width_val     = static_cast<int32_t>(_input->info()->dimension(0));
-    const int32_t   height_val    = static_cast<int32_t>(_input->info()->dimension(1));
-    const int32x4_t width_2       = vdupq_n_s32(width_val - 2);
-    const int32x4_t height_2      = vdupq_n_s32(height_val - 2);
-    const int32_t   in_stride_val = static_cast<int32_t>(_input->info()->strides_in_bytes()[1]);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto           mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
-        auto           mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
-        const uint8_t *in_ptr   = in.ptr();
-        uint8_t       *out_ptr  = out.ptr();
-        int32_t        x        = window_start_x;
-        for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x)
-        {
-            const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_2, height_2);
-            const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_2, height_2);
-            const int32_t out_of_tensor  = out_of_tensor0 + out_of_tensor1;
-
-            if(out_of_tensor < 0)
-            {
-                // Elements are out of xy plane
-                *(out_ptr)     = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value);
-                *(out_ptr + 1) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value);
-                *(out_ptr + 2) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value);
-                *(out_ptr + 3) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value);
-                *(out_ptr + 4) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value);
-                *(out_ptr + 5) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value);
-                *(out_ptr + 6) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value);
-                *(out_ptr + 7) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value);
-            }
-            else
-            {
-                // All elements are in xy plane
-                uint8x8_t tmp = vdup_n_u8(0);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value), tmp, 0);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value), tmp, 1);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value), tmp, 2);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value), tmp, 3);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value), tmp, 4);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value), tmp, 5);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value), tmp, 6);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value), tmp, 7);
-                vst1_u8(out_ptr, tmp);
-            }
-        }
-        for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr)
-        {
-            *(out_ptr) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value);
-        }
-    },
-    in, out, mapx, mapy);
-}
-
-void NERemapKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h
deleted file mode 100644
index 33e929805..000000000
--- a/src/core/NEON/kernels/NERemapKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREMAPKERNEL_H
-#define ARM_COMPUTE_NEREMAPKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform a remap on a tensor */
-class NERemapKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NERemapKernel";
-    }
-    /** Default constructor */
-    NERemapKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel(const NERemapKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel &operator=(const NERemapKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NERemapKernel(NERemapKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NERemapKernel &operator=(NERemapKernel &&) = default;
-    /** Default destructor */
-    ~NERemapKernel() = default;
-
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  input                 Source tensor. Data type supported: U8.
-     * @param[in]  map_x                 Map for X coordinates. Data type supported: F32.
-     * @param[in]  map_y                 Map for Y coordinates. Data type supported: F32.
-     * @param[out] output                Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy                The interpolation type.
-     * @param[in]  border_mode           Border mode to use on the input tensor.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. Defaults to 0.
-     */
-    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** function to perform nearest interpolation on the given window */
-    void remap_nearest(const Window &window);
-    /** function to perform bilinear interpolation on the given window */
-    void remap_bilinear(const Window &window);
-    /** Remap function to use for the particular interpolation type passed to configure() */
-    void (NERemapKernel::*_func)(const Window &window);
-
-    const ITensor *_input;                 /**< Input image */
-    ITensor       *_output;                /**< Output image */
-    const ITensor *_map_x;                 /**< Input remap x coordinates */
-    const ITensor *_map_y;                 /**< Input remap y coordinates */
-    BorderMode     _border_mode;           /**< Border mode */
-    uint8_t        _constant_border_value; /**< Border value to use */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */
-\ No newline at end of file
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index 7c988e9fa..b8c9b244e 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,10 @@
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/common/Registrars.h"
+
+#include "src/cpu/kernels/select/list.h"
+
 #include <arm_neon.h>
 #include <map>
 #include <string>
@@ -42,125 +46,123 @@ namespace arm_compute
 {
 namespace
 {
-template <typename ScalarType, typename VectorType>
-void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
-{
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator condition(cond, win);
-    Iterator input1(in1, win);
-    Iterator input2(in2, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
-        const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
-        const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
 
-        int x = window_start_x;
-        for(; x <= limit; x += window_step_x)
-        {
-            const auto c = (*condition_conversion)(condition_ptr + x);
-            const auto a = wrapper::vloadq(input1_ptr + x);
-            const auto b = wrapper::vloadq(input2_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            const auto c      = *(condition_ptr + x);
-            const auto a      = *(input1_ptr + x);
-            const auto b      = *(input2_ptr + x);
-            *(output_ptr + x) = static_cast<bool>(c) ? a : b;
-        }
-    },
-    condition, input1, input2, output);
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+struct SelectKernelSelectorData
 {
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
+    DataType dt;
+    bool     is_same_rank;
+};
 
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
-    });
-}
+using SelectorPtr = std::add_pointer<bool(const SelectKernelSelectorData &data)>::type;
+using KernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
 
-template <typename ScalarType, typename VectorType>
-void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+struct SelectKernelSelector
 {
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
+    const char       *name;
+    const SelectorPtr is_selected;
+    KernelPtr         ukernel;
+};
 
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
-    });
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+static const SelectKernelSelector available_kernels[] =
 {
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
     {
-        static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
-    });
-}
+        "neon_s8_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)
+    },
+    {
+        "neon_s16_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)
+    },
+    {
+        "neon_s32_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)
+    },
+    {
+        "neon_u8_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)
+    },
+    {
+        "neon_u16_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)
+    },
+    {
+        "neon_u32_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)
+    },
+    {
+        "neon_s8_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)
+    },
+    {
+        "neon_s16_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)
+    },
+    {
+        "neon_s32_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)
+    },
+    {
+        "neon_u8_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)
+    },
+    {
+        "neon_u16_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)
+    },
+    {
+        "neon_u32_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
+        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)
+    },
+    {
+        "neon_f16_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)
+    },
+    {
+        "neon_f16_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)
+    },
+    {
+        "neon_f32_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)
+    },
+    {
+        "neon_f32_not_same_rank",
+        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)
+    },
+};
 
-template <typename ScalarType>
-void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data)
 {
-    ARM_COMPUTE_UNUSED(window);
-
-    auto       output_ptr    = reinterpret_cast<ScalarType *>(out->buffer());
-    const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
-    const auto input1_ptr    = reinterpret_cast<const ScalarType *>(in1->buffer());
-    const auto input2_ptr    = reinterpret_cast<const ScalarType *>(in2->buffer());
-
-    const int outer_size = cond->info()->total_size() / cond->info()->element_size();
-    const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
-    int       offset     = 0;
-    const int step       = 16 / in1->info()->element_size();
-
-    for(int i = 0; i < outer_size; ++i)
+    for(const auto &uk : available_kernels)
     {
-        int        x         = offset;
-        const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
-        for(; x <= offset + inner_size - step; x += step)
-        {
-            wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
-        }
-        if(x <= offset + inner_size - (step / 2))
-        {
-            wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
-            x += step / 2;
-        }
-        for(; x < offset + inner_size; ++x)
+        if(uk.is_selected(data))
         {
-            *(output_ptr + x) = *(input_ptr + x);
+            return &uk;
         }
-        offset += inner_size;
     }
+    return nullptr;
 }
+
 } // namespace
 
 NESelectKernel::NESelectKernel()
-    : _function(nullptr), _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+    : /*_function(nullptr), */ _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
 {
 }
 
@@ -178,51 +180,6 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor
     _output        = output;
     _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
 
-    std::string function_to_call("op_");
-    function_to_call += string_from_data_type(x->info()->data_type());
-
-    static std::map<std::string, SelectFunction *> map_function;
-
-    if(_has_same_rank)
-    {
-        map_function =
-        {
-            { "op_S8", &select_op_8<int8_t, uint8x16_t> },
-            { "op_S16", &select_op_16<int16_t, uint16x8_t> },
-            { "op_S32", &select_op_32<int32_t, uint32x4_t> },
-            { "op_U8", &select_op_8<uint8_t, uint8x16_t> },
-            { "op_U16", &select_op_16<uint16_t, uint16x8_t> },
-            { "op_U32", &select_op_32<uint32_t, uint32x4_t> },
-            { "op_F32", &select_op_32<float, uint32x4_t> }
-        };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        map_function["op_F16"] = &select_op_16<float16_t, uint16x8_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    }
-    else
-    {
-        map_function =
-        {
-            { "op_S8", &select_op_not_same_rank<int8_t> },
-            { "op_S16", &select_op_not_same_rank<int16_t> },
-            { "op_S32", &select_op_not_same_rank<int32_t> },
-            { "op_U8", &select_op_not_same_rank<uint8_t> },
-            { "op_U16", &select_op_not_same_rank<uint16_t> },
-            { "op_U32", &select_op_not_same_rank<uint32_t> },
-            { "op_F32", &select_op_not_same_rank<float> }
-        };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        map_function["op_F16"] = &select_op_not_same_rank<float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    }
-
-    auto it = map_function.find(function_to_call);
-
-    if(it != map_function.end())
-    {
-        _function = it->second;
-    }
-
     Window win = calculate_max_window(*x->info());
     INEKernel::configure(win);
 }
@@ -254,7 +211,12 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_function == nullptr);
-    _function(_c, _x, _y, _output, window);
+    ARM_COMPUTE_ERROR_ON(_output == nullptr);
+    ARM_COMPUTE_ERROR_ON(_output->info() == nullptr);
+
+    const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr);
+    ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+    uk->ukernel(_c, _x, _y, _output, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h
index f7142feff..e82105a68 100644
--- a/src/core/NEON/kernels/NESelectKernel.h
+++ b/src/core/NEON/kernels/NESelectKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -82,22 +82,12 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Common signature for all the specialised select functions
-     *
-     * @param[in] c      Condition input tensor. Data types supported: U8.
-     * @param[in] x      First input tensor. Data types supported: All.
-     * @param[in] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in] output Output tensor. Data types supported: Same as @p x.
-     */
-    using SelectFunction = void(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window);
 
-    /** Select function to use for the particular tensor types passed to configure() */
-    SelectFunction *_function;
-    const ITensor *_c;              /**< Condition tensor */
-    const ITensor *_x;              /**< Source tensor 1 */
-    const ITensor *_y;              /**< Source tensor 2 */
-    ITensor        *_output;        /**< Destination tensor */
-    bool            _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
+    const ITensor *_c;             /**< Condition tensor */
+    const ITensor *_x;             /**< Source tensor 1 */
+    const ITensor *_y;             /**< Source tensor 2 */
+    ITensor       *_output;        /**< Destination tensor */
+    bool           _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NESELECTKERNEL_H */
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
index 1d52b56d3..ea41529d8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
@@ -136,7 +136,14 @@ UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &a
 {
   const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr;
   const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl);
-  return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr);
+
+  if(success)
+  {
+        auto i =  impl->get_instance(args, os);
+        i->set_name(impl->name);
+        return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(i);
+  }
+  return nullptr;
 }
 
 }  // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
index 20c823014..79fc65e56 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -23,7 +23,9 @@
  */
 #pragma once
 
+#if !defined(__OpenBSD__)
 #include <alloca.h>
+#endif /* !defined(__OpenBSD__) */
 
 #include <algorithm>
 #include <cassert>
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index d3857a50e..809946f10 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -144,7 +144,7 @@ struct GemmImplementation<Top, Tret, Nothing> {
 
 /* "Master" function implemented for each valid combination of types.
  * Returns a list of GEMM implementation descriptors for processing by the
- * other functions, terminated by an implementation with
+ * other functions, ended by an implementation with
  * method==GemmMethod::DEFAULT.  */
 template<typename Top, typename Tret, class OutputStage = Nothing>
 const GemmImplementation<Top, Tret, OutputStage> *gemm_implementation_list();
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
index d5003e4a1..91988e8c3 100644
--- a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
@@ -28,7 +28,9 @@
 #include "interleave_indirect.hpp"
 #include "bfloat.hpp"
 
+#if !defined(__OpenBSD__)
 #include <alloca.h>
+#endif /* !defined(__OpenBSD__) */
 
 #include <algorithm>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
index 758f2b1f8..9af1b4df1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
@@ -72,7 +72,7 @@ public:
                     return { 15.361, 0.9341, 0.1636 };
 
                 case CPUModel::V1:
-                    return { 62.40, 4.71, 0.67 };
+                    return { 51.14, 7.38, 0.65 };
 
                 default:
                     return { 29.0698, 3.9793, 0.4003 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
index 21c9f5966..6d333f344 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
@@ -80,7 +80,7 @@ public:
                     return { 15.361, 0.9341, 0.1636 };
 
                 case CPUModel::V1:
-                    return { 62.40, 4.71, 0.67 };
+                    return { 51.14, 7.38, 0.65 };
 
                 default:
                     return { 29.0698, 3.9793, 0.4003 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
index 090dd5855..e6e795097 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -82,7 +82,7 @@ public:
                 case CPUModel::A510:
                     return { 6.81 };
                 case CPUModel::V1:
-                    return { 28.40 };
+                    return { 22.33 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
index f5e9009f6..39ffcbef1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
@@ -82,7 +82,7 @@ public:
                 case CPUModel::A510:
                     return { 6.70 };
                 case CPUModel::V1:
-                    return { 26.64 };
+                    return { 21.28 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index 94f578368..905a60265 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -84,7 +84,7 @@ public:
                 case CPUModel::A510:
                     return { 14.81 };
                 case CPUModel::V1:
-                    return { 48.34 };
+                    return { 44.54 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
index bc933afd9..69ea87bc9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
@@ -81,7 +81,7 @@ public:
                 case CPUModel::A510:
                     return { 27.99 };
                 case CPUModel::V1:
-                    return { 68.76 };
+                    return { 62.26 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
index c5105a6d4..ce96c1b28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -75,29 +75,29 @@ public:
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        if (std::is_same<T, uint8_t>::value) {
+        if (std::is_same<T, uint32_t>::value) {
             switch (ci->get_cpu_model()) {
-                case CPUModel::A55r1:
-                    return { 9.5238, 2.0799, 0.2279 };
                 default:
-                    return { 29.6736, 11.4025, 0.5591 };
+                    return { 31.63 };
                 case CPUModel::A510:
-                    return { 16.65, 3.92, 0.48 };
+                    return { 15.89 };
                 case CPUModel::V1:
-                    return { 55.42, 19.29, 0.92 };
+                    return { 53.87 };
+                case CPUModel::A55r1:
+                    return { 9.217 };
             }
         }
 
-        if (std::is_same<T, uint32_t>::value) {
+        if (std::is_same<T, uint8_t>::value) {
             switch (ci->get_cpu_model()) {
-                default:
-                    return { 31.63 };
                 case CPUModel::A55r1:
-                    return { 9.217 };
+                    return { 9.5238, 2.0799, 0.2279 };
+                default:
+                    return { 29.6736, 11.4025, 0.5591 };
                 case CPUModel::A510:
-                    return { 15.89 };
+                    return { 16.65, 3.92, 0.48 };
                 case CPUModel::V1:
-                    return { 53.87 };
+                    return { 42.62, 16.32, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
index 24bad3c63..b5cedc7e9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
@@ -92,7 +92,7 @@ public:
                 case CPUModel::A510:
                     return { 33.64, 3.92, 0.48 };
                 case CPUModel::V1:
-                    return { 86.71, 19.00, 0.93 };
+                    return { 63.94, 16.18, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index 9b3517a80..6ec6bd2ed 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -36,6 +36,7 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void a64_interleaved_bf16fp32_mmla_8x12( ARGLIST );
+void a64_interleaved_bf16fp32_mmla_8x12_a510( ARGLIST );
 
 class cls_a64_interleaved_bf16fp32_mmla_8x12
 {
@@ -78,7 +79,7 @@ public:
                 default:
                     return { 31.54, 4.30, 7.33 };
                 case CPUModel::V1:
-                    return { 59.94, 5.08, 9.83 };
+                    return { 41.44, 5.01, 5.64 };
                 case CPUModel::A510:
                     return { 7.82, 4.05, 3.07 };
             }
@@ -101,8 +102,15 @@ public:
 
     // Default to the generic kernel
     kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
-    cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *)
+    cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A510:
+                kernel=a64_interleaved_bf16fp32_mmla_8x12_a510;
+                break;
+        }
     }
 };
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
new file mode 100644
index 000000000..0235e91bf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include "../../bfloat.hpp"
+
+namespace arm_gemm {
+
+void a64_interleaved_bf16fp32_mmla_8x12_a510(
+    const bfloat16 *Apanel, const bfloat16 *Bpanel,
+    float *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "mov %x[Apanel], x21\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.8h }, [x20], #0x10\n"
+      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+      "cmp x19, #0x2\n"
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "ld1 { v5.8h }, [x20], #0x10\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
+      "ld1 { v6.8h }, [x20], #0x10\n"
+      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
+      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
+      "ld1 { v7.8h }, [x20], #0x10\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
+      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
+      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
+      "ld1 { v4.8h }, [x20], #0x10\n"
+      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
+      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
+      "ld1 { v5.8h }, [x20], #0x10\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
+      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
+      "cbz x19, 5f\n"
+      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.8h }, [x20], #0x10\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v7.8h }, [x20], #0x10\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q4, [%x[Cpanel], #0x0]\n"
+      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q11, [%x[Cpanel], #0x10]\n"
+      "str q12, [%x[Cpanel], #0x20]\n"
+      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q13, [%x[Cpanel], #0x60]\n"
+      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q17, [%x[Cpanel], #0x70]\n"
+      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q18, [%x[Cpanel], #0x80]\n"
+      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q19, [%x[Cpanel], #0xc0]\n"
+      "str q23, [%x[Cpanel], #0xd0]\n"
+      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q25, [%x[Cpanel], #0x120]\n"
+      "str q29, [%x[Cpanel], #0x130]\n"
+      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index ff69bc8f5..4cc3ed040 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -35,6 +35,7 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void a64_interleaved_s8s32_mmla_8x12( ARGLIST );
+void a64_interleaved_s8s32_mmla_8x12_a510( ARGLIST );
 
 class cls_a64_interleaved_s8s32_mmla_8x12
 {
@@ -91,7 +92,7 @@ public:
                 case CPUModel::A510:
                     return { 48.22, 2.49, 0.29 };
                 case CPUModel::V1:
-                    return { 116.76, 4.67, 0.60 };
+                    return { 75.54, 8.06, 0.63 };
             }
         }
 
@@ -100,9 +101,17 @@ public:
 
     // Default to the generic kernel
     kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
-    cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *)
+    cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A510:
+                kernel=a64_interleaved_s8s32_mmla_8x12_a510;
+                break;
+        }
     }
+
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
new file mode 100644
index 000000000..a4d8c0ace
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_s8s32_mmla_8x12_a510(
+    const int8_t *Apanel, const int8_t *Bpanel,
+    int32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const int8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "mov %x[Apanel], x21\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.16b }, [x20], #0x10\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      "cmp x19, #0x2\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "ld1 { v5.16b }, [x20], #0x10\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
+      "ld1 { v6.16b }, [x20], #0x10\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
+      "ld1 { v7.16b }, [x20], #0x10\n"
+      ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a431  // smmla v17.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a47a  // smmla v26.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47d  // smmla v29.4s, v3.16b, v7.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e84a47b  // smmla v27.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
+      "ld1 { v4.16b }, [x20], #0x10\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e87a433  // smmla v19.4s, v1.16b, v7.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e85a47e  // smmla v30.4s, v3.16b, v5.16b\n"
+      "ld1 { v5.16b }, [x20], #0x10\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e86a47c  // smmla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47f  // smmla v31.4s, v3.16b, v7.16b\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
+      "cbz x19, 5f\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.16b }, [x20], #0x10\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v7.16b }, [x20], #0x10\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x4e87a431  // smmla v17.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a47a  // smmla v26.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47d  // smmla v29.4s, v3.16b, v7.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e84a47b  // smmla v27.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e87a433  // smmla v19.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e85a47e  // smmla v30.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a47c  // smmla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47f  // smmla v31.4s, v3.16b, v7.16b\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q4, [%x[Cpanel], #0x0]\n"
+      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q11, [%x[Cpanel], #0x10]\n"
+      "str q12, [%x[Cpanel], #0x20]\n"
+      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q13, [%x[Cpanel], #0x60]\n"
+      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q17, [%x[Cpanel], #0x70]\n"
+      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q18, [%x[Cpanel], #0x80]\n"
+      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q19, [%x[Cpanel], #0xc0]\n"
+      "str q23, [%x[Cpanel], #0xd0]\n"
+      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q25, [%x[Cpanel], #0x120]\n"
+      "str q29, [%x[Cpanel], #0x130]\n"
+      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index f492a474a..fa93c1d90 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -35,6 +35,7 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void a64_interleaved_u8u32_mmla_8x12( ARGLIST );
+void a64_interleaved_u8u32_mmla_8x12_a510( ARGLIST );
 
 class cls_a64_interleaved_u8u32_mmla_8x12
 {
@@ -91,7 +92,7 @@ public:
                 case CPUModel::A510:
                     return { 47.66, 2.47, 0.29 };
                 case CPUModel::V1:
-                    return { 111.60, 4.95, 0.66 };
+                    return { 75.54, 8.06, 0.63 };
             }
         }
 
@@ -100,8 +101,15 @@ public:
 
     // Default to the generic kernel
     kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
-    cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *)
+    cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A510:
+                kernel=a64_interleaved_u8u32_mmla_8x12_a510;
+                break;
+        }
     }
 };
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
new file mode 100644
index 000000000..3fe1a9bd0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_u8u32_mmla_8x12_a510(
+    const uint8_t *Apanel, const uint8_t *Bpanel,
+    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "mov %x[Apanel], x21\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.16b }, [x20], #0x10\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      "cmp x19, #0x2\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "ld1 { v5.16b }, [x20], #0x10\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a47a  // ummla v26.4s, v3.16b, v4.16b\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x6e85a47d  // ummla v29.4s, v3.16b, v5.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a47b  // ummla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a432  // ummla v18.4s, v1.16b, v7.16b\n"
+      "ld1 { v6.16b }, [x20], #0x10\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a47e  // ummla v30.4s, v3.16b, v7.16b\n"
+      "ld1 { v7.16b }, [x20], #0x10\n"
+      ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a47c  // ummla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47f  // ummla v31.4s, v3.16b, v5.16b\n"
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a431  // ummla v17.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a47a  // ummla v26.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47d  // ummla v29.4s, v3.16b, v7.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e84a47b  // ummla v27.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
+      "ld1 { v4.16b }, [x20], #0x10\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e87a433  // ummla v19.4s, v1.16b, v7.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e85a47e  // ummla v30.4s, v3.16b, v5.16b\n"
+      "ld1 { v5.16b }, [x20], #0x10\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e86a47c  // ummla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47f  // ummla v31.4s, v3.16b, v7.16b\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a47a  // ummla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47d  // ummla v29.4s, v3.16b, v5.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a47b  // ummla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e87a432  // ummla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a47e  // ummla v30.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a47c  // ummla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47f  // ummla v31.4s, v3.16b, v5.16b\n"
+      "cbz x19, 5f\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.16b }, [x20], #0x10\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v7.16b }, [x20], #0x10\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      "ldp q4, q5, [x20], #0x20\n"
+      ".inst 0x6e87a431  // ummla v17.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a47a  // ummla v26.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47d  // ummla v29.4s, v3.16b, v7.16b\n"
+      "ldp q6, q7, [x20], #0x20\n"
+      ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e84a47b  // ummla v27.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e87a433  // ummla v19.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e85a47e  // ummla v30.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a47c  // ummla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47f  // ummla v31.4s, v3.16b, v7.16b\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q4, [%x[Cpanel], #0x0]\n"
+      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q11, [%x[Cpanel], #0x10]\n"
+      "str q12, [%x[Cpanel], #0x20]\n"
+      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q13, [%x[Cpanel], #0x60]\n"
+      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q17, [%x[Cpanel], #0x70]\n"
+      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q18, [%x[Cpanel], #0x80]\n"
+      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q19, [%x[Cpanel], #0xc0]\n"
+      "str q23, [%x[Cpanel], #0xd0]\n"
+      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q25, [%x[Cpanel], #0x120]\n"
+      "str q29, [%x[Cpanel], #0x130]\n"
+      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
index 30e265fbc..ab175a375 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -83,7 +83,7 @@ public:
                 case CPUModel::A510:
                     return { 5.42 };
                 case CPUModel::V1:
-                    return { 34.56 };
+                    return { 20.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
index 61c7ad17e..b7c9aca9d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
@@ -83,7 +83,7 @@ public:
                 case CPUModel::A510:
                     return { 5.31 };
                 case CPUModel::V1:
-                    return { 28.93 };
+                    return { 17.32 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
index b8ca7c545..28057aa96 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
                 case CPUModel::A510:
                     return { 22.77, 3.90, 0.47 };
                 case CPUModel::V1:
-                    return { 62.97, 19.14, 0.92 };
+                    return { 48.09, 16.24, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
index b88ef14f2..c08977570 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
                 case CPUModel::A510:
                     return { 23.87, 3.89, 0.37 };
                 case CPUModel::V1:
-                    return { 107.63, 19.24, 0.92 };
+                    return { 75.14, 15.87, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
index d870711c6..901cc6d63 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
                 case CPUModel::A510:
                     return { 22.75, 3.90, 0.47 };
                 case CPUModel::V1:
-                    return { 62.97, 19.27, 0.92 };
+                    return { 48.09, 16.24, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
index 7f8eadc52..c0d089278 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
                 case CPUModel::A510:
                     return { 26.80, 3.89, 0.47 };
                 case CPUModel::V1:
-                    return { 108.33, 18.66, 0.92 };
+                    return { 75.14, 15.87, 0.83 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index fa44bdbd3..fc91dd71a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -80,7 +80,7 @@ public:
                 case CPUModel::A510:
                     return { 7.78, 4.01, 2.43 };
                 case CPUModel::V1:
-                    return { 62.50, 5.09, 11.32 };
+                    return { 47.63, 5.11, 6.80 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index 1924b2cf7..0d707b039 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -89,7 +89,7 @@ public:
                 default:
                     return { 31.67, 3.57, 0.50 };
                 case CPUModel::V1:
-                    return { 63.35, 4.76, 0.77 };
+                    return { 52.24, 7.49, 0.80 };
                 case CPUModel::A510:
                     return { 27.47, 1.70, 0.28 };
             }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index bd1764bb7..4e65296f8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -89,7 +89,7 @@ public:
                 default:
                     return { 61.97, 3.64, 0.50 };
                 case CPUModel::V1:
-                    return { 123.84, 4.93, 0.76 };
+                    return {  95.28, 7.99, 0.79 };
                 case CPUModel::A510:
                     return { 43.36, 1.86, 0.28 };
             }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index f66a9bf51..0afcdd2ce 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -91,7 +91,7 @@ public:
                 case CPUModel::A510:
                     return { 27.45, 1.65, 0.28 };
                 case CPUModel::V1:
-                    return { 63.35, 4.96, 0.77 };
+                    return { 52.24, 7.49, 0.80 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index b530202bd..58d21d6c4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -91,7 +91,7 @@ public:
                 case CPUModel::A510:
                     return { 38.02, 1.85, 0.28 };
                 case CPUModel::V1:
-                    return { 123.84, 4.98, 0.76 };
+                    return { 95.28, 7.99, 0.79 };
             }
         }
 
diff --git a/src/core/NEON/kernels/arm_gemm/transform.cpp b/src/core/NEON/kernels/arm_gemm/transform.cpp
index 60376ab80..c6a3bc0ed 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.cpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.cpp
@@ -25,7 +25,9 @@
 
 #include "bfloat.hpp"
 
+#if !defined(__OpenBSD__)
 #include <alloca.h>
+#endif /* !defined(__OpenBSD__) */
 
 namespace arm_gemm {
 
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index eadf48d00..9262ea05a 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -78,10 +78,20 @@ struct DepthwiseArgs
 template <typename TInput, typename TWeight, typename TOutput>
 class DepthwiseCommon : public IDepthwiseCommon
 {
+private:
+    std::string _name{};
+
 protected:
     const DepthwiseArgs m_args; // Copy of arguments
-
 public:
+    std::string name() const
+    {
+        return _name;
+    }
+    void set_name(const std::string &n)
+    {
+        _name = n;
+    }
     DepthwiseCommon(const DepthwiseArgs &args)
         : m_args(args) {};
     DepthwiseCommon(DepthwiseCommon &) = delete;