summaryrefslogtreecommitdiff
path: root/src/core/NEON
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON')
-rw-r--r--src/core/NEON/NEKernels.h3
-rw-r--r--src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp200
-rw-r--r--src/core/NEON/kernels/NEBoundingBoxTransformKernel.h5
-rw-r--r--src/core/NEON/kernels/NECropKernel.cpp215
-rw-r--r--src/core/NEON/kernels/NECropKernel.h4
-rw-r--r--src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp154
-rw-r--r--src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h5
-rw-r--r--src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp190
-rw-r--r--src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h15
-rw-r--r--src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp109
-rw-r--r--src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h9
-rw-r--r--src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp150
-rw-r--r--src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h4
-rw-r--r--src/core/NEON/kernels/NEPadLayerKernel.cpp9
-rw-r--r--src/core/NEON/kernels/NEROIAlignLayerKernel.cpp382
-rw-r--r--src/core/NEON/kernels/NEROIAlignLayerKernel.h5
-rw-r--r--src/core/NEON/kernels/NERangeKernel.cpp152
-rw-r--r--src/core/NEON/kernels/NERangeKernel.h11
-rw-r--r--src/core/NEON/kernels/NERemapKernel.cpp326
-rw-r--r--src/core/NEON/kernels/NERemapKernel.h86
-rw-r--r--src/core/NEON/kernels/NESelectKernel.cpp260
-rw-r--r--src/core/NEON/kernels/NESelectKernel.h22
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp24
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp275
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp13
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp275
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp275
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/transform.cpp2
-rw-r--r--src/core/NEON/kernels/assembly/depthwise.hpp14
53 files changed, 1570 insertions, 1699 deletions
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index af301c8d1..cd01659c0 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -55,7 +55,6 @@
#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
#include "src/core/NEON/kernels/NERangeKernel.h"
#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/NEON/kernels/NERemapKernel.h"
#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
#include "src/core/NEON/kernels/NEReverseKernel.h"
#include "src/core/NEON/kernels/NESelectKernel.h"
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 1e0a1742f..69bfd56ce 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,8 +28,10 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
#include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/boundingboxtransform/list.h"
#include <arm_neon.h>
@@ -37,6 +39,62 @@ namespace arm_compute
{
namespace
{
+struct BoundingBoxTransformSelectorData
+{
+ DataType dt;
+};
+
+using BoundingBoxTransformSelctorPtr = std::add_pointer<bool(const BoundingBoxTransformSelectorData &data)>::type;
+using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)>::type;
+
+struct BoundingBoxTransformKernel
+{
+ const char *name;
+ const BoundingBoxTransformSelctorPtr is_selected;
+ BoundingBoxTransformUKernelPtr ukernel;
+};
+
+static const BoundingBoxTransformKernel available_kernels[] =
+{
+ {
+ "fp32_neon_boundingboxtransform",
+ [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)
+ },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ {
+ "fp16_neon_boundingboxtransform",
+ [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)
+ },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+ {
+ "qu16_neon_boundingboxtransform",
+ [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; },
+ REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)
+ },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data)
+{
+ for(const auto &uk : available_kernels)
+ {
+ if(uk.is_selected(data))
+ {
+ return &uk;
+ }
+ }
+ return nullptr;
+}
+
Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
@@ -112,145 +170,15 @@ Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const IT
return Status{};
}
-template <>
-void NEBoundingBoxTransformKernel::internal_run<uint16_t>(const Window &window)
-{
- const size_t num_classes = _deltas->info()->tensor_shape()[0] >> 2;
- const size_t deltas_width = _deltas->info()->tensor_shape()[0];
- const int img_h = std::floor(_bbinfo.img_height() / _bbinfo.scale() + 0.5f);
- const int img_w = std::floor(_bbinfo.img_width() / _bbinfo.scale() + 0.5f);
-
- const auto scale_after = (_bbinfo.apply_scale() ? _bbinfo.scale() : 1.f);
- const auto scale_before = _bbinfo.scale();
- const auto offset = (_bbinfo.correct_transform_coords() ? 1.f : 0.f);
-
- auto pred_ptr = reinterpret_cast<uint16_t *>(_pred_boxes->buffer() + _pred_boxes->info()->offset_first_element_in_bytes());
- auto delta_ptr = reinterpret_cast<uint8_t *>(_deltas->buffer() + _deltas->info()->offset_first_element_in_bytes());
-
- const auto boxes_qinfo = _boxes->info()->quantization_info().uniform();
- const auto deltas_qinfo = _deltas->info()->quantization_info().uniform();
- const auto pred_qinfo = _pred_boxes->info()->quantization_info().uniform();
-
- Iterator box_it(_boxes, window);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto ptr = reinterpret_cast<uint16_t *>(box_it.ptr());
- const auto b0 = dequantize_qasymm16(*ptr, boxes_qinfo);
- const auto b1 = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
- const auto b2 = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
- const auto b3 = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
- const float width = (b2 / scale_before) - (b0 / scale_before) + 1.f;
- const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
- const float ctr_x = (b0 / scale_before) + 0.5f * width;
- const float ctr_y = (b1 / scale_before) + 0.5f * height;
- for(size_t j = 0; j < num_classes; ++j)
- {
- // Extract deltas
- const size_t delta_id = id.y() * deltas_width + 4u * j;
- const float dx = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / _bbinfo.weights()[0];
- const float dy = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / _bbinfo.weights()[1];
- float dw = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / _bbinfo.weights()[2];
- float dh = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / _bbinfo.weights()[3];
- // Clip dw and dh
- dw = std::min(dw, _bbinfo.bbox_xform_clip());
- dh = std::min(dh, _bbinfo.bbox_xform_clip());
- // Determine the predictions
- const float pred_ctr_x = dx * width + ctr_x;
- const float pred_ctr_y = dy * height + ctr_y;
- const float pred_w = std::exp(dw) * width;
- const float pred_h = std::exp(dh) * height;
- // Store the prediction into the output tensor
- pred_ptr[delta_id] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
- pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
- pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo);
- pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo);
- }
- },
- box_it);
-}
-
-template <typename T>
-void NEBoundingBoxTransformKernel::internal_run(const Window &window)
-{
- const size_t num_classes = _deltas->info()->tensor_shape()[0] >> 2;
- const size_t deltas_width = _deltas->info()->tensor_shape()[0];
- const int img_h = std::floor(_bbinfo.img_height() / _bbinfo.scale() + 0.5f);
- const int img_w = std::floor(_bbinfo.img_width() / _bbinfo.scale() + 0.5f);
-
- const auto scale_after = (_bbinfo.apply_scale() ? T(_bbinfo.scale()) : T(1));
- const auto scale_before = T(_bbinfo.scale());
- ARM_COMPUTE_ERROR_ON(scale_before <= 0);
- const auto offset = (_bbinfo.correct_transform_coords() ? T(1.f) : T(0.f));
-
- auto pred_ptr = reinterpret_cast<T *>(_pred_boxes->buffer() + _pred_boxes->info()->offset_first_element_in_bytes());
- auto delta_ptr = reinterpret_cast<T *>(_deltas->buffer() + _deltas->info()->offset_first_element_in_bytes());
-
- Iterator box_it(_boxes, window);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto ptr = reinterpret_cast<T *>(box_it.ptr());
- const auto b0 = *ptr;
- const auto b1 = *(ptr + 1);
- const auto b2 = *(ptr + 2);
- const auto b3 = *(ptr + 3);
- const T width = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
- const T height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
- const T ctr_x = (b0 / scale_before) + T(0.5f) * width;
- const T ctr_y = (b1 / scale_before) + T(0.5f) * height;
- for(size_t j = 0; j < num_classes; ++j)
- {
- // Extract deltas
- const size_t delta_id = id.y() * deltas_width + 4u * j;
- const T dx = delta_ptr[delta_id] / T(_bbinfo.weights()[0]);
- const T dy = delta_ptr[delta_id + 1] / T(_bbinfo.weights()[1]);
- T dw = delta_ptr[delta_id + 2] / T(_bbinfo.weights()[2]);
- T dh = delta_ptr[delta_id + 3] / T(_bbinfo.weights()[3]);
- // Clip dw and dh
- dw = std::min(dw, T(_bbinfo.bbox_xform_clip()));
- dh = std::min(dh, T(_bbinfo.bbox_xform_clip()));
- // Determine the predictions
- const T pred_ctr_x = dx * width + ctr_x;
- const T pred_ctr_y = dy * height + ctr_y;
- const T pred_w = std::exp(dw) * width;
- const T pred_h = std::exp(dh) * height;
- // Store the prediction into the output tensor
- pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
- pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
- pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
- pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
- }
- },
- box_it);
-}
-
void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- switch(_boxes->info()->data_type())
- {
- case DataType::F32:
- {
- internal_run<float>(window);
- break;
- }
- case DataType::QASYMM16:
- {
- internal_run<uint16_t>(window);
- break;
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- internal_run<float16_t>(window);
- break;
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- default:
- {
- ARM_COMPUTE_ERROR("Data type not supported");
- }
- }
+
+ const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() });
+ ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+ uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
index c080ce6a5..def827836 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -83,9 +83,6 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
private:
- template <typename T>
- void internal_run(const Window &window);
-
const ITensor *_boxes;
ITensor *_pred_boxes;
const ITensor *_deltas;
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index fabbd6430..729402116 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -31,136 +31,92 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/CPP/Validate.h"
#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/helpers/bit_ops.h"
+#include "src/cpu/kernels/crop/generic/neon/list.h"
namespace arm_compute
{
namespace
{
-template <typename T>
-inline float32x4_t load_as_f32(T *ptr)
+struct CropSelectorData
{
- ARM_COMPUTE_UNUSED(ptr);
- ARM_COMPUTE_ERROR("Type not supported.");
-}
-
-template <>
-inline float32x4_t load_as_f32(float *ptr)
-{
- return wrapper::vloadq(ptr);
-}
-
-template <>
-inline float32x4_t load_as_f32(int32_t *ptr)
-{
- return vcvtq_f32_s32(wrapper::vloadq(ptr));
-}
+ DataType dt;
+};
-template <>
-inline float32x4_t load_as_f32(uint32_t *ptr)
-{
- return vcvtq_f32_u32(wrapper::vloadq(ptr));
-}
+using CropSelectorPtr = std::add_pointer<bool(const CropSelectorData &data)>::type;
+using CropUKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
-template <>
-inline float32x4_t load_as_f32(int16_t *ptr)
+struct CropUKernel
{
- return vcvtq_f32_s32(vmovl_s16(wrapper::vload(ptr)));
-}
+ const char *name;
+ const CropSelectorPtr is_selected;
+ CropUKernelPtr ukernel;
+};
-template <>
-inline float32x4_t load_as_f32(uint16_t *ptr)
+static const CropUKernel available_kernels[] =
{
- return vcvtq_f32_u32(vmovl_u16(wrapper::vload(ptr)));
-}
-
-template <>
-inline float32x4_t load_as_f32(uint8_t *ptr)
-{
- return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr)))));
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float32x4_t load_as_f32(float16_t *ptr)
-{
- return vcvt_f32_f16(wrapper::vload(ptr));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T>
-inline void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
-{
- // Reverse elements if width flipped.
- if(is_width_flipped)
{
- // Collapse first dimension if possible.
- if(input_has_single_channel)
- {
- int32_t x = output_width_start;
- Coordinates negative_offset(input_offset);
- negative_offset.set(1, negative_offset[1] - window_step_x + 1);
- for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
- {
- auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
-
- in = wrapper::vrev64(in);
- in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ "fp16_neon_crop",
+ [](const CropSelectorData & data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)
+ },
+ {
+ "f32_neon_crop",
+ [](const CropSelectorData & data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)
+ },
+ {
+ "u8_neon_crop",
+ [](const CropSelectorData & data) { return data.dt == DataType::U8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)
+ },
+ {
+ "u16_neon_crop",
+ [](const CropSelectorData & data) { return data.dt == DataType::U16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)
+ },
+ {
+ "u32_neon_crop",
+ [](const CropSelectorData & data) { return data.dt == DataType::U32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)
+ },
+ {
+ "s8_neon_crop",
+ [](const CropSelectorData & data) { return data.dt == DataType::S8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)
+ },
+ {
+ "s16_neon_crop",
+ [](const CropSelectorData & data) { return data.dt == DataType::S16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)
+ },
+ {
+ "s32_neon_crop",
+ [](const CropSelectorData & data) { return data.dt == DataType::S32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)
+ },
+};
- wrapper::vstore(output_ptr + x, in);
- }
- input_offset[1] = negative_offset[1] + window_step_x - 1;
- for(; x < output_width_limit; ++x, --input_offset[1])
- {
- *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
- }
- }
- else
- {
- for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
- {
- input_offset.set(0, 0);
- int32_t c = 0;
- for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
- {
- auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
- wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
- }
- for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
- {
- *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
- }
- }
- }
- }
- else
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const CropUKernel *get_implementation(const CropSelectorData &data)
+{
+ for(const auto &uk : available_kernels)
{
- // Use memcpy if the elements don't need converting to float.
- if(std::is_same<T, float>::value)
+ if(uk.is_selected(data))
{
- memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
- reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
- (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
- }
- else
- {
- int32_t x = 0;
- int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
- float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
- for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
- {
- auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
- wrapper::vstore(output_start_ptr + x, in);
- }
- for(; x < limit; ++x, ++input_offset[0])
- {
- *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
- }
+ return &uk;
}
}
+
+ return nullptr;
}
inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value,
@@ -234,8 +190,7 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina
} // namespace
NECropKernel::NECropKernel()
- : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds(),
- _in_bounds_crop_function(nullptr)
+ : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds()
{
}
@@ -250,40 +205,14 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co
_output = output;
_crop_box_ind = crop_box_ind;
_extrapolation_value = extrapolation_value;
-
- switch(input->info()->data_type())
- {
- case DataType::F32:
- _in_bounds_crop_function = &in_bounds_crop_window<float>;
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _in_bounds_crop_function = &in_bounds_crop_window<float16_t>;
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::U32:
- _in_bounds_crop_function = &in_bounds_crop_window<uint32_t>;
- break;
- case DataType::S32:
- _in_bounds_crop_function = &in_bounds_crop_window<int32_t>;
- break;
- case DataType::U16:
- _in_bounds_crop_function = &in_bounds_crop_window<uint16_t>;
- break;
- case DataType::S16:
- _in_bounds_crop_function = &in_bounds_crop_window<int16_t>;
- break;
- case DataType::U8:
- _in_bounds_crop_function = &in_bounds_crop_window<uint8_t>;
- break;
- default:
- ARM_COMPUTE_ERROR("Datatype not supported");
- }
}
Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value)
{
ARM_COMPUTE_UNUSED(extrapolation_value);
+ const auto *uk = get_implementation(CropSelectorData{ input->data_type() });
+ ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
@@ -369,10 +298,12 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON(_input->info()->has_padding());
ARM_COMPUTE_ERROR_ON(_output->info()->has_padding());
+ const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() });
+
uint32_t batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind))));
Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
_end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
- execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, _in_bounds_crop_function, _end[1] < _start[1],
+ execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1],
_cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0,
_start[0] <= _end[0], _end[0] < _start[0]);
}
diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h
index 742215e22..6c989c1d2 100644
--- a/src/core/NEON/kernels/NECropKernel.h
+++ b/src/core/NEON/kernels/NECropKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -107,8 +107,6 @@ private:
std::array<uint32_t, 2> _rows_out_of_bounds;
/** The number of columns out of bounds at the start and end of output. */
std::array<uint32_t, 2> _cols_out_of_bounds;
-
- NECropKernel::InBoundsCropFunction *_in_bounds_crop_function;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_NEON_CROP_KERNEL_H */
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 56aed0ca2..7bba136e8 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,15 +28,72 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
#include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
+#include "src/cpu/kernels/genproposals/list.h"
#include <arm_neon.h>
namespace arm_compute
{
namespace
{
+struct ComputeAllAnchorsData
+{
+ DataType dt;
+};
+
+using ComputeAllAnchorsSelectorPtr = std::add_pointer<bool(const ComputeAllAnchorsData &data)>::type;
+using ComputeAllAnchorsUKernelPtr = std::add_pointer<void(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
+
+struct ComputeAllAnchorsKernel
+{
+ const char *name;
+ const ComputeAllAnchorsSelectorPtr is_selected;
+ ComputeAllAnchorsUKernelPtr ukernel;
+};
+
+static const ComputeAllAnchorsKernel available_kernels[] =
+{
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+ {
+ "neon_qu16_computeallanchors",
+ [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; },
+ REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)
+ },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ {
+ "neon_fp16_computeallanchors",
+ [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)
+ },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ {
+ "neon_fp32_computeallanchors",
+ [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)
+ },
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data)
+{
+ for(const auto &uk : available_kernels)
+ {
+ if(uk.is_selected(data))
+ {
+ return &uk;
+ }
+ }
+ return nullptr;
+}
+
Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(anchors, all_anchors);
@@ -100,100 +157,15 @@ Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITe
return Status{};
}
-template <>
-void NEComputeAllAnchorsKernel::internal_run<int16_t>(const Window &window)
-{
- Iterator all_anchors_it(_all_anchors, window);
- Iterator anchors_it(_all_anchors, window);
-
- const size_t num_anchors = _anchors->info()->dimension(1);
- const float stride = 1.f / _anchors_info.spatial_scale();
- const size_t feat_width = _anchors_info.feat_width();
-
- const UniformQuantizationInfo qinfo = _anchors->info()->quantization_info().uniform();
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const size_t anchor_offset = id.y() % num_anchors;
-
- const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
- const auto anchor_ptr = reinterpret_cast<int16_t *>(_anchors->ptr_to_element(Coordinates(0, anchor_offset)));
-
- const size_t shift_idy = id.y() / num_anchors;
- const float shiftx = (shift_idy % feat_width) * stride;
- const float shifty = (shift_idy / feat_width) * stride;
-
- const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
- const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
- const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
- const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
-
- *out_anchor_ptr = quantize_qsymm16(new_anchor_x1, qinfo.scale);
- *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
- *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
- *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
- },
- all_anchors_it);
-}
-
-template <typename T>
-void NEComputeAllAnchorsKernel::internal_run(const Window &window)
-{
- Iterator all_anchors_it(_all_anchors, window);
- Iterator anchors_it(_all_anchors, window);
-
- const size_t num_anchors = _anchors->info()->dimension(1);
- const T stride = 1.f / _anchors_info.spatial_scale();
- const size_t feat_width = _anchors_info.feat_width();
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const size_t anchor_offset = id.y() % num_anchors;
-
- const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
- const auto anchor_ptr = reinterpret_cast<T *>(_anchors->ptr_to_element(Coordinates(0, anchor_offset)));
-
- const size_t shift_idy = id.y() / num_anchors;
- const T shiftx = (shift_idy % feat_width) * stride;
- const T shifty = (shift_idy / feat_width) * stride;
-
- *out_anchor_ptr = *anchor_ptr + shiftx;
- *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
- *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
- *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
- },
- all_anchors_it);
-}
-
void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- switch(_anchors->info()->data_type())
- {
- case DataType::QSYMM16:
- {
- internal_run<int16_t>(window);
- break;
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- internal_run<float16_t>(window);
- break;
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- {
- internal_run<float>(window);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Data type not supported");
- }
- }
+ const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() });
+ ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+ uk->ukernel(_anchors, _all_anchors, _anchors_info, window);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
index f6d39e50a..297d6d4ab 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,9 +74,6 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
private:
- template <typename T>
- void internal_run(const Window &window);
-
const ITensor *_anchors;
ITensor *_all_anchors;
ComputeAnchorsInfo _anchors_info;
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index d33431a8d..71641404b 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,8 +34,10 @@
#include "src/core/CPP/Validate.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/instancenorm/list.h"
#include <arm_neon.h>
@@ -43,137 +45,53 @@ namespace arm_compute
{
namespace
{
-template <typename InputType, typename AccType = InputType>
-void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs)
+struct InstanceNormSelectorData
{
- result = wrapper::vadd(result, inputs);
- result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs));
-}
+ DataType dt;
+};
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void vector_float_sum(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs)
-{
- vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgetlow(inputs)));
- vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+using InstanceNormSelctorPtr = std::add_pointer<bool(const InstanceNormSelectorData &data)>::type;
+using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)>::type;
-template <typename InputType, typename AccType = InputType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+struct InstanceNormKernel
{
- return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
-}
+ const char *name;
+ const InstanceNormSelctorPtr is_selected;
+ InstanceNormUKernelPtr ukernel;
+};
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float16x8_t vector_float_norm(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta)
+static const InstanceNormKernel available_kernels[] =
{
- const auto input_low = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
- const auto input_high = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
- const auto result_low = wrapper::vcvt<float16_t>(vector_float_norm(input_low, vec_mean, vec_multip, vec_beta));
- const auto result_high = wrapper::vcvt<float16_t>(vector_float_norm(input_high, vec_mean, vec_multip, vec_beta));
- float16x8_t result = wrapper::vcombine(result_low, result_high);
-
- return result;
-}
+ {
+ "fp32_neon_instancenorm",
+ [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)
+ },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ {
+ "fp16_neon_instancenorm",
+ [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)
+ },
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+};
-template <typename T, typename AccType = T>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data)
{
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
- // Clear X/Y dimensions on execution window as we handle the planes manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- win.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- constexpr int window_step_x = 16 / sizeof(T);
- const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
-
- Iterator input_it(input, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ for(const auto &uk : available_kernels)
{
- Window win_plane = window;
- win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
- win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
- win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
- Iterator input_plane_it(input, win_plane);
- Iterator output_plane_it(output, win_plane);
-
- auto sum_h_w = static_cast<AccType>(0.f);
- auto sum_squares_h_w = static_cast<AccType>(0.f);
-
- execute_window_loop(win_plane, [&](const Coordinates &)
+ if(uk.is_selected(data))
{
- const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
-
- auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
- auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-
- // Compute S elements per iteration
- int x = window.x().start();
- for(; x <= (window.x().end() - window_step_x); x += window_step_x)
- {
- auto vec_input_val = wrapper::vloadq(input_ptr + x);
- vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
- }
-
- auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
- auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
-
- vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
- vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-
- sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
- sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
- // Compute left-over elements
- for(; x < window.x().end(); ++x)
- {
- const auto value = static_cast<AccType>(*(input_ptr + x));
- sum_h_w += value;
- sum_squares_h_w += value * value;
- }
- },
- input_plane_it, output_plane_it);
-
- const auto mean_h_w = sum_h_w / elements_plane;
- const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
- const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon);
- const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
- const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
- const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
-
- execute_window_loop(win_plane, [&](const Coordinates &)
- {
- auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
- auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
- // Compute S elements per iteration
- int x = window.x().start();
- //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
- for(; x <= (window.x().end() - window_step_x); x += window_step_x)
- {
- const auto vec_val = wrapper::vloadq(input_ptr + x);
- const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
- wrapper::vstore(output_ptr + x, normalized_vec);
- }
-
- // Compute left-over elements
- for(; x < window.x().end(); ++x)
- {
- const auto val = static_cast<AccType>(*(input_ptr + x));
- *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
- }
- },
- input_plane_it, output_plane_it);
- },
- input_it);
+ return &uk;
+ }
+ }
+ return nullptr;
}
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
@@ -210,7 +128,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
} // namespace
NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12)
+ : _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12)
{
}
@@ -227,28 +145,6 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), _gamma, _beta, _epsilon));
- if(_input->info()->data_type() == DataType::F32)
- {
- _func = &instance_normalization_nchw<float>;
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- else if(_input->info()->data_type() == DataType::F16)
- {
- if(_use_mixed_precision)
- {
- _func = &instance_normalization_nchw<float16_t, float>;
- }
- else
- {
- _func = &instance_normalization_nchw<float16_t>;
- }
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- else
- {
- ARM_COMPUTE_ERROR("Unsupported data type");
- }
-
// Configure kernel window
auto win_config = validate_and_configure_window(_input->info(), _output->info());
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
@@ -268,6 +164,10 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- (*_func)(_input, _output, _gamma, _beta, _epsilon, window);
+
+ const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() });
+ ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+ uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
index 96c011971..f166ce205 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -84,13 +84,12 @@ private:
*/
using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
- NormalizationFunction *_func;
- ITensor *_input;
- ITensor *_output;
- float _gamma;
- float _beta;
- float _epsilon;
- bool _use_mixed_precision{ true };
+ ITensor *_input;
+ ITensor *_output;
+ float _gamma;
+ float _beta;
+ float _epsilon;
+ bool _use_mixed_precision{ true };
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
index 761fa1523..93da8a24c 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,9 +28,10 @@
#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
+#include "src/cpu/kernels/maxunpool/list.h"
#include "support/ToolchainSupport.h"
namespace arm_compute
@@ -39,6 +40,67 @@ using namespace misc::shape_calculator;
namespace
{
+struct MaxUnpoolingSelectorData
+{
+ DataType dt;
+};
+
+using MaxUnpoolingSelctorPtr = std::add_pointer<bool(const MaxUnpoolingSelectorData &data)>::type;
+using MaxUnpoolingUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window)>::type;
+
+struct MaxUnpoolingKernel
+{
+ const char *name;
+ const MaxUnpoolingSelctorPtr is_selected;
+ MaxUnpoolingUKernelPtr ukernel;
+};
+
+static const MaxUnpoolingKernel available_kernels[] =
+{
+ {
+ "fp32_neon_maxunpooling",
+ [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_maxunpooling)
+ },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ {
+ "fp16_neon_maxunpooling",
+ [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_maxunpooling)
+ },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+ {
+ "qs8_neon_maxunpooling",
+ [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::QASYMM8; },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qs8_maxunpooling)
+ },
+ {
+ "qu8_neon_maxunpooling",
+ [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qu8_maxunpooling)
+ },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const MaxUnpoolingKernel *get_implementation(const MaxUnpoolingSelectorData &data)
+{
+ for(const auto &uk : available_kernels)
+ {
+ if(uk.is_selected(data))
+ {
+ return &uk;
+ }
+ }
+ return nullptr;
+}
+
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
@@ -69,7 +131,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
} // namespace
NEMaxUnpoolingLayerKernel::NEMaxUnpoolingLayerKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr)
+ : _input(nullptr), _output(nullptr), _indices(nullptr)
{
}
@@ -82,46 +144,12 @@ void NEMaxUnpoolingLayerKernel::configure(const ITensor *input, const ITensor *i
_output = output;
_indices = indices;
- switch(input->info()->data_type())
- {
- case DataType::F32:
- _func = &NEMaxUnpoolingLayerKernel::unpooling2<float>;
- break;
- case DataType::QASYMM8:
- _func = &NEMaxUnpoolingLayerKernel::unpooling2<uint8_t>;
- break;
- case DataType::QASYMM8_SIGNED:
- _func = &NEMaxUnpoolingLayerKernel::unpooling2<int8_t>;
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _func = &NEMaxUnpoolingLayerKernel::unpooling2<float16_t>;
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- break;
- }
const TensorShape output_shape = compute_unpool_shape(*input->info(), pool_info);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
auto window = calculate_max_window(*input->info(), Steps());
INEKernel::configure(window);
}
-template <typename T>
-void NEMaxUnpoolingLayerKernel::unpooling2(const Window &window)
-{
- Iterator input(_input, window);
- Iterator indices(_indices, window);
- auto out_ptr = reinterpret_cast<T *>(_output->buffer());
- const int out_stride_w = static_cast<int>(_output->info()->strides_in_bytes()[3]);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- auto vindices = reinterpret_cast<uint32_t *>(indices.ptr());
- auto vinput = reinterpret_cast<T *>(input.ptr());
- out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
- },
- input, indices);
-}
Status NEMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
{
@@ -135,8 +163,9 @@ void NEMaxUnpoolingLayerKernel::run(const Window &window, const ThreadInfo &info
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
- // Run function
- (this->*_func)(window);
+ const auto *uk = get_implementation(MaxUnpoolingSelectorData{ _input->info()->data_type() });
+ ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+ uk->ukernel(_input, _output, _indices, window);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
index ecc116e58..f7f9a31f6 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,10 +88,9 @@ private:
using UnpoolingFunction = void (NEMaxUnpoolingLayerKernel::*)(const Window &window);
private:
- UnpoolingFunction _func;
- const ITensor *_input;
- ITensor *_output;
- const ITensor *_indices;
+ const ITensor *_input;
+ ITensor *_output;
+ const ITensor *_indices;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index d1c7d4eb9..7d8fc7ec7 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,13 +31,64 @@
#include "src/core/CPP/Validate.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/meanstddevnorm/list.h"
namespace arm_compute
{
namespace
{
+struct MeanStdDevNormSelectorData
+{
+ DataType dt;
+};
+
+using MeanStdDevNormSelctorPtr = std::add_pointer<bool(const MeanStdDevNormSelectorData &data)>::type;
+using MeanStdDevNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
+
+struct MeanStdDevNormKernel
+{
+ const char *name;
+ const MeanStdDevNormSelctorPtr is_selected;
+ MeanStdDevNormUKernelPtr ukernel;
+};
+
+static const MeanStdDevNormKernel available_kernels[] =
+{
+ {
+ "fp32_neon_meanstddevnorm",
+ [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)
+ },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ {
+ "fp16_neon_meanstddevnorm",
+ [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)
+ },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data)
+{
+ for(const auto &uk : available_kernels)
+ {
+ if(uk.is_selected(data))
+ {
+ return &uk;
+ }
+ }
+ return nullptr;
+}
+
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
{
ARM_COMPUTE_UNUSED(epsilon);
@@ -72,80 +123,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
}
} // namespace
-template <typename ScalarType, int size>
-void NEMeanStdDevNormalizationKernel::mean_stddev_normalization(const Window &window)
-{
- using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type;
-
- // Set build options
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = size;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Iterator input(_input, win);
- Iterator output(_output, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
- auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
- auto sum_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
- auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- auto data = wrapper::vloadq(in_ptr + x);
- sum_vec = wrapper::vadd(sum_vec, data);
- sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
- }
-
- auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
- auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
- for(int i = 0; i < size / 4; ++i)
- {
- sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res);
- sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
- }
-
- auto sum = wrapper::vgetlane(sum_carry_res, 0);
- auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- ScalarType data = *(in_ptr + x);
- sum += data;
- sum_sq += data * data;
- }
-
- ScalarType mean = sum / _input->info()->dimension(0);
- ScalarType var = (sum_sq / _input->info()->dimension(0)) - (mean * mean);
- ScalarType stddev_inv = 1.f / sqrt(var + _epsilon);
-
- auto mean_vec = wrapper::vdup_n(mean, ExactTagType{});
- auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
- for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- auto data = wrapper::vloadq(in_ptr + x);
- auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
- // Store results
- wrapper::vstore(out_ptr + x, res);
- }
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
- }
- },
- input, output);
-}
-
NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel()
- : _input(nullptr), _output(nullptr), _epsilon(1e-8f), _func(nullptr)
+ : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
{
}
@@ -163,23 +142,6 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICPPKernel::configure(win_config.second);
-
- // Configure function to run based on different data types
- const DataType data_type = input->info()->data_type();
- switch(data_type)
- {
- case DataType::F32:
- _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float, 4>;
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float16_t, 8>;
- break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- default:
- ARM_COMPUTE_ERROR("Not Supported");
- break;
- }
}
Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
@@ -194,8 +156,10 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (this->*_func)(window);
+ const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() });
+ ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+ uk->ukernel(_input, _output, _epsilon, window);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
index 59d073ada..844f0efdc 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,8 +91,6 @@ private:
float _epsilon;
using MeanStdDevNormFunction = void (NEMeanStdDevNormalizationKernel::*)(const Window &window);
-
- MeanStdDevNormFunction _func;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 60986812b..734510b63 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -261,9 +261,10 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info)
size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
{
- ARM_COMPUTE_UNUSED(platform, thread_count);
-
- return ICPPKernel::small_network_mws;
+ ARM_COMPUTE_UNUSED(thread_count);
+ ARM_COMPUTE_UNUSED(platform);
+
+ return ICPPKernel::default_mws;
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index ece7e40e3..802aebb52 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,8 +30,10 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/misc/Utility.h"
#include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/roialign/list.h"
#include <arm_neon.h>
@@ -41,6 +43,67 @@ namespace arm_compute
{
namespace
{
+struct ROIAlignSelectorData
+{
+ DataType dt;
+};
+
+using ROIAlignSelctorPtr = std::add_pointer<bool(const ROIAlignSelectorData &data)>::type;
+using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)>::type;
+
+struct ROIAlignKernel
+{
+ const char *name;
+ const ROIAlignSelctorPtr is_selected;
+ ROIAlignUKernelPtr ukernel;
+};
+
+static const ROIAlignKernel available_kernels[] =
+{
+ {
+ "fp32_neon_roialign",
+ [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)
+ },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ {
+ "fp16_neon_roialign",
+ [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)
+ },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+ {
+ "qu8_neon_roialign",
+ [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)
+ },
+ {
+ "qs8_neon_roialign",
+ [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)
+ },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
+{
+ for(const auto &uk : available_kernels)
+ {
+ if(uk.is_selected(data))
+ {
+ return &uk;
+ }
+ }
+ return nullptr;
+}
+
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
@@ -110,328 +173,19 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn
return Status{};
}
-/** Average pooling over an aligned window */
-template <typename input_data_type>
-inline input_data_type roi_align_1x1(const ITensor *input,
- unsigned int roi_batch,
- float region_start_x,
- float bin_size_x,
- int grid_size_x,
- float region_end_x,
- float region_start_y,
- float bin_size_y,
- int grid_size_y,
- float region_end_y,
- int pz)
-{
- if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
- {
- return input_data_type(0);
- }
- else
- {
- const DataLayout data_layout = input->info()->data_layout();
- float avg = 0;
- // Iterate through the aligned pooling region
- for(int iy = 0; iy < grid_size_y; ++iy)
- {
- for(int ix = 0; ix < grid_size_x; ++ix)
- {
- // Align the window in the middle of every bin
- float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
- float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
-
- // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
- const int y_low = y;
- const int x_low = x;
- const int y_high = y_low + 1;
- const int x_high = x_low + 1;
-
- const float ly = y - y_low;
- const float lx = x - x_low;
- const float hy = 1. - ly;
- const float hx = 1. - lx;
-
- const float w1 = hy * hx;
- const float w2 = hy * lx;
- const float w3 = ly * hx;
- const float w4 = ly * lx;
- if(data_layout == DataLayout::NCHW)
- {
- const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
- const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
- const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
- const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
- avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
- }
- else
- {
- const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
- const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
- const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
- const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
- avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
- }
- }
- }
-
- avg /= grid_size_x * grid_size_y;
- return input_data_type(avg);
- }
-}
-
-/** Average pooling over an aligned window */
-template <typename input_data_type>
-inline input_data_type roi_align_1x1_qasymm8(const ITensor *input,
- unsigned int roi_batch,
- float region_start_x,
- float bin_size_x,
- int grid_size_x,
- float region_end_x,
- float region_start_y,
- float bin_size_y,
- int grid_size_y,
- float region_end_y,
- int pz,
- const QuantizationInfo &out_qinfo)
-{
- if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
- {
- return input_data_type(out_qinfo.uniform().offset);
- }
- else
- {
- float avg = 0;
- const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform();
- const bool is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
- const DataLayout data_layout = input->info()->data_layout();
-
- // Iterate through the aligned pooling region
- for(int iy = 0; iy < grid_size_y; ++iy)
- {
- for(int ix = 0; ix < grid_size_x; ++ix)
- {
- // Align the window in the middle of every bin
- float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
- float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
-
- // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
- const int y_low = y;
- const int x_low = x;
- const int y_high = y_low + 1;
- const int x_high = x_low + 1;
-
- const float ly = y - y_low;
- const float lx = x - x_low;
- const float hy = 1. - ly;
- const float hx = 1. - lx;
-
- const float w1 = hy * hx;
- const float w2 = hy * lx;
- const float w3 = ly * hx;
- const float w4 = ly * lx;
-
- if(data_layout == DataLayout::NCHW)
- {
- if(is_qasymm_signed)
- {
- float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
- float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
- float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
- float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
- avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
- }
- else
- {
- float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
- float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
- float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
- float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
- avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
- }
- }
- else
- {
- if(is_qasymm_signed)
- {
- const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
- const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
- const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
- const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
- avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
- }
- else
- {
- const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
- const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
- const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
- const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
- avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
- }
- }
- }
- }
-
- avg /= grid_size_x * grid_size_y;
-
- input_data_type res = 0;
- if(is_qasymm_signed)
- {
- res = quantize_qasymm8_signed(avg, out_qinfo);
- }
- else
- {
- res = quantize_qasymm8(avg, out_qinfo);
- }
- return res;
- }
-}
-
-inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, float max_value)
-{
- const float region_start = p * bin_size + roi_anchor;
- return utility::clamp(region_start, 0.0f, max_value);
-}
-
void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info)
{
const DataLayout data_layout = _input->info()->data_layout();
if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
{
- switch(_input->info()->data_type())
- {
- case DataType::QASYMM8:
- {
- NEROIAlignLayerKernel::internal_run<uint8_t, uint16_t>(window, info);
- break;
- }
- case DataType::QASYMM8_SIGNED:
- {
- NEROIAlignLayerKernel::internal_run<int8_t, uint16_t>(window, info);
- break;
- }
- case DataType::F32:
- {
- NEROIAlignLayerKernel::internal_run<float>(window, info);
- break;
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- NEROIAlignLayerKernel::internal_run<float16_t>(window, info);
- break;
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- default:
- {
- ARM_COMPUTE_ERROR("DataType not supported");
- break;
- }
- }
+ const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() });
+ ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+ uk->ukernel(_input, _output, _rois, _pool_info, window, info);
}
else
{
ARM_COMPUTE_ERROR("Invalid layout");
}
}
-
-template <typename input_data_type, typename roi_data_type>
-void NEROIAlignLayerKernel::internal_run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const DataLayout data_layout = _input->info()->data_layout();
- const size_t values_per_roi = _rois->info()->dimension(0);
-
- const int roi_list_start = window.x().start();
- const int roi_list_end = window.x().end();
-
- const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
- const int input_width = _input->info()->dimension(idx_width);
- const int input_height = _input->info()->dimension(idx_height);
- const int input_chanels = _input->info()->dimension(idx_depth);
- const int pooled_w = _pool_info.pooled_width();
- const int pooled_h = _pool_info.pooled_height();
-
- const DataType data_type = _input->info()->data_type();
- const bool is_qasymm = is_data_type_quantized_asymmetric(data_type);
-
- const auto *rois_ptr = reinterpret_cast<const roi_data_type *>(_rois->buffer());
- const QuantizationInfo &rois_qinfo = _rois->info()->quantization_info();
- for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
- {
- const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
-
- roi_data_type qx1 = rois_ptr[values_per_roi * roi_indx + 1];
- roi_data_type qy1 = rois_ptr[values_per_roi * roi_indx + 2];
- roi_data_type qx2 = rois_ptr[values_per_roi * roi_indx + 3];
- roi_data_type qy2 = rois_ptr[values_per_roi * roi_indx + 4];
- float x1(qx1);
- float x2(qx2);
- float y1(qy1);
- float y2(qy2);
- if(is_qasymm)
- {
- x1 = dequantize_qasymm16(qx1, rois_qinfo);
- x2 = dequantize_qasymm16(qx2, rois_qinfo);
- y1 = dequantize_qasymm16(qy1, rois_qinfo);
- y2 = dequantize_qasymm16(qy2, rois_qinfo);
- }
- const float roi_anchor_x = x1 * _pool_info.spatial_scale();
- const float roi_anchor_y = y1 * _pool_info.spatial_scale();
- const float roi_dims_x = std::max((x2 - x1) * _pool_info.spatial_scale(), 1.0f);
- const float roi_dims_y = std::max((y2 - y1) * _pool_info.spatial_scale(), 1.0f);
- float bin_size_x = roi_dims_x / _pool_info.pooled_width();
- float bin_size_y = roi_dims_y / _pool_info.pooled_height();
-
- // Iterate through all feature maps
- for(int ch = 0; ch < input_chanels; ++ch)
- {
- // Iterate through all output pixels
- for(int py = 0; py < pooled_h; ++py)
- {
- for(int px = 0; px < pooled_w; ++px)
- {
- const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
- const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
- const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
- const float region_end_y = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
- const int roi_bin_grid_x = (_pool_info.sampling_ratio() > 0) ? _pool_info.sampling_ratio() : int(ceil(bin_size_x));
- const int roi_bin_grid_y = (_pool_info.sampling_ratio() > 0) ? _pool_info.sampling_ratio() : int(ceil(bin_size_y));
- input_data_type out_val(0);
- if(is_qasymm)
- {
- out_val = roi_align_1x1_qasymm8<input_data_type>(
- _input, roi_batch, region_start_x, bin_size_x,
- roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
- roi_bin_grid_y, region_end_y, ch, _output->info()->quantization_info());
- }
- else
- {
- out_val = roi_align_1x1<input_data_type>(
- _input, roi_batch, region_start_x, bin_size_x,
- roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
- roi_bin_grid_y, region_end_y, ch);
- }
-
- if(data_layout == DataLayout::NCHW)
- {
- auto out_ptr = reinterpret_cast<input_data_type *>(_output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
- *out_ptr = out_val;
- }
- else
- {
- auto out_ptr = reinterpret_cast<input_data_type *>(_output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
- *out_ptr = out_val;
- }
- }
- }
- }
- }
-}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
index fa31a879b..48a3de728 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -89,9 +89,6 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
private:
- template <typename input_data_type, typename roi_data_type = input_data_type>
- void internal_run(const Window &window, const ThreadInfo &info);
-
const ITensor *_input;
ITensor *_output;
const ITensor *_rois;
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index 0395e0bd3..82d1403c5 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -30,68 +30,96 @@
#include "arm_compute/core/Validate.h"
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
-#include "arm_compute/core/Utils.h"
+#include "src/cpu/kernels/range/list.h"
namespace arm_compute
{
namespace
{
-template <typename T>
-void range_function(ITensor *output, float start, float step, const Window &window)
+struct RangeSelectorData
{
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
-
- const auto step_vec = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
- const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
- auto id_vec = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ DataType dt;
+};
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const int window_step_x = 16 / sizeof(T);
+using RangeSelectorPtr = std::add_pointer<bool(const RangeSelectorData &data)>::type;
+using RangeUKernelPtr = std::add_pointer<void(ITensor *, float, float, const Window &)>::type;
- Window win{ window };
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator output_it(output, win);
+struct RangeUKernel
+{
+ const char *name;
+ const RangeSelectorPtr is_selected;
+ RangeUKernelPtr ukernel;
+};
- execute_window_loop(win, [&](const Coordinates &)
+static const RangeUKernel available_kernels[] =
+{
{
- int x = window_start_x;
- const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- for(int count = 0; count < window_step_x; ++count)
- {
- id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
- }
-
- // start + step * id
- const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
- wrapper::vstore(out_ptr + x, res_vec);
- }
+ "fp16_neon_range",
+ [](const RangeSelectorData & data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)
+ },
+ {
+ "f32_neon_range",
+ [](const RangeSelectorData & data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)
+ },
+ {
+ "u8_neon_range",
+ [](const RangeSelectorData & data) { return data.dt == DataType::U8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)
+ },
+ {
+ "u16_neon_range",
+ [](const RangeSelectorData & data) { return data.dt == DataType::U16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)
+ },
+ {
+ "u32_neon_range",
+ [](const RangeSelectorData & data) { return data.dt == DataType::U32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)
+ },
+ {
+ "s8_neon_range",
+ [](const RangeSelectorData & data) { return data.dt == DataType::S8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)
+ },
+ {
+ "s16_neon_range",
+ [](const RangeSelectorData & data) { return data.dt == DataType::S16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)
+ },
+ {
+ "s32_neon_range",
+ [](const RangeSelectorData & data) { return data.dt == DataType::S32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)
+ },
+};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const RangeUKernel *get_implementation(const RangeSelectorData &data)
+{
+ for(const auto &uk : available_kernels)
+ {
+ if(uk.is_selected(data))
{
- const auto res = start + x * step;
- *(out_ptr + x) = res;
+ return &uk;
}
-
- },
- output_it);
+ }
+ return nullptr;
}
Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output,
- 1,
- DataType::U8, DataType::S8,
- DataType::U16, DataType::S16,
- DataType::U32, DataType::S32,
- DataType::F16, DataType::F32);
+ const auto *uk = get_implementation(RangeSelectorData{ output.data_type() });
+ ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
@@ -111,7 +139,7 @@ Status validate_arguments(const ITensorInfo &output, const float start, const fl
} // namespace
NERangeKernel::NERangeKernel()
- : _func(nullptr), _start(0), _end(1), _step(1), _output(nullptr)
+ : _start(0), _end(1), _step(1), _output(nullptr)
{
}
@@ -131,38 +159,6 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste
_end = end;
_step = step;
_output = output;
- switch(_output->info()->data_type())
- {
- case DataType::U8:
- _func = &range_function<uint8_t>;
- break;
- case DataType::U16:
- _func = &range_function<uint16_t>;
- break;
- case DataType::U32:
- _func = &range_function<uint32_t>;
- break;
- case DataType::S8:
- _func = &range_function<int8_t>;
- break;
- case DataType::S16:
- _func = &range_function<int16_t>;
- break;
- case DataType::S32:
- _func = &range_function<int32_t>;
- break;
- case DataType::F32:
- _func = &range_function<float>;
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _func = &range_function<float16_t>;
- break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- break;
- }
INEKernel::configure(win);
}
@@ -181,8 +177,8 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
+ const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() });
- (*_func)(_output, _start, _step, window);
+ uk->ukernel(_output, _start, _step, window);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h
index 7c42ef11d..90560995e 100644
--- a/src/core/NEON/kernels/NERangeKernel.h
+++ b/src/core/NEON/kernels/NERangeKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,11 +80,10 @@ public:
private:
using RangeFunction = void(ITensor *output, float start, float step, const Window &window);
- RangeFunction *_func; /**< Range function to be called */
- float _start; /**< Start of sequence */
- float _end; /**< End of sequence */
- float _step; /**< Increment/step value */
- ITensor *_output; /**< Destination tensor */
+ float _start; /**< Start of sequence */
+ float _end; /**< End of sequence */
+ float _step; /**< Increment/step value */
+ ITensor *_output; /**< Destination tensor */
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NERANGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
deleted file mode 100644
index a1ba29e4c..000000000
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NERemapKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute::scale_helpers;
-
-namespace arm_compute
-{
-class Coordinates;
-
-namespace
-{
-inline int32_t num_out_of_tensor(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &width_1, const int32x4_t &height_1)
-{
- const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr));
- const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr));
-
- const int32x4_t outbx_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(width_1, mapx_s32), mapx_s32), vdupq_n_s32(-1)), vdupq_n_s32(0)); // Contains -1 if out of border in x, 0 otherwise
- const int32x4_t outby_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(height_1, mapy_s32), mapy_s32), vdupq_n_s32(-1)), vdupq_n_s32(0)); // Contains -1 if out of border in y, 0 otherwise
-
- const int32x4_t out_of_tensor_v = vminq_s32(outbx_s32, outby_s32);
-#if defined(__aarch64__)
- // only AArch64 supports vaddv
- return vaddvq_s32(out_of_tensor_v);
-#else // __aarch64__
- return vgetq_lane_s32(out_of_tensor_v, 0) + vgetq_lane_s32(out_of_tensor_v, 1) + vgetq_lane_s32(out_of_tensor_v, 2) + vgetq_lane_s32(out_of_tensor_v, 3);
-#endif // __aarch64__
-}
-
-inline void serial_remap_nearest_interpolation(const uint8_t *in_ptr, const float *mapx_ptr, const float *mapy_ptr, uint8_t *out_ptr,
- int32_t width_val, int32_t height_val, int32_t in_stride_val, uint8_t constant_border_value)
-{
- const auto x_s32 = static_cast<int32_t>(*mapx_ptr);
- const auto y_s32 = static_cast<int32_t>(*mapy_ptr);
- if(x_s32 < 0 || y_s32 < 0 || x_s32 >= width_val || y_s32 >= height_val)
- {
- *(out_ptr) = constant_border_value;
- }
- else
- {
- *(out_ptr) = in_ptr[x_s32 + y_s32 * in_stride_val];
- }
-}
-
-inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &stride)
-{
- const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr));
- const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr));
- return vmlaq_s32(mapx_s32, mapy_s32, stride);
-}
-
-inline uint8_t pixel_bilinear_c1_clamp(const uint8_t *pixel_ptr, int32_t stride, int32_t width, int32_t height, float x, float y, uint8_t constant_border_value)
-{
- x = std::max(-1.f, std::min(x, static_cast<float>(width)));
- y = std::max(-1.f, std::min(y, static_cast<float>(height)));
-
- const int32_t xi = static_cast<int32_t>(std::floor(x));
- const int32_t yi = static_cast<int32_t>(std::floor(y));
-
- const float dx = x - static_cast<float>(xi);
- const float dy = y - static_cast<float>(yi);
-
- // Calculating the address won't trigger a segfault in case the value is outside the tensor
- // The ternary operator resolves the values in both conditions
- const uint8_t *a00 = (xi < 0 || xi >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride);
- const uint8_t *a01 = (xi + 1 >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride);
- const uint8_t *a10 = (xi < 0 || xi >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride + stride);
- const uint8_t *a11 = (xi + 1 >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride + stride);
-
- const float dx1 = 1.0f - dx;
- const float dy1 = 1.0f - dy;
- const float w1 = dx1 * dy1;
- const float w2 = dx * dy1;
- const float w3 = dx1 * dy;
- const float w4 = dx * dy;
-
- return static_cast<uint8_t>((*a00) * w1 + (*a01) * w2 + (*a10) * w3 + (*a11) * w4);
-}
-} // namespace
-
-NERemapKernel::NERemapKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr), _border_mode(BorderMode::UNDEFINED), _constant_border_value(0)
-{
-}
-
-void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
-
- _input = input;
- _output = output;
- _map_x = map_x;
- _map_y = map_y;
- _border_mode = border_mode;
- _constant_border_value = constant_border_value;
-
- switch(policy)
- {
- case InterpolationPolicy::NEAREST_NEIGHBOR:
- {
- _func = &NERemapKernel::remap_nearest;
- break;
- }
- case InterpolationPolicy::BILINEAR:
- {
- _func = &NERemapKernel::remap_bilinear;
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported interpolation mode");
- break;
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
- INEKernel::configure(win);
-}
-
-void NERemapKernel::remap_nearest(const Window &window)
-{
- // Don't increment in X and Y direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
- const int32_t window_step_x = 8;
-
- // Don't increment in X direction for the output, mapx, mapy tensors
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator in(_input, win_in);
- Iterator out(_output, win);
- Iterator mapx(_map_x, win);
- Iterator mapy(_map_y, win);
-
- const int32_t width_val = static_cast<int32_t>(_input->info()->dimension(0));
- const int32_t height_val = static_cast<int32_t>(_input->info()->dimension(1));
- const int32_t in_stride_val = static_cast<int32_t>(_input->info()->strides_in_bytes()[1]);
- const int32x4_t width_1 = vdupq_n_s32(width_val - 1);
- const int32x4_t height_1 = vdupq_n_s32(height_val - 1);
- const int32x4_t in_stride = vdupq_n_s32(in_stride_val);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
- auto mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
- const uint8_t *in_ptr = in.ptr();
- uint8_t *out_ptr = out.ptr();
- int32_t x = window_start_x;
- for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x)
- {
- const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_1, height_1);
- const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_1, height_1);
- const int32_t out_of_tensor = out_of_tensor0 + out_of_tensor1;
-
- if(out_of_tensor == -8)
- {
- // All elements are out of xy plane
- uint8x8_t tmp = vdup_n_u8(_constant_border_value);
- vst1_u8(out_ptr, tmp);
- }
- else if(out_of_tensor < 0)
- {
- // Some elements are out of xy plane
- serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value);
- serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 1, mapy_ptr + 1, out_ptr + 1, width_val, height_val, in_stride_val, _constant_border_value);
- serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 2, mapy_ptr + 2, out_ptr + 2, width_val, height_val, in_stride_val, _constant_border_value);
- serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 3, mapy_ptr + 3, out_ptr + 3, width_val, height_val, in_stride_val, _constant_border_value);
- serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 4, mapy_ptr + 4, out_ptr + 4, width_val, height_val, in_stride_val, _constant_border_value);
- serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 5, mapy_ptr + 5, out_ptr + 5, width_val, height_val, in_stride_val, _constant_border_value);
- serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 6, mapy_ptr + 6, out_ptr + 6, width_val, height_val, in_stride_val, _constant_border_value);
- serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 7, mapy_ptr + 7, out_ptr + 7, width_val, height_val, in_stride_val, _constant_border_value);
- }
- else
- {
- // All elements are in xy plane
- uint8x8_t tmp = vdup_n_u8(0);
- const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr, mapy_ptr, in_stride);
- const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, in_stride);
- tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0);
- tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1);
- tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2);
- tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3);
- tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4);
- tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5);
- tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6);
- tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7);
- vst1_u8(out_ptr, tmp);
- }
- }
- for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr)
- {
- serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value);
- }
- },
- in, out, mapx, mapy);
-}
-
-void NERemapKernel::remap_bilinear(const Window &window)
-{
- // Don't increment in X and Y direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
- const int32_t window_step_x = 8;
-
- // Don't increment in X direction for the output, mapx, mapy tensors
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator in(_input, win_in);
- Iterator out(_output, win);
- Iterator mapx(_map_x, win);
- Iterator mapy(_map_y, win);
-
- const int32_t width_val = static_cast<int32_t>(_input->info()->dimension(0));
- const int32_t height_val = static_cast<int32_t>(_input->info()->dimension(1));
- const int32x4_t width_2 = vdupq_n_s32(width_val - 2);
- const int32x4_t height_2 = vdupq_n_s32(height_val - 2);
- const int32_t in_stride_val = static_cast<int32_t>(_input->info()->strides_in_bytes()[1]);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
- auto mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
- const uint8_t *in_ptr = in.ptr();
- uint8_t *out_ptr = out.ptr();
- int32_t x = window_start_x;
- for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x)
- {
- const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_2, height_2);
- const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_2, height_2);
- const int32_t out_of_tensor = out_of_tensor0 + out_of_tensor1;
-
- if(out_of_tensor < 0)
- {
- // Elements are out of xy plane
- *(out_ptr) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value);
- *(out_ptr + 1) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value);
- *(out_ptr + 2) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value);
- *(out_ptr + 3) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value);
- *(out_ptr + 4) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value);
- *(out_ptr + 5) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value);
- *(out_ptr + 6) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value);
- *(out_ptr + 7) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value);
- }
- else
- {
- // All elements are in xy plane
- uint8x8_t tmp = vdup_n_u8(0);
- tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value), tmp, 0);
- tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value), tmp, 1);
- tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value), tmp, 2);
- tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value), tmp, 3);
- tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value), tmp, 4);
- tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value), tmp, 5);
- tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value), tmp, 6);
- tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value), tmp, 7);
- vst1_u8(out_ptr, tmp);
- }
- }
- for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr)
- {
- *(out_ptr) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value);
- }
- },
- in, out, mapx, mapy);
-}
-
-void NERemapKernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- (this->*_func)(window);
-}
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h
deleted file mode 100644
index 33e929805..000000000
--- a/src/core/NEON/kernels/NERemapKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREMAPKERNEL_H
-#define ARM_COMPUTE_NEREMAPKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform a remap on a tensor */
-class NERemapKernel : public INEKernel
-{
-public:
- const char *name() const override
- {
- return "NERemapKernel";
- }
- /** Default constructor */
- NERemapKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NERemapKernel(const NERemapKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NERemapKernel &operator=(const NERemapKernel &) = delete;
- /** Allow instances of this class to be moved */
- NERemapKernel(NERemapKernel &&) = default;
- /** Allow instances of this class to be moved */
- NERemapKernel &operator=(NERemapKernel &&) = default;
- /** Default destructor */
- ~NERemapKernel() = default;
-
- /** Initialize the kernel's input, output and border mode.
- *
- * @param[in] input Source tensor. Data type supported: U8.
- * @param[in] map_x Map for X coordinates. Data type supported: F32.
- * @param[in] map_y Map for Y coordinates. Data type supported: F32.
- * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
- * @param[in] policy The interpolation type.
- * @param[in] border_mode Border mode to use on the input tensor.
- * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. Defaults to 0.
- */
- void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- /** function to perform nearest interpolation on the given window */
- void remap_nearest(const Window &window);
- /** function to perform bilinear interpolation on the given window */
- void remap_bilinear(const Window &window);
- /** Remap function to use for the particular interpolation type passed to configure() */
- void (NERemapKernel::*_func)(const Window &window);
-
- const ITensor *_input; /**< Input image */
- ITensor *_output; /**< Output image */
- const ITensor *_map_x; /**< Input remap x coordinates */
- const ITensor *_map_y; /**< Input remap y coordinates */
- BorderMode _border_mode; /**< Border mode */
- uint8_t _constant_border_value; /**< Border value to use */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */ \ No newline at end of file
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index 7c988e9fa..b8c9b244e 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,6 +34,10 @@
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/common/Registrars.h"
+
+#include "src/cpu/kernels/select/list.h"
+
#include <arm_neon.h>
#include <map>
#include <string>
@@ -42,125 +46,123 @@ namespace arm_compute
{
namespace
{
-template <typename ScalarType, typename VectorType>
-void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
-{
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator condition(cond, win);
- Iterator input1(in1, win);
- Iterator input2(in2, win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
- const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
- int x = window_start_x;
- for(; x <= limit; x += window_step_x)
- {
- const auto c = (*condition_conversion)(condition_ptr + x);
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
- }
- for(; x < window_end_x; ++x)
- {
- const auto c = *(condition_ptr + x);
- const auto a = *(input1_ptr + x);
- const auto b = *(input2_ptr + x);
- *(output_ptr + x) = static_cast<bool>(c) ? a : b;
- }
- },
- condition, input1, input2, output);
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+struct SelectKernelSelectorData
{
- const auto window_step_x = 16 / sizeof(ScalarType);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
+ DataType dt;
+ bool is_same_rank;
+};
- select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
- {
- static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
- return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
- });
-}
+using SelectorPtr = std::add_pointer<bool(const SelectKernelSelectorData &data)>::type;
+using KernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
-template <typename ScalarType, typename VectorType>
-void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+struct SelectKernelSelector
{
- const auto window_step_x = 16 / sizeof(ScalarType);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
+ const char *name;
+ const SelectorPtr is_selected;
+ KernelPtr ukernel;
+};
- select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
- {
- static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
- return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
- });
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+static const SelectKernelSelector available_kernels[] =
{
- const auto window_step_x = 16 / sizeof(ScalarType);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
{
- static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
- return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
- });
-}
+ "neon_s8_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)
+ },
+ {
+ "neon_s16_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)
+ },
+ {
+ "neon_s32_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)
+ },
+ {
+ "neon_u8_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)
+ },
+ {
+ "neon_u16_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)
+ },
+ {
+ "neon_u32_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)
+ },
+ {
+ "neon_s8_not_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)
+ },
+ {
+ "neon_s16_not_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)
+ },
+ {
+ "neon_s32_not_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)
+ },
+ {
+ "neon_u8_not_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)
+ },
+ {
+ "neon_u16_not_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)
+ },
+ {
+ "neon_u32_not_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)
+ },
+ {
+ "neon_f16_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)
+ },
+ {
+ "neon_f16_not_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)
+ },
+ {
+ "neon_f32_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)
+ },
+ {
+ "neon_f32_not_same_rank",
+ [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)
+ },
+};
-template <typename ScalarType>
-void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data)
{
- ARM_COMPUTE_UNUSED(window);
-
- auto output_ptr = reinterpret_cast<ScalarType *>(out->buffer());
- const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
- const auto input1_ptr = reinterpret_cast<const ScalarType *>(in1->buffer());
- const auto input2_ptr = reinterpret_cast<const ScalarType *>(in2->buffer());
-
- const int outer_size = cond->info()->total_size() / cond->info()->element_size();
- const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
- int offset = 0;
- const int step = 16 / in1->info()->element_size();
-
- for(int i = 0; i < outer_size; ++i)
+ for(const auto &uk : available_kernels)
{
- int x = offset;
- const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
- for(; x <= offset + inner_size - step; x += step)
- {
- wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
- }
- if(x <= offset + inner_size - (step / 2))
- {
- wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
- x += step / 2;
- }
- for(; x < offset + inner_size; ++x)
+ if(uk.is_selected(data))
{
- *(output_ptr + x) = *(input_ptr + x);
+ return &uk;
}
- offset += inner_size;
}
+ return nullptr;
}
+
} // namespace
NESelectKernel::NESelectKernel()
- : _function(nullptr), _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+ : /*_function(nullptr), */ _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
{
}
@@ -178,51 +180,6 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor
_output = output;
_has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
- std::string function_to_call("op_");
- function_to_call += string_from_data_type(x->info()->data_type());
-
- static std::map<std::string, SelectFunction *> map_function;
-
- if(_has_same_rank)
- {
- map_function =
- {
- { "op_S8", &select_op_8<int8_t, uint8x16_t> },
- { "op_S16", &select_op_16<int16_t, uint16x8_t> },
- { "op_S32", &select_op_32<int32_t, uint32x4_t> },
- { "op_U8", &select_op_8<uint8_t, uint8x16_t> },
- { "op_U16", &select_op_16<uint16_t, uint16x8_t> },
- { "op_U32", &select_op_32<uint32_t, uint32x4_t> },
- { "op_F32", &select_op_32<float, uint32x4_t> }
- };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- map_function["op_F16"] = &select_op_16<float16_t, uint16x8_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- }
- else
- {
- map_function =
- {
- { "op_S8", &select_op_not_same_rank<int8_t> },
- { "op_S16", &select_op_not_same_rank<int16_t> },
- { "op_S32", &select_op_not_same_rank<int32_t> },
- { "op_U8", &select_op_not_same_rank<uint8_t> },
- { "op_U16", &select_op_not_same_rank<uint16_t> },
- { "op_U32", &select_op_not_same_rank<uint32_t> },
- { "op_F32", &select_op_not_same_rank<float> }
- };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- map_function["op_F16"] = &select_op_not_same_rank<float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- }
-
- auto it = map_function.find(function_to_call);
-
- if(it != map_function.end())
- {
- _function = it->second;
- }
-
Window win = calculate_max_window(*x->info());
INEKernel::configure(win);
}
@@ -254,7 +211,12 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_function == nullptr);
- _function(_c, _x, _y, _output, window);
+ ARM_COMPUTE_ERROR_ON(_output == nullptr);
+ ARM_COMPUTE_ERROR_ON(_output->info() == nullptr);
+
+ const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank });
+ ARM_COMPUTE_ERROR_ON(uk == nullptr);
+ ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+ uk->ukernel(_c, _x, _y, _output, window);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h
index f7142feff..e82105a68 100644
--- a/src/core/NEON/kernels/NESelectKernel.h
+++ b/src/core/NEON/kernels/NESelectKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -82,22 +82,12 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
private:
- /** Common signature for all the specialised select functions
- *
- * @param[in] c Condition input tensor. Data types supported: U8.
- * @param[in] x First input tensor. Data types supported: All.
- * @param[in] y Second input tensor. Data types supported: Same as @p x
- * @param[in] output Output tensor. Data types supported: Same as @p x.
- */
- using SelectFunction = void(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window);
- /** Select function to use for the particular tensor types passed to configure() */
- SelectFunction *_function;
- const ITensor *_c; /**< Condition tensor */
- const ITensor *_x; /**< Source tensor 1 */
- const ITensor *_y; /**< Source tensor 2 */
- ITensor *_output; /**< Destination tensor */
- bool _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
+ const ITensor *_c; /**< Condition tensor */
+ const ITensor *_x; /**< Source tensor 1 */
+ const ITensor *_y; /**< Source tensor 2 */
+ ITensor *_output; /**< Destination tensor */
+ bool _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NESELECTKERNEL_H */
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
index 1d52b56d3..ea41529d8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
@@ -136,7 +136,14 @@ UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &a
{
const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr;
const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl);
- return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr);
+
+ if(success)
+ {
+ auto i = impl->get_instance(args, os);
+ i->set_name(impl->name);
+ return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(i);
+ }
+ return nullptr;
}
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
index 20c823014..79fc65e56 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -23,7 +23,9 @@
*/
#pragma once
+#if !defined(__OpenBSD__)
#include <alloca.h>
+#endif /* !defined(__OpenBSD__) */
#include <algorithm>
#include <cassert>
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index d3857a50e..809946f10 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -144,7 +144,7 @@ struct GemmImplementation<Top, Tret, Nothing> {
/* "Master" function implemented for each valid combination of types.
* Returns a list of GEMM implementation descriptors for processing by the
- * other functions, terminated by an implementation with
+ * other functions, ended by an implementation with
* method==GemmMethod::DEFAULT. */
template<typename Top, typename Tret, class OutputStage = Nothing>
const GemmImplementation<Top, Tret, OutputStage> *gemm_implementation_list();
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
index d5003e4a1..91988e8c3 100644
--- a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
@@ -28,7 +28,9 @@
#include "interleave_indirect.hpp"
#include "bfloat.hpp"
+#if !defined(__OpenBSD__)
#include <alloca.h>
+#endif /* !defined(__OpenBSD__) */
#include <algorithm>
#include <cstddef>
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
index 758f2b1f8..9af1b4df1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
@@ -72,7 +72,7 @@ public:
return { 15.361, 0.9341, 0.1636 };
case CPUModel::V1:
- return { 62.40, 4.71, 0.67 };
+ return { 51.14, 7.38, 0.65 };
default:
return { 29.0698, 3.9793, 0.4003 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
index 21c9f5966..6d333f344 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
@@ -80,7 +80,7 @@ public:
return { 15.361, 0.9341, 0.1636 };
case CPUModel::V1:
- return { 62.40, 4.71, 0.67 };
+ return { 51.14, 7.38, 0.65 };
default:
return { 29.0698, 3.9793, 0.4003 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
index 090dd5855..e6e795097 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -82,7 +82,7 @@ public:
case CPUModel::A510:
return { 6.81 };
case CPUModel::V1:
- return { 28.40 };
+ return { 22.33 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
index f5e9009f6..39ffcbef1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
@@ -82,7 +82,7 @@ public:
case CPUModel::A510:
return { 6.70 };
case CPUModel::V1:
- return { 26.64 };
+ return { 21.28 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index 94f578368..905a60265 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -84,7 +84,7 @@ public:
case CPUModel::A510:
return { 14.81 };
case CPUModel::V1:
- return { 48.34 };
+ return { 44.54 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
index bc933afd9..69ea87bc9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
@@ -81,7 +81,7 @@ public:
case CPUModel::A510:
return { 27.99 };
case CPUModel::V1:
- return { 68.76 };
+ return { 62.26 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
index c5105a6d4..ce96c1b28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -75,29 +75,29 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
- if (std::is_same<T, uint8_t>::value) {
+ if (std::is_same<T, uint32_t>::value) {
switch (ci->get_cpu_model()) {
- case CPUModel::A55r1:
- return { 9.5238, 2.0799, 0.2279 };
default:
- return { 29.6736, 11.4025, 0.5591 };
+ return { 31.63 };
case CPUModel::A510:
- return { 16.65, 3.92, 0.48 };
+ return { 15.89 };
case CPUModel::V1:
- return { 55.42, 19.29, 0.92 };
+ return { 53.87 };
+ case CPUModel::A55r1:
+ return { 9.217 };
}
}
- if (std::is_same<T, uint32_t>::value) {
+ if (std::is_same<T, uint8_t>::value) {
switch (ci->get_cpu_model()) {
- default:
- return { 31.63 };
case CPUModel::A55r1:
- return { 9.217 };
+ return { 9.5238, 2.0799, 0.2279 };
+ default:
+ return { 29.6736, 11.4025, 0.5591 };
case CPUModel::A510:
- return { 15.89 };
+ return { 16.65, 3.92, 0.48 };
case CPUModel::V1:
- return { 53.87 };
+ return { 42.62, 16.32, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
index 24bad3c63..b5cedc7e9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
@@ -92,7 +92,7 @@ public:
case CPUModel::A510:
return { 33.64, 3.92, 0.48 };
case CPUModel::V1:
- return { 86.71, 19.00, 0.93 };
+ return { 63.94, 16.18, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index 9b3517a80..6ec6bd2ed 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -36,6 +36,7 @@ namespace arm_gemm
{
// Actual kernel implementations
void a64_interleaved_bf16fp32_mmla_8x12( ARGLIST );
+void a64_interleaved_bf16fp32_mmla_8x12_a510( ARGLIST );
class cls_a64_interleaved_bf16fp32_mmla_8x12
{
@@ -78,7 +79,7 @@ public:
default:
return { 31.54, 4.30, 7.33 };
case CPUModel::V1:
- return { 59.94, 5.08, 9.83 };
+ return { 41.44, 5.01, 5.64 };
case CPUModel::A510:
return { 7.82, 4.05, 3.07 };
}
@@ -101,8 +102,15 @@ public:
// Default to the generic kernel
kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
- cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *)
+ cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A510:
+ kernel=a64_interleaved_bf16fp32_mmla_8x12_a510;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
new file mode 100644
index 000000000..0235e91bf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include "../../bfloat.hpp"
+
+namespace arm_gemm {
+
+void a64_interleaved_bf16fp32_mmla_8x12_a510(
+ const bfloat16 *Apanel, const bfloat16 *Bpanel,
+ float *Cpanel, int ablocks, int bblocks, int K) {
+
+ struct KernelArgs {
+ size_t bblocks = {};
+ size_t K = {};
+ const bfloat16 *Bpanel = {};
+ } ka;
+
+ ka.bblocks = bblocks;
+ ka.K = (K/4) - 1;
+ ka.Bpanel = Bpanel;
+
+ __asm__ __volatile__(
+
+ "1:" // Height loop
+ "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+ "mov x21, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "2:" // Width loop
+ "mov %x[Apanel], x21\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+ "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v4.8h }, [x20], #0x10\n"
+ "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+ "cmp x19, #0x2\n"
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "ld1 { v5.8h }, [x20], #0x10\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "blt 4f\n"
+ "3:" // main loop head
+ "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
+ ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
+ "sub x19, x19, #0x2\n"
+ ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
+ "cmp x19, #0x2\n"
+ ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
+ "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n"
+ "ld1 { v6.8h }, [x20], #0x10\n"
+ ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
+ ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
+ "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n"
+ "ld1 { v7.8h }, [x20], #0x10\n"
+ ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
+ "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
+ "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n"
+ ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n"
+ "ld1 { v4.8h }, [x20], #0x10\n"
+ ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
+ "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n"
+ "ld1 { v5.8h }, [x20], #0x10\n"
+ ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
+ "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
+ "bge 3b\n"
+ "4:" // main loop skip
+ "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
+ ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
+ ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
+ ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n"
+ ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
+ "cbz x19, 5f\n"
+ "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.8h }, [x20], #0x10\n"
+ ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
+ "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v7.8h }, [x20], #0x10\n"
+ ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
+ "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n"
+ ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n"
+ ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n"
+ ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
+ "5:" // multiply loop done
+ "subs x22, x22, #0x1\n"
+ "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp2 v8.2d, v8.2d, v11.2d\n"
+ "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q4, [%x[Cpanel], #0x0]\n"
+ "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "uzp2 v10.2d, v10.2d, v13.2d\n"
+ "str q11, [%x[Cpanel], #0x10]\n"
+ "str q12, [%x[Cpanel], #0x20]\n"
+ "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
+ "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp2 v15.2d, v15.2d, v18.2d\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
+ "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp2 v16.2d, v16.2d, v19.2d\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
+ "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp2 v20.2d, v20.2d, v23.2d\n"
+ "str q13, [%x[Cpanel], #0x60]\n"
+ "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "uzp2 v21.2d, v21.2d, v24.2d\n"
+ "str q17, [%x[Cpanel], #0x70]\n"
+ "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "uzp2 v22.2d, v22.2d, v25.2d\n"
+ "str q18, [%x[Cpanel], #0x80]\n"
+ "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "uzp2 v26.2d, v26.2d, v29.2d\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
+ "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp2 v27.2d, v27.2d, v30.2d\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
+ "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp2 v28.2d, v28.2d, v31.2d\n"
+ "str q16, [%x[Cpanel], #0xb0]\n"
+ "str q19, [%x[Cpanel], #0xc0]\n"
+ "str q23, [%x[Cpanel], #0xd0]\n"
+ "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q20, [%x[Cpanel], #0xf0]\n"
+ "str q21, [%x[Cpanel], #0x100]\n"
+ "str q22, [%x[Cpanel], #0x110]\n"
+ "str q25, [%x[Cpanel], #0x120]\n"
+ "str q29, [%x[Cpanel], #0x130]\n"
+ "str q30, [%x[Cpanel], #0x140]\n"
+ "str q26, [%x[Cpanel], #0x150]\n"
+ "str q27, [%x[Cpanel], #0x160]\n"
+ "str q28, [%x[Cpanel], #0x170]\n"
+ "add %x[Cpanel], %x[Cpanel], #0x180\n"
+ "bgt 2b\n"
+ "subs %x[ablocks], %x[ablocks], #0x1\n"
+ "bne 1b\n"
+ : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+ : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index ff69bc8f5..4cc3ed040 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -35,6 +35,7 @@ namespace arm_gemm
{
// Actual kernel implementations
void a64_interleaved_s8s32_mmla_8x12( ARGLIST );
+void a64_interleaved_s8s32_mmla_8x12_a510( ARGLIST );
class cls_a64_interleaved_s8s32_mmla_8x12
{
@@ -91,7 +92,7 @@ public:
case CPUModel::A510:
return { 48.22, 2.49, 0.29 };
case CPUModel::V1:
- return { 116.76, 4.67, 0.60 };
+ return { 75.54, 8.06, 0.63 };
}
}
@@ -100,9 +101,17 @@ public:
// Default to the generic kernel
kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
- cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *)
+ cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A510:
+ kernel=a64_interleaved_s8s32_mmla_8x12_a510;
+ break;
+ }
}
+
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
new file mode 100644
index 000000000..a4d8c0ace
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_s8s32_mmla_8x12_a510(
+ const int8_t *Apanel, const int8_t *Bpanel,
+ int32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+ struct KernelArgs {
+ size_t bblocks = {};
+ size_t K = {};
+ const int8_t *Bpanel = {};
+ } ka;
+
+ ka.bblocks = bblocks;
+ ka.K = (K/8) - 1;
+ ka.Bpanel = Bpanel;
+
+ __asm__ __volatile__(
+
+ "1:" // Height loop
+ "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+ "mov x21, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "2:" // Width loop
+ "mov %x[Apanel], x21\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v4.16b }, [x20], #0x10\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ "cmp x19, #0x2\n"
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "ld1 { v5.16b }, [x20], #0x10\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "blt 4f\n"
+ "3:" // main loop head
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
+ ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
+ "sub x19, x19, #0x2\n"
+ ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
+ "cmp x19, #0x2\n"
+ ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a432 // smmla v18.4s, v1.16b, v7.16b\n"
+ "ld1 { v6.16b }, [x20], #0x10\n"
+ ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n"
+ "ld1 { v7.16b }, [x20], #0x10\n"
+ ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n"
+ "ld1 { v4.16b }, [x20], #0x10\n"
+ ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n"
+ "ld1 { v5.16b }, [x20], #0x10\n"
+ ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
+ "bge 3b\n"
+ "4:" // main loop skip
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
+ ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e87a432 // smmla v18.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
+ ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n"
+ ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
+ "cbz x19, 5f\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.16b }, [x20], #0x10\n"
+ ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v7.16b }, [x20], #0x10\n"
+ ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n"
+ ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n"
+ ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
+ "5:" // multiply loop done
+ "subs x22, x22, #0x1\n"
+ "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp2 v8.2d, v8.2d, v11.2d\n"
+ "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q4, [%x[Cpanel], #0x0]\n"
+ "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "uzp2 v10.2d, v10.2d, v13.2d\n"
+ "str q11, [%x[Cpanel], #0x10]\n"
+ "str q12, [%x[Cpanel], #0x20]\n"
+ "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
+ "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp2 v15.2d, v15.2d, v18.2d\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
+ "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp2 v16.2d, v16.2d, v19.2d\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
+ "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp2 v20.2d, v20.2d, v23.2d\n"
+ "str q13, [%x[Cpanel], #0x60]\n"
+ "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "uzp2 v21.2d, v21.2d, v24.2d\n"
+ "str q17, [%x[Cpanel], #0x70]\n"
+ "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "uzp2 v22.2d, v22.2d, v25.2d\n"
+ "str q18, [%x[Cpanel], #0x80]\n"
+ "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "uzp2 v26.2d, v26.2d, v29.2d\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
+ "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp2 v27.2d, v27.2d, v30.2d\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
+ "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp2 v28.2d, v28.2d, v31.2d\n"
+ "str q16, [%x[Cpanel], #0xb0]\n"
+ "str q19, [%x[Cpanel], #0xc0]\n"
+ "str q23, [%x[Cpanel], #0xd0]\n"
+ "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q20, [%x[Cpanel], #0xf0]\n"
+ "str q21, [%x[Cpanel], #0x100]\n"
+ "str q22, [%x[Cpanel], #0x110]\n"
+ "str q25, [%x[Cpanel], #0x120]\n"
+ "str q29, [%x[Cpanel], #0x130]\n"
+ "str q30, [%x[Cpanel], #0x140]\n"
+ "str q26, [%x[Cpanel], #0x150]\n"
+ "str q27, [%x[Cpanel], #0x160]\n"
+ "str q28, [%x[Cpanel], #0x170]\n"
+ "add %x[Cpanel], %x[Cpanel], #0x180\n"
+ "bgt 2b\n"
+ "subs %x[ablocks], %x[ablocks], #0x1\n"
+ "bne 1b\n"
+ : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+ : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index f492a474a..fa93c1d90 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -35,6 +35,7 @@ namespace arm_gemm
{
// Actual kernel implementations
void a64_interleaved_u8u32_mmla_8x12( ARGLIST );
+void a64_interleaved_u8u32_mmla_8x12_a510( ARGLIST );
class cls_a64_interleaved_u8u32_mmla_8x12
{
@@ -91,7 +92,7 @@ public:
case CPUModel::A510:
return { 47.66, 2.47, 0.29 };
case CPUModel::V1:
- return { 111.60, 4.95, 0.66 };
+ return { 75.54, 8.06, 0.63 };
}
}
@@ -100,8 +101,15 @@ public:
// Default to the generic kernel
kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
- cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *)
+ cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A510:
+ kernel=a64_interleaved_u8u32_mmla_8x12_a510;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
new file mode 100644
index 000000000..3fe1a9bd0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_u8u32_mmla_8x12_a510(
+ const uint8_t *Apanel, const uint8_t *Bpanel,
+ uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+ struct KernelArgs {
+ size_t bblocks = {};
+ size_t K = {};
+ const uint8_t *Bpanel = {};
+ } ka;
+
+ ka.bblocks = bblocks;
+ ka.K = (K/8) - 1;
+ ka.Bpanel = Bpanel;
+
+ __asm__ __volatile__(
+
+ "1:" // Height loop
+ "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+ "mov x21, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "2:" // Width loop
+ "mov %x[Apanel], x21\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v4.16b }, [x20], #0x10\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ "cmp x19, #0x2\n"
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "ld1 { v5.16b }, [x20], #0x10\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "blt 4f\n"
+ "3:" // main loop head
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+ "sub x19, x19, #0x2\n"
+ ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
+ "cmp x19, #0x2\n"
+ ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a432 // ummla v18.4s, v1.16b, v7.16b\n"
+ "ld1 { v6.16b }, [x20], #0x10\n"
+ ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n"
+ "ld1 { v7.16b }, [x20], #0x10\n"
+ ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n"
+ "ld1 { v4.16b }, [x20], #0x10\n"
+ ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n"
+ "ld1 { v5.16b }, [x20], #0x10\n"
+ ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
+ "bge 3b\n"
+ "4:" // main loop skip
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e87a432 // ummla v18.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n"
+ ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
+ "cbz x19, 5f\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.16b }, [x20], #0x10\n"
+ ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v7.16b }, [x20], #0x10\n"
+ ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n"
+ ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
+ "5:" // multiply loop done
+ "subs x22, x22, #0x1\n"
+ "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp2 v8.2d, v8.2d, v11.2d\n"
+ "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q4, [%x[Cpanel], #0x0]\n"
+ "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "uzp2 v10.2d, v10.2d, v13.2d\n"
+ "str q11, [%x[Cpanel], #0x10]\n"
+ "str q12, [%x[Cpanel], #0x20]\n"
+ "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
+ "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp2 v15.2d, v15.2d, v18.2d\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
+ "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp2 v16.2d, v16.2d, v19.2d\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
+ "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp2 v20.2d, v20.2d, v23.2d\n"
+ "str q13, [%x[Cpanel], #0x60]\n"
+ "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "uzp2 v21.2d, v21.2d, v24.2d\n"
+ "str q17, [%x[Cpanel], #0x70]\n"
+ "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "uzp2 v22.2d, v22.2d, v25.2d\n"
+ "str q18, [%x[Cpanel], #0x80]\n"
+ "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "uzp2 v26.2d, v26.2d, v29.2d\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
+ "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp2 v27.2d, v27.2d, v30.2d\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
+ "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp2 v28.2d, v28.2d, v31.2d\n"
+ "str q16, [%x[Cpanel], #0xb0]\n"
+ "str q19, [%x[Cpanel], #0xc0]\n"
+ "str q23, [%x[Cpanel], #0xd0]\n"
+ "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q20, [%x[Cpanel], #0xf0]\n"
+ "str q21, [%x[Cpanel], #0x100]\n"
+ "str q22, [%x[Cpanel], #0x110]\n"
+ "str q25, [%x[Cpanel], #0x120]\n"
+ "str q29, [%x[Cpanel], #0x130]\n"
+ "str q30, [%x[Cpanel], #0x140]\n"
+ "str q26, [%x[Cpanel], #0x150]\n"
+ "str q27, [%x[Cpanel], #0x160]\n"
+ "str q28, [%x[Cpanel], #0x170]\n"
+ "add %x[Cpanel], %x[Cpanel], #0x180\n"
+ "bgt 2b\n"
+ "subs %x[ablocks], %x[ablocks], #0x1\n"
+ "bne 1b\n"
+ : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+ : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
index 30e265fbc..ab175a375 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -83,7 +83,7 @@ public:
case CPUModel::A510:
return { 5.42 };
case CPUModel::V1:
- return { 34.56 };
+ return { 20.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
index 61c7ad17e..b7c9aca9d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
@@ -83,7 +83,7 @@ public:
case CPUModel::A510:
return { 5.31 };
case CPUModel::V1:
- return { 28.93 };
+ return { 17.32 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
index b8ca7c545..28057aa96 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
case CPUModel::A510:
return { 22.77, 3.90, 0.47 };
case CPUModel::V1:
- return { 62.97, 19.14, 0.92 };
+ return { 48.09, 16.24, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
index b88ef14f2..c08977570 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
case CPUModel::A510:
return { 23.87, 3.89, 0.37 };
case CPUModel::V1:
- return { 107.63, 19.24, 0.92 };
+ return { 75.14, 15.87, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
index d870711c6..901cc6d63 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
case CPUModel::A510:
return { 22.75, 3.90, 0.47 };
case CPUModel::V1:
- return { 62.97, 19.27, 0.92 };
+ return { 48.09, 16.24, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
index 7f8eadc52..c0d089278 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
case CPUModel::A510:
return { 26.80, 3.89, 0.47 };
case CPUModel::V1:
- return { 108.33, 18.66, 0.92 };
+ return { 75.14, 15.87, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index fa44bdbd3..fc91dd71a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -80,7 +80,7 @@ public:
case CPUModel::A510:
return { 7.78, 4.01, 2.43 };
case CPUModel::V1:
- return { 62.50, 5.09, 11.32 };
+ return { 47.63, 5.11, 6.80 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index 1924b2cf7..0d707b039 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -89,7 +89,7 @@ public:
default:
return { 31.67, 3.57, 0.50 };
case CPUModel::V1:
- return { 63.35, 4.76, 0.77 };
+ return { 52.24, 7.49, 0.80 };
case CPUModel::A510:
return { 27.47, 1.70, 0.28 };
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index bd1764bb7..4e65296f8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -89,7 +89,7 @@ public:
default:
return { 61.97, 3.64, 0.50 };
case CPUModel::V1:
- return { 123.84, 4.93, 0.76 };
+ return { 95.28, 7.99, 0.79 };
case CPUModel::A510:
return { 43.36, 1.86, 0.28 };
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index f66a9bf51..0afcdd2ce 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -91,7 +91,7 @@ public:
case CPUModel::A510:
return { 27.45, 1.65, 0.28 };
case CPUModel::V1:
- return { 63.35, 4.96, 0.77 };
+ return { 52.24, 7.49, 0.80 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index b530202bd..58d21d6c4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -91,7 +91,7 @@ public:
case CPUModel::A510:
return { 38.02, 1.85, 0.28 };
case CPUModel::V1:
- return { 123.84, 4.98, 0.76 };
+ return { 95.28, 7.99, 0.79 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/transform.cpp b/src/core/NEON/kernels/arm_gemm/transform.cpp
index 60376ab80..c6a3bc0ed 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.cpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.cpp
@@ -25,7 +25,9 @@
#include "bfloat.hpp"
+#if !defined(__OpenBSD__)
#include <alloca.h>
+#endif /* !defined(__OpenBSD__) */
namespace arm_gemm {
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index eadf48d00..9262ea05a 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,10 +78,20 @@ struct DepthwiseArgs
template <typename TInput, typename TWeight, typename TOutput>
class DepthwiseCommon : public IDepthwiseCommon
{
+private:
+ std::string _name{};
+
protected:
const DepthwiseArgs m_args; // Copy of arguments
-
public:
+ std::string name() const
+ {
+ return _name;
+ }
+ void set_name(const std::string &n)
+ {
+ _name = n;
+ }
DepthwiseCommon(const DepthwiseArgs &args)
: m_args(args) {};
DepthwiseCommon(DepthwiseCommon &) = delete;