diff options
Diffstat (limited to 'src/core/NEON')
53 files changed, 1570 insertions, 1699 deletions
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h index af301c8d1..cd01659c0 100644 --- a/src/core/NEON/NEKernels.h +++ b/src/core/NEON/NEKernels.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -55,7 +55,6 @@ #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h" #include "src/core/NEON/kernels/NERangeKernel.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" -#include "src/core/NEON/kernels/NERemapKernel.h" #include "src/core/NEON/kernels/NEReorgLayerKernel.h" #include "src/core/NEON/kernels/NEReverseKernel.h" #include "src/core/NEON/kernels/NESelectKernel.h" diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp index 1e0a1742f..69bfd56ce 100644 --- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp +++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,8 +28,10 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" #include "src/core/CPP/Validate.h" +#include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/boundingboxtransform/list.h" #include <arm_neon.h> @@ -37,6 +39,62 @@ namespace arm_compute { namespace { +struct BoundingBoxTransformSelectorData +{ + DataType dt; +}; + +using BoundingBoxTransformSelctorPtr = std::add_pointer<bool(const BoundingBoxTransformSelectorData &data)>::type; +using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)>::type; + +struct BoundingBoxTransformKernel +{ + const char *name; + const BoundingBoxTransformSelctorPtr is_selected; + BoundingBoxTransformUKernelPtr ukernel; +}; + +static const BoundingBoxTransformKernel available_kernels[] = +{ + { + "fp32_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform) + }, +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + "fp16_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform) + }, +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "qu16_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform) + }, +#endif //defined(ARM_COMPUTE_ENABLE_NEON) +}; + +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected(data)) + { + return &uk; + } + } + return nullptr; +} + Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); @@ -112,145 +170,15 @@ Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const IT return Status{}; } -template <> -void NEBoundingBoxTransformKernel::internal_run<uint16_t>(const Window &window) -{ - const size_t num_classes = _deltas->info()->tensor_shape()[0] >> 2; - const size_t deltas_width = _deltas->info()->tensor_shape()[0]; - const int img_h = std::floor(_bbinfo.img_height() / _bbinfo.scale() + 0.5f); - const int img_w = std::floor(_bbinfo.img_width() / _bbinfo.scale() + 0.5f); - - const auto scale_after = (_bbinfo.apply_scale() ? _bbinfo.scale() : 1.f); - const auto scale_before = _bbinfo.scale(); - const auto offset = (_bbinfo.correct_transform_coords() ? 1.f : 0.f); - - auto pred_ptr = reinterpret_cast<uint16_t *>(_pred_boxes->buffer() + _pred_boxes->info()->offset_first_element_in_bytes()); - auto delta_ptr = reinterpret_cast<uint8_t *>(_deltas->buffer() + _deltas->info()->offset_first_element_in_bytes()); - - const auto boxes_qinfo = _boxes->info()->quantization_info().uniform(); - const auto deltas_qinfo = _deltas->info()->quantization_info().uniform(); - const auto pred_qinfo = _pred_boxes->info()->quantization_info().uniform(); - - Iterator box_it(_boxes, window); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto ptr = reinterpret_cast<uint16_t *>(box_it.ptr()); - const auto b0 = dequantize_qasymm16(*ptr, boxes_qinfo); - const auto b1 = dequantize_qasymm16(*(ptr + 1), boxes_qinfo); - const auto b2 = dequantize_qasymm16(*(ptr + 2), boxes_qinfo); - const auto b3 = dequantize_qasymm16(*(ptr + 3), boxes_qinfo); - const float width = (b2 / scale_before) - (b0 / scale_before) + 1.f; - const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f; - const float ctr_x = (b0 / scale_before) + 0.5f * width; - const float ctr_y = (b1 / scale_before) + 0.5f * height; - for(size_t j = 0; j < num_classes; ++j) - { - // Extract deltas - const size_t delta_id = id.y() * deltas_width + 4u * j; - const float dx = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / _bbinfo.weights()[0]; - const float dy = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / _bbinfo.weights()[1]; - float dw = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / _bbinfo.weights()[2]; - float dh = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / _bbinfo.weights()[3]; - // Clip dw and dh - dw = std::min(dw, _bbinfo.bbox_xform_clip()); - dh = std::min(dh, _bbinfo.bbox_xform_clip()); - // Determine the predictions - const float pred_ctr_x = dx * width + ctr_x; - const float pred_ctr_y = dy * height + ctr_y; - const float pred_w = std::exp(dw) * width; - const float pred_h = std::exp(dh) * height; - // Store the prediction into the output tensor - pred_ptr[delta_id] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo); - pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo); - pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo); - pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo); - } - }, - box_it); -} - -template <typename T> -void NEBoundingBoxTransformKernel::internal_run(const Window &window) -{ - const size_t num_classes = _deltas->info()->tensor_shape()[0] >> 2; - const size_t deltas_width = _deltas->info()->tensor_shape()[0]; - const int img_h = std::floor(_bbinfo.img_height() / _bbinfo.scale() + 0.5f); - const int img_w = std::floor(_bbinfo.img_width() / _bbinfo.scale() + 0.5f); - - const auto scale_after = (_bbinfo.apply_scale() ? T(_bbinfo.scale()) : T(1)); - const auto scale_before = T(_bbinfo.scale()); - ARM_COMPUTE_ERROR_ON(scale_before <= 0); - const auto offset = (_bbinfo.correct_transform_coords() ? T(1.f) : T(0.f)); - - auto pred_ptr = reinterpret_cast<T *>(_pred_boxes->buffer() + _pred_boxes->info()->offset_first_element_in_bytes()); - auto delta_ptr = reinterpret_cast<T *>(_deltas->buffer() + _deltas->info()->offset_first_element_in_bytes()); - - Iterator box_it(_boxes, window); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto ptr = reinterpret_cast<T *>(box_it.ptr()); - const auto b0 = *ptr; - const auto b1 = *(ptr + 1); - const auto b2 = *(ptr + 2); - const auto b3 = *(ptr + 3); - const T width = (b2 / scale_before) - (b0 / scale_before) + T(1.f); - const T height = (b3 / scale_before) - (b1 / scale_before) + T(1.f); - const T ctr_x = (b0 / scale_before) + T(0.5f) * width; - const T ctr_y = (b1 / scale_before) + T(0.5f) * height; - for(size_t j = 0; j < num_classes; ++j) - { - // Extract deltas - const size_t delta_id = id.y() * deltas_width + 4u * j; - const T dx = delta_ptr[delta_id] / T(_bbinfo.weights()[0]); - const T dy = delta_ptr[delta_id + 1] / T(_bbinfo.weights()[1]); - T dw = delta_ptr[delta_id + 2] / T(_bbinfo.weights()[2]); - T dh = delta_ptr[delta_id + 3] / T(_bbinfo.weights()[3]); - // Clip dw and dh - dw = std::min(dw, T(_bbinfo.bbox_xform_clip())); - dh = std::min(dh, T(_bbinfo.bbox_xform_clip())); - // Determine the predictions - const T pred_ctr_x = dx * width + ctr_x; - const T pred_ctr_y = dy * height + ctr_y; - const T pred_w = std::exp(dw) * width; - const T pred_h = std::exp(dh) * height; - // Store the prediction into the output tensor - pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1)); - pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1)); - pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1)); - pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1)); - } - }, - box_it); -} - void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_boxes->info()->data_type()) - { - case DataType::F32: - { - internal_run<float>(window); - break; - } - case DataType::QASYMM16: - { - internal_run<uint16_t>(window); - break; - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - internal_run<float16_t>(window); - break; - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - default: - { - ARM_COMPUTE_ERROR("Data type not supported"); - } - } + + const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() }); + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h index c080ce6a5..def827836 100644 --- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h +++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -83,9 +83,6 @@ public: void run(const Window &window, const ThreadInfo &info) override; private: - template <typename T> - void internal_run(const Window &window); - const ITensor *_boxes; ITensor *_pred_boxes; const ITensor *_deltas; diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp index fabbd6430..729402116 100644 --- a/src/core/NEON/kernels/NECropKernel.cpp +++ b/src/core/NEON/kernels/NECropKernel.cpp @@ -31,136 +31,92 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "src/core/CPP/Validate.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/helpers/bit_ops.h" +#include "src/cpu/kernels/crop/generic/neon/list.h" namespace arm_compute { namespace { -template <typename T> -inline float32x4_t load_as_f32(T *ptr) +struct CropSelectorData { - ARM_COMPUTE_UNUSED(ptr); - ARM_COMPUTE_ERROR("Type not supported."); -} - -template <> -inline float32x4_t load_as_f32(float *ptr) -{ - return wrapper::vloadq(ptr); -} - -template <> -inline float32x4_t load_as_f32(int32_t *ptr) -{ - return vcvtq_f32_s32(wrapper::vloadq(ptr)); -} + DataType dt; +}; -template <> -inline float32x4_t load_as_f32(uint32_t *ptr) -{ - return vcvtq_f32_u32(wrapper::vloadq(ptr)); -} +using CropSelectorPtr = std::add_pointer<bool(const CropSelectorData &data)>::type; +using CropUKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type; -template <> -inline float32x4_t load_as_f32(int16_t *ptr) +struct CropUKernel { - return vcvtq_f32_s32(vmovl_s16(wrapper::vload(ptr))); -} + const char *name; + const CropSelectorPtr is_selected; + CropUKernelPtr ukernel; +}; -template <> -inline float32x4_t load_as_f32(uint16_t *ptr) +static const CropUKernel available_kernels[] = { - return vcvtq_f32_u32(vmovl_u16(wrapper::vload(ptr))); -} - -template <> -inline float32x4_t load_as_f32(uint8_t *ptr) -{ - return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr))))); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline float32x4_t load_as_f32(float16_t *ptr) -{ - return vcvt_f32_f16(wrapper::vload(ptr)); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template <typename T> -inline void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) -{ - // Reverse elements if width flipped. - if(is_width_flipped) { - // Collapse first dimension if possible. - if(input_has_single_channel) - { - int32_t x = output_width_start; - Coordinates negative_offset(input_offset); - negative_offset.set(1, negative_offset[1] - window_step_x + 1); - for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x) - { - auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset))); - - in = wrapper::vrev64(in); - in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in)); + "fp16_neon_crop", + [](const CropSelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window) + }, + { + "f32_neon_crop", + [](const CropSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window) + }, + { + "u8_neon_crop", + [](const CropSelectorData & data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window) + }, + { + "u16_neon_crop", + [](const CropSelectorData & data) { return data.dt == DataType::U16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window) + }, + { + "u32_neon_crop", + [](const CropSelectorData & data) { return data.dt == DataType::U32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window) + }, + { + "s8_neon_crop", + [](const CropSelectorData & data) { return data.dt == DataType::S8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window) + }, + { + "s16_neon_crop", + [](const CropSelectorData & data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window) + }, + { + "s32_neon_crop", + [](const CropSelectorData & data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window) + }, +}; - wrapper::vstore(output_ptr + x, in); - } - input_offset[1] = negative_offset[1] + window_step_x - 1; - for(; x < output_width_limit; ++x, --input_offset[1]) - { - *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset))); - } - } - else - { - for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1]) - { - input_offset.set(0, 0); - int32_t c = 0; - for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x) - { - auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset))); - wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in); - } - for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0]) - { - *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset))); - } - } - } - } - else +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const CropUKernel *get_implementation(const CropSelectorData &data) +{ + for(const auto &uk : available_kernels) { - // Use memcpy if the elements don't need converting to float. - if(std::is_same<T, float>::value) + if(uk.is_selected(data)) { - memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)), - reinterpret_cast<const void *>(input->ptr_to_element(input_offset)), - (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size()); - } - else - { - int32_t x = 0; - int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0)); - float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0); - for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x) - { - auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset))); - wrapper::vstore(output_start_ptr + x, in); - } - for(; x < limit; ++x, ++input_offset[0]) - { - *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset))); - } + return &uk; } } + + return nullptr; } inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value, @@ -234,8 +190,7 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina } // namespace NECropKernel::NECropKernel() - : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds(), - _in_bounds_crop_function(nullptr) + : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds() { } @@ -250,40 +205,14 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co _output = output; _crop_box_ind = crop_box_ind; _extrapolation_value = extrapolation_value; - - switch(input->info()->data_type()) - { - case DataType::F32: - _in_bounds_crop_function = &in_bounds_crop_window<float>; - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _in_bounds_crop_function = &in_bounds_crop_window<float16_t>; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::U32: - _in_bounds_crop_function = &in_bounds_crop_window<uint32_t>; - break; - case DataType::S32: - _in_bounds_crop_function = &in_bounds_crop_window<int32_t>; - break; - case DataType::U16: - _in_bounds_crop_function = &in_bounds_crop_window<uint16_t>; - break; - case DataType::S16: - _in_bounds_crop_function = &in_bounds_crop_window<int16_t>; - break; - case DataType::U8: - _in_bounds_crop_function = &in_bounds_crop_window<uint8_t>; - break; - default: - ARM_COMPUTE_ERROR("Datatype not supported"); - } } Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value) { ARM_COMPUTE_UNUSED(extrapolation_value); + const auto *uk = get_implementation(CropSelectorData{ input->data_type() }); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); @@ -369,10 +298,12 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON(_input->info()->has_padding()); ARM_COMPUTE_ERROR_ON(_output->info()->has_padding()); + const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() }); + uint32_t batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind)))); Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0], _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index); - execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, _in_bounds_crop_function, _end[1] < _start[1], + execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1], _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0, _start[0] <= _end[0], _end[0] < _start[0]); } diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h index 742215e22..6c989c1d2 100644 --- a/src/core/NEON/kernels/NECropKernel.h +++ b/src/core/NEON/kernels/NECropKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -107,8 +107,6 @@ private: std::array<uint32_t, 2> _rows_out_of_bounds; /** The number of columns out of bounds at the start and end of output. */ std::array<uint32_t, 2> _cols_out_of_bounds; - - NECropKernel::InBoundsCropFunction *_in_bounds_crop_function; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEON_CROP_KERNEL_H */ diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp index 56aed0ca2..7bba136e8 100644 --- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp +++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,15 +28,72 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" #include "src/core/CPP/Validate.h" +#include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - +#include "src/cpu/kernels/genproposals/list.h" #include <arm_neon.h> namespace arm_compute { namespace { +struct ComputeAllAnchorsData +{ + DataType dt; +}; + +using ComputeAllAnchorsSelectorPtr = std::add_pointer<bool(const ComputeAllAnchorsData &data)>::type; +using ComputeAllAnchorsUKernelPtr = std::add_pointer<void(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type; + +struct ComputeAllAnchorsKernel +{ + const char *name; + const ComputeAllAnchorsSelectorPtr is_selected; + ComputeAllAnchorsUKernelPtr ukernel; +}; + +static const ComputeAllAnchorsKernel available_kernels[] = +{ +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "neon_qu16_computeallanchors", + [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors) + }, +#endif //defined(ARM_COMPUTE_ENABLE_NEON) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + "neon_fp16_computeallanchors", + [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors) + }, +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + "neon_fp32_computeallanchors", + [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors) + }, +}; + +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected(data)) + { + return &uk; + } + } + return nullptr; +} + Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(anchors, all_anchors); @@ -100,100 +157,15 @@ Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITe return Status{}; } -template <> -void NEComputeAllAnchorsKernel::internal_run<int16_t>(const Window &window) -{ - Iterator all_anchors_it(_all_anchors, window); - Iterator anchors_it(_all_anchors, window); - - const size_t num_anchors = _anchors->info()->dimension(1); - const float stride = 1.f / _anchors_info.spatial_scale(); - const size_t feat_width = _anchors_info.feat_width(); - - const UniformQuantizationInfo qinfo = _anchors->info()->quantization_info().uniform(); - - execute_window_loop(window, [&](const Coordinates & id) - { - const size_t anchor_offset = id.y() % num_anchors; - - const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr()); - const auto anchor_ptr = reinterpret_cast<int16_t *>(_anchors->ptr_to_element(Coordinates(0, anchor_offset))); - - const size_t shift_idy = id.y() / num_anchors; - const float shiftx = (shift_idy % feat_width) * stride; - const float shifty = (shift_idy / feat_width) * stride; - - const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx; - const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty; - const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx; - const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty; - - *out_anchor_ptr = quantize_qsymm16(new_anchor_x1, qinfo.scale); - *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale); - *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale); - *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale); - }, - all_anchors_it); -} - -template <typename T> -void NEComputeAllAnchorsKernel::internal_run(const Window &window) -{ - Iterator all_anchors_it(_all_anchors, window); - Iterator anchors_it(_all_anchors, window); - - const size_t num_anchors = _anchors->info()->dimension(1); - const T stride = 1.f / _anchors_info.spatial_scale(); - const size_t feat_width = _anchors_info.feat_width(); - - execute_window_loop(window, [&](const Coordinates & id) - { - const size_t anchor_offset = id.y() % num_anchors; - - const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr()); - const auto anchor_ptr = reinterpret_cast<T *>(_anchors->ptr_to_element(Coordinates(0, anchor_offset))); - - const size_t shift_idy = id.y() / num_anchors; - const T shiftx = (shift_idy % feat_width) * stride; - const T shifty = (shift_idy / feat_width) * stride; - - *out_anchor_ptr = *anchor_ptr + shiftx; - *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty; - *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx; - *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty; - }, - all_anchors_it); -} - void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_anchors->info()->data_type()) - { - case DataType::QSYMM16: - { - internal_run<int16_t>(window); - break; - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - internal_run<float16_t>(window); - break; - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - { - internal_run<float>(window); - break; - } - default: - { - ARM_COMPUTE_ERROR("Data type not supported"); - } - } + const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() }); + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + uk->ukernel(_anchors, _all_anchors, _anchors_info, window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h index f6d39e50a..297d6d4ab 100644 --- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h +++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -74,9 +74,6 @@ public: void run(const Window &window, const ThreadInfo &info) override; private: - template <typename T> - void internal_run(const Window &window); - const ITensor *_anchors; ITensor *_all_anchors; ComputeAnchorsInfo _anchors_info; diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp index d33431a8d..71641404b 100644 --- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,8 +34,10 @@ #include "src/core/CPP/Validate.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/instancenorm/list.h" #include <arm_neon.h> @@ -43,137 +45,53 @@ namespace arm_compute { namespace { -template <typename InputType, typename AccType = InputType> -void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs) +struct InstanceNormSelectorData { - result = wrapper::vadd(result, inputs); - result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs)); -} + DataType dt; +}; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline void vector_float_sum(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs) -{ - vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgetlow(inputs))); - vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs))); -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +using InstanceNormSelctorPtr = std::add_pointer<bool(const InstanceNormSelectorData &data)>::type; +using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)>::type; -template <typename InputType, typename AccType = InputType> -InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta) +struct InstanceNormKernel { - return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta); -} + const char *name; + const InstanceNormSelctorPtr is_selected; + InstanceNormUKernelPtr ukernel; +}; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline float16x8_t vector_float_norm(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta) +static const InstanceNormKernel available_kernels[] = { - const auto input_low = wrapper::vcvt<float>(wrapper::vgetlow(inputs)); - const auto input_high = wrapper::vcvt<float>(wrapper::vgethigh(inputs)); - const auto result_low = wrapper::vcvt<float16_t>(vector_float_norm(input_low, vec_mean, vec_multip, vec_beta)); - const auto result_high = wrapper::vcvt<float16_t>(vector_float_norm(input_high, vec_mean, vec_multip, vec_beta)); - float16x8_t result = wrapper::vcombine(result_low, result_high); - - return result; -} + { + "fp32_neon_instancenorm", + [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm) + }, +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + "fp16_neon_instancenorm", + [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm) + }, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +}; -template <typename T, typename AccType = T> -void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window) +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data) { - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; - - // Clear X/Y dimensions on execution window as we handle the planes manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - win.set(Window::DimY, Window::Dimension(0, 1, 1)); - - constexpr int window_step_x = 16 / sizeof(T); - const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); - - Iterator input_it(input, win); - execute_window_loop(win, [&](const Coordinates & id) + for(const auto &uk : available_kernels) { - Window win_plane = window; - win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); - win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); - - Iterator input_plane_it(input, win_plane); - Iterator output_plane_it(output, win_plane); - - auto sum_h_w = static_cast<AccType>(0.f); - auto sum_squares_h_w = static_cast<AccType>(0.f); - - execute_window_loop(win_plane, [&](const Coordinates &) + if(uk.is_selected(data)) { - const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); - - auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); - auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); - - // Compute S elements per iteration - int x = window.x().start(); - for(; x <= (window.x().end() - window_step_x); x += window_step_x) - { - auto vec_input_val = wrapper::vloadq(input_ptr + x); - vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val); - } - - auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); - auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w)); - - vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); - vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); - - sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); - sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); - - // Compute left-over elements - for(; x < window.x().end(); ++x) - { - const auto value = static_cast<AccType>(*(input_ptr + x)); - sum_h_w += value; - sum_squares_h_w += value * value; - } - }, - input_plane_it, output_plane_it); - - const auto mean_h_w = sum_h_w / elements_plane; - const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; - - const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon); - const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{}); - const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{}); - const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{}); - - execute_window_loop(win_plane, [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); - auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); - - // Compute S elements per iteration - int x = window.x().start(); - //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); - for(; x <= (window.x().end() - window_step_x); x += window_step_x) - { - const auto vec_val = wrapper::vloadq(input_ptr + x); - const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta); - wrapper::vstore(output_ptr + x, normalized_vec); - } - - // Compute left-over elements - for(; x < window.x().end(); ++x) - { - const auto val = static_cast<AccType>(*(input_ptr + x)); - *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta); - } - }, - input_plane_it, output_plane_it); - }, - input_it); + return &uk; + } + } + return nullptr; } Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon) @@ -210,7 +128,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe } // namespace NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12) + : _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12) { } @@ -227,28 +145,6 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), _gamma, _beta, _epsilon)); - if(_input->info()->data_type() == DataType::F32) - { - _func = &instance_normalization_nchw<float>; - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else if(_input->info()->data_type() == DataType::F16) - { - if(_use_mixed_precision) - { - _func = &instance_normalization_nchw<float16_t, float>; - } - else - { - _func = &instance_normalization_nchw<float16_t>; - } - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else - { - ARM_COMPUTE_ERROR("Unsupported data type"); - } - // Configure kernel window auto win_config = validate_and_configure_window(_input->info(), _output->info()); ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); @@ -268,6 +164,10 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - (*_func)(_input, _output, _gamma, _beta, _epsilon, window); + + const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() }); + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h index 96c011971..f166ce205 100644 --- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -84,13 +84,12 @@ private: */ using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); - NormalizationFunction *_func; - ITensor *_input; - ITensor *_output; - float _gamma; - float _beta; - float _epsilon; - bool _use_mixed_precision{ true }; + ITensor *_input; + ITensor *_output; + float _gamma; + float _beta; + float _epsilon; + bool _use_mixed_precision{ true }; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */ diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp index 761fa1523..93da8a24c 100644 --- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,9 +28,10 @@ #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "src/core/CPP/Validate.h" +#include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - +#include "src/cpu/kernels/maxunpool/list.h" #include "support/ToolchainSupport.h" namespace arm_compute @@ -39,6 +40,67 @@ using namespace misc::shape_calculator; namespace { +struct MaxUnpoolingSelectorData +{ + DataType dt; +}; + +using MaxUnpoolingSelctorPtr = std::add_pointer<bool(const MaxUnpoolingSelectorData &data)>::type; +using MaxUnpoolingUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *indices, const Window &window)>::type; + +struct MaxUnpoolingKernel +{ + const char *name; + const MaxUnpoolingSelctorPtr is_selected; + MaxUnpoolingUKernelPtr ukernel; +}; + +static const MaxUnpoolingKernel available_kernels[] = +{ + { + "fp32_neon_maxunpooling", + [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_maxunpooling) + }, +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + "fp16_neon_maxunpooling", + [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_maxunpooling) + }, +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "qs8_neon_maxunpooling", + [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qs8_maxunpooling) + }, + { + "qu8_neon_maxunpooling", + [](const MaxUnpoolingSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qu8_maxunpooling) + }, +#endif //defined(ARM_COMPUTE_ENABLE_NEON) +}; + +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const MaxUnpoolingKernel *get_implementation(const MaxUnpoolingSelectorData &data) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected(data)) + { + return &uk; + } + } + return nullptr; +} + Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices); @@ -69,7 +131,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } // namespace NEMaxUnpoolingLayerKernel::NEMaxUnpoolingLayerKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr) + : _input(nullptr), _output(nullptr), _indices(nullptr) { } @@ -82,46 +144,12 @@ void NEMaxUnpoolingLayerKernel::configure(const ITensor *input, const ITensor *i _output = output; _indices = indices; - switch(input->info()->data_type()) - { - case DataType::F32: - _func = &NEMaxUnpoolingLayerKernel::unpooling2<float>; - break; - case DataType::QASYMM8: - _func = &NEMaxUnpoolingLayerKernel::unpooling2<uint8_t>; - break; - case DataType::QASYMM8_SIGNED: - _func = &NEMaxUnpoolingLayerKernel::unpooling2<int8_t>; - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _func = &NEMaxUnpoolingLayerKernel::unpooling2<float16_t>; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - break; - } const TensorShape output_shape = compute_unpool_shape(*input->info(), pool_info); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); auto window = calculate_max_window(*input->info(), Steps()); INEKernel::configure(window); } -template <typename T> -void NEMaxUnpoolingLayerKernel::unpooling2(const Window &window) -{ - Iterator input(_input, window); - Iterator indices(_indices, window); - auto out_ptr = reinterpret_cast<T *>(_output->buffer()); - const int out_stride_w = static_cast<int>(_output->info()->strides_in_bytes()[3]); - execute_window_loop(window, [&](const Coordinates & id) - { - auto vindices = reinterpret_cast<uint32_t *>(indices.ptr()); - auto vinput = reinterpret_cast<T *>(input.ptr()); - out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput; - }, - input, indices); -} Status NEMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) { @@ -135,8 +163,9 @@ void NEMaxUnpoolingLayerKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - // Run function - (this->*_func)(window); + const auto *uk = get_implementation(MaxUnpoolingSelectorData{ _input->info()->data_type() }); + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + uk->ukernel(_input, _output, _indices, window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h index ecc116e58..f7f9a31f6 100644 --- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h +++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -88,10 +88,9 @@ private: using UnpoolingFunction = void (NEMaxUnpoolingLayerKernel::*)(const Window &window); private: - UnpoolingFunction _func; - const ITensor *_input; - ITensor *_output; - const ITensor *_indices; + const ITensor *_input; + ITensor *_output; + const ITensor *_indices; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H */ diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp index d1c7d4eb9..7d8fc7ec7 100644 --- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,13 +31,64 @@ #include "src/core/CPP/Validate.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/meanstddevnorm/list.h" namespace arm_compute { namespace { +struct MeanStdDevNormSelectorData +{ + DataType dt; +}; + +using MeanStdDevNormSelctorPtr = std::add_pointer<bool(const MeanStdDevNormSelectorData &data)>::type; +using MeanStdDevNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type; + +struct MeanStdDevNormKernel +{ + const char *name; + const MeanStdDevNormSelctorPtr is_selected; + MeanStdDevNormUKernelPtr ukernel; +}; + +static const MeanStdDevNormKernel available_kernels[] = +{ + { + "fp32_neon_meanstddevnorm", + [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm) + }, +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + "fp16_neon_meanstddevnorm", + [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm) + }, +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +}; + +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected(data)) + { + return &uk; + } + } + return nullptr; +} + Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float epsilon) { ARM_COMPUTE_UNUSED(epsilon); @@ -72,80 +123,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -template <typename ScalarType, int size> -void NEMeanStdDevNormalizationKernel::mean_stddev_normalization(const Window &window) -{ - using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type; - - // Set build options - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = size; - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - Iterator input(_input, win); - Iterator output(_output, win); - - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); - auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - - auto sum_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{}); - auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{}); - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto data = wrapper::vloadq(in_ptr + x); - sum_vec = wrapper::vadd(sum_vec, data); - sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data)); - } - - auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec)); - auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec)); - for(int i = 0; i < size / 4; ++i) - { - sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res); - sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res); - } - - auto sum = wrapper::vgetlane(sum_carry_res, 0); - auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0); - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - ScalarType data = *(in_ptr + x); - sum += data; - sum_sq += data * data; - } - - ScalarType mean = sum / _input->info()->dimension(0); - ScalarType var = (sum_sq / _input->info()->dimension(0)) - (mean * mean); - ScalarType stddev_inv = 1.f / sqrt(var + _epsilon); - - auto mean_vec = wrapper::vdup_n(mean, ExactTagType{}); - auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{}); - for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto data = wrapper::vloadq(in_ptr + x); - auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec); - // Store results - wrapper::vstore(out_ptr + x, res); - } - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; - } - }, - input, output); -} - NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() - : _input(nullptr), _output(nullptr), _epsilon(1e-8f), _func(nullptr) + : _input(nullptr), _output(nullptr), _epsilon(1e-8f) { } @@ -163,23 +142,6 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output, auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info()); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICPPKernel::configure(win_config.second); - - // Configure function to run based on different data types - const DataType data_type = input->info()->data_type(); - switch(data_type) - { - case DataType::F32: - _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float, 4>; - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float16_t, 8>; - break; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - default: - ARM_COMPUTE_ERROR("Not Supported"); - break; - } } Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon) @@ -194,8 +156,10 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - (this->*_func)(window); + const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() }); + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + uk->ukernel(_input, _output, _epsilon, window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h index 59d073ada..844f0efdc 100644 --- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h +++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -91,8 +91,6 @@ private: float _epsilon; using MeanStdDevNormFunction = void (NEMeanStdDevNormalizationKernel::*)(const Window &window); - - MeanStdDevNormFunction _func; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H */ diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp index 60986812b..734510b63 100644 --- a/src/core/NEON/kernels/NEPadLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -261,9 +261,10 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info) size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) const { - ARM_COMPUTE_UNUSED(platform, thread_count); - - return ICPPKernel::small_network_mws; + ARM_COMPUTE_UNUSED(thread_count); + ARM_COMPUTE_UNUSED(platform); + + return ICPPKernel::default_mws; } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp index ece7e40e3..802aebb52 100644 --- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp +++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,8 +30,10 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/misc/Utility.h" #include "src/core/CPP/Validate.h" +#include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/roialign/list.h" #include <arm_neon.h> @@ -41,6 +43,67 @@ namespace arm_compute { namespace { +struct ROIAlignSelectorData +{ + DataType dt; +}; + +using ROIAlignSelctorPtr = std::add_pointer<bool(const ROIAlignSelectorData &data)>::type; +using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)>::type; + +struct ROIAlignKernel +{ + const char *name; + const ROIAlignSelctorPtr is_selected; + ROIAlignUKernelPtr ukernel; +}; + +static const ROIAlignKernel available_kernels[] = +{ + { + "fp32_neon_roialign", + [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign) + }, +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + "fp16_neon_roialign", + [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign) + }, +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "qu8_neon_roialign", + [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign) + }, + { + "qs8_neon_roialign", + [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign) + }, +#endif //defined(ARM_COMPUTE_ENABLE_NEON) +}; + +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected(data)) + { + return &uk; + } + } + return nullptr; +} + Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output); @@ -110,328 +173,19 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn return Status{}; } -/** Average pooling over an aligned window */ -template <typename input_data_type> -inline input_data_type roi_align_1x1(const ITensor *input, - unsigned int roi_batch, - float region_start_x, - float bin_size_x, - int grid_size_x, - float region_end_x, - float region_start_y, - float bin_size_y, - int grid_size_y, - float region_end_y, - int pz) -{ - if((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) - { - return input_data_type(0); - } - else - { - const DataLayout data_layout = input->info()->data_layout(); - float avg = 0; - // Iterate through the aligned pooling region - for(int iy = 0; iy < grid_size_y; ++iy) - { - for(int ix = 0; ix < grid_size_x; ++ix) - { - // Align the window in the middle of every bin - float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y); - float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x); - - // Interpolation in the [0,0] [0,1] [1,0] [1,1] square - const int y_low = y; - const int x_low = x; - const int y_high = y_low + 1; - const int x_high = x_low + 1; - - const float ly = y - y_low; - const float lx = x - x_low; - const float hy = 1. - ly; - const float hx = 1. - lx; - - const float w1 = hy * hx; - const float w2 = hy * lx; - const float w3 = ly * hx; - const float w4 = ly * lx; - if(data_layout == DataLayout::NCHW) - { - const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))); - const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))); - const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))); - const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))); - avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; - } - else - { - const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))); - const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))); - const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))); - const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))); - avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; - } - } - } - - avg /= grid_size_x * grid_size_y; - return input_data_type(avg); - } -} - -/** Average pooling over an aligned window */ -template <typename input_data_type> -inline input_data_type roi_align_1x1_qasymm8(const ITensor *input, - unsigned int roi_batch, - float region_start_x, - float bin_size_x, - int grid_size_x, - float region_end_x, - float region_start_y, - float bin_size_y, - int grid_size_y, - float region_end_y, - int pz, - const QuantizationInfo &out_qinfo) -{ - if((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) - { - return input_data_type(out_qinfo.uniform().offset); - } - else - { - float avg = 0; - const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform(); - const bool is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type()); - const DataLayout data_layout = input->info()->data_layout(); - - // Iterate through the aligned pooling region - for(int iy = 0; iy < grid_size_y; ++iy) - { - for(int ix = 0; ix < grid_size_x; ++ix) - { - // Align the window in the middle of every bin - float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y); - float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x); - - // Interpolation in the [0,0] [0,1] [1,0] [1,1] square - const int y_low = y; - const int x_low = x; - const int y_high = y_low + 1; - const int x_high = x_low + 1; - - const float ly = y - y_low; - const float lx = x - x_low; - const float hy = 1. - ly; - const float hx = 1. - lx; - - const float w1 = hy * hx; - const float w2 = hy * lx; - const float w3 = ly * hx; - const float w4 = ly * lx; - - if(data_layout == DataLayout::NCHW) - { - if(is_qasymm_signed) - { - float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo); - float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo); - float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo); - float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo); - avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; - } - else - { - float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo); - float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo); - float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo); - float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo); - avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; - } - } - else - { - if(is_qasymm_signed) - { - const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo); - const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo); - const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo); - const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo); - avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; - } - else - { - const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo); - const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo); - const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo); - const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo); - avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; - } - } - } - } - - avg /= grid_size_x * grid_size_y; - - input_data_type res = 0; - if(is_qasymm_signed) - { - res = quantize_qasymm8_signed(avg, out_qinfo); - } - else - { - res = quantize_qasymm8(avg, out_qinfo); - } - return res; - } -} - -inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, float max_value) -{ - const float region_start = p * bin_size + roi_anchor; - return utility::clamp(region_start, 0.0f, max_value); -} - void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info) { const DataLayout data_layout = _input->info()->data_layout(); if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC) { - switch(_input->info()->data_type()) - { - case DataType::QASYMM8: - { - NEROIAlignLayerKernel::internal_run<uint8_t, uint16_t>(window, info); - break; - } - case DataType::QASYMM8_SIGNED: - { - NEROIAlignLayerKernel::internal_run<int8_t, uint16_t>(window, info); - break; - } - case DataType::F32: - { - NEROIAlignLayerKernel::internal_run<float>(window, info); - break; - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - NEROIAlignLayerKernel::internal_run<float16_t>(window, info); - break; - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - default: - { - ARM_COMPUTE_ERROR("DataType not supported"); - break; - } - } + const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() }); + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + uk->ukernel(_input, _output, _rois, _pool_info, window, info); } else { ARM_COMPUTE_ERROR("Invalid layout"); } } - -template <typename input_data_type, typename roi_data_type> -void NEROIAlignLayerKernel::internal_run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - const DataLayout data_layout = _input->info()->data_layout(); - const size_t values_per_roi = _rois->info()->dimension(0); - - const int roi_list_start = window.x().start(); - const int roi_list_end = window.x().end(); - - const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const unsigned int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - - const int input_width = _input->info()->dimension(idx_width); - const int input_height = _input->info()->dimension(idx_height); - const int input_chanels = _input->info()->dimension(idx_depth); - const int pooled_w = _pool_info.pooled_width(); - const int pooled_h = _pool_info.pooled_height(); - - const DataType data_type = _input->info()->data_type(); - const bool is_qasymm = is_data_type_quantized_asymmetric(data_type); - - const auto *rois_ptr = reinterpret_cast<const roi_data_type *>(_rois->buffer()); - const QuantizationInfo &rois_qinfo = _rois->info()->quantization_info(); - for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) - { - const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx]; - - roi_data_type qx1 = rois_ptr[values_per_roi * roi_indx + 1]; - roi_data_type qy1 = rois_ptr[values_per_roi * roi_indx + 2]; - roi_data_type qx2 = rois_ptr[values_per_roi * roi_indx + 3]; - roi_data_type qy2 = rois_ptr[values_per_roi * roi_indx + 4]; - float x1(qx1); - float x2(qx2); - float y1(qy1); - float y2(qy2); - if(is_qasymm) - { - x1 = dequantize_qasymm16(qx1, rois_qinfo); - x2 = dequantize_qasymm16(qx2, rois_qinfo); - y1 = dequantize_qasymm16(qy1, rois_qinfo); - y2 = dequantize_qasymm16(qy2, rois_qinfo); - } - const float roi_anchor_x = x1 * _pool_info.spatial_scale(); - const float roi_anchor_y = y1 * _pool_info.spatial_scale(); - const float roi_dims_x = std::max((x2 - x1) * _pool_info.spatial_scale(), 1.0f); - const float roi_dims_y = std::max((y2 - y1) * _pool_info.spatial_scale(), 1.0f); - float bin_size_x = roi_dims_x / _pool_info.pooled_width(); - float bin_size_y = roi_dims_y / _pool_info.pooled_height(); - - // Iterate through all feature maps - for(int ch = 0; ch < input_chanels; ++ch) - { - // Iterate through all output pixels - for(int py = 0; py < pooled_h; ++py) - { - for(int px = 0; px < pooled_w; ++px) - { - const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width); - const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height); - const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width); - const float region_end_y = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height); - const int roi_bin_grid_x = (_pool_info.sampling_ratio() > 0) ? _pool_info.sampling_ratio() : int(ceil(bin_size_x)); - const int roi_bin_grid_y = (_pool_info.sampling_ratio() > 0) ? _pool_info.sampling_ratio() : int(ceil(bin_size_y)); - input_data_type out_val(0); - if(is_qasymm) - { - out_val = roi_align_1x1_qasymm8<input_data_type>( - _input, roi_batch, region_start_x, bin_size_x, - roi_bin_grid_x, region_end_x, region_start_y, bin_size_y, - roi_bin_grid_y, region_end_y, ch, _output->info()->quantization_info()); - } - else - { - out_val = roi_align_1x1<input_data_type>( - _input, roi_batch, region_start_x, bin_size_x, - roi_bin_grid_x, region_end_x, region_start_y, bin_size_y, - roi_bin_grid_y, region_end_y, ch); - } - - if(data_layout == DataLayout::NCHW) - { - auto out_ptr = reinterpret_cast<input_data_type *>(_output->ptr_to_element(Coordinates(px, py, ch, roi_indx))); - *out_ptr = out_val; - } - else - { - auto out_ptr = reinterpret_cast<input_data_type *>(_output->ptr_to_element(Coordinates(ch, px, py, roi_indx))); - *out_ptr = out_val; - } - } - } - } - } -} } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h index fa31a879b..48a3de728 100644 --- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h +++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -89,9 +89,6 @@ public: void run(const Window &window, const ThreadInfo &info) override; private: - template <typename input_data_type, typename roi_data_type = input_data_type> - void internal_run(const Window &window, const ThreadInfo &info); - const ITensor *_input; ITensor *_output; const ITensor *_rois; diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp index 0395e0bd3..82d1403c5 100644 --- a/src/core/NEON/kernels/NERangeKernel.cpp +++ b/src/core/NEON/kernels/NERangeKernel.cpp @@ -30,68 +30,96 @@ #include "arm_compute/core/Validate.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - -#include "arm_compute/core/Utils.h" +#include "src/cpu/kernels/range/list.h" namespace arm_compute { namespace { -template <typename T> -void range_function(ITensor *output, float start, float step, const Window &window) +struct RangeSelectorData { - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type; - - const auto step_vec = wrapper::vdup_n(static_cast<T>(step), ExactTagType{}); - const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{}); - auto id_vec = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + DataType dt; +}; - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - const int window_step_x = 16 / sizeof(T); +using RangeSelectorPtr = std::add_pointer<bool(const RangeSelectorData &data)>::type; +using RangeUKernelPtr = std::add_pointer<void(ITensor *, float, float, const Window &)>::type; - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator output_it(output, win); +struct RangeUKernel +{ + const char *name; + const RangeSelectorPtr is_selected; + RangeUKernelPtr ukernel; +}; - execute_window_loop(win, [&](const Coordinates &) +static const RangeUKernel available_kernels[] = +{ { - int x = window_start_x; - const auto out_ptr = reinterpret_cast<T *>(output_it.ptr()); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - for(int count = 0; count < window_step_x; ++count) - { - id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count); - } - - // start + step * id - const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec); - wrapper::vstore(out_ptr + x, res_vec); - } + "fp16_neon_range", + [](const RangeSelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function) + }, + { + "f32_neon_range", + [](const RangeSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function) + }, + { + "u8_neon_range", + [](const RangeSelectorData & data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function) + }, + { + "u16_neon_range", + [](const RangeSelectorData & data) { return data.dt == DataType::U16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function) + }, + { + "u32_neon_range", + [](const RangeSelectorData & data) { return data.dt == DataType::U32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function) + }, + { + "s8_neon_range", + [](const RangeSelectorData & data) { return data.dt == DataType::S8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function) + }, + { + "s16_neon_range", + [](const RangeSelectorData & data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function) + }, + { + "s32_neon_range", + [](const RangeSelectorData & data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function) + }, +}; - // Compute left-over elements - for(; x < window_end_x; ++x) +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const RangeUKernel *get_implementation(const RangeSelectorData &data) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected(data)) { - const auto res = start + x * step; - *(out_ptr + x) = res; + return &uk; } - - }, - output_it); + } + return nullptr; } Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, - 1, - DataType::U8, DataType::S8, - DataType::U16, DataType::S16, - DataType::U32, DataType::S32, - DataType::F16, DataType::F32); + const auto *uk = get_implementation(RangeSelectorData{ output.data_type() }); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end"); @@ -111,7 +139,7 @@ Status validate_arguments(const ITensorInfo &output, const float start, const fl } // namespace NERangeKernel::NERangeKernel() - : _func(nullptr), _start(0), _end(1), _step(1), _output(nullptr) + : _start(0), _end(1), _step(1), _output(nullptr) { } @@ -131,38 +159,6 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste _end = end; _step = step; _output = output; - switch(_output->info()->data_type()) - { - case DataType::U8: - _func = &range_function<uint8_t>; - break; - case DataType::U16: - _func = &range_function<uint16_t>; - break; - case DataType::U32: - _func = &range_function<uint32_t>; - break; - case DataType::S8: - _func = &range_function<int8_t>; - break; - case DataType::S16: - _func = &range_function<int16_t>; - break; - case DataType::S32: - _func = &range_function<int32_t>; - break; - case DataType::F32: - _func = &range_function<float>; - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _func = &range_function<float16_t>; - break; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - break; - } INEKernel::configure(win); } @@ -181,8 +177,8 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); + const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() }); - (*_func)(_output, _start, _step, window); + uk->ukernel(_output, _start, _step, window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h index 7c42ef11d..90560995e 100644 --- a/src/core/NEON/kernels/NERangeKernel.h +++ b/src/core/NEON/kernels/NERangeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -80,11 +80,10 @@ public: private: using RangeFunction = void(ITensor *output, float start, float step, const Window &window); - RangeFunction *_func; /**< Range function to be called */ - float _start; /**< Start of sequence */ - float _end; /**< End of sequence */ - float _step; /**< Increment/step value */ - ITensor *_output; /**< Destination tensor */ + float _start; /**< Start of sequence */ + float _end; /**< End of sequence */ + float _step; /**< Increment/step value */ + ITensor *_output; /**< Destination tensor */ }; } // namespace arm_compute #endif /* ARM_COMPUTE_NERANGEKERNEL_H */ diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp deleted file mode 100644 index a1ba29e4c..000000000 --- a/src/core/NEON/kernels/NERemapKernel.cpp +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/NEON/kernels/NERemapKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/helpers/WindowHelpers.h" - -#include <arm_neon.h> -#include <cstddef> -#include <cstdint> - -using namespace arm_compute::scale_helpers; - -namespace arm_compute -{ -class Coordinates; - -namespace -{ -inline int32_t num_out_of_tensor(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &width_1, const int32x4_t &height_1) -{ - const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr)); - const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr)); - - const int32x4_t outbx_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(width_1, mapx_s32), mapx_s32), vdupq_n_s32(-1)), vdupq_n_s32(0)); // Contains -1 if out of border in x, 0 otherwise - const int32x4_t outby_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(height_1, mapy_s32), mapy_s32), vdupq_n_s32(-1)), vdupq_n_s32(0)); // Contains -1 if out of border in y, 0 otherwise - - const int32x4_t out_of_tensor_v = vminq_s32(outbx_s32, outby_s32); -#if defined(__aarch64__) - // only AArch64 supports vaddv - return vaddvq_s32(out_of_tensor_v); -#else // __aarch64__ - return vgetq_lane_s32(out_of_tensor_v, 0) + vgetq_lane_s32(out_of_tensor_v, 1) + vgetq_lane_s32(out_of_tensor_v, 2) + vgetq_lane_s32(out_of_tensor_v, 3); -#endif // __aarch64__ -} - -inline void serial_remap_nearest_interpolation(const uint8_t *in_ptr, const float *mapx_ptr, const float *mapy_ptr, uint8_t *out_ptr, - int32_t width_val, int32_t height_val, int32_t in_stride_val, uint8_t constant_border_value) -{ - const auto x_s32 = static_cast<int32_t>(*mapx_ptr); - const auto y_s32 = static_cast<int32_t>(*mapy_ptr); - if(x_s32 < 0 || y_s32 < 0 || x_s32 >= width_val || y_s32 >= height_val) - { - *(out_ptr) = constant_border_value; - } - else - { - *(out_ptr) = in_ptr[x_s32 + y_s32 * in_stride_val]; - } -} - -inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &stride) -{ - const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr)); - const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr)); - return vmlaq_s32(mapx_s32, mapy_s32, stride); -} - -inline uint8_t pixel_bilinear_c1_clamp(const uint8_t *pixel_ptr, int32_t stride, int32_t width, int32_t height, float x, float y, uint8_t constant_border_value) -{ - x = std::max(-1.f, std::min(x, static_cast<float>(width))); - y = std::max(-1.f, std::min(y, static_cast<float>(height))); - - const int32_t xi = static_cast<int32_t>(std::floor(x)); - const int32_t yi = static_cast<int32_t>(std::floor(y)); - - const float dx = x - static_cast<float>(xi); - const float dy = y - static_cast<float>(yi); - - // Calculating the address won't trigger a segfault in case the value is outside the tensor - // The ternary operator resolves the values in both conditions - const uint8_t *a00 = (xi < 0 || xi >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride); - const uint8_t *a01 = (xi + 1 >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride); - const uint8_t *a10 = (xi < 0 || xi >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride + stride); - const uint8_t *a11 = (xi + 1 >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride + stride); - - const float dx1 = 1.0f - dx; - const float dy1 = 1.0f - dy; - const float w1 = dx1 * dy1; - const float w2 = dx * dy1; - const float w3 = dx1 * dy; - const float w4 = dx * dy; - - return static_cast<uint8_t>((*a00) * w1 + (*a01) * w2 + (*a10) * w3 + (*a11) * w4); -} -} // namespace - -NERemapKernel::NERemapKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr), _border_mode(BorderMode::UNDEFINED), _constant_border_value(0) -{ -} - -void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32); - - _input = input; - _output = output; - _map_x = map_x; - _map_y = map_y; - _border_mode = border_mode; - _constant_border_value = constant_border_value; - - switch(policy) - { - case InterpolationPolicy::NEAREST_NEIGHBOR: - { - _func = &NERemapKernel::remap_nearest; - break; - } - case InterpolationPolicy::BILINEAR: - { - _func = &NERemapKernel::remap_bilinear; - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported interpolation mode"); - break; - } - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps()); - INEKernel::configure(win); -} - -void NERemapKernel::remap_nearest(const Window &window) -{ - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - - const auto window_start_x = static_cast<int32_t>(window.x().start()); - const auto window_end_x = static_cast<int32_t>(window.x().end()); - const int32_t window_step_x = 8; - - // Don't increment in X direction for the output, mapx, mapy tensors - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(_input, win_in); - Iterator out(_output, win); - Iterator mapx(_map_x, win); - Iterator mapy(_map_y, win); - - const int32_t width_val = static_cast<int32_t>(_input->info()->dimension(0)); - const int32_t height_val = static_cast<int32_t>(_input->info()->dimension(1)); - const int32_t in_stride_val = static_cast<int32_t>(_input->info()->strides_in_bytes()[1]); - const int32x4_t width_1 = vdupq_n_s32(width_val - 1); - const int32x4_t height_1 = vdupq_n_s32(height_val - 1); - const int32x4_t in_stride = vdupq_n_s32(in_stride_val); - - execute_window_loop(win, [&](const Coordinates &) - { - auto mapx_ptr = reinterpret_cast<const float *>(mapx.ptr()); - auto mapy_ptr = reinterpret_cast<const float *>(mapy.ptr()); - const uint8_t *in_ptr = in.ptr(); - uint8_t *out_ptr = out.ptr(); - int32_t x = window_start_x; - for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x) - { - const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_1, height_1); - const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_1, height_1); - const int32_t out_of_tensor = out_of_tensor0 + out_of_tensor1; - - if(out_of_tensor == -8) - { - // All elements are out of xy plane - uint8x8_t tmp = vdup_n_u8(_constant_border_value); - vst1_u8(out_ptr, tmp); - } - else if(out_of_tensor < 0) - { - // Some elements are out of xy plane - serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value); - serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 1, mapy_ptr + 1, out_ptr + 1, width_val, height_val, in_stride_val, _constant_border_value); - serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 2, mapy_ptr + 2, out_ptr + 2, width_val, height_val, in_stride_val, _constant_border_value); - serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 3, mapy_ptr + 3, out_ptr + 3, width_val, height_val, in_stride_val, _constant_border_value); - serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 4, mapy_ptr + 4, out_ptr + 4, width_val, height_val, in_stride_val, _constant_border_value); - serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 5, mapy_ptr + 5, out_ptr + 5, width_val, height_val, in_stride_val, _constant_border_value); - serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 6, mapy_ptr + 6, out_ptr + 6, width_val, height_val, in_stride_val, _constant_border_value); - serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 7, mapy_ptr + 7, out_ptr + 7, width_val, height_val, in_stride_val, _constant_border_value); - } - else - { - // All elements are in xy plane - uint8x8_t tmp = vdup_n_u8(0); - const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr, mapy_ptr, in_stride); - const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, in_stride); - tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0); - tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1); - tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2); - tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3); - tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4); - tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5); - tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6); - tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7); - vst1_u8(out_ptr, tmp); - } - } - for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr) - { - serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value); - } - }, - in, out, mapx, mapy); -} - -void NERemapKernel::remap_bilinear(const Window &window) -{ - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - - const auto window_start_x = static_cast<int32_t>(window.x().start()); - const auto window_end_x = static_cast<int32_t>(window.x().end()); - const int32_t window_step_x = 8; - - // Don't increment in X direction for the output, mapx, mapy tensors - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(_input, win_in); - Iterator out(_output, win); - Iterator mapx(_map_x, win); - Iterator mapy(_map_y, win); - - const int32_t width_val = static_cast<int32_t>(_input->info()->dimension(0)); - const int32_t height_val = static_cast<int32_t>(_input->info()->dimension(1)); - const int32x4_t width_2 = vdupq_n_s32(width_val - 2); - const int32x4_t height_2 = vdupq_n_s32(height_val - 2); - const int32_t in_stride_val = static_cast<int32_t>(_input->info()->strides_in_bytes()[1]); - - execute_window_loop(win, [&](const Coordinates &) - { - auto mapx_ptr = reinterpret_cast<const float *>(mapx.ptr()); - auto mapy_ptr = reinterpret_cast<const float *>(mapy.ptr()); - const uint8_t *in_ptr = in.ptr(); - uint8_t *out_ptr = out.ptr(); - int32_t x = window_start_x; - for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x) - { - const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_2, height_2); - const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_2, height_2); - const int32_t out_of_tensor = out_of_tensor0 + out_of_tensor1; - - if(out_of_tensor < 0) - { - // Elements are out of xy plane - *(out_ptr) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value); - *(out_ptr + 1) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value); - *(out_ptr + 2) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value); - *(out_ptr + 3) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value); - *(out_ptr + 4) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value); - *(out_ptr + 5) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value); - *(out_ptr + 6) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value); - *(out_ptr + 7) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value); - } - else - { - // All elements are in xy plane - uint8x8_t tmp = vdup_n_u8(0); - tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value), tmp, 0); - tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value), tmp, 1); - tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value), tmp, 2); - tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value), tmp, 3); - tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value), tmp, 4); - tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value), tmp, 5); - tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value), tmp, 6); - tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value), tmp, 7); - vst1_u8(out_ptr, tmp); - } - } - for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr) - { - *(out_ptr) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value); - } - }, - in, out, mapx, mapy); -} - -void NERemapKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (this->*_func)(window); -} -} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h deleted file mode 100644 index 33e929805..000000000 --- a/src/core/NEON/kernels/NERemapKernel.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEREMAPKERNEL_H -#define ARM_COMPUTE_NEREMAPKERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Kernel to perform a remap on a tensor */ -class NERemapKernel : public INEKernel -{ -public: - const char *name() const override - { - return "NERemapKernel"; - } - /** Default constructor */ - NERemapKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NERemapKernel(const NERemapKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NERemapKernel &operator=(const NERemapKernel &) = delete; - /** Allow instances of this class to be moved */ - NERemapKernel(NERemapKernel &&) = default; - /** Allow instances of this class to be moved */ - NERemapKernel &operator=(NERemapKernel &&) = default; - /** Default destructor */ - ~NERemapKernel() = default; - - /** Initialize the kernel's input, output and border mode. - * - * @param[in] input Source tensor. Data type supported: U8. - * @param[in] map_x Map for X coordinates. Data type supported: F32. - * @param[in] map_y Map for Y coordinates. Data type supported: F32. - * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane. - * @param[in] policy The interpolation type. - * @param[in] border_mode Border mode to use on the input tensor. - * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. Defaults to 0. - */ - void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - /** function to perform nearest interpolation on the given window */ - void remap_nearest(const Window &window); - /** function to perform bilinear interpolation on the given window */ - void remap_bilinear(const Window &window); - /** Remap function to use for the particular interpolation type passed to configure() */ - void (NERemapKernel::*_func)(const Window &window); - - const ITensor *_input; /**< Input image */ - ITensor *_output; /**< Output image */ - const ITensor *_map_x; /**< Input remap x coordinates */ - const ITensor *_map_y; /**< Input remap y coordinates */ - BorderMode _border_mode; /**< Border mode */ - uint8_t _constant_border_value; /**< Border value to use */ -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */
\ No newline at end of file diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp index 7c988e9fa..b8c9b244e 100644 --- a/src/core/NEON/kernels/NESelectKernel.cpp +++ b/src/core/NEON/kernels/NESelectKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,6 +34,10 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/common/Registrars.h" + +#include "src/cpu/kernels/select/list.h" + #include <arm_neon.h> #include <map> #include <string> @@ -42,125 +46,123 @@ namespace arm_compute { namespace { -template <typename ScalarType, typename VectorType> -void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *)) -{ - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator condition(cond, win); - Iterator input1(in1, win); - Iterator input2(in2, win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr()); - const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); - int x = window_start_x; - for(; x <= limit; x += window_step_x) - { - const auto c = (*condition_conversion)(condition_ptr + x); - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b)); - } - for(; x < window_end_x; ++x) - { - const auto c = *(condition_ptr + x); - const auto a = *(input1_ptr + x); - const auto b = *(input2_ptr + x); - *(output_ptr + x) = static_cast<bool>(c) ? a : b; - } - }, - condition, input1, input2, output); -} - -template <typename ScalarType, typename VectorType> -void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +struct SelectKernelSelectorData { - const auto window_step_x = 16 / sizeof(ScalarType); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); + DataType dt; + bool is_same_rank; +}; - select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType - { - static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero); - }); -} +using SelectorPtr = std::add_pointer<bool(const SelectKernelSelectorData &data)>::type; +using KernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type; -template <typename ScalarType, typename VectorType> -void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +struct SelectKernelSelector { - const auto window_step_x = 16 / sizeof(ScalarType); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); + const char *name; + const SelectorPtr is_selected; + KernelPtr ukernel; +}; - select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType - { - static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero); - }); -} - -template <typename ScalarType, typename VectorType> -void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +static const SelectKernelSelector available_kernels[] = { - const auto window_step_x = 16 / sizeof(ScalarType); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType { - static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero); - }); -} + "neon_s8_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank) + }, + { + "neon_s16_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank) + }, + { + "neon_s32_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank) + }, + { + "neon_u8_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank) + }, + { + "neon_u16_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank) + }, + { + "neon_u32_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank) + }, + { + "neon_s8_not_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank) + }, + { + "neon_s16_not_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank) + }, + { + "neon_s32_not_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank) + }, + { + "neon_u8_not_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank) + }, + { + "neon_u16_not_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank) + }, + { + "neon_u32_not_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank) + }, + { + "neon_f16_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank) + }, + { + "neon_f16_not_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank) + }, + { + "neon_f32_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank) + }, + { + "neon_f32_not_same_rank", + [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank) + }, +}; -template <typename ScalarType> -void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data) { - ARM_COMPUTE_UNUSED(window); - - auto output_ptr = reinterpret_cast<ScalarType *>(out->buffer()); - const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer()); - const auto input1_ptr = reinterpret_cast<const ScalarType *>(in1->buffer()); - const auto input2_ptr = reinterpret_cast<const ScalarType *>(in2->buffer()); - - const int outer_size = cond->info()->total_size() / cond->info()->element_size(); - const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size; - int offset = 0; - const int step = 16 / in1->info()->element_size(); - - for(int i = 0; i < outer_size; ++i) + for(const auto &uk : available_kernels) { - int x = offset; - const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr; - for(; x <= offset + inner_size - step; x += step) - { - wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x)); - } - if(x <= offset + inner_size - (step / 2)) - { - wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x)); - x += step / 2; - } - for(; x < offset + inner_size; ++x) + if(uk.is_selected(data)) { - *(output_ptr + x) = *(input_ptr + x); + return &uk; } - offset += inner_size; } + return nullptr; } + } // namespace NESelectKernel::NESelectKernel() - : _function(nullptr), _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false) + : /*_function(nullptr), */ _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false) { } @@ -178,51 +180,6 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor _output = output; _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions()); - std::string function_to_call("op_"); - function_to_call += string_from_data_type(x->info()->data_type()); - - static std::map<std::string, SelectFunction *> map_function; - - if(_has_same_rank) - { - map_function = - { - { "op_S8", &select_op_8<int8_t, uint8x16_t> }, - { "op_S16", &select_op_16<int16_t, uint16x8_t> }, - { "op_S32", &select_op_32<int32_t, uint32x4_t> }, - { "op_U8", &select_op_8<uint8_t, uint8x16_t> }, - { "op_U16", &select_op_16<uint16_t, uint16x8_t> }, - { "op_U32", &select_op_32<uint32_t, uint32x4_t> }, - { "op_F32", &select_op_32<float, uint32x4_t> } - }; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - map_function["op_F16"] = &select_op_16<float16_t, uint16x8_t>; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - } - else - { - map_function = - { - { "op_S8", &select_op_not_same_rank<int8_t> }, - { "op_S16", &select_op_not_same_rank<int16_t> }, - { "op_S32", &select_op_not_same_rank<int32_t> }, - { "op_U8", &select_op_not_same_rank<uint8_t> }, - { "op_U16", &select_op_not_same_rank<uint16_t> }, - { "op_U32", &select_op_not_same_rank<uint32_t> }, - { "op_F32", &select_op_not_same_rank<float> } - }; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - map_function["op_F16"] = &select_op_not_same_rank<float16_t>; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - } - - auto it = map_function.find(function_to_call); - - if(it != map_function.end()) - { - _function = it->second; - } - Window win = calculate_max_window(*x->info()); INEKernel::configure(win); } @@ -254,7 +211,12 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_function == nullptr); - _function(_c, _x, _y, _output, window); + ARM_COMPUTE_ERROR_ON(_output == nullptr); + ARM_COMPUTE_ERROR_ON(_output->info() == nullptr); + + const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank }); + ARM_COMPUTE_ERROR_ON(uk == nullptr); + ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); + uk->ukernel(_c, _x, _y, _output, window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h index f7142feff..e82105a68 100644 --- a/src/core/NEON/kernels/NESelectKernel.h +++ b/src/core/NEON/kernels/NESelectKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2020, 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -82,22 +82,12 @@ public: void run(const Window &window, const ThreadInfo &info) override; private: - /** Common signature for all the specialised select functions - * - * @param[in] c Condition input tensor. Data types supported: U8. - * @param[in] x First input tensor. Data types supported: All. - * @param[in] y Second input tensor. Data types supported: Same as @p x - * @param[in] output Output tensor. Data types supported: Same as @p x. - */ - using SelectFunction = void(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window); - /** Select function to use for the particular tensor types passed to configure() */ - SelectFunction *_function; - const ITensor *_c; /**< Condition tensor */ - const ITensor *_x; /**< Source tensor 1 */ - const ITensor *_y; /**< Source tensor 2 */ - ITensor *_output; /**< Destination tensor */ - bool _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */ + const ITensor *_c; /**< Condition tensor */ + const ITensor *_x; /**< Source tensor 1 */ + const ITensor *_y; /**< Source tensor 2 */ + ITensor *_output; /**< Destination tensor */ + bool _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */ }; } // namespace arm_compute #endif /* ARM_COMPUTE_NESELECTKERNEL_H */ diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp index 1d52b56d3..ea41529d8 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp @@ -136,7 +136,14 @@ UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &a { const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr; const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl); - return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr); + + if(success) + { + auto i = impl->get_instance(args, os); + i->set_name(impl->name); + return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(i); + } + return nullptr; } } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp index 20c823014..79fc65e56 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp @@ -23,7 +23,9 @@ */ #pragma once +#if !defined(__OpenBSD__) #include <alloca.h> +#endif /* !defined(__OpenBSD__) */ #include <algorithm> #include <cassert> diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp index d3857a50e..809946f10 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp @@ -144,7 +144,7 @@ struct GemmImplementation<Top, Tret, Nothing> { /* "Master" function implemented for each valid combination of types. * Returns a list of GEMM implementation descriptors for processing by the - * other functions, terminated by an implementation with + * other functions, ended by an implementation with * method==GemmMethod::DEFAULT. */ template<typename Top, typename Tret, class OutputStage = Nothing> const GemmImplementation<Top, Tret, OutputStage> *gemm_implementation_list(); diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp index d5003e4a1..91988e8c3 100644 --- a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp +++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp @@ -28,7 +28,9 @@ #include "interleave_indirect.hpp" #include "bfloat.hpp" +#if !defined(__OpenBSD__) #include <alloca.h> +#endif /* !defined(__OpenBSD__) */ #include <algorithm> #include <cstddef> diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp index 758f2b1f8..9af1b4df1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp @@ -72,7 +72,7 @@ public: return { 15.361, 0.9341, 0.1636 }; case CPUModel::V1: - return { 62.40, 4.71, 0.67 }; + return { 51.14, 7.38, 0.65 }; default: return { 29.0698, 3.9793, 0.4003 }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp index 21c9f5966..6d333f344 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp @@ -80,7 +80,7 @@ public: return { 15.361, 0.9341, 0.1636 }; case CPUModel::V1: - return { 62.40, 4.71, 0.67 }; + return { 51.14, 7.38, 0.65 }; default: return { 29.0698, 3.9793, 0.4003 }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp index 090dd5855..e6e795097 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp @@ -82,7 +82,7 @@ public: case CPUModel::A510: return { 6.81 }; case CPUModel::V1: - return { 28.40 }; + return { 22.33 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp index f5e9009f6..39ffcbef1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp @@ -82,7 +82,7 @@ public: case CPUModel::A510: return { 6.70 }; case CPUModel::V1: - return { 26.64 }; + return { 21.28 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp index 94f578368..905a60265 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp @@ -84,7 +84,7 @@ public: case CPUModel::A510: return { 14.81 }; case CPUModel::V1: - return { 48.34 }; + return { 44.54 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp index bc933afd9..69ea87bc9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp @@ -81,7 +81,7 @@ public: case CPUModel::A510: return { 27.99 }; case CPUModel::V1: - return { 68.76 }; + return { 62.26 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp index c5105a6d4..ce96c1b28 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp @@ -75,29 +75,29 @@ public: template<typename T> static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same<T, uint8_t>::value) { + if (std::is_same<T, uint32_t>::value) { switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 9.5238, 2.0799, 0.2279 }; default: - return { 29.6736, 11.4025, 0.5591 }; + return { 31.63 }; case CPUModel::A510: - return { 16.65, 3.92, 0.48 }; + return { 15.89 }; case CPUModel::V1: - return { 55.42, 19.29, 0.92 }; + return { 53.87 }; + case CPUModel::A55r1: + return { 9.217 }; } } - if (std::is_same<T, uint32_t>::value) { + if (std::is_same<T, uint8_t>::value) { switch (ci->get_cpu_model()) { - default: - return { 31.63 }; case CPUModel::A55r1: - return { 9.217 }; + return { 9.5238, 2.0799, 0.2279 }; + default: + return { 29.6736, 11.4025, 0.5591 }; case CPUModel::A510: - return { 15.89 }; + return { 16.65, 3.92, 0.48 }; case CPUModel::V1: - return { 53.87 }; + return { 42.62, 16.32, 0.83 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp index 24bad3c63..b5cedc7e9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp @@ -92,7 +92,7 @@ public: case CPUModel::A510: return { 33.64, 3.92, 0.48 }; case CPUModel::V1: - return { 86.71, 19.00, 0.93 }; + return { 63.94, 16.18, 0.83 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp index 9b3517a80..6ec6bd2ed 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp @@ -36,6 +36,7 @@ namespace arm_gemm { // Actual kernel implementations void a64_interleaved_bf16fp32_mmla_8x12( ARGLIST ); +void a64_interleaved_bf16fp32_mmla_8x12_a510( ARGLIST ); class cls_a64_interleaved_bf16fp32_mmla_8x12 { @@ -78,7 +79,7 @@ public: default: return { 31.54, 4.30, 7.33 }; case CPUModel::V1: - return { 59.94, 5.08, 9.83 }; + return { 41.44, 5.01, 5.64 }; case CPUModel::A510: return { 7.82, 4.05, 3.07 }; } @@ -101,8 +102,15 @@ public: // Default to the generic kernel kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12; - cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *) + cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A510: + kernel=a64_interleaved_bf16fp32_mmla_8x12_a510; + break; + } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp new file mode 100644 index 000000000..0235e91bf --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include <cstddef> +#include "../../bfloat.hpp" + +namespace arm_gemm { + +void a64_interleaved_bf16fp32_mmla_8x12_a510( + const bfloat16 *Apanel, const bfloat16 *Bpanel, + float *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const bfloat16 *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/4) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "mov %x[Apanel], x21\n" + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "ld1 { v0.8h }, [%x[Apanel]], #0x10\n" + "ld1 { v4.8h }, [x20], #0x10\n" + "ld1 { v1.8h }, [%x[Apanel]], #0x10\n" + "cmp x19, #0x2\n" + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "ld1 { v5.8h }, [x20], #0x10\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "ld1 { v2.8h }, [%x[Apanel]], #0x10\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "blt 4f\n" + "3:" // main loop head + "ld1 { v3.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n" + ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + "sub x19, x19, #0x2\n" + ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" + "cmp x19, #0x2\n" + ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n" + "ld1 { v0.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n" + ".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n" + "ld1 { v6.8h }, [x20], #0x10\n" + ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n" + ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n" + "ld1 { v1.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n" + "ld1 { v7.8h }, [x20], #0x10\n" + ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n" + "ld1 { v2.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n" + ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n" + "ld1 { v3.8h }, [%x[Apanel]], #0x10\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n" + ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n" + ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ld1 { v0.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n" + ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n" + ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n" + ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n" + "ld1 { v4.8h }, [x20], #0x10\n" + ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n" + ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n" + "ld1 { v1.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" + ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n" + "ld1 { v5.8h }, [x20], #0x10\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n" + "ld1 { v2.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n" + ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n" + "bge 3b\n" + "4:" // main loop skip + "ld1 { v3.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n" + ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" + ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n" + ".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n" + ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n" + ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n" + ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n" + ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n" + ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n" + "cbz x19, 5f\n" + "ld1 { v0.8h }, [%x[Apanel]], #0x10\n" + "ld1 { v6.8h }, [x20], #0x10\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ld1 { v1.8h }, [%x[Apanel]], #0x10\n" + "ld1 { v7.8h }, [x20], #0x10\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + "ld1 { v2.8h }, [%x[Apanel]], #0x10\n" + "ld1 { v3.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n" + ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n" + ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n" + ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n" + ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n" + ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n" + ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n" + ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" + ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n" + ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n" + ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n" + "5:" // multiply loop done + "subs x22, x22, #0x1\n" + "uzp1 v4.2d, v8.2d, v11.2d\n" + "uzp2 v8.2d, v8.2d, v11.2d\n" + "uzp1 v11.2d, v9.2d, v12.2d\n" + "uzp2 v9.2d, v9.2d, v12.2d\n" + "str q4, [%x[Cpanel], #0x0]\n" + "uzp1 v12.2d, v10.2d, v13.2d\n" + "uzp2 v10.2d, v10.2d, v13.2d\n" + "str q11, [%x[Cpanel], #0x10]\n" + "str q12, [%x[Cpanel], #0x20]\n" + "uzp1 v13.2d, v14.2d, v17.2d\n" + "uzp2 v14.2d, v14.2d, v17.2d\n" + "str q8, [%x[Cpanel], #0x30]\n" + "uzp1 v17.2d, v15.2d, v18.2d\n" + "uzp2 v15.2d, v15.2d, v18.2d\n" + "str q9, [%x[Cpanel], #0x40]\n" + "uzp1 v18.2d, v16.2d, v19.2d\n" + "uzp2 v16.2d, v16.2d, v19.2d\n" + "str q10, [%x[Cpanel], #0x50]\n" + "uzp1 v19.2d, v20.2d, v23.2d\n" + "uzp2 v20.2d, v20.2d, v23.2d\n" + "str q13, [%x[Cpanel], #0x60]\n" + "uzp1 v23.2d, v21.2d, v24.2d\n" + "uzp2 v21.2d, v21.2d, v24.2d\n" + "str q17, [%x[Cpanel], #0x70]\n" + "uzp1 v24.2d, v22.2d, v25.2d\n" + "uzp2 v22.2d, v22.2d, v25.2d\n" + "str q18, [%x[Cpanel], #0x80]\n" + "uzp1 v25.2d, v26.2d, v29.2d\n" + "uzp2 v26.2d, v26.2d, v29.2d\n" + "str q14, [%x[Cpanel], #0x90]\n" + "uzp1 v29.2d, v27.2d, v30.2d\n" + "uzp2 v27.2d, v27.2d, v30.2d\n" + "str q15, [%x[Cpanel], #0xa0]\n" + "uzp1 v30.2d, v28.2d, v31.2d\n" + "uzp2 v28.2d, v28.2d, v31.2d\n" + "str q16, [%x[Cpanel], #0xb0]\n" + "str q19, [%x[Cpanel], #0xc0]\n" + "str q23, [%x[Cpanel], #0xd0]\n" + "str q24, [%x[Cpanel], #0xe0]\n" + "str q20, [%x[Cpanel], #0xf0]\n" + "str q21, [%x[Cpanel], #0x100]\n" + "str q22, [%x[Cpanel], #0x110]\n" + "str q25, [%x[Cpanel], #0x120]\n" + "str q29, [%x[Cpanel], #0x130]\n" + "str q30, [%x[Cpanel], #0x140]\n" + "str q26, [%x[Cpanel], #0x150]\n" + "str q27, [%x[Cpanel], #0x160]\n" + "str q28, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp index ff69bc8f5..4cc3ed040 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp @@ -35,6 +35,7 @@ namespace arm_gemm { // Actual kernel implementations void a64_interleaved_s8s32_mmla_8x12( ARGLIST ); +void a64_interleaved_s8s32_mmla_8x12_a510( ARGLIST ); class cls_a64_interleaved_s8s32_mmla_8x12 { @@ -91,7 +92,7 @@ public: case CPUModel::A510: return { 48.22, 2.49, 0.29 }; case CPUModel::V1: - return { 116.76, 4.67, 0.60 }; + return { 75.54, 8.06, 0.63 }; } } @@ -100,9 +101,17 @@ public: // Default to the generic kernel kern_type kernel=a64_interleaved_s8s32_mmla_8x12; - cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *) + cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A510: + kernel=a64_interleaved_s8s32_mmla_8x12_a510; + break; + } } + }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp new file mode 100644 index 000000000..a4d8c0ace --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include <cstddef> +#include <cstdint> + +namespace arm_gemm { + +void a64_interleaved_s8s32_mmla_8x12_a510( + const int8_t *Apanel, const int8_t *Bpanel, + int32_t *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const int8_t *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/8) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "mov %x[Apanel], x21\n" + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v4.16b }, [x20], #0x10\n" + "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" + "cmp x19, #0x2\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "ld1 { v5.16b }, [x20], #0x10\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "blt 4f\n" + "3:" // main loop head + "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n" + ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n" + "sub x19, x19, #0x2\n" + ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n" + ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n" + "cmp x19, #0x2\n" + ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n" + ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n" + "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n" + ".inst 0x4e87a432 // smmla v18.4s, v1.16b, v7.16b\n" + "ld1 { v6.16b }, [x20], #0x10\n" + ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n" + "ld1 { v7.16b }, [x20], #0x10\n" + ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n" + ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n" + "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n" + ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n" + "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n" + ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n" + ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n" + ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n" + ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n" + "ld1 { v4.16b }, [x20], #0x10\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n" + "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" + ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n" + "ld1 { v5.16b }, [x20], #0x10\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n" + ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n" + "bge 3b\n" + "4:" // main loop skip + "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n" + ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n" + ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n" + ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n" + ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n" + ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n" + ".inst 0x4e87a432 // smmla v18.4s, v1.16b, v7.16b\n" + ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n" + ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n" + ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n" + ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n" + ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n" + "cbz x19, 5f\n" + "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v6.16b }, [x20], #0x10\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v7.16b }, [x20], #0x10\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n" + ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n" + ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n" + ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n" + ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" + ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n" + ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n" + "5:" // multiply loop done + "subs x22, x22, #0x1\n" + "uzp1 v4.2d, v8.2d, v11.2d\n" + "uzp2 v8.2d, v8.2d, v11.2d\n" + "uzp1 v11.2d, v9.2d, v12.2d\n" + "uzp2 v9.2d, v9.2d, v12.2d\n" + "str q4, [%x[Cpanel], #0x0]\n" + "uzp1 v12.2d, v10.2d, v13.2d\n" + "uzp2 v10.2d, v10.2d, v13.2d\n" + "str q11, [%x[Cpanel], #0x10]\n" + "str q12, [%x[Cpanel], #0x20]\n" + "uzp1 v13.2d, v14.2d, v17.2d\n" + "uzp2 v14.2d, v14.2d, v17.2d\n" + "str q8, [%x[Cpanel], #0x30]\n" + "uzp1 v17.2d, v15.2d, v18.2d\n" + "uzp2 v15.2d, v15.2d, v18.2d\n" + "str q9, [%x[Cpanel], #0x40]\n" + "uzp1 v18.2d, v16.2d, v19.2d\n" + "uzp2 v16.2d, v16.2d, v19.2d\n" + "str q10, [%x[Cpanel], #0x50]\n" + "uzp1 v19.2d, v20.2d, v23.2d\n" + "uzp2 v20.2d, v20.2d, v23.2d\n" + "str q13, [%x[Cpanel], #0x60]\n" + "uzp1 v23.2d, v21.2d, v24.2d\n" + "uzp2 v21.2d, v21.2d, v24.2d\n" + "str q17, [%x[Cpanel], #0x70]\n" + "uzp1 v24.2d, v22.2d, v25.2d\n" + "uzp2 v22.2d, v22.2d, v25.2d\n" + "str q18, [%x[Cpanel], #0x80]\n" + "uzp1 v25.2d, v26.2d, v29.2d\n" + "uzp2 v26.2d, v26.2d, v29.2d\n" + "str q14, [%x[Cpanel], #0x90]\n" + "uzp1 v29.2d, v27.2d, v30.2d\n" + "uzp2 v27.2d, v27.2d, v30.2d\n" + "str q15, [%x[Cpanel], #0xa0]\n" + "uzp1 v30.2d, v28.2d, v31.2d\n" + "uzp2 v28.2d, v28.2d, v31.2d\n" + "str q16, [%x[Cpanel], #0xb0]\n" + "str q19, [%x[Cpanel], #0xc0]\n" + "str q23, [%x[Cpanel], #0xd0]\n" + "str q24, [%x[Cpanel], #0xe0]\n" + "str q20, [%x[Cpanel], #0xf0]\n" + "str q21, [%x[Cpanel], #0x100]\n" + "str q22, [%x[Cpanel], #0x110]\n" + "str q25, [%x[Cpanel], #0x120]\n" + "str q29, [%x[Cpanel], #0x130]\n" + "str q30, [%x[Cpanel], #0x140]\n" + "str q26, [%x[Cpanel], #0x150]\n" + "str q27, [%x[Cpanel], #0x160]\n" + "str q28, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp index f492a474a..fa93c1d90 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp @@ -35,6 +35,7 @@ namespace arm_gemm { // Actual kernel implementations void a64_interleaved_u8u32_mmla_8x12( ARGLIST ); +void a64_interleaved_u8u32_mmla_8x12_a510( ARGLIST ); class cls_a64_interleaved_u8u32_mmla_8x12 { @@ -91,7 +92,7 @@ public: case CPUModel::A510: return { 47.66, 2.47, 0.29 }; case CPUModel::V1: - return { 111.60, 4.95, 0.66 }; + return { 75.54, 8.06, 0.63 }; } } @@ -100,8 +101,15 @@ public: // Default to the generic kernel kern_type kernel=a64_interleaved_u8u32_mmla_8x12; - cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *) + cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A510: + kernel=a64_interleaved_u8u32_mmla_8x12_a510; + break; + } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp new file mode 100644 index 000000000..3fe1a9bd0 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include <cstddef> +#include <cstdint> + +namespace arm_gemm { + +void a64_interleaved_u8u32_mmla_8x12_a510( + const uint8_t *Apanel, const uint8_t *Bpanel, + uint32_t *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const uint8_t *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/8) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "mov %x[Apanel], x21\n" + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v4.16b }, [x20], #0x10\n" + "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" + "cmp x19, #0x2\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "ld1 { v5.16b }, [x20], #0x10\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "blt 4f\n" + "3:" // main loop head + "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n" + ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n" + "sub x19, x19, #0x2\n" + ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n" + ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" + "cmp x19, #0x2\n" + ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" + ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" + ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n" + ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n" + "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n" + ".inst 0x6e87a432 // ummla v18.4s, v1.16b, v7.16b\n" + "ld1 { v6.16b }, [x20], #0x10\n" + ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n" + "ld1 { v7.16b }, [x20], #0x10\n" + ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n" + ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" + "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n" + ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n" + "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" + ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n" + ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n" + ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n" + ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" + ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" + "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n" + ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n" + ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n" + "ld1 { v4.16b }, [x20], #0x10\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n" + "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" + ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n" + "ld1 { v5.16b }, [x20], #0x10\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n" + ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n" + "bge 3b\n" + "4:" // main loop skip + "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n" + ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n" + ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n" + ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" + ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n" + ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" + ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n" + ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n" + ".inst 0x6e87a432 // ummla v18.4s, v1.16b, v7.16b\n" + ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n" + ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n" + ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" + ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n" + ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n" + "cbz x19, 5f\n" + "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v6.16b }, [x20], #0x10\n" + ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" + "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v7.16b }, [x20], #0x10\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + "ldp q4, q5, [x20], #0x20\n" + ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" + ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n" + ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n" + "ldp q6, q7, [x20], #0x20\n" + ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n" + ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n" + ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n" + ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" + ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" + ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n" + ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" + ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n" + ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n" + "5:" // multiply loop done + "subs x22, x22, #0x1\n" + "uzp1 v4.2d, v8.2d, v11.2d\n" + "uzp2 v8.2d, v8.2d, v11.2d\n" + "uzp1 v11.2d, v9.2d, v12.2d\n" + "uzp2 v9.2d, v9.2d, v12.2d\n" + "str q4, [%x[Cpanel], #0x0]\n" + "uzp1 v12.2d, v10.2d, v13.2d\n" + "uzp2 v10.2d, v10.2d, v13.2d\n" + "str q11, [%x[Cpanel], #0x10]\n" + "str q12, [%x[Cpanel], #0x20]\n" + "uzp1 v13.2d, v14.2d, v17.2d\n" + "uzp2 v14.2d, v14.2d, v17.2d\n" + "str q8, [%x[Cpanel], #0x30]\n" + "uzp1 v17.2d, v15.2d, v18.2d\n" + "uzp2 v15.2d, v15.2d, v18.2d\n" + "str q9, [%x[Cpanel], #0x40]\n" + "uzp1 v18.2d, v16.2d, v19.2d\n" + "uzp2 v16.2d, v16.2d, v19.2d\n" + "str q10, [%x[Cpanel], #0x50]\n" + "uzp1 v19.2d, v20.2d, v23.2d\n" + "uzp2 v20.2d, v20.2d, v23.2d\n" + "str q13, [%x[Cpanel], #0x60]\n" + "uzp1 v23.2d, v21.2d, v24.2d\n" + "uzp2 v21.2d, v21.2d, v24.2d\n" + "str q17, [%x[Cpanel], #0x70]\n" + "uzp1 v24.2d, v22.2d, v25.2d\n" + "uzp2 v22.2d, v22.2d, v25.2d\n" + "str q18, [%x[Cpanel], #0x80]\n" + "uzp1 v25.2d, v26.2d, v29.2d\n" + "uzp2 v26.2d, v26.2d, v29.2d\n" + "str q14, [%x[Cpanel], #0x90]\n" + "uzp1 v29.2d, v27.2d, v30.2d\n" + "uzp2 v27.2d, v27.2d, v30.2d\n" + "str q15, [%x[Cpanel], #0xa0]\n" + "uzp1 v30.2d, v28.2d, v31.2d\n" + "uzp2 v28.2d, v28.2d, v31.2d\n" + "str q16, [%x[Cpanel], #0xb0]\n" + "str q19, [%x[Cpanel], #0xc0]\n" + "str q23, [%x[Cpanel], #0xd0]\n" + "str q24, [%x[Cpanel], #0xe0]\n" + "str q20, [%x[Cpanel], #0xf0]\n" + "str q21, [%x[Cpanel], #0x100]\n" + "str q22, [%x[Cpanel], #0x110]\n" + "str q25, [%x[Cpanel], #0x120]\n" + "str q29, [%x[Cpanel], #0x130]\n" + "str q30, [%x[Cpanel], #0x140]\n" + "str q26, [%x[Cpanel], #0x150]\n" + "str q27, [%x[Cpanel], #0x160]\n" + "str q28, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp index 30e265fbc..ab175a375 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp @@ -83,7 +83,7 @@ public: case CPUModel::A510: return { 5.42 }; case CPUModel::V1: - return { 34.56 }; + return { 20.83 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp index 61c7ad17e..b7c9aca9d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp @@ -83,7 +83,7 @@ public: case CPUModel::A510: return { 5.31 }; case CPUModel::V1: - return { 28.93 }; + return { 17.32 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp index b8ca7c545..28057aa96 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp @@ -94,7 +94,7 @@ public: case CPUModel::A510: return { 22.77, 3.90, 0.47 }; case CPUModel::V1: - return { 62.97, 19.14, 0.92 }; + return { 48.09, 16.24, 0.83 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp index b88ef14f2..c08977570 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp @@ -94,7 +94,7 @@ public: case CPUModel::A510: return { 23.87, 3.89, 0.37 }; case CPUModel::V1: - return { 107.63, 19.24, 0.92 }; + return { 75.14, 15.87, 0.83 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp index d870711c6..901cc6d63 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp @@ -94,7 +94,7 @@ public: case CPUModel::A510: return { 22.75, 3.90, 0.47 }; case CPUModel::V1: - return { 62.97, 19.27, 0.92 }; + return { 48.09, 16.24, 0.83 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp index 7f8eadc52..c0d089278 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp @@ -94,7 +94,7 @@ public: case CPUModel::A510: return { 26.80, 3.89, 0.47 }; case CPUModel::V1: - return { 108.33, 18.66, 0.92 }; + return { 75.14, 15.87, 0.83 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp index fa44bdbd3..fc91dd71a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp @@ -80,7 +80,7 @@ public: case CPUModel::A510: return { 7.78, 4.01, 2.43 }; case CPUModel::V1: - return { 62.50, 5.09, 11.32 }; + return { 47.63, 5.11, 6.80 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp index 1924b2cf7..0d707b039 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp @@ -89,7 +89,7 @@ public: default: return { 31.67, 3.57, 0.50 }; case CPUModel::V1: - return { 63.35, 4.76, 0.77 }; + return { 52.24, 7.49, 0.80 }; case CPUModel::A510: return { 27.47, 1.70, 0.28 }; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp index bd1764bb7..4e65296f8 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp @@ -89,7 +89,7 @@ public: default: return { 61.97, 3.64, 0.50 }; case CPUModel::V1: - return { 123.84, 4.93, 0.76 }; + return { 95.28, 7.99, 0.79 }; case CPUModel::A510: return { 43.36, 1.86, 0.28 }; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp index f66a9bf51..0afcdd2ce 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp @@ -91,7 +91,7 @@ public: case CPUModel::A510: return { 27.45, 1.65, 0.28 }; case CPUModel::V1: - return { 63.35, 4.96, 0.77 }; + return { 52.24, 7.49, 0.80 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp index b530202bd..58d21d6c4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp @@ -91,7 +91,7 @@ public: case CPUModel::A510: return { 38.02, 1.85, 0.28 }; case CPUModel::V1: - return { 123.84, 4.98, 0.76 }; + return { 95.28, 7.99, 0.79 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/transform.cpp b/src/core/NEON/kernels/arm_gemm/transform.cpp index 60376ab80..c6a3bc0ed 100644 --- a/src/core/NEON/kernels/arm_gemm/transform.cpp +++ b/src/core/NEON/kernels/arm_gemm/transform.cpp @@ -25,7 +25,9 @@ #include "bfloat.hpp" +#if !defined(__OpenBSD__) #include <alloca.h> +#endif /* !defined(__OpenBSD__) */ namespace arm_gemm { diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp index eadf48d00..9262ea05a 100644 --- a/src/core/NEON/kernels/assembly/depthwise.hpp +++ b/src/core/NEON/kernels/assembly/depthwise.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,10 +78,20 @@ struct DepthwiseArgs template <typename TInput, typename TWeight, typename TOutput> class DepthwiseCommon : public IDepthwiseCommon { +private: + std::string _name{}; + protected: const DepthwiseArgs m_args; // Copy of arguments - public: + std::string name() const + { + return _name; + } + void set_name(const std::string &n) + { + _name = n; + } DepthwiseCommon(const DepthwiseArgs &args) : m_args(args) {}; DepthwiseCommon(DepthwiseCommon &) = delete; |