aboutsummaryrefslogtreecommitdiff
path: root/vpx_dsp/arm/sum_neon.h
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_dsp/arm/sum_neon.h')
-rw-r--r--vpx_dsp/arm/sum_neon.h51
1 files changed, 51 insertions, 0 deletions
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 75c170df6..11821dc10 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -221,4 +221,55 @@ static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
#endif
}
+static INLINE uint64_t horizontal_long_add_uint32x4_x2(const uint32x4_t a[2]) {
+ return horizontal_long_add_uint32x4(a[0]) +
+ horizontal_long_add_uint32x4(a[1]);
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x4(const uint32x4_t a[4]) {
+ uint64x2_t sum = vpaddlq_u32(a[0]);
+ sum = vpadalq_u32(sum, a[1]);
+ sum = vpadalq_u32(sum, a[2]);
+ sum = vpadalq_u32(sum, a[3]);
+
+ return horizontal_add_uint64x2(sum);
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x8(const uint32x4_t a[8]) {
+ uint64x2_t sum[2];
+ sum[0] = vpaddlq_u32(a[0]);
+ sum[1] = vpaddlq_u32(a[1]);
+ sum[0] = vpadalq_u32(sum[0], a[2]);
+ sum[1] = vpadalq_u32(sum[1], a[3]);
+ sum[0] = vpadalq_u32(sum[0], a[4]);
+ sum[1] = vpadalq_u32(sum[1], a[5]);
+ sum[0] = vpadalq_u32(sum[0], a[6]);
+ sum[1] = vpadalq_u32(sum[1], a[7]);
+
+ return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+static INLINE uint64_t
+horizontal_long_add_uint32x4_x16(const uint32x4_t a[16]) {
+ uint64x2_t sum[2];
+ sum[0] = vpaddlq_u32(a[0]);
+ sum[1] = vpaddlq_u32(a[1]);
+ sum[0] = vpadalq_u32(sum[0], a[2]);
+ sum[1] = vpadalq_u32(sum[1], a[3]);
+ sum[0] = vpadalq_u32(sum[0], a[4]);
+ sum[1] = vpadalq_u32(sum[1], a[5]);
+ sum[0] = vpadalq_u32(sum[0], a[6]);
+ sum[1] = vpadalq_u32(sum[1], a[7]);
+ sum[0] = vpadalq_u32(sum[0], a[8]);
+ sum[1] = vpadalq_u32(sum[1], a[9]);
+ sum[0] = vpadalq_u32(sum[0], a[10]);
+ sum[1] = vpadalq_u32(sum[1], a[11]);
+ sum[0] = vpadalq_u32(sum[0], a[12]);
+ sum[1] = vpadalq_u32(sum[1], a[13]);
+ sum[0] = vpadalq_u32(sum[0], a[14]);
+ sum[1] = vpadalq_u32(sum[1], a[15]);
+
+ return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
+}
+
#endif // VPX_VPX_DSP_ARM_SUM_NEON_H_