aboutsummaryrefslogtreecommitdiff
path: root/libvpx/vpx_dsp/arm/transpose_neon.h
diff options
context:
space:
mode:
Diffstat (limited to 'libvpx/vpx_dsp/arm/transpose_neon.h')
-rw-r--r--libvpx/vpx_dsp/arm/transpose_neon.h109
1 files changed, 109 insertions, 0 deletions
diff --git a/libvpx/vpx_dsp/arm/transpose_neon.h b/libvpx/vpx_dsp/arm/transpose_neon.h
index 8366ce50b..d85cbcee4 100644
--- a/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -710,6 +710,83 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
*a7 = d3.val[1];
}
+static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
+ int32x4x2_t *a2, int32x4x2_t *a3,
+ int32x4x2_t *a4, int32x4x2_t *a5,
+ int32x4x2_t *a6, int32x4x2_t *a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0: 00 10 02 12 01 11 03 13
+ // b1: 20 30 22 32 21 31 23 33
+ // b2: 40 50 42 52 41 51 43 53
+ // b3: 60 70 62 72 61 71 63 73
+ // b4: 04 14 06 16 05 15 07 17
+ // b5: 24 34 26 36 25 35 27 37
+ // b6: 44 54 46 56 45 55 47 57
+ // b7: 64 74 66 76 65 75 67 77
+
+ const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
+ const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
+ const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
+ const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
+ const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
+ const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
+ const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
+ const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
+
+ // Swap 64 bit elements resulting in:
+ // c0: 00 10 20 30 02 12 22 32
+ // c1: 01 11 21 31 03 13 23 33
+ // c2: 40 50 60 70 42 52 62 72
+ // c3: 41 51 61 71 43 53 63 73
+ // c4: 04 14 24 34 06 16 26 36
+ // c5: 05 15 25 35 07 17 27 37
+ // c6: 44 54 64 74 46 56 66 76
+ // c7: 45 55 65 75 47 57 67 77
+ const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+ const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
+ const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
+ const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
+ const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
+ const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
+ const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
+
+ // Swap 128 bit elements resulting in:
+ // a0: 00 10 20 30 40 50 60 70
+ // a1: 01 11 21 31 41 51 61 71
+ // a2: 02 12 22 32 42 52 62 72
+ // a3: 03 13 23 33 43 53 63 73
+ // a4: 04 14 24 34 44 54 64 74
+ // a5: 05 15 25 35 45 55 65 75
+ // a6: 06 16 26 36 46 56 66 76
+ // a7: 07 17 27 37 47 57 67 77
+ a0->val[0] = c0.val[0];
+ a0->val[1] = c2.val[0];
+ a1->val[0] = c1.val[0];
+ a1->val[1] = c3.val[0];
+ a2->val[0] = c0.val[1];
+ a2->val[1] = c2.val[1];
+ a3->val[0] = c1.val[1];
+ a3->val[1] = c3.val[1];
+ a4->val[0] = c4.val[0];
+ a4->val[1] = c6.val[0];
+ a5->val[0] = c5.val[0];
+ a5->val[1] = c7.val[0];
+ a6->val[0] = c4.val[1];
+ a6->val[1] = c6.val[1];
+ a7->val[0] = c5.val[1];
+ a7->val[1] = c7.val[1];
+}
+
static INLINE void transpose_u8_16x8(
const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
@@ -1204,4 +1281,36 @@ static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
}
+
+static INLINE void load_and_transpose_s32_8x8(
+ const int32_t *a, const int a_stride, int32x4x2_t *const a0,
+ int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3,
+ int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6,
+ int32x4x2_t *const a7) {
+ a0->val[0] = vld1q_s32(a);
+ a0->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a1->val[0] = vld1q_s32(a);
+ a1->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a2->val[0] = vld1q_s32(a);
+ a2->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a3->val[0] = vld1q_s32(a);
+ a3->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a4->val[0] = vld1q_s32(a);
+ a4->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a5->val[0] = vld1q_s32(a);
+ a5->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a6->val[0] = vld1q_s32(a);
+ a6->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a7->val[0] = vld1q_s32(a);
+ a7->val[1] = vld1q_s32(a + 4);
+
+ transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
#endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_