summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarco Nelissen <marcone@google.com>2015-10-08 22:55:37 +0000
committerMarco Nelissen <marcone@google.com>2015-10-08 22:55:37 +0000
commit6f1a83d66c7093abfb3504f45273513ef17393b1 (patch)
tree27c52b97fc3c91be948aeac8ac7b5bc63b52e4e4
parenta84bad80dee40c0c0535983cf4e4fd4ec8bf7b82 (diff)
downloadlibmpeg2-6f1a83d66c7093abfb3504f45273513ef17393b1.tar.gz
Revert "armv8: Removed redundant NEON element size declarations"
This reverts commit a84bad80dee40c0c0535983cf4e4fd4ec8bf7b82. Change-Id: I15a98b97908bd29fadf4cc5a8de17803adc90b61
-rw-r--r--common/armv8/impeg2_idct.s242
-rw-r--r--decoder.arm.mk1
-rw-r--r--decoder.arm64.mk1
3 files changed, 123 insertions, 121 deletions
diff --git a/common/armv8/impeg2_idct.s b/common/armv8/impeg2_idct.s
index 82ff0ef..4956e54 100644
--- a/common/armv8/impeg2_idct.s
+++ b/common/armv8/impeg2_idct.s
@@ -384,30 +384,30 @@ impeg2_idct_recon_av8:
ld1 {v2.4h}, [x0], #8
ld1 {v3.4h}, [x9], #8
ld1 {v4.4h}, [x0], x5
- smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
ld1 {v5.4h}, [x9], x5
- smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
ld1 {v6.4h}, [x0], #8
ld1 {v7.4h}, [x9], #8
- smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
ld1 {v8.4h}, [x0], x10
- smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
ld1 {v9.4h}, [x9], x10
- smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
ld1 {v10.4h}, [x0], #8
- smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
ld1 {v11.4h}, [x9], #8
- smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
ld1 {v12.4h}, [x0], x5
- smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
ld1 {v13.4h}, [x9], x5
- smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
ld1 {v14.4h}, [x0], #8
- smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
ld1 {v15.4h}, [x9], #8
- smull v22.4s, v10.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
+ smull v22.4s, v10.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
ld1 {v16.4h}, [x0], x10
- smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
ld1 {v17.4h}, [x9], x10
///* this following was activated when alignment is not there */
@@ -431,21 +431,21 @@ impeg2_idct_recon_av8:
- smlal v24.4s, v14.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
- smlsl v26.4s, v14.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
- smlal v28.4s, v14.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
- smlal v30.4s, v14.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v24.4s, v14.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v14.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v14.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v14.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
- smlsl v18.4s, v11.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v6.4s, v11.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlsl v18.4s, v11.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v6.4s, v11.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v15.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
- smlsl v26.4s, v15.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
- smlal v28.4s, v15.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
- smlsl v30.4s, v15.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+ smlal v24.4s, v15.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlsl v26.4s, v15.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlal v28.4s, v15.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v15.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
@@ -502,20 +502,20 @@ skip_last4_rows:
- smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
- smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
- smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
@@ -554,37 +554,37 @@ last4_cols:
cmp x12, #0xf0
bge skip_last4cols
- smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v18.4s, v5.4h, v1.h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
- smull v8.4s, v5.4h, v0.h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v5.4h, v1.4h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
+ smull v8.4s, v5.4h, v0.4h[2] //// y2 * cos2(part of d0)
- smull v20.4s, v4.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
- smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
+ smull v20.4s, v4.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
- smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
- smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
- smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
- smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
- smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v8.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v8.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
- smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
- smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
- smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
+ smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
+ smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
+ smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
+ smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7)
sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4)
@@ -647,21 +647,21 @@ skip_last4cols:
mov v25.d[0], x15
- smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
// vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1)
- smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
- smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
@@ -727,19 +727,19 @@ skip_last4cols:
mov v25.d[0], x19
mov v25.d[1], x20
- smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3)
+ smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
- smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
- smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0)
+ smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+ smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
@@ -908,38 +908,38 @@ end_skip_last4cols:
//// q5 -> q2
//// q7 -> q4
- smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
- smull v22.4s, v4.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v4.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
- smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
- smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
- smlal v24.4s, v8.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
- smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
- smlal v28.4s, v8.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
- smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v24.4s, v8.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v8.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
- smlsl v18.4s, v5.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v6.4s, v5.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlsl v18.4s, v5.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v6.4s, v5.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v9.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
- smlsl v26.4s, v9.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
- smlal v28.4s, v9.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
- smlsl v30.4s, v9.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+ smlal v24.4s, v9.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlsl v26.4s, v9.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlal v28.4s, v9.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v9.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
@@ -1004,53 +1004,53 @@ end_skip_last4cols:
- smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
- smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
- smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
- smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0)
- smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+ smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+ smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
- smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
add x5, x8, x8, lsl #1 //
- smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data
- smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
add x10, x7, x7, lsl #1 //
- smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v14.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlal v14.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
// swapping v3 and v6
mov v31.d[0], v3.d[0]
mov v3.d[0], v6.d[0]
mov v6.d[0], v31.d[0]
- smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
// swapping v5 and v8
mov v31.d[0], v5.d[0]
mov v5.d[0], v8.d[0]
mov v8.d[0], v31.d[0]
- smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
- smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+ smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7)
diff --git a/decoder.arm.mk b/decoder.arm.mk
index c3af911..fb94969 100644
--- a/decoder.arm.mk
+++ b/decoder.arm.mk
@@ -28,4 +28,5 @@ LOCAL_C_INCLUDES_arm += $(libmpeg2d_inc_dir_arm)
LOCAL_CFLAGS_arm += $(libmpeg2d_cflags_arm)
# CLANG WORKAROUNDS
+LOCAL_CLANG_ASFLAGS_arm += -no-integrated-as
LOCAL_CLANG_ASFLAGS_arm += $(addprefix -Wa$(comma)-I,$(libmpeg2d_inc_dir_arm))
diff --git a/decoder.arm64.mk b/decoder.arm64.mk
index 5b0414e..a195111 100644
--- a/decoder.arm64.mk
+++ b/decoder.arm64.mk
@@ -33,4 +33,5 @@ LOCAL_C_INCLUDES_arm64 += $(libmpeg2d_inc_dir_arm64)
LOCAL_CFLAGS_arm64 += $(libmpeg2d_cflags_arm64)
# CLANG WORKAROUNDS
+LOCAL_CLANG_ASFLAGS_arm64 += -no-integrated-as
LOCAL_CLANG_ASFLAGS_arm64 += $(addprefix -Wa$(comma)-I,$(libmpeg2d_inc_dir_arm64))