From 8936bab7bac2ed8f16821d700414a398dd25dc7b Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 4 Oct 2023 11:27:10 +0200 Subject: x86: Consolidate some pb_0to31 and pb_0to63 constants --- src/x86/filmgrain16_avx512.asm | 10 ++++------ src/x86/looprestoration16_avx2.asm | 16 ++++++++-------- src/x86/looprestoration_avx2.asm | 11 ++++++----- src/x86/pal.asm | 2 +- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/x86/filmgrain16_avx512.asm b/src/x86/filmgrain16_avx512.asm index 00dd6af..5cbebce 100644 --- a/src/x86/filmgrain16_avx512.asm +++ b/src/x86/filmgrain16_avx512.asm @@ -29,11 +29,7 @@ %if ARCH_X86_64 -SECTION_RODATA 64 -pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 - db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 +SECTION_RODATA 16 scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1 scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4 pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27 @@ -53,6 +49,8 @@ uv_offset_mul: dd 256 dd 1024 pb_8_9_0_1: db 8, 9, 0, 1 +cextern pb_0to63 + SECTION .text INIT_ZMM avx512icl @@ -382,7 +380,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling packssdw m4, m5, m5 vpbroadcastd m21, [base+scale_shift+r9*8+4] %if %2 - mova m12, [base+pb_0to63] ; pw_even + mova m12, [pb_0to63] ; pw_even mov r13d, 0x0101 vpbroadcastq m10, [base+pw_23_22+r9*8] kmovw k3, r13d diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm index ef25c28..4cf8b90 100644 --- a/src/x86/looprestoration16_avx2.asm +++ b/src/x86/looprestoration16_avx2.asm @@ -32,15 +32,15 @@ SECTION_RODATA 32 sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 -pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 wiener_hshift: dw 4, 4, 1, 1 wiener_vshift: dw 1024, 1024, 4096, 4096 @@ -62,6 +62,7 @@ pd_0xf00801c7: dd 0xf00801c7 %define pw_256 sgr_lshuf5 +cextern pb_0to63 cextern sgr_x_by_x_avx2 SECTION .text @@ -182,7 +183,7 @@ cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ .extend_right: movd xm1, r10d vpbroadcastd m0, [pb_6_7] - movu m2, [pb_0to31] + mova m2, [pb_0to63] vpbroadcastb m1, xm1 psubb m0, m1 pminub m0, m2 @@ -406,9 +407,8 @@ cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ vpbroadcastd m0, [base+wiener_hshift+t3*4] vpbroadcastd m9, [base+wiener_round+t3*4] vpbroadcastd m10, [base+wiener_vshift+t3*4] - movu xm15, [wiener_lshuf5] + mova m15, [wiener_lshuf5] pmullw m11, m0 - vinserti128 m15, [pb_0to31], 1 pmullw m12, m0 test edgeb, 4 ; LR_HAVE_TOP jz .no_top @@ -486,7 +486,7 @@ cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ vpbroadcastb m2, xm2 psubb m0, m2 psubb m1, m2 - movu m2, [pb_0to31] + mova m2, [pb_0to63] pminub m0, m2 pminub m1, m2 pshufb m3, m0 diff --git a/src/x86/looprestoration_avx2.asm b/src/x86/looprestoration_avx2.asm index a73cb21..7787997 100644 --- a/src/x86/looprestoration_avx2.asm +++ b/src/x86/looprestoration_avx2.asm @@ -31,11 +31,11 @@ SECTION_RODATA 32 wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 +sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_r_ext: times 16 db 1 times 16 db 9 @@ -64,7 +64,6 @@ pb_m5: times 4 db -5 pb_3: times 4 db 3 pw_5_6: dw 5, 6 -sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 db 9, -1, 10, -1, 11, -1, 12, -1 @@ -77,6 +76,8 @@ pd_m4096: dd -4096 pd_0xf00801c7: dd 0xf00801c7 pd_0xf00800a4: dd 0xf00800a4 +cextern pb_0to63 + SECTION .text DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers @@ -192,7 +193,7 @@ cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ vpbroadcastd m0, [pb_3] vpbroadcastd m1, [pb_m5] vpbroadcastb m2, xm2 - movu m3, [pb_0to31] + mova m3, [pb_0to63] psubb m0, m2 psubb m1, m2 pminub m0, m3 @@ -826,7 +827,7 @@ cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \ mova m0, [sgr_r_ext] vpbroadcastb m2, xm2 psubb m0, m2 - pminub m0, [pb_0to31] + pminub m0, [pb_0to63] pshufb m5, m0 ret .h: ; horizontal boxsum diff --git a/src/x86/pal.asm b/src/x86/pal.asm index 27187d1..92075b9 100644 --- a/src/x86/pal.asm +++ b/src/x86/pal.asm @@ -28,7 +28,7 @@ SECTION_RODATA 64 -pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +const pb_0to63, db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 %if ARCH_X86_64 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 -- cgit v1.2.3 From 4c012978fbefc100faa1cc060f1279529a6c200f Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 14 Sep 2023 20:17:46 +0200 Subject: x86: Add 8-bit ipred z1 AVX-512 (Ice Lake) asm --- src/x86/ipred.h | 1 + src/x86/ipred_avx512.asm | 591 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 588 insertions(+), 4 deletions(-) diff --git a/src/x86/ipred.h b/src/x86/ipred.h index 29e1d96..4aff752 100644 --- a/src/x86/ipred.h +++ b/src/x86/ipred.h @@ -137,6 +137,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl); init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); + init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl); #endif init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl); init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl); diff --git a/src/x86/ipred_avx512.asm b/src/x86/ipred_avx512.asm index 4aeb14e..75dfa66 100644 --- a/src/x86/ipred_avx512.asm +++ b/src/x86/ipred_avx512.asm @@ -97,16 +97,60 @@ ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +z_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14 + db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30 + db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46 + db 16, 48, 14, 50, 12, 52, 10, 54, 8, 56, 6, 58, 4, 60, 2, 62 +z_filter_s1: db -1, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6 + db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22 + db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38 + db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54 +z_filter_s5: db 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16 + db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32 + db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48 + db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64 +z_filter_s3: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +z_filter_s2: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +z_filter_s4: db 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8 +z_xpos_bc: db 17, 17, 17, 17, 33, 33, 33, 33, 9, 9, 9, 9, 9, 9, 9, 9 +z_filter4_s1: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 + db 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 +z_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72 +z_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80 +z_xpos_off2a: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 + db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 + db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56 +z_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 + db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 + db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 + db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64 +z_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8 + dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 79, 79, 79 +z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 + db 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16, 0 + db 0, 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16 +pb_8_56_0_0: db 8, 56, 0, 0 +pb_m4_36: times 2 db -4, 36 pb_127_m127: times 2 db 127, -127 pb_128: times 4 db 128 pw_128: times 2 dw 128 pw_255: times 2 dw 255 +pw_512: times 2 dw 512 -%define pb_1 (ipred_h_shuf+24) -%define pb_2 (ipred_h_shuf+20) -%define pb_3 (ipred_h_shuf+16) -%define pd_8 (filter_taps+128) +%define pb_1 (ipred_h_shuf+24) +%define pb_2 (ipred_h_shuf+20) +%define pb_3 (ipred_h_shuf+16) +%define pb_7 (ipred_h_shuf+ 0) +%define pb_9 (z_xpos_bc + 8) +%define pb_17 (z_xpos_bc + 0) +%define pb_33 (z_xpos_bc + 4) +%define pd_8 (filter_taps+128) %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) @@ -125,10 +169,14 @@ JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 +cextern dr_intra_derivative +cextern pb_0to63 + SECTION .text INIT_ZMM avx512icl @@ -1200,6 +1248,541 @@ cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3 jg .w64 RET +%if WIN64 + DECLARE_REG_TMP 4 +%else + DECLARE_REG_TMP 8 +%endif + +cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx +%define base r7-z_filter_t0 + lea r7, [z_filter_t0] + tzcnt wd, wm + movifnidn angled, anglem + lea t0, [dr_intra_derivative] + movsxd wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4] + inc tlq + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [t0+dxq] + lea wq, [base+ipred_z1_8bpc_avx512icl_table+wq] + movifnidn hd, hm + xor angled, 0x4ff ; d = 90 - angle + mova m14, [base+z_frac_table] + vpbroadcastd m15, [base+pw_512] + jmp wq +.w4: + mova m9, [pb_0to63] + pminud m8, m9, [base+pb_7] {1to16} + vpbroadcastq m7, [tlq] + pshufb m7, m8 + cmp angleb, 40 + jae .w4_filter + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_filter ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + pshufb xmm0, xm7, [base+z_filter_s4] + mova xmm1, [tlq-1] + pshufb xmm1, [base+z_xpos_off2a] + vpbroadcastd xmm2, [base+pb_m4_36] + vpbroadcastq m4, [pb_0to63] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm1, xmm2 + add dxd, dxd + kxnorw k1, k1, k1 + paddw xmm0, xmm1 + pmulhrsw xm0, xmm0, xm15 + packuswb xm0, xm0 + punpcklbw ym7{k1}, ym0 + jmp .w4_main2 +.w4_filter: + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+3] + vpbroadcastb xm0, r3d + vpbroadcastb xm1, angled + shr angled, 8 ; is_sm << 1 + vpcmpeqb k1, xm0, [base+z_filter_wh] + vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] + kmovw r5d, k1 + test r5d, r5d + jz .w4_main + vbroadcasti32x4 ym0, [tlq-1] + pshufb ym0, [base+z_filter4_s1] + popcnt r5d, r5d ; filter_strength + pshufb ym1, ym7, [z_filter_s4] + pshufb ym7, [base+z_filter_s3] + vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] + vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym7, ym12 + paddw ym0, ym1 + paddw ym7, ym0 + pmulhrsw ym7, ym15 + cmp hd, 4 + je .w4_filter_end + vpbroadcastd m8, [base+pb_9] + pminub m8, m9 +.w4_filter_end: + paddb m8, m8 + vpermb m7, m8, m7 +.w4_main: + vpbroadcastq m4, [base+z_xpos_off1a] +.w4_main2: + movsldup m2, [base+z_xpos_mul] + vpbroadcastw m5, dxd + vbroadcasti32x4 m3, [base+z_xpos_bc] + lea r2, [strideq*3] + pmullw m2, m5 ; xpos + psllw m5, 5 ; dx*8 +.w4_loop: + psrlw m1, m2, 3 + pshufb m0, m2, m3 + vpermw m1, m1, m14 ; 64-frac, frac + paddsb m0, m4 ; base, base+1 + vpermb m0, m0, m7 ; top[base], top[base+1] + paddsw m2, m5 ; xpos += dx + pmaddubsw m0, m1 ; v + pmulhrsw m0, m15 + packuswb m0, m0 + vextracti32x4 xm1, ym0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+r2 ], xm1, 1 + sub hd, 8 + jl .w4_end + vextracti32x4 xm1, m0, 2 ; top[max_base_x] + lea dstq, [dstq+strideq*4] + vextracti32x4 xm0, m0, 3 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r2 ], xm0, 1 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.w8: + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_filter ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + lea r3d, [hq-1] + mova xm1, [base+z_filter_s4] + vpbroadcastb xm2, r3d + mova xm7, [tlq-1] + vinserti32x4 ym7, [tlq+7], 1 + vbroadcasti32x4 ym0, [base+z_xpos_off1a] + vpbroadcastd ym3, [base+pb_m4_36] + pminub xm2, xm1 + pshufb ym0, ym7, ym0 + vinserti32x4 ym1, xm2, 1 + psrldq ym7, 1 + pshufb ym1, ym7, ym1 + pmaddubsw ym0, ym3 + pmaddubsw ym1, ym3 + vbroadcasti32x4 m8, [pb_0to63] + add dxd, dxd + paddw ym0, ym1 + pmulhrsw ym0, ym15 + packuswb ym0, ym0 + punpcklbw ym7, ym0 + jmp .w8_main2 +.w8_filter: + lea r3d, [hq+7] + mova m9, [pb_0to63] + vpbroadcastb ym0, r3d + and r3d, 7 + vbroadcasti32x4 m7, [tlq] + or r3d, 8 ; imin(h+7, 15) + vpbroadcastb m8, r3d + pminub m8, m9 + pshufb m7, m8 + test angled, 0x400 + jnz .w8_main + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym0, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + kmovd r5d, k1 + test r5d, r5d + jz .w8_main + mova ym0, [base+z_filter_s1] + vpbroadcastd ym2, [tlq-4] + popcnt r5d, r5d + vbroadcasti32x4 ym1, [base+z_filter_s2] + vbroadcasti32x4 ym3, [base+z_filter_s3] + vbroadcasti32x4 ym4, [base+z_filter_s4] + vpermi2b ym0, ym7, ym2 ; al bl + mova ym5, [base+z_filter_s5] + pshufb ym1, ym7, ym1 ; ah bh + vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] + pshufb ym3, ym7, ym3 ; cl ch + vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] + pshufb ym4, ym7, ym4 ; el dl + vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2] + vpermb ym5, ym5, ym7 ; eh dh + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym2, ym3, ym12 + pmaddubsw ym3, ym13 + pmaddubsw ym4, ym11 + pmaddubsw ym5, ym11 + paddw ym0, ym2 + paddw ym1, ym3 + paddw ym0, ym4 + paddw ym1, ym5 + pmulhrsw ym0, ym15 + pmulhrsw ym1, ym15 + packuswb ym0, ym1 + cmp hd, 8 + jle .w8_filter_end + vpbroadcastd m8, [base+pb_17] + add r3d, 2 + pminub m8, m9 +.w8_filter_end: + vpermb m7, m8, m0 +.w8_main: + vbroadcasti32x4 m8, [base+z_xpos_off1a] +.w8_main2: + movsldup m4, [base+z_xpos_mul] + vpbroadcastw m9, dxd + shl r3d, 6 + vpbroadcastd m5, [base+z_xpos_bc+8*0] + pmullw m4, m9 ; xpos + vpbroadcastd m6, [base+z_xpos_bc+8*1] + sub r3d, dxd + shl dxd, 3 + psllw m9, 5 ; dx*8 + lea r2, [strideq*3] +.w8_loop: + psrlw m3, m4, 3 + pshufb m0, m4, m5 + pshufb m1, m4, m6 + vpermw m3, m3, m14 + paddsb m0, m8 + paddsb m1, m8 + vpermb m0, m0, m7 + vpermb m1, m1, m7 + paddsw m4, m9 + punpcklqdq m2, m3, m3 + pmaddubsw m0, m2 + punpckhqdq m3, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + sub hd, 8 + jl .w8_end + vextracti32x8 ym0, m0, 1 + lea dstq, [dstq+strideq*4] + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + jz .w8_end + lea dstq, [dstq+strideq*4] + sub r3d, dxd + jg .w8_loop + vextracti32x4 xm7, m7, 3 +.w8_end_loop: + movq [dstq+strideq*0], xm7 + movq [dstq+strideq*1], xm7 + movq [dstq+strideq*2], xm7 + movq [dstq+r2 ], xm7 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_end_loop +.w8_end: + RET +.w16: + lea r3d, [hq+15] + mova m9, [pb_0to63] + vpbroadcastb ym0, r3d + and r3d, 15 + movu ym7, [tlq] + or r3d, 16 ; imin(h+15, 31) + vpbroadcastb m8, r3d + pminub m8, m9 + vpermb m7, m8, m7 + test angled, 0x400 + jnz .w16_main + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym0, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + kmovd r5d, k1 + test r5d, r5d + jz .w16_main + mova m0, [base+z_filter_s1] + vpbroadcastd m2, [tlq-4] + popcnt r5d, r5d + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpermi2b m0, m7, m2 ; al bl + mova m5, [base+z_filter_s5] + pshufb m1, m7, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+(r5-1)*4+12*0] + pshufb m3, m7, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+(r5-1)*4+12*1] + pshufb m4, m7, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+(r5-1)*4+12*2] + vpermb m5, m5, m7 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m2, m3, m12 + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m0, m2 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + cmp hd, 16 + jle .w16_filter_end + vpbroadcastd m8, [base+pb_33] + add r3d, 2 + pminub m8, m9 +.w16_filter_end: + vpermb m7, m8, m0 +.w16_main: + movshdup m3, [base+z_xpos_mul] + vpbroadcastw m8, dxd + shl r3d, 6 + vpbroadcastd m4, [base+z_xpos_bc] + pmullw m3, m8 ; xpos + vbroadcasti32x4 m5, [base+z_xpos_off1a] + sub r3d, dxd + shl dxd, 2 + vbroadcasti32x4 m6, [base+z_xpos_off1b] + psllw m8, 4 ; dx*4 + lea r2, [strideq*3] +.w16_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermb m0, m0, m7 + vpermb m1, m1, m7 + paddsw m3, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + sub hd, 4 + jz .w16_end + lea dstq, [dstq+strideq*4] + sub r3d, dxd + jg .w16_loop + vextracti32x4 xm7, m7, 3 +.w16_end_loop: + mova [dstq+strideq*0], xm7 + mova [dstq+strideq*1], xm7 + mova [dstq+strideq*2], xm7 + mova [dstq+r2 ], xm7 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_end_loop +.w16_end: + RET +.w32: + lea r3d, [hq+31] + vpbroadcastb m9, r3d + and r3d, 31 + pminub m10, m9, [pb_0to63] + or r3d, 32 ; imin(h+31, 63) + vpermb m7, m10, [tlq] + vpbroadcastb m8, [tlq+r3] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vpbroadcastd m2, [tlq-4] + mova m0, [base+z_filter_s1] + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpermi2b m0, m7, m2 ; al bl + mova m5, [base+z_filter_s5] + pshufb m1, m7, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+4*2+12*0] + pshufb m3, m7, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+4*2+12*1] + pshufb m4, m7, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+4*2+12*2] + vpermi2b m5, m7, m8 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m2, m3, m12 + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m0, m2 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m7, m0, m1 + cmp hd, 64 + je .w32_h64_filter_end + vpermb m8, m9, m7 + vpermb m7, m10, m7 + jmp .w32_main +.w32_h64_filter_end: ; edge case for 32x64 + movd xmm0, [tlq+r3-1] + movd xmm1, [base+pb_8_56_0_0] + add r3d, 2 + pmaddubsw xmm0, xmm1 + vptestmw k1, xmm1, xmm1 ; 0x01 + pmulhrsw xm0, xmm0, xm15 + vmovdqu8 m8{k1}, m0 +.w32_main: + rorx r2d, dxd, 30 + vpbroadcastd m4, [base+z_xpos_bc] + vpbroadcastw m3, r2d + vbroadcasti32x8 m5, [base+z_xpos_off2a] + shl r3d, 6 + vbroadcasti32x8 m6, [base+z_xpos_off2b] + sub r3d, dxd + paddw m9, m3, m3 + add dxd, dxd + vinserti32x8 m3, ym9, 1 +.w32_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermi2b m0, m7, m8 + vpermi2b m1, m7, m8 + paddsw m3, m9 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w32_end + lea dstq, [dstq+strideq*2] + sub r3d, dxd + jg .w32_loop + punpckhqdq ym8, ym8 +.w32_end_loop: + mova [dstq+strideq*0], ym8 + mova [dstq+strideq*1], ym8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_end_loop +.w32_end: + RET +.w64: + lea r3d, [hq-1] + movu m7, [tlq+64*0] + vpbroadcastb m13, r3d + pminub m12, m13, [pb_0to63] + or r3d, 64 + vpermb m8, m12, [tlq+64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + movu m0, [tlq+56] + vbroadcasti32x4 m3, [base+z_filter_s2] + vpbroadcastd m2, [tlq-4] + mova m1, [base+z_filter_s1] + movu m11, [tlq+8] + pshufb m0, m3 ; al bl + vpermi2b m1, m7, m2 + vbroadcasti32x4 m4, [base+z_filter_s4] + pshufb m6, m8, m4 ; el dl + pshufb m9, m7, m4 + pminub m10, m13, [base+z_filter_s5] + pshufb m2, m8, m3 ; ah bh + pshufb m3, m7, m3 + vbroadcasti32x4 m5, [base+z_filter_s3] + vpermb m10, m10, m8 ; eh dh + pshufb m11, m4 + vpbroadcastd m4, [base+z_filter_k+4*2+12*0] + pshufb m8, m5 ; cl ch + pshufb m7, m5 + vpbroadcastd m5, [base+z_filter_k+4*2+12*1] + REPX {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11 + pmaddubsw m4, m8, m5 + pmaddubsw m5, m7, m5 + paddw m0, m6 + vpbroadcastd m6, [base+z_filter_k+4*2+12*2] + paddw m1, m9 + pmaddubsw m7, m6 + pmaddubsw m8, m6 + paddw m2, m10 + paddw m3, m11 + paddw m0, m4 + paddw m1, m5 + paddw m2, m8 + paddw m3, m7 + REPX {pmulhrsw x, m15}, m0, m2, m1, m3 + packuswb m0, m2 + packuswb m7, m1, m3 + vpermb m8, m12, m0 +.w64_main: + rorx r2d, dxd, 30 + vpbroadcastd m4, [base+z_xpos_bc] + vpbroadcastw m3, r2d + mova m5, [base+z_xpos_off2a] + shl r3d, 6 + mova m6, [base+z_xpos_off2b] + sub r3d, dxd + mova m9, m3 +.w64_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermi2b m0, m7, m8 + vpermi2b m1, m7, m8 + paddsw m3, m9 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq], m0 + dec hd + jz .w64_end + add dstq, strideq + sub r3d, dxd + jg .w64_loop + vpermb m8, m13, m8 +.w64_end_loop: + mova [dstq], m8 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET + ; The ipred_filter code processes 4x2 blocks in the following order ; which increases parallelism compared to doing things row by row. ; Some redundant blocks are calculated for w > 4. -- cgit v1.2.3 From 47107e384bd1dc25674acf04d000a8cdc6195234 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sun, 24 Sep 2023 09:42:17 +0100 Subject: deblock_avx512: convert byte-shifts to gf2p8affineqb --- src/x86/loopfilter_avx512.asm | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/x86/loopfilter_avx512.asm b/src/x86/loopfilter_avx512.asm index 0218b62..202a612 100644 --- a/src/x86/loopfilter_avx512.asm +++ b/src/x86/loopfilter_avx512.asm @@ -41,6 +41,10 @@ hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51 hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49 hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +shift1: dq 0x0204081020408000 +shift3: dq 0x0810204080000000 +shift4: dq 0x1020408000000000 + pb_1: times 4 db 1 pb_2: times 4 db 2 pb_3: times 4 db 3 @@ -49,9 +53,6 @@ pb_16: times 4 db 16 pb_63: times 4 db 63 pb_64: times 4 db 64 pb_128: times 4 db 0x80 -pb_240: times 4 db 0xf0 -pb_248: times 4 db 0xf8 -pb_254: times 4 db 0xfe pb_2_1: times 2 db 2, 1 pb_3_1: times 2 db 3, 1 pb_7_1: times 2 db 7, 1 @@ -482,8 +483,7 @@ SECTION .text vpbroadcastb m1, [lutq+136] pminub m2, m1 pmaxub m2, m15 ; I - pand m1, m0, [pb_240]{bcstd} - psrlq m1, 4 ; H + gf2p8affineqb m1, m0, [shift4]{bcstq}, 0 ; H paddd m0, [pb_2]{bcstd} paddb m0, m0 paddb m0, m2 ; E @@ -534,8 +534,7 @@ SECTION .text ABSSUB m10, m3, m6, m11 ; abs(p1-q1) ABSSUB m11, m4, m5, m2 ; abs(p0-q0) paddusb m11, m11 - pand m10, [pb_254]{bcstd} - psrlq m10, 1 + gf2p8affineqb m10, m10, [shift1]{bcstq}, 0 paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E @@ -608,12 +607,8 @@ SECTION .text paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm paddsb m8, m10, m15 paddsb m10, m0 - pand m8, [pb_248]{bcstd} - pand m10, [pb_248]{bcstd} - psrlq m8, 3 - psrlq m10, 3 - pxor m8, m12 - pxor m10, m12 + gf2p8affineqb m8, m8, [shift3]{bcstq}, 16 + gf2p8affineqb m10, m10, [shift3]{bcstq}, 16 psubb m8, m12 ; f2 psubb m10, m12 ; f1 paddsb m4, m8 -- cgit v1.2.3 From fd4ecc2fd870fa267e1995600dddf212c6e49300 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 19 Oct 2023 11:12:29 +0200 Subject: x86: Add 8-bit ipred z3 AVX-512 (Ice Lake) asm --- src/x86/ipred.h | 1 + src/x86/ipred_avx512.asm | 536 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 461 insertions(+), 76 deletions(-) diff --git a/src/x86/ipred.h b/src/x86/ipred.h index 4aff752..e290c87 100644 --- a/src/x86/ipred.h +++ b/src/x86/ipred.h @@ -138,6 +138,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl); + init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl); #endif init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl); init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl); diff --git a/src/x86/ipred_avx512.asm b/src/x86/ipred_avx512.asm index 75dfa66..7666e62 100644 --- a/src/x86/ipred_avx512.asm +++ b/src/x86/ipred_avx512.asm @@ -97,6 +97,10 @@ ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pb_63to0: db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48 + db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32 + db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 + db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 z_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14 db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30 db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46 @@ -127,8 +131,26 @@ z_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64 z_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8 dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16 +z_ypos_off1: db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67 + db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71 + db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75 + db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79 +z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0 + db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1 + db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2 + db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3 +z_ypos_mul1: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512 + dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512 + dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512 + dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512 +z_ypos_mul2: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512 + dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512 + dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512 + dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512 z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z3_upsample: db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 + db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8 z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 db 39, 39, 47, 47, 47, 79, 79, 79 z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 @@ -138,6 +160,11 @@ z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 pb_8_56_0_0: db 8, 56, 0, 0 pb_m4_36: times 2 db -4, 36 pb_127_m127: times 2 db 127, -127 +pb_8: times 4 db 8 +pb_15: times 4 db 15 +pb_16: times 4 db 16 +pb_31: times 4 db 31 +pb_63: times 4 db 63 pb_128: times 4 db 128 pw_128: times 2 dw 128 pw_255: times 2 dw 255 @@ -146,6 +173,7 @@ pw_512: times 2 dw 512 %define pb_1 (ipred_h_shuf+24) %define pb_2 (ipred_h_shuf+20) %define pb_3 (ipred_h_shuf+16) +%define pb_4 (smooth_shuf +48) %define pb_7 (ipred_h_shuf+ 0) %define pb_9 (z_xpos_bc + 8) %define pb_17 (z_xpos_bc + 0) @@ -170,6 +198,7 @@ JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 @@ -1278,11 +1307,11 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx vpbroadcastq m7, [tlq] pshufb m7, m8 cmp angleb, 40 - jae .w4_filter + jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd - jg .w4_filter ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) pshufb xmm0, xm7, [base+z_filter_s4] mova xmm1, [tlq-1] pshufb xmm1, [base+z_xpos_off2a] @@ -1297,7 +1326,7 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx packuswb xm0, xm0 punpcklbw ym7{k1}, ym0 jmp .w4_main2 -.w4_filter: +.w4_no_upsample: test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+3] @@ -1366,11 +1395,40 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx jg .w4_loop .w4_end: RET +.w8_filter: + mova ym0, [base+z_filter_s1] + popcnt r5d, r5d + vbroadcasti32x4 ym1, [base+z_filter_s2] + vbroadcasti32x4 ym3, [base+z_filter_s3] + vbroadcasti32x4 ym4, [base+z_filter_s4] + vpermi2b ym0, ym7, ym2 ; al bl + mova ym5, [base+z_filter_s5] + pshufb ym1, ym7, ym1 ; ah bh + vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] + pshufb ym3, ym7, ym3 ; cl ch + vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] + pshufb ym4, ym7, ym4 ; el dl + vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2] + vpermb ym5, ym5, ym7 ; eh dh + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym2, ym3, ym12 + pmaddubsw ym3, ym13 + pmaddubsw ym4, ym11 + pmaddubsw ym5, ym11 + paddw ym0, ym2 + paddw ym1, ym3 + paddw ym0, ym4 + paddw ym1, ym5 + pmulhrsw ym0, ym15 + pmulhrsw ym1, ym15 + packuswb ym0, ym1 + ret .w8: lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 - ja .w8_filter ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 lea r3d, [hq-1] mova xm1, [base+z_filter_s4] vpbroadcastb xm2, r3d @@ -1392,7 +1450,7 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx packuswb ym0, ym0 punpcklbw ym7, ym0 jmp .w8_main2 -.w8_filter: +.w8_no_upsample: lea r3d, [hq+7] mova m9, [pb_0to63] vpbroadcastb ym0, r3d @@ -1412,34 +1470,8 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx kmovd r5d, k1 test r5d, r5d jz .w8_main - mova ym0, [base+z_filter_s1] vpbroadcastd ym2, [tlq-4] - popcnt r5d, r5d - vbroadcasti32x4 ym1, [base+z_filter_s2] - vbroadcasti32x4 ym3, [base+z_filter_s3] - vbroadcasti32x4 ym4, [base+z_filter_s4] - vpermi2b ym0, ym7, ym2 ; al bl - mova ym5, [base+z_filter_s5] - pshufb ym1, ym7, ym1 ; ah bh - vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] - pshufb ym3, ym7, ym3 ; cl ch - vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] - pshufb ym4, ym7, ym4 ; el dl - vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2] - vpermb ym5, ym5, ym7 ; eh dh - pmaddubsw ym0, ym11 - pmaddubsw ym1, ym11 - pmaddubsw ym2, ym3, ym12 - pmaddubsw ym3, ym13 - pmaddubsw ym4, ym11 - pmaddubsw ym5, ym11 - paddw ym0, ym2 - paddw ym1, ym3 - paddw ym0, ym4 - paddw ym1, ym5 - pmulhrsw ym0, ym15 - pmulhrsw ym1, ym15 - packuswb ym0, ym1 + call .w8_filter cmp hd, 8 jle .w8_filter_end vpbroadcastd m8, [base+pb_17] @@ -1506,28 +1538,8 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx jg .w8_end_loop .w8_end: RET -.w16: - lea r3d, [hq+15] - mova m9, [pb_0to63] - vpbroadcastb ym0, r3d - and r3d, 15 - movu ym7, [tlq] - or r3d, 16 ; imin(h+15, 31) - vpbroadcastb m8, r3d - pminub m8, m9 - vpermb m7, m8, m7 - test angled, 0x400 - jnz .w16_main - vpbroadcastb ym1, angled - shr angled, 8 - vpcmpeqb k1, ym0, [base+z_filter_wh] - mova xm0, [base+z_filter_t0+angleq*8] - vpcmpgtb k1{k1}, ym1, ym0 - kmovd r5d, k1 - test r5d, r5d - jz .w16_main +.w16_filter: mova m0, [base+z_filter_s1] - vpbroadcastd m2, [tlq-4] popcnt r5d, r5d vbroadcasti32x4 m1, [base+z_filter_s2] vbroadcasti32x4 m3, [base+z_filter_s3] @@ -1554,6 +1566,29 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 + ret +.w16: + lea r3d, [hq+15] + mova m9, [pb_0to63] + vpbroadcastb ym0, r3d + and r3d, 15 + movu ym7, [tlq] + or r3d, 16 ; imin(h+15, 31) + vpbroadcastb m8, r3d + pminub m8, m9 + vpermb m7, m8, m7 + test angled, 0x400 + jnz .w16_main + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym0, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + kmovd r5d, k1 + test r5d, r5d + jz .w16_main + vpbroadcastd m2, [tlq-4] + call .w16_filter cmp hd, 16 jle .w16_filter_end vpbroadcastd m8, [base+pb_33] @@ -1607,17 +1642,7 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx jg .w16_end_loop .w16_end: RET -.w32: - lea r3d, [hq+31] - vpbroadcastb m9, r3d - and r3d, 31 - pminub m10, m9, [pb_0to63] - or r3d, 32 ; imin(h+31, 63) - vpermb m7, m10, [tlq] - vpbroadcastb m8, [tlq+r3] - test angled, 0x400 ; !enable_intra_edge_filter - jnz .w32_main - vpbroadcastd m2, [tlq-4] +.w32_filter: mova m0, [base+z_filter_s1] vbroadcasti32x4 m1, [base+z_filter_s2] vbroadcasti32x4 m3, [base+z_filter_s3] @@ -1644,6 +1669,19 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m7, m0, m1 + ret +.w32: + lea r3d, [hq+31] + vpbroadcastb m9, r3d + and r3d, 31 + pminub m10, m9, [pb_0to63] + or r3d, 32 ; imin(h+31, 63) + vpermb m7, m10, [tlq] + vpbroadcastb m8, [tlq+r3] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vpbroadcastd m2, [tlq-4] + call .w32_filter cmp hd, 64 je .w32_h64_filter_end vpermb m8, m9, m7 @@ -1698,20 +1736,9 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx jg .w32_end_loop .w32_end: RET -.w64: - lea r3d, [hq-1] - movu m7, [tlq+64*0] - vpbroadcastb m13, r3d - pminub m12, m13, [pb_0to63] - or r3d, 64 - vpermb m8, m12, [tlq+64*1] - test angled, 0x400 ; !enable_intra_edge_filter - jnz .w64_main - movu m0, [tlq+56] +.w64_filter: vbroadcasti32x4 m3, [base+z_filter_s2] - vpbroadcastd m2, [tlq-4] mova m1, [base+z_filter_s1] - movu m11, [tlq+8] pshufb m0, m3 ; al bl vpermi2b m1, m7, m2 vbroadcasti32x4 m4, [base+z_filter_s4] @@ -1745,6 +1772,20 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx packuswb m0, m2 packuswb m7, m1, m3 vpermb m8, m12, m0 + ret +.w64: + lea r3d, [hq-1] + movu m7, [tlq+64*0] + vpbroadcastb m13, r3d + pminub m12, m13, [pb_0to63] + or r3d, 64 + vpermb m8, m12, [tlq+64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + movu m0, [tlq+56] + vpbroadcastd m2, [tlq-4] + movu m11, [tlq+8] + call .w64_filter .w64_main: rorx r2d, dxd, 30 vpbroadcastd m4, [base+z_xpos_bc] @@ -1783,6 +1824,349 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx .w64_end: RET +cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy + lea r7, [z_filter_t0] + tzcnt wd, wm + movifnidn angled, anglem + lea t0, [dr_intra_derivative+45*2-1] + movsxd wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4] + sub angled, 180 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + mova m0, [base+pb_63to0] + movzx dyd, word [t0+dyq] + lea wq, [base+ipred_z3_8bpc_avx512icl_table+wq] + movifnidn hd, hm + mova m14, [base+z_frac_table] + shl dyd, 6 + vpbroadcastd m15, [base+pw_512] + jmp wq +.w4: + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + lea r3d, [hq+4] + call .upsample + movshdup m1, [base+z_ypos_off1] + vpbroadcastd m6, [base+pb_16] + jmp .w4_main2 +.w4_no_upsample: + lea r3d, [hq+3] + vpbroadcastb m9, r3d + vpxord m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4) + pmaxub m1, m0 + vpermb m7, m1, [tlq-64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + vpbroadcastb xm1, angled + shr angled, 8 + vpcmpeqb k1, xm9, [base+z_filter_wh] + vpbroadcastd m2, [tlq-3] + vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] + kmovw r5d, k1 + test r5d, r5d + jz .w4_main + pminub m9, [pb_0to63] + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter + vpermb m7, m9, m0 +.w4_main: + movsldup m1, [base+z_ypos_off1] + vpbroadcastd m6, [base+pb_8] +.w4_main2: + vpbroadcastw m0, dyd + vpbroadcastq m2, [base+z_ypos_mul1] ; 1..4 + pmulhuw m2, m0 ; ypos >> 1 + lea r2, [strideq*3] + vpermw m3, m2, m14 ; 64-frac, frac + psrlw m2, 5 + packsswb m2, m2 + punpcklbw m2, m2 + paddsb m2, m1 ; base, base+1 +.w4_loop: + vpermb m0, m2, m7 + pmaddubsw m0, m3 + paddsb m2, m6 + pmulhrsw m0, m15 + vpmovwb ym0, m0 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + sub hd, 8 + jl .w4_end + vextracti32x4 xm0, ym0, 1 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.upsample: + xor r3d, 31 ; 31 - (h + imin(w, h)) + vbroadcasti32x4 ym0, [base+z_xpos_off2a] + vpbroadcastb ym7, r3d + pmaxub ym7, [base+z3_upsample] + vbroadcasti32x4 ym1, [base+z_filter_s4] + vpermb ym7, ym7, [tlq-31] + vpbroadcastd ym2, [base+pb_m4_36] + pshufb ym0, ym7, ym0 + psrldq ym7, 1 + pshufb ym1, ym7, ym1 + pmaddubsw ym0, ym2 + pmaddubsw ym1, ym2 + add dyd, dyd + paddw ym0, ym1 + pmulhrsw ym0, ym15 + packuswb ym0, ym0 + punpcklbw ym7, ym0 + ret +.w8: + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + lea r3d, [hq*2] + call .upsample + pshufd m1, [base+z_ypos_off1], q0000 + vpbroadcastd m6, [base+pb_8] + jmp .w8_main2 +.w8_no_upsample: + mov r3d, 8 + cmp hd, 4 + cmove r3d, hd + lea r3d, [r3+hq-1] + xor r3d, 63 ; 63 - (h + imin(w, h)) + vpbroadcastb m1, wd + pmaxub m1, m0 + vpermb m7, m1, [tlq-64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w8_main + lea r3d, [hq+7] + call .filter_strength + test r5d, r5d + jz .w8_main + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter + vpermb m7, m10, m0 +.w8_main: + movsldup m1, [base+z_ypos_off2] + vpbroadcastd m6, [base+pb_4] +.w8_main2: + vpbroadcastw m0, dyd + vbroadcasti32x4 m2, [base+z_ypos_mul1] ; 1..8 + pmulhuw m2, m0 ; ypos >> 1 + lea r2, [strideq*3] + vpermw m3, m2, m14 ; 64-frac, frac + psrlw m2, 5 + packsswb m2, m2 + punpcklbw m2, m2 + paddsb m2, m1 ; base, base+1 +.w8_loop: + vpermb m0, m2, m7 + pmaddubsw m0, m3 + paddsb m2, m6 + pmulhrsw m0, m15 + vpmovwb ym0, m0 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.filter_strength: + vpbroadcastd m2, [tlq-3] +.filter_strength2: + vpbroadcastb m9, r3d + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym9, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + pminub m10, m9, [pb_0to63] + kmovd r5d, k1 + ret +.w16_load: + cmp r3d, hd + cmovae r3d, hd + add r3d, hd + mova m7, [tlq-64*1] + neg r3d ; -(h + imin(w, h)) + and r3d, 63 + vpbroadcastb m1, r3d + pmaxub m2, m0, m1 + cmp hd, 64 + je .w16_load_h64 + vpermb m8, m1, m7 + vpermb m7, m2, m7 + ret +.w16_load_h64: + vpermb m7, m0, m7 + vpermb m8, m2, [tlq-64*2] + ret +.w16: + mov r3d, 16 + call .w16_load + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w16_main + vpbroadcastd m2, [tlq-3] + cmp hd, 64 + je .w16_filter64 + lea r3d, [hq+15] + call .filter_strength2 + test r5d, r5d + jz .w16_main + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter + pminub m10, m9, [pb_0to63] + vpermb m8, m9, m0 + vpermb m7, m10, m0 + jmp .w16_main +.w16_filter64: + vpbroadcastd m13, [base+pb_15] + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w16_main: + vpbroadcastd m6, [base+pb_4] + vpbroadcastw m0, dyd + vbroadcasti32x4 m3, [base+z_ypos_mul1] ; 1.. 8 + vbroadcasti32x4 m2, [base+z_ypos_mul2] ; 9..15 + pmulhuw m3, m0 ; ypos >> 1 + pmulhuw m2, m0 + movshdup m0, [base+z_ypos_off2] + lea r2, [strideq*3] + vpbroadcastd m1, [base+pb_1] + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m3, m0 + paddsb m1, m3 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w16_loop: +%macro Z3_PERM2 0 + mova m0, m7 + vpermt2b m0, m2, m8 + mova m1, m7 + vpermt2b m1, m3, m8 + pmaddubsw m0, m4 + pmaddubsw m1, m5 + paddsb m2, m6 + paddsb m3, m6 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 +%endmacro + Z3_PERM2 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + mov r3d, 32 + call .w16_load + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vpbroadcastd m2, [tlq-3] + cmp hd, 64 + je .w32_filter64 + lea r3d, [hq+31] + vpbroadcastb m9, r3d + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter + vpermb m8, m9, m7 + jmp .w32_main +.w32_filter64: + vpbroadcastd m13, [base+pb_31] + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w32_main: + vbroadcasti32x8 m3, [base+z_ypos_mul1] ; 1.. 8 + vpbroadcastw m0, dyd + vbroadcasti32x8 m2, [base+z_ypos_mul2] ; 9..15 + vpbroadcastd m1, [base+pb_1] + pmulhuw m3, m0 ; ypos >> 1 + pmulhuw m2, m0 + vpbroadcastd m6, [base+pb_2] + mova ym0, ym1 + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m3, m0 + paddsb m1, m3 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w32_loop: + Z3_PERM2 + vextracti32x8 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], ym0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + mova m7, [tlq-64*1] + cmp hd, 64 + je .w64_h64 + lea r3d, [hq*2-1] + xor r3d, 63 ; -(h + imin(w, h)) & 63 + vpbroadcastb m1, r3d + pmaxub m0, m1 + vpermb m8, m1, m7 + jmp .w64_filter +.w64_h64: + vpermb m8, m0, [tlq-64*2] +.w64_filter: + vpermb m7, m0, m7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + lea r3d, [hq-1] + vpbroadcastd m2, [tlq-3] + vpbroadcastb m13, r3d + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w64_main: + vpbroadcastw m2, dyd + pmulhuw m3, m2, [base+z_ypos_mul1] + pmulhuw m2, [base+z_ypos_mul2] + vpbroadcastd m6, [base+pb_1] + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m1, m3, m6 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w64_loop: + Z3_PERM2 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + ; The ipred_filter code processes 4x2 blocks in the following order ; which increases parallelism compared to doing things row by row. ; Some redundant blocks are calculated for w > 4. -- cgit v1.2.3 From 48ef39592010b6aeb4c4c318c412714d283e1215 Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Tue, 24 Oct 2023 20:27:33 +0200 Subject: CI: Update images --- .gitlab-ci.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6b80a35..2bca058 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -11,49 +11,49 @@ stages: - amd64 .debian-amd64-minimum: - image: registry.videolan.org/dav1d-debian-minimum:20230211045249 + image: registry.videolan.org/dav1d-debian-minimum:20231024033032 stage: build tags: - docker - amd64 .debian-llvm-mingw-common: - image: registry.videolan.org/vlc-debian-llvm-msvcrt:20230212072216 + image: registry.videolan.org/vlc-debian-llvm-msvcrt:20231024033032 stage: build tags: - docker - amd64 .debian-aarch64-common: - image: registry.videolan.org/dav1d-debian-bullseye-aarch64:20230512061045 + image: registry.videolan.org/dav1d-debian-bookworm-aarch64:20231018041418 stage: build tags: - docker - aarch64 .debian-armv7-common: - image: registry.videolan.org/dav1d-debian-bullseye-armv7:20230513182209 + image: registry.videolan.org/dav1d-debian-bookworm-armv7:20231018042237 stage: build tags: - docker - armv7 .debian-ppc64le-common: - image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20230211050439 + image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20231020040221 stage: build tags: - docker - ppc64le .android-common: - image: registry.videolan.org/vlc-debian-android:20230212071537 + image: registry.videolan.org/vlc-debian-android:20231013040434 stage: build tags: - docker - amd64 .debian-wasm-emscripten-common: - image: registry.videolan.org/vlc-debian-wasm-emscripten:20221213104631 + image: registry.videolan.org/vlc-debian-wasm-emscripten:20231024033032 stage: build tags: - docker -- cgit v1.2.3 From 9dbf46285d9c3fdbb920d1824205ae686a6a0d22 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 1 Nov 2023 13:55:57 +0100 Subject: ci: Fix test-debian-asan running checkasm with non-existing arguments --- .gitlab-ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2bca058..b59cf8c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -570,8 +570,9 @@ test-debian-asan: - ninja -C build - cd build - exit_code=0 - - time meson test -v --setup=sanitizer --test-args "--cpumask 0" || exit_code=$((exit_code + $?)) - - time meson test -v --setup=sanitizer --test-args "--cpumask 0xff" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite checkasm || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask 0" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask 0xff" || exit_code=$((exit_code + $?)) - if [ $exit_code -ne 0 ]; then exit $exit_code; fi test-debian-msan: -- cgit v1.2.3 From 0f2a877e7eb0801d8b0866ca0bc4a51600c1257e Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 30 Oct 2023 13:53:52 +0100 Subject: checkasm: Check for errors in command line parsing --- tests/checkasm/checkasm.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 682cc43..4f66b1d 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -26,6 +26,7 @@ */ #include "tests/checkasm/checkasm.h" +#include #include #include #include @@ -567,6 +568,13 @@ static unsigned get_seed(void) { #endif } +static int checkasm_strtoul(unsigned long *const dst, const char *const str, const int base) { + char *end; + errno = 0; + *dst = strtoul(str, &end, base); + return errno || end == str || *end; +} + int main(int argc, char *argv[]) { state.seed = get_seed(); @@ -612,7 +620,12 @@ int main(int argc, char *argv[]) { } else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) { state.verbose = 1; } else if (!strncmp(argv[1], "--affinity=", 11)) { - unsigned long affinity = strtoul(argv[1] + 11, NULL, 16); + const char *const s = argv[1] + 11; + unsigned long affinity; + if (checkasm_strtoul(&affinity, s, 16)) { + fprintf(stderr, "checkasm: invalid cpu affinity (%s)\n", s); + return 1; + } #ifdef _WIN32 BOOL (WINAPI *spdcs)(HANDLE, const ULONG*, ULONG) = (void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"), "SetProcessDefaultCpuSets"); @@ -649,7 +662,12 @@ int main(int argc, char *argv[]) { return 1; #endif } else { - state.seed = (unsigned) strtoul(argv[1], NULL, 10); + unsigned long seed; + if (checkasm_strtoul(&seed, argv[1], 10)) { + fprintf(stderr, "checkasm: unknown option (%s)\n", argv[1]); + return 1; + } + state.seed = (unsigned)seed; } argc--; -- cgit v1.2.3 From 6bc552eb285ebfabf47f97bd51d91e72d02db564 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 1 Nov 2023 13:13:59 +0100 Subject: checkasm: Enable virtual terminal processing on Windows This allows for the use of standard VT100 escape codes for text coloring, which simplifies things by eliminating a bunch of Windows-specific code. This is only supported since Windows 10. Things will still run on older systems, just without colored text output. --- tests/checkasm/checkasm.c | 59 ++++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 39 deletions(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 4f66b1d..b1e09c2 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -36,9 +36,9 @@ #ifdef _WIN32 #include -#define COLOR_RED FOREGROUND_RED -#define COLOR_GREEN FOREGROUND_GREEN -#define COLOR_YELLOW (FOREGROUND_RED|FOREGROUND_GREEN) +#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING +#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x04 +#endif #else #include #include @@ -50,11 +50,12 @@ #ifdef __APPLE__ #include #endif -#define COLOR_RED 1 -#define COLOR_GREEN 2 -#define COLOR_YELLOW 3 #endif +#define COLOR_RED 31 +#define COLOR_GREEN 32 +#define COLOR_YELLOW 33 + /* List of tests to invoke */ static const struct { const char *name; @@ -242,48 +243,19 @@ int float_near_abs_eps_array_ulp(const float *const a, const float *const b, } /* Print colored text to stderr if the terminal supports it */ +static int use_printf_color; static void color_printf(const int color, const char *const fmt, ...) { - static int8_t use_color = -1; va_list arg; -#ifdef _WIN32 - static HANDLE con; - static WORD org_attributes; - - if (use_color < 0) { - CONSOLE_SCREEN_BUFFER_INFO con_info; - con = GetStdHandle(STD_ERROR_HANDLE); - if (con && con != INVALID_HANDLE_VALUE && - GetConsoleScreenBufferInfo(con, &con_info)) - { - org_attributes = con_info.wAttributes; - use_color = 1; - } else - use_color = 0; - } - if (use_color) - SetConsoleTextAttribute(con, (org_attributes & 0xfff0) | - (color & 0x0f)); -#else - if (use_color < 0) { - const char *const term = getenv("TERM"); - use_color = term && strcmp(term, "dumb") && isatty(2); - } - if (use_color) - fprintf(stderr, "\x1b[%d;3%dm", (color & 0x08) >> 3, color & 0x07); -#endif + if (use_printf_color) + fprintf(stderr, "\x1b[0;%dm", color); va_start(arg, fmt); vfprintf(stderr, fmt, arg); va_end(arg); - if (use_color) { -#ifdef _WIN32 - SetConsoleTextAttribute(con, org_attributes); -#else + if (use_printf_color) fprintf(stderr, "\x1b[0m"); -#endif - } } /* Deallocate a tree */ @@ -684,6 +656,12 @@ int main(int argc, char *argv[]) { #ifdef _WIN32 #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) AddVectoredExceptionHandler(0, signal_handler); + + HANDLE con = GetStdHandle(STD_ERROR_HANDLE); + DWORD con_mode = 0; + use_printf_color = con && con != INVALID_HANDLE_VALUE && + GetConsoleMode(con, &con_mode) && + SetConsoleMode(con, con_mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING); #endif #else const struct sigaction sa = { @@ -694,6 +672,9 @@ int main(int argc, char *argv[]) { sigaction(SIGFPE, &sa, NULL); sigaction(SIGILL, &sa, NULL); sigaction(SIGSEGV, &sa, NULL); + + const char *const term = getenv("TERM"); + use_printf_color = term && strcmp(term, "dumb") && isatty(2); #endif #ifdef readtime -- cgit v1.2.3 From 611abc20db1c131cf48afdbac9b6b8723d8a9095 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 30 Oct 2023 13:33:47 +0100 Subject: checkasm: Add missing WINAPI_PARTITION checks on Windows Some functionality is only available on WINAPI_PARTITION_DESKTOP systems. --- tests/checkasm/checkasm.c | 13 +++++++++---- tests/checkasm/checkasm.h | 6 ++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index b1e09c2..24db86e 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -435,6 +435,7 @@ checkasm_context checkasm_context_buf; /* Crash handling: attempt to catch crashes and handle them * gracefully instead of just aborting abruptly. */ #ifdef _WIN32 +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) { if (!state.catch_signals) return EXCEPTION_CONTINUE_SEARCH; @@ -464,6 +465,7 @@ static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) { checkasm_load_context(); return EXCEPTION_CONTINUE_EXECUTION; /* never reached, but shuts up gcc */ } +#endif #else static void signal_handler(const int s) { if (state.catch_signals) { @@ -599,13 +601,16 @@ int main(int argc, char *argv[]) { return 1; } #ifdef _WIN32 + int affinity_err; + HANDLE process = GetCurrentProcess(); +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) BOOL (WINAPI *spdcs)(HANDLE, const ULONG*, ULONG) = (void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"), "SetProcessDefaultCpuSets"); - HANDLE process = GetCurrentProcess(); - int affinity_err; - if (spdcs) { + if (spdcs) affinity_err = !spdcs(process, (ULONG[]){ affinity + 256 }, 1); - } else { + else +#endif + { if (affinity < sizeof(DWORD_PTR) * 8) affinity_err = !SetProcessAffinityMask(process, (DWORD_PTR)1 << affinity); else diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 67a2e42..269ce59 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -37,10 +37,16 @@ /* setjmp/longjmp on 64-bit Windows will try to use SEH to unwind the stack, * which doesn't work for assembly functions without unwind information. */ #include +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) #define checkasm_context CONTEXT #define checkasm_save_context() RtlCaptureContext(&checkasm_context_buf) #define checkasm_load_context() RtlRestoreContext(&checkasm_context_buf, NULL) #else +#define checkasm_context void* +#define checkasm_save_context() do {} while (0) +#define checkasm_load_context() do {} while (0) +#endif +#else #include #define checkasm_context jmp_buf #define checkasm_save_context() setjmp(checkasm_context_buf) -- cgit v1.2.3 From d2ee43892b5ee667088fc1768cf3a7f48b6dc896 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 1 Nov 2023 12:34:33 +0100 Subject: checkasm: Improve DSP trimming error message --- tests/checkasm/checkasm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 24db86e..71a9334 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -652,7 +652,7 @@ int main(int argc, char *argv[]) { } #if TRIM_DSP_FUNCTIONS - fprintf(stderr, "checkasm: reference functions unavailable\n"); + fprintf(stderr, "checkasm: reference functions unavailable, reconfigure using '-Dtrim_dsp=false'\n"); return 0; #endif -- cgit v1.2.3 From 2179b30c84571ae5a4ecfe60821b2dd0050f355f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 1 Nov 2023 19:28:07 +0200 Subject: checkasm: Fix catching crashes on Windows on ARM longjmp on Windows uses SEH to unwind on ARM/ARM64 too, just like on x86_64, thus use RtlCaptureContext/RtlRestoreContext instead of setjmp/longjmp on those architectures as well. --- tests/checkasm/checkasm.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 269ce59..562960a 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -33,9 +33,10 @@ #include #include -#if ARCH_X86_64 && defined(_WIN32) -/* setjmp/longjmp on 64-bit Windows will try to use SEH to unwind the stack, - * which doesn't work for assembly functions without unwind information. */ +#if !ARCH_X86_32 && defined(_WIN32) +/* setjmp/longjmp on Windows on architectures using SEH (all except x86_32) + * will try to use SEH to unwind the stack, which doesn't work for assembly + * functions without unwind information. */ #include #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) #define checkasm_context CONTEXT -- cgit v1.2.3 From e47a39ca951b523824424cd20719f93da2b7da9f Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Sun, 12 Nov 2023 12:24:07 +0100 Subject: x86: Fix 8bpc AVX2 ipred_z2 filtering with extremely large frame sizes The max_width/max_height values can exceed 16-bit range. --- src/x86/ipred_avx2.asm | 56 +++++++++++++++++++++++++------------------------- tests/checkasm/ipred.c | 15 +++++++++++--- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/src/x86/ipred_avx2.asm b/src/x86/ipred_avx2.asm index 95802c7..58e4093 100644 --- a/src/x86/ipred_avx2.asm +++ b/src/x86/ipred_avx2.asm @@ -2275,14 +2275,14 @@ ALIGN function_align vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] punpckhqdq xm3, xm3 ; 34 44 44 44 pmaddubsw xm3, xm4 - movd xm4, r6m ; max_width - pminsw xm4, xm15 - vpbroadcastb xm4, xm4 + vpbroadcastd xm4, r6m ; max_width + packssdw xm4, xm4 paddw xm0, xm2 paddw xm0, xm3 pmulhrsw xm0, xm13 - psubb xm4, [base+pb_1to32] + packsswb xm4, xm4 psrlq xm1, 8 + psubb xm4, [base+pb_1to32] packuswb xm0, xm0 vpblendvb xm0, xm1, xm4 movd [rsp+65], xm0 @@ -2324,14 +2324,14 @@ ALIGN function_align vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2] pshufb m2, m4 pmaddubsw m2, m3 - movd xm4, r7m ; max_height - pminsw xm4, xm15 - vpbroadcastb xm4, xm4 - psubb xm4, [base+pb_16to1] + vpbroadcastd xm4, r7m ; max_height + packssdw xm4, xm4 paddw m1, m0 paddw m1, m2 pmulhrsw m1, m13 + packsswb xm4, xm4 vextracti128 xm0, m1, 1 + psubb xm4, [base+pb_16to1] packuswb xm0, xm1 vpblendvb xm0, [rsp+48], xm4 mova [rsp+48], xm0 @@ -2465,14 +2465,14 @@ ALIGN function_align pmaddubsw xm2, xm4 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] pmaddubsw xm3, xm4 - movd xm4, r6m ; max_width - pminuw xm4, xm15 - vpbroadcastb xm4, xm4 + vpbroadcastd xm4, r6m ; max_width + packssdw xm4, xm4 paddw xm0, xm2 paddw xm0, xm3 pmulhrsw xm0, xm13 - psubb xm4, [base+pb_1to32] + packsswb xm4, xm4 psrldq xm1, 1 + psubb xm4, [base+pb_1to32] packuswb xm0, xm0 vpblendvb xm0, xm1, xm4 movq [rsp+65], xm0 @@ -2530,14 +2530,14 @@ ALIGN function_align vinserti128 m2, [rsp+43], 1 pshufb m0, m2, m0 pmaddubsw m0, m7 - movd xm7, r7m ; max_height + vpbroadcastd m7, r7m ; max_height pshufb m1, m2, m1 pmaddubsw m1, m8 pshufb m2, m4 pmaddubsw m2, m9 - pminsw xm7, xm15 + packssdw m7, m7 paddw m1, m0 - vpbroadcastb m7, xm7 + packsswb m7, m7 paddw m1, m2 pmulhrsw m1, m13 psubb m7, [base+pb_32to1] @@ -2679,14 +2679,14 @@ ALIGN function_align shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff pmaddubsw m2, m4 pmaddubsw m1, m5 - movd xm4, r6m ; max_width - pminsw xm4, xm15 - vpbroadcastb xm4, xm4 + vpbroadcastd xm4, r6m ; max_width + packssdw xm4, xm4 paddw m0, m2 paddw m0, m1 pmulhrsw m0, m13 - psubb xm4, [base+pb_1to32] + packsswb xm4, xm4 vextracti128 xm2, m0, 1 + psubb xm4, [base+pb_1to32] packuswb xm0, xm2 vpblendvb xm0, xm6, xm4 movu [rsp+65], xm0 @@ -2703,9 +2703,9 @@ ALIGN function_align vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] .w16_filter_left: - movd xm6, r7m ; max_height - pminsw xm6, xm15 - vpbroadcastb m6, xm6 + vpbroadcastd m6, r7m ; max_height + packssdw m6, m6 + packsswb m6, m6 cmp hd, 32 jl .w16_filter_left_h16 vpbroadcastd xm0, [base+pb_5] @@ -2916,9 +2916,9 @@ ALIGN function_align vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff movu xm3, [tlq+ 6] vinserti128 m3, [tlq+17], 1 - movd xm0, r6m ; max_width - pminsw xm0, xm15 - vpbroadcastb m10, xm0 + vpbroadcastd m10, r6m ; max_width + packssdw m10, m10 + packsswb m10, m10 .w32_filter_above: pshufb m0, m1, m5 shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de @@ -2974,20 +2974,20 @@ ALIGN function_align paddw m0, m3 movu xm2, [tlq+36] vinserti128 m2, [tlq+49], 1 + vpbroadcastd m10, r6m ; max_width pshufb m4, m2, m4 pmaddubsw m4, m7 pshufb m3, m2, m6 pmaddubsw m3, m8 pshufb m2, m5 pmaddubsw m2, m9 - movd xm5, r6m ; max_width - pminsw xm5, xm15 - vpbroadcastb m10, xm5 + packssdw m10, m10 paddw m3, m4 paddw m2, m3 vpbroadcastd m3, [base+pb_32] pmulhrsw m0, m13 pmulhrsw m2, m13 + packsswb m10, m10 mova xm5, [base+z_filter_s] vinserti128 m5, [base+z_filter_s+6], 1 psubb m3, m10, m3 diff --git a/tests/checkasm/ipred.c b/tests/checkasm/ipred.c index 946ce73..ad54f1b 100644 --- a/tests/checkasm/ipred.c +++ b/tests/checkasm/ipred.c @@ -65,6 +65,16 @@ static const uint8_t z_angles[27] = { 81, 84, 87 }; +/* Generate max_width/max_height values that covers all edge cases */ +static int gen_z2_max_wh(const int sz) { + const int n = rnd(); + if (n & (1 << 17)) /* edge block */ + return (n & (sz - 1)) + 1; + if (n & (1 << 16)) /* max size, exceeds uint16_t */ + return 65536; + return (n & 65535) + 1; +} + static void check_intra_pred(Dav1dIntraPredDSPContext *const c) { PIXEL_RECT(c_dst, 64, 64); PIXEL_RECT(a_dst, 64, 64); @@ -98,9 +108,8 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) { a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) | (rnd() & 0x600); if (mode == Z2_PRED) { - maxw = rnd(), maxh = rnd(); - maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1)); - maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1)); + maxw = gen_z2_max_wh(w); + maxh = gen_z2_max_wh(h); } } else if (mode == FILTER_PRED) /* filter_idx */ a = (rnd() % 5) | (rnd() & ~511); -- cgit v1.2.3 From 3c41fa88ce0fee1fcd1cdfdf53ad8db9bcf3ad29 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 13 Nov 2023 13:05:58 +0100 Subject: x86: Add 8-bit ipred z2 AVX-512 (Ice Lake) asm --- src/x86/ipred.h | 1 + src/x86/ipred_avx512.asm | 748 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 738 insertions(+), 11 deletions(-) diff --git a/src/x86/ipred.h b/src/x86/ipred.h index e290c87..1997588 100644 --- a/src/x86/ipred.h +++ b/src/x86/ipred.h @@ -138,6 +138,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl); + init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl); init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl); #endif init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl); diff --git a/src/x86/ipred_avx512.asm b/src/x86/ipred_avx512.asm index 7666e62..de953de 100644 --- a/src/x86/ipred_avx512.asm +++ b/src/x86/ipred_avx512.asm @@ -139,11 +139,19 @@ z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0 db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1 db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2 db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3 -z_ypos_mul1: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512 +z_ypos_off3: db 1, 2, 1, 2, 1, 1, 1, 1, 3, 4, 3, 4, 1, 1, 1, 1 + db 5, 6, 5, 6, 3, 3, 3, 3, 7, 8, 7, 8, 3, 3, 3, 3 + db 9, 10, 9, 10, 5, 5, 5, 5, 11, 12, 11, 12, 5, 5, 5, 5 + db 13, 14, 13, 14, 7, 7, 7, 7, 15, 16, 15, 16, 7, 7, 7, 7 +z_ypos_mul1a: dw 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24 + dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56 +z_ypos_mul1b: dw 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32 + dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64 +z_ypos_mul2a: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512 dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512 dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512 dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512 -z_ypos_mul2: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512 +z_ypos_mul2b: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512 dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512 dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512 dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512 @@ -165,6 +173,7 @@ pb_15: times 4 db 15 pb_16: times 4 db 16 pb_31: times 4 db 31 pb_63: times 4 db 63 +pb_90: times 4 db 90 pb_128: times 4 db 128 pw_128: times 2 dw 128 pw_255: times 2 dw 255 @@ -198,6 +207,7 @@ JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 @@ -1824,6 +1834,722 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx .w64_end: RET +cglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy + tzcnt wd, wm + movifnidn angled, anglem + lea dxq, [dr_intra_derivative-90] + movzx dyd, angleb + xor angled, 0x400 + mov r7, dxq + sub dxq, dyq + movifnidn hd, hm + and dyd, ~1 + and dxq, ~1 + movzx dyd, word [r7+dyq] ; angle - 90 + lea r7, [z_filter_t0] + movzx dxd, word [dxq+270] ; 180 - angle + movsxd wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4] + mova m8, [base+pb_63to0] + neg dyd + vpermb m8, m8, [tlq-64] ; left + lea wq, [base+ipred_z2_8bpc_avx512icl_table+wq] + mova m14, [base+z_frac_table] + inc tlq + vpbroadcastd m15, [base+pw_512] + neg dxd + jmp wq +.w4: + movd xm7, [tlq] + vpbroadcastq m10, [base+z_xpos_off2a] + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm2, [base+pb_4] + sub angled, 1075 ; angle - 53 + call .upsample_above + lea r3d, [hq+3] + vpbroadcastq m10, [pb_0to63+1] + punpcklbw xm7, xm0, xm7 + call .filter_strength + jmp .w4_filter_left +.w4_upsample_left: + call .upsample_left + movsldup m16, [base+z_ypos_off3] + vpbroadcastd m9, [base+pb_16] + punpcklbw xm8, xm0, xm8 + jmp .w4_main2 +.w4_no_upsample_above: + lea r3d, [hq+3] + sub angled, 1112 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w4_no_filter_above + vpbroadcastd xm5, [base+pb_3] + call .filter_top_w16 +.w4_no_filter_above: + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + vpbroadcastd ym0, [base+pb_90] + psubb ym0, ym17 + vpcmpgtb k2{k2}, ym0, ym16 + kmovd r3d, k2 +.w4_filter_left: + test r3d, r3d + jz .w4_main + popcnt r3d, r3d + call .filter_left_h16 +.w4_main: + movsldup m16, [base+z_ypos_off1] + vpbroadcastd m9, [base+pb_8] +.w4_main2: + vpbroadcastq m3, [base+z_ypos_mul1a] + vpbroadcastw m0, dyd + movsldup m1, [base+z_xpos_mul] + vpbroadcastw m5, dxd + vinserti32x4 m7, [tlq-16], 3 + vinserti32x4 m8, [tlq-16], 3 + pmullw m3, m0 + vbroadcasti32x4 m2, [base+z_xpos_bc] + pmullw m1, m5 ; xpos0..3 + psllw m5, 5 ; dx*8 + psraw m4, m3, 6 + psrlw m3, 1 + packsswb m4, m4 + vpermw m3, m3, m14 ; 64-frac, frac + punpcklbw m4, m4 + lea r2, [strideq*3] + paddb m4, m16 ; base, base+1 +.w4_loop: + pshufb m16, m1, m2 + psrlw m0, m1, 3 + paddb m16, m10 + vpermw m0, m0, m14 + vpmovw2m k1, m16 ; base_x < 0 + vpermb m16, m16, m7 + pmaddubsw m16, m0 + vpermb m0, m4, m8 + pmaddubsw m16{k1}, m0, m3 + pmulhrsw m16, m15 + vpmovwb ym16, m16 + movd [dstq+strideq*0], xm16 + pextrd [dstq+strideq*1], xm16, 1 + pextrd [dstq+strideq*2], xm16, 2 + pextrd [dstq+r2 ], xm16, 3 + sub hd, 8 + jl .w4_end + paddsw m1, m5 + vextracti128 xm16, ym16, 1 + lea dstq, [dstq+strideq*4] + paddb m4, m9 + movd [dstq+strideq*0], xm16 + pextrd [dstq+strideq*1], xm16, 1 + pextrd [dstq+strideq*2], xm16, 2 + pextrd [dstq+r2 ], xm16, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.upsample_above: ; w4/w8 + mova xm0, [tlq-1] + xor angled, 0x7f ; 180 - angle + add dxd, dxd + jmp .upsample +.upsample_left: ; h4/h8 + palignr xm0, xm8, [tlq-16], 15 + vpbroadcastb xm2, hd + add dyd, dyd +.upsample: + pshufb xm1, xm0, [base+z_filter4_s1] + pminub xm2, [base+z_filter_s4] + vpbroadcastd xm3, [base+pb_m4_36] + pshufb xm0, xm2 + pmaddubsw xm1, xm3 + pmaddubsw xm0, xm3 + paddw xm0, xm1 + pmulhrsw xm0, xm15 + packuswb xm0, xm0 + ret +.filter_strength: + vpbroadcastb ym16, r3d + mov r3d, angled + vpbroadcastd m2, [tlq-4] + vpbroadcastb ym17, angled + shr r3d, 8 + vpcmpeqb k2, ym16, [base+z_filter_wh] + mova xm16, [base+z_filter_t0+r3*8] + vpcmpgtb k1{k2}, ym17, ym16 + mova m9, [pb_0to63] + kmovd r3d, k1 + ret +.w8: + movq xm7, [tlq] + vbroadcasti32x4 m10, [base+z_xpos_off2a] + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + vpbroadcastd xm2, [base+pb_8] + sub angled, 53 ; angle - 53 + call .upsample_above + lea r3d, [hq+7] + vbroadcasti32x4 m10, [pb_0to63+1] + punpcklbw xm7, xm0, xm7 + call .filter_strength + jmp .w8_filter_left +.w8_upsample_left: + call .upsample_left + movshdup m16, [base+z_ypos_off3] + vpbroadcastd m9, [base+pb_8] + punpcklbw xm8, xm0, xm8 + jmp .w8_main2 +.w8_no_upsample_above: + lea r3d, [hq+7] + sub angled, 90 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w8_no_filter_above + vpbroadcastd xm5, [base+pb_7] + call .filter_top_w16 +.w8_no_filter_above: + lea r3d, [angleq-51] + mov r3b, hb + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + vpbroadcastd ym0, [base+pb_90] + psubb ym0, ym17 + vpcmpgtb k2{k2}, ym0, ym16 + kmovd r3d, k2 +.w8_filter_left: + test r3d, r3d + jz .w8_main + cmp hd, 32 + je .w8_filter_left_h32 + popcnt r3d, r3d + call .filter_left_h16 + jmp .w8_main +.w8_filter_left_h32: + call .filter_left_h64 +.w8_main: + movshdup m16, [base+z_ypos_off2] + vpbroadcastd m9, [base+pb_4] +.w8_main2: + vbroadcasti32x4 m3, [base+z_ypos_mul1a] + vpbroadcastw m0, dyd + movshdup m1, [base+z_xpos_mul] + vpbroadcastw m5, dxd + vinserti32x4 m7, [tlq-16], 3 + vinserti32x4 m8, [tlq-16], 3 + pmullw m3, m0 + vpbroadcastd m2, [base+pb_1] + pmullw m1, m5 ; xpos0..3 + psllw m5, 4 ; dx*4 + psraw m4, m3, 6 + psrlw m3, 1 + packsswb m4, m4 + vpermw m3, m3, m14 ; 64-frac, frac + lea r3d, [dxq+(8<<6)] + paddsb m4, m16 + shl dxd, 2 + paddsb m0, m4, m2 + lea r2, [strideq*3] + punpcklbw m4, m0 ; base, base+1 +.w8_loop: + pshufb m16, m1, m2 + psrlw m0, m1, 3 + paddb m16, m10 + vpermw m0, m0, m14 + vpmovw2m k1, m16 ; base_x < 0 + vpermb m16, m16, m7 + pmaddubsw m16, m0 + vpermb m0, m4, m8 + pmaddubsw m16{k1}, m0, m3 + pmulhrsw m16, m15 + vpmovwb ym16, m16 + vextracti128 xm17, ym16, 1 + movq [dstq+strideq*0], xm16 + movhps [dstq+strideq*1], xm16 + movq [dstq+strideq*2], xm17 + movhps [dstq+r2 ], xm17 + sub hd, 4 + jz .w8_end + paddw m1, m5 + lea dstq, [dstq+strideq*4] + paddb m4, m9 + add r3d, dxd + jge .w8_loop +.w8_leftonly_loop: + vpermb m16, m4, m8 + pmaddubsw m16, m3 + paddb m4, m9 + pmulhrsw m16, m15 + vpmovwb ym16, m16 + vextracti128 xm17, ym16, 1 + movq [dstq+strideq*0], xm16 + movhps [dstq+strideq*1], xm16 + movq [dstq+strideq*2], xm17 + movhps [dstq+r2 ], xm17 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_leftonly_loop +.w8_end: + RET +.filter_top_w16: + mova xm0, [base+z_filter_s1] + popcnt r3d, r3d + pminub xm4, xm5, [base+z_filter_s4] + vpermi2b xm0, xm7, xm2 + pminub xm5, [base+z_filter_s5] + pshufb xm1, xm7, [base+z_filter_s2] + vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] + pshufb xm3, xm7, [base+z_filter_s3] + vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] + pshufb xm4, xm7, xm4 + vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] + pshufb xm5, xm7, xm5 + pmaddubsw xm0, xm11 + pmaddubsw xm1, xm11 + pmaddubsw xm6, xm3, xm12 + vpbroadcastd xm12, r7m ; max_width + pmaddubsw xm3, xm13 + pmaddubsw xm4, xm11 + pmaddubsw xm5, xm11 + packssdw xm12, xm12 + paddw xm0, xm6 + paddw xm1, xm3 + paddw xm0, xm4 + paddw xm1, xm5 + packsswb xm12, xm12 + pmulhrsw xm0, xm15 + pmulhrsw xm1, xm15 + vpcmpgtb k1, xm12, xm9 ; x < max_width + packuswb xm7{k1}, xm0, xm1 + ret +.filter_left_h16: + lea r5d, [hq-1] + mova xm0, [base+z_filter_s1] + vpbroadcastb xm5, r5d + vpermi2b xm0, xm8, xm2 + pminub xm4, xm5, [base+z_filter_s4] + pshufb xm1, xm8, [base+z_filter_s2] + pminub xm5, [base+z_filter_s5] + pshufb xm3, xm8, [base+z_filter_s3] + vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] + pshufb xm4, xm8, xm4 + vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] + pshufb xm5, xm8, xm5 + vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] + pmaddubsw xm0, xm11 + pmaddubsw xm1, xm11 + pmaddubsw xm6, xm3, xm12 + vpbroadcastd xm12, r8m ; max_height + pmaddubsw xm3, xm13 + pmaddubsw xm4, xm11 + pmaddubsw xm5, xm11 + packssdw xm12, xm12 + paddw xm0, xm6 + paddw xm1, xm3 + paddw xm0, xm4 + paddw xm1, xm5 + packsswb xm12, xm12 + pmulhrsw xm0, xm15 + pmulhrsw xm1, xm15 + vpcmpgtb k1, xm12, xm9 ; y < max_height + packuswb xm8{k1}, xm0, xm1 + ret +.w16: + movu xm7, [tlq] ; top + test angled, 0x400 + jnz .w16_main + lea r3d, [hq+15] + sub angled, 90 + call .filter_strength + test r3d, r3d + jz .w16_no_filter_above + vpbroadcastd xm5, [base+pb_15] + call .filter_top_w16 +.w16_no_filter_above: + cmp hd, 16 + jg .w16_filter_left_h64 + vpbroadcastd ym0, [base+pb_90] + psubb ym0, ym17 + vpcmpgtb k2{k2}, ym0, ym16 + kmovd r3d, k2 + test r3d, r3d + jz .w16_main + popcnt r3d, r3d + call .filter_left_h16 + jmp .w16_main +.w16_filter_left_h64: + call .filter_left_h64 +.w16_main: + vbroadcasti32x4 m6, [base+z_ypos_mul1a] ; 1.. 8 + vbroadcasti32x4 m5, [base+z_ypos_mul1b] ; 9..15 + vpbroadcastw m0, dyd + vinserti32x4 m7, [tlq-16], 3 + vpbroadcastd m2, [base+pb_1] + vpbroadcastw m12, dxd + movshdup m1, [base+z_xpos_mul] + pmullw m6, m0 + vbroadcasti32x4 m3, [base+z_xpos_off2a] + pmullw m5, m0 + vbroadcasti32x4 m4, [base+z_xpos_off2b] + pmullw m1, m12 ; xpos0 xpos1 xpos2 xpos3 + vpbroadcastd m9, [base+pb_4] + psllw m12, 4 ; dx*4 + movshdup m16, [base+z_ypos_off2] + psrlw m10, m6, 1 + psrlw m11, m5, 1 + vpermw m10, m10, m14 ; 64-frac, frac + psraw m6, 6 + vpermw m11, m11, m14 + psraw m5, 6 + mov r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft + packsswb m6, m5 + mov r3d, 1<<6 + paddsb m6, m16 + sub r5d, dxd ; left-only threshold + paddsb m0, m6, m2 + shl dxd, 2 + punpcklbw m5, m6, m0 ; base, base+1 + lea r2, [strideq*3] + punpckhbw m6, m0 +.w16_loop: + pshufb m17, m1, m2 + psrlw m0, m1, 3 + paddb m16, m3, m17 + vpermw m0, m0, m14 + paddb m17, m4 + vpmovw2m k1, m16 + vpermb m16, m16, m7 + vpmovw2m k2, m17 + vpermb m17, m17, m7 + pmaddubsw m16, m0 + pmaddubsw m17, m0 + add r3d, dxd + jge .w16_toponly + mova m0, m8 + vpermt2b m0, m5, m7 + pmaddubsw m16{k1}, m0, m10 + mova m0, m8 + vpermt2b m0, m6, m7 + pmaddubsw m17{k2}, m0, m11 +.w16_toponly: + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + sub hd, 4 + jz .w16_end + paddw m1, m12 + lea dstq, [dstq+strideq*4] + paddb m5, m9 + paddb m6, m9 + cmp r3d, r5d + jge .w16_loop +.w16_leftonly_loop: + vpermb m16, m5, m8 + vpermb m17, m6, m8 + pmaddubsw m16, m10 + pmaddubsw m17, m11 + paddb m5, m9 + paddb m6, m9 + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_leftonly_loop +.w16_end: + RET +.w32: + movu ym7, [tlq] + test angled, 0x400 + jnz .w32_main + vpbroadcastd m2, [tlq-4] + mova ym0, [base+z_filter_s1] + vbroadcasti32x4 ym1, [base+z_filter_s2] + vbroadcasti32x4 ym3, [base+z_filter_s3] + vbroadcasti32x4 ym4, [base+z_filter_s4] + vpermi2b ym0, ym7, ym2 ; al bl + vpbroadcastd ym5, [base+pb_31] + pminub ym5, [base+z_filter_s5] + pshufb ym1, ym7, ym1 ; ah bh + vpbroadcastd ym11, [base+z_filter_k+4*2+12*0] + pshufb ym3, ym7, ym3 ; cl ch + vpbroadcastd ym12, [base+z_filter_k+4*2+12*1] + pshufb ym4, ym7, ym4 ; el dl + vpbroadcastd ym13, [base+z_filter_k+4*2+12*2] + vpermb ym5, ym5, ym7 ; eh dh + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym6, ym3, ym12 + vpbroadcastd ym12, r6m + pmaddubsw ym3, ym13 + pmaddubsw ym4, ym11 + pmaddubsw ym5, ym11 + mova m9, [pb_0to63] + packssdw ym12, ym12 + paddw ym0, ym6 + paddw ym1, ym3 + paddw ym0, ym4 + paddw ym1, ym5 + packsswb ym12, ym12 + pmulhrsw ym0, ym15 + pmulhrsw ym1, ym15 + vpcmpgtb k1, ym12, ym9 ; x < max_width + packuswb ym7{k1}, ym0, ym1 + cmp hd, 16 + jg .w32_filter_h64 + mov r3d, 3 + call .filter_left_h16 + jmp .w32_main +.w32_filter_h64: + call .filter_left_h64 +.w32_main: + vbroadcasti32x8 m6, [base+z_ypos_mul1a] ; 1.. 8 + vbroadcasti32x8 m5, [base+z_ypos_mul1b] ; 9..15 + vpbroadcastw m0, dyd + vinserti32x4 m7, [tlq-16], 3 + rorx r2q, dxq, 62 ; dx << 2 + vpbroadcastd m2, [base+pb_1] + vpbroadcastw m1, r2d + pmullw m6, m0 + vbroadcasti32x8 m3, [base+z_xpos_off2a] + pmullw m5, m0 + vbroadcasti32x8 m4, [base+z_xpos_off2b] + mova ym0, ym1 + paddw m12, m1, m1 + vpbroadcastd m9, [base+pb_2] + paddw m1, m0 ; xpos1 xpos0 + mova ym0, ym2 + psrlw m10, m6, 1 + psrlw m11, m5, 1 + vpermw m10, m10, m14 ; 64-frac, frac + psraw m6, 6 + vpermw m11, m11, m14 + psraw m5, 6 + mov r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft + packsswb m6, m5 + mov r3d, 1<<6 + paddsb m6, m0 + sub r5d, dxd ; left-only threshold + paddsb m0, m6, m2 + add dxd, dxd + punpcklbw m5, m6, m0 ; base, base+1 + punpckhbw m6, m0 +.w32_loop: + pshufb m17, m1, m2 + psrlw m0, m1, 3 + paddb m16, m3, m17 + vpermw m0, m0, m14 + paddb m17, m4 + vpmovw2m k1, m16 + vpermb m16, m16, m7 + vpmovw2m k2, m17 + vpermb m17, m17, m7 + pmaddubsw m16, m0 + pmaddubsw m17, m0 + add r3d, dxd + jge .w32_toponly + mova m0, m8 + vpermt2b m0, m5, m7 + pmaddubsw m16{k1}, m0, m10 + mova m0, m8 + vpermt2b m0, m6, m7 + pmaddubsw m17{k2}, m0, m11 +.w32_toponly: + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + vextracti32x8 [dstq+strideq*0], m16, 1 + mova [dstq+strideq*1], ym16 + sub hd, 2 + jz .w32_end + paddw m1, m12 + lea dstq, [dstq+strideq*2] + paddb m5, m9 + paddb m6, m9 + cmp r3d, r5d + jge .w32_loop +.w32_leftonly_loop: + vpermb m16, m5, m8 + vpermb m17, m6, m8 + pmaddubsw m16, m10 + pmaddubsw m17, m11 + paddb m5, m9 + paddb m6, m9 + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + vextracti32x8 [dstq+strideq*0], m16, 1 + mova [dstq+strideq*1], ym16 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_leftonly_loop +.w32_end: + RET +.filter_left_h64: + mova m0, [base+z_filter_s1] + lea r3d, [hq-1] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpbroadcastb m5, r3d + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vpermi2b m0, m8, m2 ; al bl + pminub m5, [base+z_filter_s5] + pshufb m1, m8, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+4*2+12*0] + pshufb m3, m8, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+4*2+12*1] + pshufb m4, m8, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+4*2+12*2] + vpermb m5, m5, m8 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m6, m3, m12 + vpbroadcastd m12, r8m ; max_height + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + packssdw m12, m12 + paddw m0, m6 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + packsswb m12, m12 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + vpcmpgtb k1, m12, m9 ; y < max_height + packuswb m8{k1}, m0, m1 + ret +.w64: + movu m7, [tlq] + test angled, 0x400 + jnz .w64_main + vpbroadcastd m2, [tlq-4] + mova m0, [base+z_filter_s1] + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpermi2b m0, m7, m2 ; al bl + vpbroadcastd m5, [base+pb_63] + pminub m5, [base+z_filter_s5] + pshufb m1, m7, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+4*2+12*0] + pshufb m3, m7, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+4*2+12*1] + pshufb m4, m7, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+4*2+12*2] + vpermb m5, m5, m7 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m6, m3, m12 + vpbroadcastd m12, r6m + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + mova m9, [pb_0to63] + packssdw m12, m12 + paddw m0, m6 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + packsswb m12, m12 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + vpcmpgtb k1, m12, m9 ; x < max_width + packuswb m7{k1}, m0, m1 + call .filter_left_h64 ; always filter the full 64 pixels for simplicity +.w64_main: + vpbroadcastw m5, dyd + vpbroadcastd m9, [tlq-4] + rorx r2q, dxq, 62 ; dx << 2 + pmullw m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such + pmullw m5, [base+z_ypos_mul1b] ; pixels aren't selected from the left edge + vpbroadcastw m1, r2d ; xpos + mova m3, [base+z_xpos_off2a] + mova m4, [base+z_xpos_off2b] + mova m12, m1 + vpbroadcastd m2, [base+pb_1] + psrlw m10, m6, 1 + psrlw m11, m5, 1 + vpermw m10, m10, m14 ; 64-frac, frac + psraw m6, 6 + vpermw m11, m11, m14 + psraw m5, 6 + mov r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft + packsswb m6, m5 + mov r3d, 1<<6 + paddsb m0, m6, m2 + sub r5d, dxd ; left-only threshold + punpcklbw m5, m6, m0 ; base, base+1 + punpckhbw m6, m0 +.w64_loop: + pshufb m17, m1, m2 + psrlw m0, m1, 3 + paddb m16, m3, m17 + vpermw m0, m0, m14 + paddb m17, m4 + vpmovw2m k1, m16 ; base_x < 0 + vpermi2b m16, m7, m9 + vpmovw2m k2, m17 + vpermi2b m17, m7, m9 + pmaddubsw m16, m0 + pmaddubsw m17, m0 + add r3d, dxd + jge .w64_toponly + mova m0, m8 + vpermt2b m0, m5, m9 + pmaddubsw m16{k1}, m0, m10 + mova m0, m8 + vpermt2b m0, m6, m9 + pmaddubsw m17{k2}, m0, m11 +.w64_toponly: + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq], m16 + dec hd + jz .w64_end + paddw m1, m12 + add dstq, strideq + paddb m5, m2 + paddb m6, m2 + cmp r3d, r5d + jge .w64_loop +.w64_leftonly_loop: + vpermb m16, m5, m8 + vpermb m17, m6, m8 + pmaddubsw m16, m10 + pmaddubsw m17, m11 + paddb m5, m2 + paddb m6, m2 + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq], m16 + add dstq, strideq + dec hd + jg .w64_leftonly_loop +.w64_end: + RET + cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy lea r7, [z_filter_t0] tzcnt wd, wm @@ -1879,7 +2605,7 @@ cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy vpbroadcastd m6, [base+pb_8] .w4_main2: vpbroadcastw m0, dyd - vpbroadcastq m2, [base+z_ypos_mul1] ; 1..4 + vpbroadcastq m2, [base+z_ypos_mul2a] ; 1..4 pmulhuw m2, m0 ; ypos >> 1 lea r2, [strideq*3] vpermw m3, m2, m14 ; 64-frac, frac @@ -1960,7 +2686,7 @@ cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy vpbroadcastd m6, [base+pb_4] .w8_main2: vpbroadcastw m0, dyd - vbroadcasti32x4 m2, [base+z_ypos_mul1] ; 1..8 + vbroadcasti32x4 m2, [base+z_ypos_mul2a] ; 1..8 pmulhuw m2, m0 ; ypos >> 1 lea r2, [strideq*3] vpermw m3, m2, m14 ; 64-frac, frac @@ -2037,10 +2763,10 @@ cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy valignq m11, m8, m7, 1 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter .w16_main: - vpbroadcastd m6, [base+pb_4] + vbroadcasti32x4 m3, [base+z_ypos_mul2a] ; 1.. 8 + vbroadcasti32x4 m2, [base+z_ypos_mul2b] ; 9..15 vpbroadcastw m0, dyd - vbroadcasti32x4 m3, [base+z_ypos_mul1] ; 1.. 8 - vbroadcasti32x4 m2, [base+z_ypos_mul2] ; 9..15 + vpbroadcastd m6, [base+pb_4] pmulhuw m3, m0 ; ypos >> 1 pmulhuw m2, m0 movshdup m0, [base+z_ypos_off2] @@ -2098,9 +2824,9 @@ cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy valignq m11, m8, m7, 1 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter .w32_main: - vbroadcasti32x8 m3, [base+z_ypos_mul1] ; 1.. 8 + vbroadcasti32x8 m3, [base+z_ypos_mul2a] ; 1.. 8 + vbroadcasti32x8 m2, [base+z_ypos_mul2b] ; 9..15 vpbroadcastw m0, dyd - vbroadcasti32x8 m2, [base+z_ypos_mul2] ; 9..15 vpbroadcastd m1, [base+pb_1] pmulhuw m3, m0 ; ypos >> 1 pmulhuw m2, m0 @@ -2148,8 +2874,8 @@ cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter .w64_main: vpbroadcastw m2, dyd - pmulhuw m3, m2, [base+z_ypos_mul1] - pmulhuw m2, [base+z_ypos_mul2] + pmulhuw m3, m2, [base+z_ypos_mul2a] + pmulhuw m2, [base+z_ypos_mul2b] vpbroadcastd m6, [base+pb_1] vpermw m4, m3, m14 ; 64-frac, frac psrlw m3, 5 -- cgit v1.2.3 From ec05e9b9784f0c5b77e089e3a1256788ee036e1e Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Fri, 8 Dec 2023 15:34:16 +0100 Subject: x86: Flag Zen 4 as having slow gathers --- src/x86/cpu.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/x86/cpu.c b/src/x86/cpu.c index 764d8be..f570fd7 100644 --- a/src/x86/cpu.c +++ b/src/x86/cpu.c @@ -57,7 +57,6 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) { if (cpu.max_leaf >= 1) { CpuidRegisters r; dav1d_cpu_cpuid(&r, 1, 0); - const unsigned model = ((r.eax >> 4) & 0x0f) + ((r.eax >> 12) & 0xf0); const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff); if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ { @@ -87,10 +86,8 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) { } #endif if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) { - if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && (family < 0x19 || - (family == 0x19 && (model < 0x10 || (model >= 0x20 && model < 0x60))))) - { - /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+ */ + if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && family <= 0x19) { + /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+, Zen 4 */ flags |= DAV1D_X86_CPU_FLAG_SLOW_GATHER; } } -- cgit v1.2.3 From 0e438e70face3bbfd660ee996cf1b6324d5a1ae6 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Fri, 8 Dec 2023 15:34:17 +0100 Subject: x86: Require fast gathers for AVX-512 mc resize and warp Prefer using the AVX2 implementations (which doesn't use gathers) on Zen 4. --- src/x86/mc.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/x86/mc.h b/src/x86/mc.h index 65c607e..b142361 100644 --- a/src/x86/mc.h +++ b/src/x86/mc.h @@ -292,8 +292,11 @@ static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) { c->blend = BF(dav1d_blend, avx512icl); c->blend_v = BF(dav1d_blend_v, avx512icl); c->blend_h = BF(dav1d_blend_h, avx512icl); - c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl); - c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl); - c->resize = BF(dav1d_resize, avx512icl); + + if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) { + c->resize = BF(dav1d_resize, avx512icl); + c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl); + } #endif } -- cgit v1.2.3 From a04a72471922e3763e324e2fdf835bb400bd0170 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Fri, 8 Dec 2023 15:34:19 +0100 Subject: x86: Require fast gathers for high bit-depth AVX-512 film grain Prefer using the SSSE3 implementations on Zen 4. --- src/x86/filmgrain.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/x86/filmgrain.h b/src/x86/filmgrain.h index eeaa328..8f6ac8f 100644 --- a/src/x86/filmgrain.h +++ b/src/x86/filmgrain.h @@ -73,9 +73,11 @@ static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *cons if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; - c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl); + if (BITDEPTH == 8 || !(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) { + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl); + } #endif } -- cgit v1.2.3 From 0a8d66402e1b10ce16335d9497f42170e14e427b Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Fri, 8 Dec 2023 15:34:20 +0100 Subject: x86: Require fast gathers for AVX-512 horizontal loopfilters Prefer using the AVX2 implementations (which doesn't use gathers) on Zen 4. --- src/x86/loopfilter.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/x86/loopfilter.h b/src/x86/loopfilter.h index 33c842a..9535c75 100644 --- a/src/x86/loopfilter.h +++ b/src/x86/loopfilter.h @@ -58,9 +58,12 @@ static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *co if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; - c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl); c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl); - c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl); c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl); + + if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) { + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl); + } #endif } -- cgit v1.2.3 From b3779b89c0c8ec51655e757f3a5e287189a18408 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 4 Dec 2023 11:32:23 +0100 Subject: x86: Add high bit-depth ipred z1 AVX-512 (Ice Lake) asm --- src/x86/ipred.h | 2 +- src/x86/ipred16_avx512.asm | 549 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 548 insertions(+), 3 deletions(-) diff --git a/src/x86/ipred.h b/src/x86/ipred.h index 1997588..1815e37 100644 --- a/src/x86/ipred.h +++ b/src/x86/ipred.h @@ -137,7 +137,6 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl); init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); - init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl); init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl); init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl); #endif @@ -145,6 +144,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl); init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl); init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl); + init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl); init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl); c->pal_pred = BF(dav1d_pal_pred, avx512icl); diff --git a/src/x86/ipred16_avx512.asm b/src/x86/ipred16_avx512.asm index 60f08d7..94eaa3f 100644 --- a/src/x86/ipred16_avx512.asm +++ b/src/x86/ipred16_avx512.asm @@ -42,6 +42,16 @@ pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51 db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55 db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59 db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63 +pw_0to31: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + dw 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +z_upsample: dw 0, -1, 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6 + dw 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14 +z_xpos_mul: dw 1, 1, 1, 1, 2, 2, 1, 1, 3, 3, 2, 2, 4, 4, 2, 2 + dw 5, 5, 3, 3, 6, 6, 3, 3, 7, 7, 4, 4, 8, 8, 4, 4 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z_xpos_off1a: dw 30720, 30784, 30848, 30912, 30976, 31040, 31104, 31168 +z_xpos_off1b: dw 30720, 30848, 30976, 31104, 31232, 31360, 31488, 31616 filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5 times 4 db 10, 11, 12, 13, 2, 3, -1, -1 filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7 @@ -57,8 +67,21 @@ filter_shift: times 2 dw 6 dd 0 times 2 dw 4 dd 9 -pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44 - db 16, 24, 20, 28, 48, 56, 52, 60 +pd_65536: dd 65536 +pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44 + db 16, 24, 20, 28, 48, 56, 52, 60 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 79, 79, 79 +z_filter_k: dw 8, 8, 6, 6, 4, 4 + dw 4, 4, 5, 5, 4, 4 + dw 0, 0, 0, 0, 2, 2 +pw_17: times 2 dw 17 +pw_63: times 2 dw 63 +pw_512: times 2 dw 512 +pw_31806: times 2 dw 31806 + +%define pw_3 (z_xpos_mul+4* 4) +%define pw_7 (z_xpos_mul+4*12) %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) @@ -74,10 +97,12 @@ JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc cextern smooth_weights_2d_16bpc +cextern dr_intra_derivative cextern filter_intra_taps SECTION .text @@ -612,6 +637,526 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 jg .w64_loop RET +%if WIN64 + DECLARE_REG_TMP 4 +%else + DECLARE_REG_TMP 8 +%endif + +cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx +%define base r7-z_filter_t0 + lea r7, [z_filter_t0] + tzcnt wd, wm + movifnidn angled, anglem + lea t0, [dr_intra_derivative] + movsxd wq, [base+ipred_z1_16bpc_avx512icl_table+wq*4] + add tlq, 2 + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [t0+dxq] + lea wq, [base+ipred_z1_16bpc_avx512icl_table+wq] + movifnidn hd, hm + xor angled, 0x4ff ; d = 90 - angle + vpbroadcastd m15, [base+pw_31806] + jmp wq +.w4: + vpbroadcastw m5, [tlq+14] + vinserti32x4 m5, [tlq], 0 + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + call .upsample_top + vpbroadcastq m0, [base+z_xpos_off1b] + jmp .w4_main2 +.w4_no_upsample: + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+3] + vpbroadcastb xm0, r3d + vpbroadcastb xm1, angled + shr angled, 8 ; is_sm << 1 + vpcmpeqb k1, xm0, [base+z_filter_wh] + vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] + kmovw r5d, k1 + test r5d, r5d + jz .w4_main + call .w16_filter + mov r2d, 9 + cmp hd, 4 + cmovne r3d, r2d + vpbroadcastw m6, r3d + pminuw m6, [base+pw_0to31] + vpermw m5, m6, m5 +.w4_main: + vpbroadcastq m0, [base+z_xpos_off1a] +.w4_main2: + movsldup m3, [base+z_xpos_mul] + vpbroadcastw m4, dxd + lea r2, [strideq*3] + pmullw m3, m4 + vshufi32x4 m6, m5, m5, q3321 + psllw m4, 3 ; dx*8 + paddsw m3, m0 ; xpos + palignr m6, m5, 2 ; top+1 +.w4_loop: + psrlw m1, m3, 6 ; base_x + pand m2, m15, m3 ; frac + vpermw m0, m1, m5 ; top[base_x] + vpermw m1, m1, m6 ; top[base_x+1] + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + sub hd, 8 + jl .w4_end + vextracti32x4 xm1, m0, 2 + paddsw m3, m4 ; xpos += dx + lea dstq, [dstq+strideq*4] + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm0 + movhps [dstq+r2 ], xm0 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.upsample_top: + vinserti32x4 m5, [tlq-16], 3 + mova m3, [base+z_upsample] + vpbroadcastd m4, [base+pd_65536] + add dxd, dxd + vpermw m0, m3, m5 + paddw m3, m4 + vpermw m1, m3, m5 + paddw m3, m4 + vpermw m2, m3, m5 + paddw m3, m4 + vpermw m3, m3, m5 + vpbroadcastw m5, r9m ; pixel_max + paddw m1, m2 ; b+c + paddw m0, m3 ; a+d + psubw m0, m1, m0 + psraw m0, 3 + pxor m2, m2 + paddw m0, m1 + pmaxsw m0, m2 + pavgw m0, m2 + pminsw m5, m0 + ret +.w8: + lea r3d, [angleq+216] + movu ym5, [tlq] + mov r3b, hb + mova m10, [base+pw_0to31] + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + lea r3d, [hq+7] + vpbroadcastw m6, r3d + add r3d, r3d + pminuw m6, m10 + vpermw m5, m6, m5 + call .upsample_top + vbroadcasti32x4 m0, [base+z_xpos_off1b] + jmp .w8_main2 +.w8_no_upsample: + lea r3d, [hq+7] + vpbroadcastb ym0, r3d + and r3d, 7 + or r3d, 8 ; imin(h+7, 15) + vpbroadcastw m6, r3d + pminuw m6, m10 + vpermw m5, m6, m5 + test angled, 0x400 + jnz .w8_main + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym0, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + kmovd r5d, k1 + test r5d, r5d + jz .w8_main + call .w16_filter + cmp hd, r3d + jl .w8_filter_end + pminud m6, m10, [base+pw_17] {1to16} + add r3d, 2 +.w8_filter_end: + vpermw m5, m6, m5 +.w8_main: + vbroadcasti32x4 m0, [base+z_xpos_off1a] +.w8_main2: + movshdup m3, [base+z_xpos_mul] + vpbroadcastw m4, dxd + shl r3d, 6 + lea r2, [strideq*3] + pmullw m3, m4 + vshufi32x4 m6, m5, m5, q3321 + sub r3d, dxd + psllw m4, 2 ; dx*4 + shl dxd, 2 + paddsw m3, m0 ; xpos + palignr m6, m5, 2 ; top+1 +.w8_loop: + psrlw m1, m3, 6 ; base_x + pand m2, m15, m3 ; frac + vpermw m0, m1, m5 ; top[base_x] + vpermw m1, m1, m6 ; top[base_x+1] + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + sub hd, 4 + jz .w8_end + paddsw m3, m4 ; xpos += dx + lea dstq, [dstq+strideq*4] + sub r3d, dxd + jg .w8_loop + vextracti32x4 xm5, m5, 3 +.w8_end_loop: + mova [dstq+strideq*0], xm5 + mova [dstq+strideq*1], xm5 + mova [dstq+strideq*2], xm5 + mova [dstq+r2 ], xm5 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_end_loop +.w8_end: + RET +.w16_filter: + vpbroadcastw m1, [tlq-2] + popcnt r5d, r5d + valignq m3, m6, m5, 2 + vpbroadcastd m7, [base+z_filter_k+(r5-1)*4+12*0] + valignq m1, m5, m1, 6 + vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1] + palignr m2, m3, m5, 2 + vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2] + palignr m0, m5, m1, 14 + pmullw m7, m5 + palignr m3, m5, 4 + paddw m0, m2 + palignr m5, m1, 12 + pmullw m0, m8 + paddw m5, m3 + pmullw m5, m9 + pxor m1, m1 + paddw m0, m7 + paddw m5, m0 + psrlw m5, 3 + pavgw m5, m1 + ret +.w16: + lea r3d, [hq+15] + vpbroadcastb ym0, r3d + and r3d, 15 + or r3d, 16 ; imin(h+15, 31) + vpbroadcastw m11, r3d + pminuw m10, m11, [base+pw_0to31] + vpbroadcastw m6, [tlq+r3*2] + vpermw m5, m10, [tlq] + test angled, 0x400 + jnz .w16_main + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym0, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + kmovd r5d, k1 + test r5d, r5d + jz .w16_main + call .w16_filter + cmp hd, 16 + jg .w16_filter_h32 + vpermw m6, m11, m5 + vpermw m5, m10, m5 + jmp .w16_main +.w16_filter_h32: + movzx r3d, word [tlq+62] + movzx r2d, word [tlq+60] + lea r2d, [r2+r3*8+4] + sub r2d, r3d + mov r3d, 1 + shr r2d, 3 + kmovb k1, r3d + movd xm0, r2d + or r3d, 32 + vmovdqu16 m6{k1}, m0 +.w16_main: + rorx r2d, dxd, 23 + mov r7, rsp + and rsp, ~63 + vpbroadcastw m3, r2d + sub rsp, 64*2 + mov r2d, dxd + paddw m4, m3, m3 + mova [rsp+64*0], m5 + vinserti32x8 m3, ym4, 1 + mova [rsp+64*1], m6 + shl r3d, 6 +.w16_loop: + lea r5d, [r2+dxq] + shr r2d, 6 + movu ym0, [rsp+r2*2] + movu ym1, [rsp+r2*2+2] + lea r2d, [r5+dxq] + shr r5d, 6 + vinserti32x8 m0, [rsp+r5*2], 1 + vinserti32x8 m1, [rsp+r5*2+2], 1 + pand m2, m15, m3 ; frac << 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w16_end + paddw m3, m4 + lea dstq, [dstq+strideq*2] + cmp r2d, r3d + jl .w16_loop + punpckhqdq ym6, ym6 +.w16_end_loop: + mova [dstq+strideq*0], ym6 + mova [dstq+strideq*1], ym6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_end_loop +.w16_end: + mov rsp, r7 + RET +.w32: + lea r3d, [hq+31] + movu m7, [tlq+64*0] + and r3d, 31 + vpbroadcastw m11, r3d + or r3d, 32 ; imin(h+31, 63) + pminuw m10, m11, [base+pw_0to31] + vpbroadcastw m9, [tlq+r3*2] + vpermw m8, m10, [tlq+64*1] + test angled, 0x400 + jnz .w32_main + vpbroadcastd m5, [base+pw_3] + mov r5d, ~1 + movu m3, [tlq-2] + kmovd k1, r5d + valignq m2, m8, m7, 6 + paddw m7, m3 + vmovdqu16 m3{k1}, [tlq-4] + valignq m4, m9, m8, 2 + paddw m3, m5 + paddw m7, [tlq+2] + palignr m1, m8, m2, 14 + pavgw m3, [tlq+4] + palignr m2, m8, m2, 12 + paddw m7, m3 + palignr m3, m4, m8, 2 + psrlw m7, 2 + palignr m4, m8, 4 + paddw m8, m1 + paddw m2, m5 + paddw m8, m3 + pavgw m2, m4 + paddw m8, m2 + psrlw m8, 2 + cmp hd, 64 + je .w32_filter_h64 + vpermw m9, m11, m8 + vpermw m8, m10, m8 + jmp .w32_main +.w32_filter_h64: + movzx r3d, word [tlq+126] + movzx r2d, word [tlq+124] + lea r2d, [r2+r3*8+4] + sub r2d, r3d + mov r3d, 65 + shr r2d, 3 + movd xm0, r2d + vpblendmw m9{k1}, m0, m9 +.w32_main: + rorx r2d, dxd, 23 + mov r7, rsp + and rsp, ~63 + vpbroadcastw m5, r2d + sub rsp, 64*4 + mov r2d, dxd + mova [rsp+64*0], m7 + shl r3d, 6 + mova [rsp+64*1], m8 + mova m6, m5 + mova [rsp+64*2], m9 + punpckhqdq m9, m9 + mova [rsp+64*3], ym9 +.w32_loop: + lea r5d, [r2+dxq] + shr r2d, 6 + movu m0, [rsp+r2*2] + movu m2, [rsp+r2*2+2] + lea r2d, [r5+dxq] + shr r5d, 6 + movu m1, [rsp+r5*2] + movu m3, [rsp+r5*2+2] + pand m4, m15, m5 + paddw m5, m6 + psubw m2, m0 + pmulhrsw m2, m4 + pand m4, m15, m5 + psubw m3, m1 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jz .w32_end + paddw m5, m6 + lea dstq, [dstq+strideq*2] + cmp r2d, r3d + jl .w32_loop +.w32_end_loop: + mova [dstq+strideq*0], m9 + mova [dstq+strideq*1], m9 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_end_loop +.w32_end: + mov rsp, r7 + RET +.w64_filter96: + vpbroadcastd m4, [base+pw_3] + mov r5d, ~1 + movu m0, [tlq-2] + kmovd k1, r5d + paddw m7, m0 + vmovdqu16 m0{k1}, [tlq-4] + paddw m0, m4 + paddw m7, [tlq+2] + pavgw m0, [tlq+4] + valignq m1, m9, m8, 6 + paddw m8, [tlq+62] + paddw m2, m4, [tlq+60] + valignq m3, m10, m9, 2 + paddw m8, [tlq+66] + pavgw m2, [tlq+68] + paddw m7, m0 + palignr m0, m9, m1, 14 + paddw m8, m2 + palignr m1, m9, m1, 12 + psrlw m7, 2 + palignr m2, m3, m9, 2 + psrlw m8, 2 + palignr m3, m9, 4 + paddw m0, m9 + paddw m1, m4 + paddw m0, m2 + pavgw m1, m3 + paddw m0, m1 + ret +.w64: + movu m7, [tlq+64*0] + lea r3d, [hq-1] + movu m8, [tlq+64*1] + vpbroadcastw m11, [tlq+r3*2+128] + movu m9, [tlq+64*2] + cmp hd, 64 + je .w64_h64 + vpbroadcastw m13, r3d + or r3d, 64 + pminuw m12, m13, [base+pw_0to31] + mova m10, m11 + vpermw m9, m12, m9 + test angled, 0x400 + jnz .w64_main + call .w64_filter96 + psrlw m0, 2 + vpermw m9, m12, m0 + vpermw m10, m13, m0 + mova m11, m10 + jmp .w64_main +.w64_h64: + movu m10, [tlq+64*3] + or r3d, 64 + test angled, 0x400 + jnz .w64_main + call .w64_filter96 + valignq m1, m10, m9, 6 + valignq m3, m11, m10, 2 + vpbroadcastd m11, [base+pw_63] + psrlw m9, m0, 2 + palignr m0, m10, m1, 14 + palignr m1, m10, m1, 12 + palignr m2, m3, m10, 2 + palignr m3, m10, 4 + paddw m10, m0 + paddw m1, m4 + paddw m10, m2 + pavgw m1, m3 + paddw m10, m1 + psrlw m10, 2 + vpermw m11, m11, m10 +.w64_main: + rorx r2d, dxd, 23 + mov r7, rsp + and rsp, ~63 + vpbroadcastw m5, r2d + sub rsp, 64*6 + mova [rsp+64*0], m7 + mov r2d, dxd + mova [rsp+64*1], m8 + lea r5, [rsp+r3*2] + mova [rsp+64*2], m9 + shl r3d, 6 + mova [rsp+64*3], m10 + sub r2, r3 + mova [rsp+64*4], m11 + mova m6, m5 + mova [rsp+64*5], m11 +.w64_loop: + mov r3, r2 + sar r3, 6 + movu m0, [r5+r3*2+64*0] + movu m2, [r5+r3*2+64*0+2] + movu m1, [r5+r3*2+64*1] + movu m3, [r5+r3*2+64*1+2] + pand m4, m15, m5 + psubw m2, m0 + pmulhrsw m2, m4 + psubw m3, m1 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd + jz .w64_end + paddw m5, m6 + add dstq, strideq + add r2, dxq + jl .w64_loop +.w64_end_loop: + mova [dstq+64*0], m11 + mova [dstq+64*1], m11 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + mov rsp, r7 + RET + cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3 lea r6, [pal_pred_16bpc_avx512icl_table] tzcnt wd, wm -- cgit v1.2.3 From 5149b274472340e9641d3f30e00ebd8bd57fd515 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 14 Dec 2023 15:40:47 +0200 Subject: checkasm: Map SIGBUS to the right error text This was missed in 2ef970a885990ff462c30b6573bea5044bb4b0f5. Also print this text for EXCEPTION_IN_PAGE_ERROR on Windows. --- tests/checkasm/checkasm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 71a9334..5e26432 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -453,10 +453,12 @@ static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) { case EXCEPTION_ACCESS_VIOLATION: case EXCEPTION_ARRAY_BOUNDS_EXCEEDED: case EXCEPTION_DATATYPE_MISALIGNMENT: - case EXCEPTION_IN_PAGE_ERROR: case EXCEPTION_STACK_OVERFLOW: err = "segmentation fault"; break; + case EXCEPTION_IN_PAGE_ERROR: + err = "bus error"; + break; default: return EXCEPTION_CONTINUE_SEARCH; } @@ -472,6 +474,7 @@ static void signal_handler(const int s) { state.catch_signals = 0; checkasm_fail_func(s == SIGFPE ? "fatal arithmetic error" : s == SIGILL ? "illegal instruction" : + s == SIGBUS ? "bus error" : "segmentation fault"); checkasm_load_context(); } else { -- cgit v1.2.3 From 8ba0df84921bfe1cd5a2a6b20a294d606d6754fe Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 19 Dec 2023 12:11:46 +0100 Subject: checkasm: Fix cdef_dir function prototype --- tests/checkasm/cdef.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/checkasm/cdef.c b/tests/checkasm/cdef.c index 9a90e31..b96339a 100644 --- a/tests/checkasm/cdef.c +++ b/tests/checkasm/cdef.c @@ -106,7 +106,7 @@ static void check_cdef_filter(const cdef_fn fn, const int w, const int h) { static void check_cdef_direction(const cdef_dir_fn fn) { ALIGN_STK_64(pixel, src, 8 * 8,); - declare_func(int, pixel *src, ptrdiff_t dst_stride, unsigned *var + declare_func(int, const pixel *src, ptrdiff_t dst_stride, unsigned *var HIGHBD_DECL_SUFFIX); if (check_func(fn, "cdef_dir_%dbpc", BITDEPTH)) { -- cgit v1.2.3 From b3f5e8cef56886dbbe70b1485433ff45a522493d Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 19 Dec 2023 12:11:54 +0100 Subject: thread_task: Replace goto's with a regular while-loop --- src/thread_task.c | 57 ++++++++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/src/thread_task.c b/src/thread_task.c index 1698ab0..152685c 100644 --- a/src/thread_task.c +++ b/src/thread_task.c @@ -501,45 +501,42 @@ static inline void delayed_fg_task(const Dav1dContext *const c, int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1); pthread_mutex_unlock(&ttd->lock); int progmax = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE; - fg_apply_loop: - if (row + 1 < progmax) - pthread_cond_signal(&ttd->cond); - else if (row + 1 >= progmax) { - pthread_mutex_lock(&ttd->lock); - ttd->delayed_fg.exec = 0; - if (row >= progmax) goto end_add; - pthread_mutex_unlock(&ttd->lock); - } - switch (out->p.bpc) { + while (row < progmax) { + if (row + 1 < progmax) + pthread_cond_signal(&ttd->cond); + else { + pthread_mutex_lock(&ttd->lock); + ttd->delayed_fg.exec = 0; + pthread_mutex_unlock(&ttd->lock); + } + switch (out->p.bpc) { #if CONFIG_8BPC - case 8: - dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in, - ttd->delayed_fg.scaling_8bpc, - ttd->delayed_fg.grain_lut_8bpc, row); - break; + case 8: + dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in, + ttd->delayed_fg.scaling_8bpc, + ttd->delayed_fg.grain_lut_8bpc, row); + break; #endif #if CONFIG_16BPC - case 10: - case 12: - dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in, - ttd->delayed_fg.scaling_16bpc, - ttd->delayed_fg.grain_lut_16bpc, row); - break; + case 10: + case 12: + dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in, + ttd->delayed_fg.scaling_16bpc, + ttd->delayed_fg.grain_lut_16bpc, row); + break; #endif - default: abort(); + default: abort(); + } + row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1); + atomic_fetch_add(&ttd->delayed_fg.progress[1], 1); } - row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1); - int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1; - if (row < progmax) goto fg_apply_loop; pthread_mutex_lock(&ttd->lock); ttd->delayed_fg.exec = 0; - end_add: - done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1; + int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1; progmax = atomic_load(&ttd->delayed_fg.progress[0]); // signal for completion only once the last runner reaches this - if (done < progmax) - break; - pthread_cond_signal(&ttd->delayed_fg.cond); + if (done >= progmax) + pthread_cond_signal(&ttd->delayed_fg.cond); break; default: abort(); } -- cgit v1.2.3 From 746ab8b4f3021d7263c64d6b5d6f1e2c281c7acc Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 19 Dec 2023 12:12:00 +0100 Subject: thread_task: Properly handle spurious wakeups in delayed_fg POSIX explicitly states that spurious wakeups from pthread_cond_wake() may occur, even without any corresponding call to pthread_cond_signal(). --- src/internal.h | 2 +- src/thread_task.c | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/internal.h b/src/internal.h index b1a94f8..631c5a8 100644 --- a/src/internal.h +++ b/src/internal.h @@ -142,7 +142,7 @@ struct Dav1dContext { atomic_uint reset_task_cur; atomic_int cond_signaled; struct { - int exec; + int exec, finished; pthread_cond_t cond; const Dav1dPicture *in; Dav1dPicture *out; diff --git a/src/thread_task.c b/src/thread_task.c index 152685c..1ededde 100644 --- a/src/thread_task.c +++ b/src/thread_task.c @@ -357,8 +357,11 @@ void dav1d_task_delayed_fg(Dav1dContext *const c, Dav1dPicture *const out, atomic_init(&ttd->delayed_fg.progress[1], 0); pthread_mutex_lock(&ttd->lock); ttd->delayed_fg.exec = 1; + ttd->delayed_fg.finished = 0; pthread_cond_signal(&ttd->cond); - pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock); + do { + pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock); + } while (!ttd->delayed_fg.finished); pthread_mutex_unlock(&ttd->lock); } @@ -535,8 +538,10 @@ static inline void delayed_fg_task(const Dav1dContext *const c, int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1; progmax = atomic_load(&ttd->delayed_fg.progress[0]); // signal for completion only once the last runner reaches this - if (done >= progmax) + if (done >= progmax) { + ttd->delayed_fg.finished = 1; pthread_cond_signal(&ttd->delayed_fg.cond); + } break; default: abort(); } -- cgit v1.2.3 From ceeb535d9436b1398448ca27892803db774c52b4 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 3 Jan 2024 08:36:07 -0500 Subject: qm: derive more tables at runtime This reduces binary size from ~50kb to ~35kb. Ideas provided by Yu-Chen (Eric) Sun and Ryan Lei from Meta. --- src/qm.c | 1499 +------------------------------------------------------------- 1 file changed, 22 insertions(+), 1477 deletions(-) diff --git a/src/qm.c b/src/qm.c index e2e0d61..a523da5 100644 --- a/src/qm.c +++ b/src/qm.c @@ -33,1470 +33,6 @@ #include "src/qm.h" -static const uint8_t qm_tbl_4x4_t[][2][10] = { - { - { - 32, - 43, 67, - 73, 94, 137, - 97, 110, 150, 200, - }, { - 35, - 46, 60, - 57, 69, 90, - 66, 71, 90, 109, - }, - }, { - { - 32, - 41, 63, - 69, 88, 127, - 92, 103, 140, 184, - }, { - 33, - 45, 58, - 56, 66, 86, - 64, 69, 87, 105, - }, - }, { - { - 32, - 38, 56, - 63, 78, 113, - 86, 97, 130, 169, - }, { - 32, - 45, 55, - 53, 62, 80, - 63, 67, 84, 101, - }, - }, { - { - 32, - 37, 54, - 58, 72, 102, - 81, 91, 121, 156, - }, { - 32, - 45, 54, - 51, 59, 75, - 61, 65, 81, 97, - }, - }, { - { - 32, - 34, 49, - 53, 64, 91, - 75, 81, 112, 140, - }, { - 32, - 46, 53, - 49, 55, 70, - 58, 62, 78, 91, - }, - }, { - { - 32, - 34, 48, - 49, 60, 82, - 72, 79, 104, 134, - }, { - 32, - 46, 53, - 47, 54, 66, - 57, 60, 75, 89, - }, - }, { - { - 32, - 33, 39, - 45, 51, 71, - 62, 64, 87, 108, - }, { - 31, - 42, 48, - 47, 50, 61, - 53, 54, 67, 78, - }, - }, { - { - 32, - 33, 38, - 42, 46, 63, - 55, 57, 75, 92, - }, { - 31, - 41, 48, - 46, 48, 58, - 51, 51, 62, 71, - }, - }, { - { - 32, - 32, 35, - 38, 40, 54, - 51, 49, 64, 81, - }, { - 31, - 38, 47, - 47, 46, 54, - 49, 46, 57, 66, - }, - }, { - { - 32, - 32, 34, - 35, 37, 48, - 43, 43, 54, 65, - }, { - 31, - 37, 44, - 47, 47, 53, - 47, 45, 53, 59, - }, - }, { - { - 32, - 32, 33, - 34, 35, 39, - 38, 39, 45, 54, - }, { - 31, - 34, 39, - 42, 45, 48, - 47, 46, 49, 54, - }, - }, { - { - 32, - 32, 32, - 32, 33, 35, - 35, 35, 38, 46, - }, { - 31, - 32, 34, - 38, 41, 47, - 46, 46, 47, 52, - }, - }, { - { - 31, - 32, 32, - 32, 32, 33, - 32, 33, 34, 35, - }, { - 31, - 31, 32, - 34, 35, 39, - 38, 40, 43, 47, - }, - }, { - { - 31, - 31, 32, - 31, 32, 32, - 32, 32, 32, 33, - }, { - 31, - 31, 31, - 31, 31, 32, - 34, 35, 35, 39, - }, - }, { - { - 31, - 31, 32, - 31, 32, 32, - 31, 32, 32, 32, - }, { - 31, - 31, 31, - 31, 31, 31, - 31, 31, 31, 31, - }, - }, -}; - -static const uint8_t qm_tbl_8x4[][2][32] = { - { - { - 32, 33, 37, 49, 65, 80, 91, 104, - 42, 42, 58, 71, 84, 97, 100, 112, - 75, 69, 84, 103, 125, 142, 145, 146, - 91, 86, 91, 110, 128, 152, 178, 190, - }, { - 31, 40, 46, 48, 54, 61, 64, 68, - 47, 45, 56, 61, 65, 69, 68, 71, - 60, 54, 64, 75, 85, 92, 90, 87, - 66, 61, 64, 73, 82, 92, 102, 105, - }, - }, { - { - 32, 33, 36, 46, 60, 75, 86, 98, - 42, 42, 56, 67, 79, 92, 95, 105, - 69, 64, 77, 93, 112, 130, 136, 136, - 88, 83, 88, 105, 122, 144, 167, 177, - }, { - 31, 40, 46, 47, 52, 59, 63, 66, - 47, 45, 55, 60, 64, 68, 66, 69, - 57, 52, 61, 70, 79, 87, 88, 85, - 65, 61, 63, 72, 81, 90, 99, 102, - }, - }, { - { - 32, 32, 34, 44, 54, 72, 82, 92, - 38, 40, 51, 61, 69, 84, 89, 98, - 62, 58, 68, 85, 98, 118, 129, 127, - 86, 80, 85, 101, 117, 136, 157, 165, - }, { - 31, 38, 46, 46, 50, 57, 61, 65, - 47, 46, 53, 56, 59, 64, 65, 67, - 54, 50, 57, 66, 74, 82, 85, 82, - 64, 60, 62, 71, 79, 88, 97, 99, - }, - }, { - { - 32, 32, 34, 41, 51, 65, 75, 86, - 35, 36, 47, 53, 61, 73, 81, 92, - 59, 57, 65, 78, 92, 108, 117, 119, - 83, 78, 82, 97, 111, 129, 148, 154, - }, { - 31, 36, 46, 45, 49, 54, 59, 63, - 47, 47, 52, 53, 55, 58, 61, 65, - 53, 50, 55, 63, 71, 77, 81, 80, - 63, 59, 61, 70, 77, 86, 94, 95, - }, - }, { - { - 32, 32, 34, 38, 48, 60, 72, 81, - 35, 36, 42, 51, 59, 68, 79, 86, - 51, 50, 54, 67, 80, 92, 104, 112, - 77, 72, 75, 87, 103, 119, 135, 144, - }, { - 31, 36, 43, 45, 47, 52, 57, 61, - 47, 47, 50, 53, 54, 56, 60, 63, - 50, 47, 50, 58, 66, 70, 75, 77, - 61, 57, 58, 65, 74, 82, 90, 93, - }, - }, { - { - 32, 32, 34, 37, 45, 54, 65, 75, - 35, 36, 42, 50, 56, 63, 73, 81, - 51, 50, 54, 65, 76, 87, 97, 106, - 75, 71, 73, 84, 96, 110, 125, 136, - }, { - 31, 36, 43, 46, 46, 50, 54, 59, - 47, 47, 50, 53, 54, 55, 58, 61, - 50, 47, 50, 57, 64, 68, 72, 75, - 60, 56, 57, 64, 71, 78, 85, 90, - }, - }, { - { - 32, 32, 33, 35, 41, 49, 57, 66, - 34, 34, 37, 43, 48, 54, 60, 68, - 43, 42, 44, 54, 64, 71, 78, 86, - 62, 59, 58, 68, 79, 91, 101, 111, - }, { - 31, 33, 40, 47, 45, 48, 51, 55, - 42, 44, 47, 50, 49, 50, 52, 55, - 47, 45, 46, 54, 59, 61, 63, 66, - 54, 51, 50, 57, 64, 70, 75, 79, - }, - }, { - { - 32, 32, 32, 34, 38, 44, 50, 61, - 32, 33, 35, 37, 40, 45, 50, 58, - 42, 41, 42, 50, 58, 66, 71, 79, - 56, 53, 52, 59, 68, 78, 86, 97, - }, { - 31, 32, 39, 44, 46, 47, 48, 53, - 38, 40, 47, 47, 47, 46, 47, 50, - 47, 45, 45, 51, 56, 59, 61, 64, - 52, 49, 48, 53, 58, 64, 68, 73, - }, - }, { - { - 32, 32, 32, 34, 35, 40, 46, 52, - 32, 33, 34, 37, 38, 42, 46, 51, - 37, 36, 38, 44, 49, 55, 59, 64, - 52, 49, 49, 54, 60, 69, 76, 83, - }, { - 31, 31, 36, 42, 47, 46, 48, 50, - 38, 40, 44, 47, 48, 46, 46, 48, - 47, 46, 47, 50, 53, 54, 55, 56, - 50, 48, 47, 50, 54, 60, 64, 67, - }, - }, { - { - 31, 32, 32, 32, 34, 37, 42, 46, - 32, 33, 34, 35, 37, 40, 43, 46, - 35, 34, 36, 38, 43, 49, 53, 56, - 43, 41, 42, 42, 49, 56, 63, 67, - }, { - 31, 31, 35, 39, 43, 47, 46, 48, - 38, 40, 43, 47, 47, 47, 46, 46, - 47, 46, 47, 47, 50, 53, 53, 54, - 48, 45, 46, 45, 50, 55, 58, 59, - }, - }, { - { - 31, 32, 32, 32, 33, 34, 37, 40, - 32, 32, 33, 33, 34, 36, 38, 40, - 34, 34, 34, 36, 38, 41, 44, 46, - 39, 38, 38, 40, 42, 47, 52, 56, - }, { - 31, 31, 33, 36, 40, 45, 47, 47, - 34, 35, 37, 41, 44, 46, 47, 46, - 42, 42, 44, 46, 48, 49, 50, 49, - 48, 46, 46, 46, 48, 51, 54, 55, - }, - }, { - { - 31, 32, 32, 32, 32, 33, 34, 35, - 31, 32, 32, 32, 33, 33, 34, 34, - 32, 32, 33, 34, 35, 36, 37, 38, - 35, 35, 34, 36, 38, 40, 42, 48, - }, { - 31, 31, 31, 34, 37, 39, 42, 48, - 31, 31, 32, 36, 39, 41, 43, 46, - 37, 38, 40, 43, 46, 47, 47, 48, - 48, 47, 46, 47, 47, 48, 50, 53, - }, - }, { - { - 31, 31, 32, 32, 32, 32, 32, 33, - 31, 32, 32, 32, 32, 32, 33, 33, - 32, 32, 32, 32, 33, 34, 34, 35, - 32, 32, 32, 33, 34, 34, 35, 36, - }, { - 31, 31, 31, 31, 34, 35, 38, 41, - 31, 31, 32, 32, 36, 37, 40, 42, - 35, 36, 37, 37, 40, 42, 45, 45, - 37, 38, 39, 40, 43, 44, 47, 47, - }, - }, { - { - 31, 31, 31, 31, 31, 31, 32, 32, - 31, 32, 32, 32, 32, 32, 32, 32, - 31, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 33, 33, 33, - }, { - 31, 31, 31, 31, 31, 31, 34, 34, - 31, 31, 31, 32, 32, 33, 36, 36, - 31, 31, 31, 32, 32, 33, 36, 36, - 34, 35, 35, 36, 36, 37, 40, 40, - }, - }, { - { - 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 32, 32, 32, 32, 32, 32, - 31, 31, 32, 32, 32, 32, 32, 32, - 31, 31, 32, 32, 32, 32, 32, 32, - }, { - 31, 31, 31, 31, 31, 31, 31, 30, - 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 32, 32, - 31, 31, 31, 31, 31, 31, 32, 32, - }, - }, -}; - -static const uint8_t qm_tbl_8x8_t[][2][36] = { - { - { - 32, - 32, 35, - 38, 40, 54, - 51, 49, 65, 82, - 68, 63, 78, 97, 117, - 84, 76, 91, 111, 134, 152, - 95, 89, 98, 113, 138, 159, 183, - 109, 102, 106, 121, 142, 168, 199, 220, - }, { - 31, - 38, 47, - 47, 46, 54, - 50, 47, 57, 66, - 57, 52, 61, 72, 82, - 63, 57, 66, 77, 88, 96, - 67, 62, 67, 75, 86, 95, 104, - 71, 67, 68, 75, 84, 95, 107, 113, - }, - }, { - { - 32, - 32, 35, - 37, 39, 51, - 47, 46, 60, 73, - 62, 58, 71, 87, 105, - 78, 72, 84, 100, 121, 140, - 90, 84, 93, 106, 129, 148, 169, - 102, 96, 100, 113, 132, 155, 183, 201, - }, { - 31, - 38, 47, - 47, 47, 53, - 48, 46, 55, 62, - 54, 50, 58, 67, 76, - 61, 55, 63, 72, 83, 91, - 66, 61, 65, 73, 84, 92, 101, - 69, 65, 66, 73, 82, 92, 103, 109, - }, - }, { - { - 32, - 32, 34, - 35, 37, 48, - 46, 45, 56, 70, - 57, 54, 64, 80, 93, - 76, 70, 79, 96, 111, 134, - 85, 79, 87, 100, 121, 138, 156, - 96, 90, 93, 105, 122, 144, 168, 184, - }, { - 31, - 36, 43, - 47, 47, 53, - 48, 46, 54, 61, - 52, 49, 55, 65, 71, - 60, 55, 60, 70, 78, 89, - 64, 59, 63, 71, 81, 89, 97, - 67, 63, 64, 71, 79, 89, 99, 104, - }, - }, { - { - 32, - 32, 33, - 35, 36, 46, - 42, 42, 52, 63, - 53, 51, 60, 73, 86, - 68, 64, 72, 84, 100, 117, - 78, 74, 80, 92, 109, 128, 140, - 90, 84, 87, 98, 114, 133, 155, 168, - }, { - 31, - 34, 39, - 46, 47, 52, - 47, 45, 52, 58, - 50, 48, 54, 62, 68, - 57, 53, 58, 65, 73, 82, - 61, 57, 61, 68, 77, 86, 91, - 65, 61, 62, 68, 76, 86, 95, 100, - }, - }, { - { - 32, - 32, 33, - 34, 35, 39, - 39, 40, 46, 56, - 50, 48, 53, 65, 78, - 62, 59, 63, 75, 90, 105, - 76, 71, 74, 86, 101, 118, 134, - 84, 79, 81, 92, 106, 123, 142, 153, - }, { - 31, - 34, 39, - 42, 45, 48, - 47, 46, 49, 55, - 49, 47, 50, 58, 65, - 54, 51, 53, 61, 69, 76, - 60, 56, 57, 65, 73, 82, 89, - 64, 59, 60, 66, 74, 83, 92, 96, - }, - }, { - { - 32, - 32, 33, - 34, 35, 39, - 38, 39, 45, 54, - 46, 45, 51, 61, 71, - 56, 54, 58, 69, 80, 92, - 68, 64, 68, 78, 90, 103, 117, - 78, 74, 76, 86, 99, 113, 128, 140, - }, { - 31, - 34, 39, - 42, 45, 48, - 47, 46, 49, 54, - 48, 46, 50, 56, 61, - 52, 49, 52, 58, 65, 71, - 57, 53, 55, 61, 68, 75, 82, - 61, 57, 58, 64, 71, 79, 86, 91, - }, - }, { - { - 31, - 32, 32, - 32, 33, 35, - 35, 35, 38, 48, - 42, 41, 43, 54, 63, - 51, 49, 49, 59, 71, 81, - 59, 56, 56, 66, 77, 89, 98, - 69, 65, 64, 73, 85, 97, 108, 119, - }, { - 31, - 32, 35, - 38, 42, 47, - 48, 47, 48, 53, - 47, 45, 45, 53, 58, - 50, 47, 47, 54, 61, 66, - 53, 50, 49, 56, 63, 69, 73, - 57, 54, 52, 58, 65, 72, 77, 82, - }, - }, { - { - 31, - 32, 32, - 32, 32, 35, - 34, 34, 37, 42, - 38, 37, 40, 47, 54, - 46, 44, 45, 52, 60, 69, - 52, 49, 49, 56, 65, 75, 82, - 63, 59, 58, 65, 73, 84, 92, 105, - }, { - 31, - 31, 32, - 38, 40, 47, - 44, 44, 47, 50, - 47, 45, 46, 51, 54, - 48, 46, 46, 51, 56, 61, - 50, 47, 47, 52, 57, 63, 66, - 55, 52, 50, 54, 60, 66, 70, 76, - }, - }, { - { - 31, - 32, 32, - 32, 32, 34, - 34, 33, 35, 39, - 35, 34, 37, 42, 48, - 41, 40, 41, 47, 53, 60, - 47, 44, 45, 51, 57, 65, 71, - 53, 50, 51, 55, 61, 70, 77, 85, - }, { - 31, - 31, 32, - 35, 36, 41, - 42, 42, 45, 48, - 48, 46, 47, 50, 53, - 47, 45, 45, 49, 53, 57, - 49, 46, 46, 50, 54, 59, 61, - 51, 48, 48, 51, 54, 60, 64, 68, - }, - }, { - { - 31, - 31, 32, - 32, 32, 33, - 32, 32, 34, 35, - 34, 34, 35, 37, 41, - 37, 36, 38, 39, 45, 51, - 43, 41, 42, 42, 49, 56, 63, - 47, 44, 45, 46, 52, 59, 67, 71, - }, { - 31, - 31, 32, - 34, 35, 39, - 37, 40, 43, 47, - 43, 43, 45, 47, 49, - 48, 46, 46, 47, 50, 53, - 47, 45, 45, 45, 50, 55, 58, - 49, 46, 46, 46, 50, 55, 60, 61, - }, - }, { - { - 31, - 31, 32, - 32, 32, 32, - 32, 32, 33, 34, - 33, 33, 34, 35, 37, - 34, 34, 35, 36, 39, 43, - 37, 36, 37, 38, 41, 46, 51, - 41, 39, 40, 41, 44, 49, 54, 58, - }, { - 31, - 31, 31, - 32, 33, 35, - 35, 37, 39, 43, - 39, 41, 42, 45, 47, - 45, 44, 45, 47, 48, 50, - 48, 46, 46, 47, 48, 51, 53, - 48, 46, 45, 46, 47, 51, 54, 56, - }, - }, { - { - 31, - 31, 32, - 31, 32, 32, - 32, 32, 32, 33, - 32, 32, 32, 34, 35, - 32, 33, 33, 34, 35, 36, - 34, 34, 33, 35, 36, 38, 39, - 35, 35, 34, 36, 38, 40, 42, 48, - }, { - 31, - 31, 31, - 30, 31, 32, - 34, 34, 35, 39, - 36, 37, 39, 42, 46, - 39, 40, 41, 44, 47, 47, - 42, 42, 42, 45, 47, 48, 48, - 48, 47, 46, 47, 47, 49, 50, 53, - }, - }, { - { - 31, - 31, 32, - 31, 32, 32, - 31, 32, 32, 32, - 32, 32, 32, 32, 33, - 32, 32, 32, 32, 33, 34, - 32, 32, 32, 32, 34, 34, 35, - 33, 33, 33, 33, 35, 35, 36, 38, - }, { - 31, - 31, 31, - 31, 31, 31, - 30, 31, 31, 32, - 34, 34, 35, 35, 39, - 35, 35, 36, 36, 40, 41, - 37, 38, 39, 40, 43, 44, 47, - 40, 41, 41, 42, 44, 45, 47, 48, - }, - }, { - { - 31, - 31, 32, - 31, 32, 32, - 31, 32, 32, 32, - 31, 32, 32, 32, 32, - 31, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 33, - 32, 32, 32, 32, 32, 32, 33, 33, - }, { - 31, - 31, 31, - 31, 31, 31, - 31, 31, 31, 31, - 30, 31, 31, 31, 32, - 31, 32, 32, 32, 32, 33, - 33, 34, 34, 35, 35, 36, 39, - 33, 34, 34, 35, 35, 36, 39, 39, - }, - }, { - { - 31, - 31, 31, - 31, 31, 31, - 31, 31, 32, 32, - 31, 31, 32, 32, 32, - 31, 31, 32, 32, 32, 32, - 31, 31, 32, 32, 32, 32, 32, - 31, 31, 32, 32, 32, 32, 32, 32, - }, { - 31, - 31, 31, - 31, 31, 31, - 31, 31, 31, 31, - 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 31, - 30, 31, 31, 31, 31, 31, 31, 31, - }, - }, -}; - -static const uint8_t qm_tbl_16x4[][2][64] = { - { - { - 31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108, - 44, 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115, - 79, 72, 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151, - 96, 90, 86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197, - }, { - 31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, - 49, 45, 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, - 63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, - 69, 65, 62, 60, 63, 66, 70, 74, 80, 85, 91, 96, 101, 103, 105, 107, - }, - }, { - { - 31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101, - 44, 41, 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108, - 73, 67, 65, 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141, - 93, 87, 83, 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183, - }, { - 31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, - 49, 45, 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, - 61, 55, 54, 54, 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, - 69, 64, 61, 59, 62, 65, 68, 73, 78, 84, 89, 93, 98, 100, 102, 103, - }, - }, { - { - 31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, - 39, 38, 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, - 65, 60, 59, 58, 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131, - 90, 84, 81, 78, 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170, - }, { - 31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, - 48, 46, 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, - 57, 53, 51, 50, 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, - 68, 63, 60, 58, 61, 64, 67, 71, 77, 82, 87, 91, 95, 97, 99, 100, - }, - }, { - { - 31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, - 36, 35, 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, - 62, 58, 57, 56, 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, - 88, 82, 79, 76, 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159, - }, { - 31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, - 48, 46, 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, - 56, 52, 50, 49, 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, - 67, 62, 60, 57, 60, 63, 66, 70, 75, 80, 85, 89, 93, 94, 96, 97, - }, - }, { - { - 31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, - 36, 35, 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, - 53, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, - 81, 76, 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, - }, { - 31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, - 48, 47, 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, - 52, 49, 48, 47, 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, - 64, 60, 57, 56, 57, 61, 64, 68, 71, 75, 78, 83, 87, 90, 92, 94, - }, - }, { - { - 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, - 36, 35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, - 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, - 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136, - }, { - 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, - 48, 47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, - 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, - 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90, - }, - }, { - { - 31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, - 34, 34, 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, - 44, 43, 41, 43, 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, - 65, 62, 59, 59, 58, 63, 67, 71, 76, 81, 85, 92, 98, 105, 111, 118, - }, { - 31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, - 42, 42, 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, - 49, 47, 45, 46, 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, - 57, 54, 52, 51, 50, 53, 56, 58, 61, 64, 67, 71, 73, 76, 79, 82, - }, - }, { - { - 31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, - 32, 32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, - 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, - 58, 55, 53, 53, 53, 52, 57, 63, 67, 70, 74, 79, 86, 90, 93, 97, - }, { - 31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, - 37, 38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, - 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, - 54, 51, 49, 49, 48, 48, 51, 55, 58, 60, 62, 65, 68, 70, 71, 73, - }, - }, { - { - 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, - 32, 32, 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, - 38, 37, 36, 36, 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, - 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, - }, { - 31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, - 37, 38, 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, - 48, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, - 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, - }, - }, { - { - 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, - 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, - 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, - 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, - }, { - 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, - 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, - 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, - 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, - }, - }, { - { - 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, - 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, - 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, - 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 51, 51, 54, 54, 58, - }, { - 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, - 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, - 42, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, - 48, 47, 47, 45, 45, 46, 46, 46, 46, 50, 50, 53, 53, 54, 54, 56, - }, - }, { - { - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, - 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, - 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, - }, { - 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, - 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, - 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, - 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, - }, - }, { - { - 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, - 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, - 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, - }, { - 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, - 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, - 35, 35, 36, 36, 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, - 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, - }, - }, { - { - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, - }, { - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, - 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, - 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, - 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 39, 40, 40, 40, - }, - }, { - { - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - }, { - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, - }, - }, -}; - -static const uint8_t qm_tbl_16x8[][2][128] = { - { - { - 32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118, - 32, 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107, - 36, 34, 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105, - 53, 49, 50, 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118, - 65, 59, 59, 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136, - 87, 78, 77, 79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157, - 93, 86, 82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182, - 99, 93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203, - }, { - 32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, - 37, 40, 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, - 48, 46, 47, 50, 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, - 52, 48, 47, 50, 54, 61, 64, 68, 70, 75, 75, 74, 73, 75, 74, 73, - 57, 52, 51, 53, 57, 64, 67, 73, 76, 82, 83, 86, 83, 83, 84, 82, - 66, 60, 59, 60, 62, 69, 73, 80, 84, 92, 93, 94, 96, 92, 94, 91, - 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98, 99, 101, 103, 101, - 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98, 104, 106, 109, - }, - }, { - { - 32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110, - 32, 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100, - 36, 35, 36, 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, - 47, 44, 45, 47, 56, 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111, - 65, 60, 59, 60, 68, 73, 84, 92, 100, 111, 118, 124, 121, 124, 129, 127, - 79, 72, 71, 71, 78, 84, 95, 103, 113, 125, 133, 140, 148, 141, 151, 147, - 90, 84, 80, 78, 83, 91, 101, 108, 116, 129, 142, 153, 157, 163, 171, 169, - 96, 90, 87, 85, 87, 94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188, - }, { - 32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, - 35, 38, 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, - 48, 46, 47, 48, 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, - 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72, 71, 73, 72, 71, - 57, 52, 51, 51, 57, 60, 66, 71, 74, 79, 82, 84, 81, 81, 82, 79, - 63, 58, 56, 55, 60, 64, 70, 75, 79, 85, 89, 91, 94, 89, 92, 89, - 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95, 97, 98, 100, 98, - 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96, 101, 103, 105, - }, - }, { - { - 32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103, - 32, 32, 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, - 36, 35, 36, 38, 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, - 44, 41, 42, 42, 50, 58, 63, 67, 74, 79, 84, 91, 96, 102, 103, 103, - 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114, 117, 121, 119, - 79, 73, 71, 69, 75, 84, 90, 97, 108, 118, 125, 135, 140, 133, 141, 137, - 88, 81, 78, 76, 81, 88, 97, 104, 111, 123, 135, 145, 148, 153, 160, 158, - 93, 88, 84, 82, 84, 90, 97, 105, 113, 122, 131, 141, 151, 163, 169, 175, - }, { - 32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, - 34, 36, 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, - 48, 46, 47, 47, 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, - 49, 46, 46, 45, 51, 56, 58, 60, 62, 64, 65, 68, 69, 71, 70, 69, - 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 80, 77, - 63, 58, 56, 54, 59, 64, 67, 71, 77, 82, 85, 89, 91, 87, 89, 86, - 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93, 94, 96, 97, 95, - 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98, 100, 102, - }, - }, { - { - 32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, - 31, 32, 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, - 35, 34, 35, 37, 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, - 44, 41, 42, 42, 48, 54, 60, 66, 71, 75, 79, 86, 92, 96, 97, 97, - 53, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 105, 110, 114, 111, - 65, 61, 59, 58, 63, 68, 76, 84, 92, 98, 105, 113, 120, 125, 132, 128, - 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, 150, 147, - 90, 85, 81, 79, 81, 87, 93, 101, 108, 116, 124, 134, 142, 153, 157, 163, - }, { - 32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, - 33, 34, 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, - 45, 45, 46, 47, 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, - 49, 46, 45, 45, 49, 53, 57, 59, 61, 62, 64, 66, 68, 69, 68, 67, - 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 77, 75, - 57, 53, 51, 50, 53, 57, 61, 66, 71, 73, 76, 80, 83, 84, 86, 83, - 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87, 91, 93, 94, 92, - 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95, 97, 98, - }, - }, { - { - 32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, - 31, 32, 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, - 33, 33, 34, 36, 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, - 40, 39, 38, 40, 44, 51, 54, 59, 62, 66, 70, 75, 81, 86, 90, 90, - 51, 49, 47, 48, 52, 58, 63, 69, 74, 79, 84, 90, 97, 102, 106, 103, - 65, 61, 59, 58, 62, 68, 73, 79, 85, 92, 98, 106, 113, 120, 124, 119, - 79, 74, 71, 69, 72, 78, 84, 90, 96, 103, 110, 119, 128, 135, 140, 137, - 87, 82, 79, 77, 78, 84, 89, 96, 103, 111, 118, 126, 134, 143, 147, 151, - }, { - 32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, - 32, 33, 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, - 40, 41, 43, 46, 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, - 49, 47, 46, 46, 49, 53, 54, 56, 57, 58, 59, 61, 63, 65, 66, 65, - 51, 49, 47, 47, 49, 54, 57, 61, 63, 65, 67, 69, 72, 73, 75, 72, - 57, 54, 51, 50, 52, 57, 60, 64, 67, 71, 73, 77, 80, 82, 84, 81, - 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82, 86, 89, 91, 89, - 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92, 94, 95, - }, - }, { - { - 32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, - 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, - 32, 32, 33, 34, 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, - 36, 35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, - 44, 42, 41, 42, 42, 48, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92, - 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, - 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, - 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136, - }, { - 32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, - 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, - 37, 38, 40, 43, 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, - 48, 47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, - 49, 47, 45, 46, 45, 49, 53, 56, 58, 59, 61, 62, 64, 65, 67, 68, - 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, - 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73, 76, 79, 82, 83, - 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90, - }, - }, { - { - 32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, - 31, 32, 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, - 32, 32, 33, 34, 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, - 36, 35, 34, 36, 38, 42, 47, 49, 51, 54, 56, 60, 63, 68, 73, 79, - 44, 42, 41, 42, 42, 48, 52, 56, 60, 64, 67, 71, 75, 79, 84, 90, - 53, 51, 49, 50, 49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, - 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90, 95, 102, 108, 115, - 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105, 112, 119, 127, - }, { - 32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, - 31, 31, 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, - 37, 38, 40, 43, 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, - 48, 47, 46, 47, 47, 50, 52, 53, 53, 53, 54, 54, 55, 56, 58, 60, - 49, 47, 45, 46, 45, 49, 53, 55, 57, 58, 59, 61, 62, 64, 65, 67, - 52, 50, 48, 47, 47, 50, 53, 56, 59, 62, 64, 66, 68, 70, 72, 75, - 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70, 72, 75, 77, 80, - 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79, 82, 86, - }, - }, { - { - 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, - 31, 32, 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, - 32, 32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, - 35, 35, 34, 35, 36, 37, 41, 46, 47, 49, 51, 54, 57, 60, 63, 66, - 39, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72, - 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, - 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76, 82, 86, 89, 92, - 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97, 100, 105, - }, { - 32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, - 31, 31, 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, - 37, 38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, - 45, 45, 44, 46, 46, 47, 49, 52, 51, 51, 51, 52, 53, 54, 54, 55, - 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59, - 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, - 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64, 66, 68, 69, 70, - 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73, 74, 76, - }, - }, { - { - 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, - 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, - 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, - 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, - 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, - 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, - 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, - 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, - }, { - 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, - 31, 31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, - 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, - 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, - 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, - 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, - 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, - 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, - }, - }, { - { - 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, - 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, - 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, - 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, - 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, - 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, - 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, - 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, - }, { - 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, - 31, 31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, - 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, - 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, - 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, - 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, - 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, - 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, - }, - }, { - { - 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, - 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, - 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, - 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, - 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, - 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, - 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, - 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, - }, { - 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, - 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, - 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, - 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, - 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, - 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, - 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, - 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, - 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, - 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, - 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, - 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, - 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, - }, { - 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, - 31, 31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, - 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, - 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, - 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, - 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, - 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, - 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, - 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, - 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, - 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, - 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, - }, { - 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, - 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, - 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, - 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, - 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, - 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, - 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, - 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, - }, { - 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, - 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, - 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, - 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, - 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, - 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, - 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, 41, 41, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - }, { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, - }, - }, -}; - -static const uint8_t qm_tbl_32x8[][2][256] = { - { - { - 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, - 32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, - 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107, - 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119, - 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136, - 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156, - 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, - 99, 94, 93, 90, 89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, - }, { - 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, - 37, 38, 40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, - 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, - 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, 73, 73, - 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, - 66, 63, 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, - 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, 103, 101, 99, - 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, 108, - }, - }, { - { - 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, - 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, - 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100, - 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, - 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, - 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, - 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166, - 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190, - }, { - 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, - 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, - 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, - 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, 71, 71, - 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, - 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, - 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, 100, 98, 96, - 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, - }, - }, { - { - 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, - 32, 32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, - 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, - 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, - 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118, - 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135, - 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155, - 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, - }, { - 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, - 34, 35, 36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, - 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, - 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, 69, 69, - 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, - 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, - 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, 95, 93, - 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, - }, - }, { - { - 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, - 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, - 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, - 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, 97, 97, - 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110, - 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126, - 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144, - 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163, - }, { - 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, - 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, - 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, - 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, 67, 67, - 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, - 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, - 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, 92, 90, - 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, - }, - }, { - { - 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, - 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, - 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, - 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, 90, 90, - 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103, - 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, - 79, 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, - 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152, - }, { - 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, - 32, 33, 33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, - 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, - 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, 65, 65, - 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, - 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, - 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, 89, 87, - 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, - }, - }, { - { - 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, - 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, - 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, - 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, - 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, - 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, - 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, - 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141, - }, { - 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, - 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, - 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, - 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, - 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, - 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, - 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, - 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, - 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, - 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, - 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, 79, 79, - 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, - 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, - 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, 103, 108, 108, 115, 115, - 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, 127, 127, - }, { - 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, - 31, 31, 31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, - 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, - 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, 60, 60, - 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, - 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, - 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 80, - 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, - 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, - 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, 66, 70, - 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, - 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, - 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96, - 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, - }, { - 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, - 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, - 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, - 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, 55, 57, - 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, - 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, - 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, 70, 72, - 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, - 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, - 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, 55, 55, - 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, - 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, - 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79, 79, - 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, - }, { - 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, - 31, 31, 31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, - 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, - 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, - 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, - 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, - 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, 65, 65, - 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, - 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, - 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, - 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, - 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, - 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, - 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, - }, { - 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, - 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, - 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, - 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, - 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, - 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, - 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, - 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, - 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, - 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, - 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, - 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, - 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, - }, { - 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, - 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, - 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, - 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, - 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, - 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, - 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, - 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, - 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, - 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, - 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, 46, 46, - 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, - }, { - 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, - 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, - 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, - 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, - 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, - 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, - 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, 52, 52, - 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, - 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, - 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, - }, { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, - 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, - 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, - 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, - 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, - 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, - 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, - }, { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, - 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, - 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, - }, - }, { - { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - }, { - 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, - }, - }, -}; - static const uint8_t qm_tbl_32x16[][2][512] = { { { @@ -3069,19 +1605,23 @@ const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES]; static uint8_t qm_tbl_4x4[15][2][16]; static uint8_t qm_tbl_4x8[15][2][32]; static uint8_t qm_tbl_4x16[15][2][64]; +static uint8_t qm_tbl_8x4[15][2][32]; static uint8_t qm_tbl_8x8[15][2][64]; static uint8_t qm_tbl_8x16[15][2][128]; static uint8_t qm_tbl_8x32[15][2][256]; +static uint8_t qm_tbl_16x4[15][2][64]; +static uint8_t qm_tbl_16x8[15][2][128]; static uint8_t qm_tbl_16x16[15][2][256]; static uint8_t qm_tbl_16x32[15][2][512]; +static uint8_t qm_tbl_32x8[15][2][256]; static uint8_t qm_tbl_32x32[15][2][1024]; -static void subsample(uint8_t *const dst, const uint8_t *const src, - const int sz, const int step) +static void subsample(uint8_t *dst, const uint8_t *const src, + const int h, const int hstep, const int vstep) { - for (int y = 0; y < sz; y++) - for (int x = 0; x < sz; x++) - dst[y * sz + x] = src[y * sz * step * step + x * step]; + for (int y = 0; y < h; y += vstep) + for (int x = 0; x < 32; x += hstep) + *dst++ = src[y * 32 + x]; } static void transpose(uint8_t *const dst, const uint8_t *const src, @@ -3114,28 +1654,33 @@ COLD void dav1d_init_qm_tables(void) { // because we store coefficients transposed dav1d_qm_tbl[i][j][RTX_4X8 ] = qm_tbl_8x4[i][j]; dav1d_qm_tbl[i][j][RTX_8X4 ] = qm_tbl_4x8[i][j]; - transpose(qm_tbl_4x8[i][j], qm_tbl_8x4[i][j], 8, 4); dav1d_qm_tbl[i][j][RTX_4X16 ] = qm_tbl_16x4[i][j]; dav1d_qm_tbl[i][j][RTX_16X4 ] = qm_tbl_4x16[i][j]; - transpose(qm_tbl_4x16[i][j], qm_tbl_16x4[i][j], 16, 4); dav1d_qm_tbl[i][j][RTX_8X16 ] = qm_tbl_16x8[i][j]; dav1d_qm_tbl[i][j][RTX_16X8 ] = qm_tbl_8x16[i][j]; - transpose(qm_tbl_8x16[i][j], qm_tbl_16x8[i][j], 16, 8); dav1d_qm_tbl[i][j][RTX_8X32 ] = qm_tbl_32x8[i][j]; dav1d_qm_tbl[i][j][RTX_32X8 ] = qm_tbl_8x32[i][j]; - transpose(qm_tbl_8x32[i][j], qm_tbl_32x8[i][j], 32, 8); dav1d_qm_tbl[i][j][RTX_16X32] = qm_tbl_32x16[i][j]; dav1d_qm_tbl[i][j][RTX_32X16] = qm_tbl_16x32[i][j]; - transpose(qm_tbl_16x32[i][j], qm_tbl_32x16[i][j], 32, 16); dav1d_qm_tbl[i][j][ TX_4X4 ] = qm_tbl_4x4[i][j]; dav1d_qm_tbl[i][j][ TX_8X8 ] = qm_tbl_8x8[i][j]; dav1d_qm_tbl[i][j][ TX_16X16] = qm_tbl_16x16[i][j]; dav1d_qm_tbl[i][j][ TX_32X32] = qm_tbl_32x32[i][j]; - untriangle(qm_tbl_4x4[i][j], qm_tbl_4x4_t[i][j], 4); - untriangle(qm_tbl_8x8[i][j], qm_tbl_8x8_t[i][j], 8); + untriangle(qm_tbl_32x32[i][j], qm_tbl_32x32_t[i][j], 32); - subsample(qm_tbl_16x16[i][j], qm_tbl_32x32[i][j], 16, 2); + subsample(qm_tbl_4x4[i][j], &qm_tbl_32x32[i][j][32*3+3], 32, 8, 8); + subsample(qm_tbl_8x4[i][j], &qm_tbl_32x16[i][j][32*1+1], 16, 4, 4); + subsample(qm_tbl_8x8[i][j], &qm_tbl_32x32[i][j][32*1+1], 32, 4, 4); + subsample(qm_tbl_16x4[i][j], &qm_tbl_32x16[i][j][32*1+0], 16, 2, 4); + subsample(qm_tbl_16x8[i][j], &qm_tbl_32x16[i][j][32*0+0], 16, 2, 2); + subsample(qm_tbl_16x16[i][j], &qm_tbl_32x32[i][j][32*0+0], 32, 2, 2); + subsample(qm_tbl_32x8[i][j], &qm_tbl_32x16[i][j][32*0+0], 16, 1, 2); + transpose(qm_tbl_4x8[i][j], qm_tbl_8x4[i][j], 8, 4); + transpose(qm_tbl_4x16[i][j], qm_tbl_16x4[i][j], 16, 4); + transpose(qm_tbl_8x16[i][j], qm_tbl_16x8[i][j], 16, 8); + transpose(qm_tbl_8x32[i][j], qm_tbl_32x8[i][j], 32, 8); + transpose(qm_tbl_16x32[i][j], qm_tbl_32x16[i][j], 32, 16); dav1d_qm_tbl[i][j][ TX_64X64] = dav1d_qm_tbl[i][j][ TX_32X32]; dav1d_qm_tbl[i][j][RTX_64X32] = dav1d_qm_tbl[i][j][ TX_32X32]; -- cgit v1.2.3 From 8501a4b20135f93a4c3b426468e2240e872949c5 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 21 Dec 2023 16:22:57 +0100 Subject: checkasm: Make signal handling async-signal-safe --- tests/checkasm/checkasm.c | 42 ++++++++++++++++++++++++------------------ tests/checkasm/checkasm.h | 22 ++++++++++++++-------- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 5e26432..2e19f78 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -36,12 +37,15 @@ #ifdef _WIN32 #include +#ifndef SIGBUS +/* non-standard, use the same value as mingw-w64 */ +#define SIGBUS 10 +#endif #ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING #define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x04 #endif #else #include -#include #include #include #ifdef HAVE_PTHREAD_NP_H @@ -139,7 +143,7 @@ static struct { int bench; int verbose; int function_listing; - int catch_signals; + volatile sig_atomic_t catch_signals; int suffix_length; int max_function_name_length; #if ARCH_X86_64 @@ -440,31 +444,30 @@ static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) { if (!state.catch_signals) return EXCEPTION_CONTINUE_SEARCH; - const char *err; + int s; switch (e->ExceptionRecord->ExceptionCode) { case EXCEPTION_FLT_DIVIDE_BY_ZERO: case EXCEPTION_INT_DIVIDE_BY_ZERO: - err = "fatal arithmetic error"; + s = SIGFPE; break; case EXCEPTION_ILLEGAL_INSTRUCTION: case EXCEPTION_PRIV_INSTRUCTION: - err = "illegal instruction"; + s = SIGILL; break; case EXCEPTION_ACCESS_VIOLATION: case EXCEPTION_ARRAY_BOUNDS_EXCEEDED: case EXCEPTION_DATATYPE_MISALIGNMENT: case EXCEPTION_STACK_OVERFLOW: - err = "segmentation fault"; + s = SIGSEGV; break; case EXCEPTION_IN_PAGE_ERROR: - err = "bus error"; + s = SIGBUS; break; default: return EXCEPTION_CONTINUE_SEARCH; } state.catch_signals = 0; - checkasm_fail_func(err); - checkasm_load_context(); + checkasm_load_context(s); return EXCEPTION_CONTINUE_EXECUTION; /* never reached, but shuts up gcc */ } #endif @@ -472,11 +475,7 @@ static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) { static void signal_handler(const int s) { if (state.catch_signals) { state.catch_signals = 0; - checkasm_fail_func(s == SIGFPE ? "fatal arithmetic error" : - s == SIGILL ? "illegal instruction" : - s == SIGBUS ? "bus error" : - "segmentation fault"); - checkasm_load_context(); + checkasm_load_context(s); } else { /* fall back to the default signal handler */ static const struct sigaction default_sa = { .sa_handler = SIG_DFL }; @@ -687,11 +686,8 @@ int main(int argc, char *argv[]) { #ifdef readtime if (state.bench) { - static int testing = 0; - checkasm_save_context(); - if (!testing) { + if (!checkasm_save_context()) { checkasm_set_signal_handler_state(1); - testing = 1; readtime(); checkasm_set_signal_handler_state(0); } else { @@ -890,6 +886,16 @@ void checkasm_set_signal_handler_state(const int enabled) { state.catch_signals = enabled; } +int checkasm_handle_signal(const int s) { + if (s) { + checkasm_fail_func(s == SIGFPE ? "fatal arithmetic error" : + s == SIGILL ? "illegal instruction" : + s == SIGBUS ? "bus error" : + "segmentation fault"); + } + return s; +} + static int check_err(const char *const file, const int line, const char *const name, const int w, const int h, int *const err) diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 562960a..375c65f 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -39,19 +39,24 @@ * functions without unwind information. */ #include #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -#define checkasm_context CONTEXT -#define checkasm_save_context() RtlCaptureContext(&checkasm_context_buf) -#define checkasm_load_context() RtlRestoreContext(&checkasm_context_buf, NULL) +typedef struct { CONTEXT c; int status; } checkasm_context; +#define checkasm_save_context() \ + (checkasm_context_buf.status = 0, \ + RtlCaptureContext(&checkasm_context_buf.c), \ + checkasm_handle_signal(checkasm_context_buf.status)) +#define checkasm_load_context(s) \ + (checkasm_context_buf.status = s, \ + RtlRestoreContext(&checkasm_context_buf.c, NULL)) #else -#define checkasm_context void* -#define checkasm_save_context() do {} while (0) +typedef void* checkasm_context; +#define checkasm_save_context() 0 #define checkasm_load_context() do {} while (0) #endif #else #include -#define checkasm_context jmp_buf -#define checkasm_save_context() setjmp(checkasm_context_buf) -#define checkasm_load_context() longjmp(checkasm_context_buf, 1) +typedef jmp_buf checkasm_context; +#define checkasm_save_context() checkasm_handle_signal(setjmp(checkasm_context_buf)) +#define checkasm_load_context(s) longjmp(checkasm_context_buf, s) #endif #include "include/common/attributes.h" @@ -82,6 +87,7 @@ int checkasm_fail_func(const char *msg, ...); void checkasm_update_bench(int iterations, uint64_t cycles); void checkasm_report(const char *name, ...); void checkasm_set_signal_handler_state(int enabled); +int checkasm_handle_signal(int s); extern checkasm_context checkasm_context_buf; /* float compare utilities */ -- cgit v1.2.3 From d23e87f7aee26ddcf5f7a2e185112031477599a7 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 21 Dec 2023 21:47:09 +0100 Subject: checkasm: Prefer sigsetjmp()/siglongjmp() over SA_NODEFER Also prefer re-setting the signal handler upon intercept in combination with SA_RESETHAND over re-raising exceptions with the SIG_DFL handler. --- tests/checkasm/checkasm.c | 26 ++++++++++++-------------- tests/checkasm/checkasm.h | 17 +++++++++++------ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 2e19f78..26a8560 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -36,7 +36,6 @@ #include "src/cpu.h" #ifdef _WIN32 -#include #ifndef SIGBUS /* non-standard, use the same value as mingw-w64 */ #define SIGBUS 10 @@ -472,15 +471,18 @@ static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) { } #endif #else +static void signal_handler(int s); + +static const struct sigaction signal_handler_act = { + .sa_handler = signal_handler, + .sa_flags = SA_RESETHAND, +}; + static void signal_handler(const int s) { if (state.catch_signals) { state.catch_signals = 0; + sigaction(s, &signal_handler_act, NULL); checkasm_load_context(s); - } else { - /* fall back to the default signal handler */ - static const struct sigaction default_sa = { .sa_handler = SIG_DFL }; - sigaction(s, &default_sa, NULL); - raise(s); } } #endif @@ -671,14 +673,10 @@ int main(int argc, char *argv[]) { SetConsoleMode(con, con_mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING); #endif #else - const struct sigaction sa = { - .sa_handler = signal_handler, - .sa_flags = SA_NODEFER, - }; - sigaction(SIGBUS, &sa, NULL); - sigaction(SIGFPE, &sa, NULL); - sigaction(SIGILL, &sa, NULL); - sigaction(SIGSEGV, &sa, NULL); + sigaction(SIGBUS, &signal_handler_act, NULL); + sigaction(SIGFPE, &signal_handler_act, NULL); + sigaction(SIGILL, &signal_handler_act, NULL); + sigaction(SIGSEGV, &signal_handler_act, NULL); const char *const term = getenv("TERM"); use_printf_color = term && strcmp(term, "dumb") && isatty(2); diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 375c65f..bf69bbb 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -33,12 +33,17 @@ #include #include -#if !ARCH_X86_32 && defined(_WIN32) +#ifdef _WIN32 +#include +#if ARCH_X86_32 +#include +typedef jmp_buf checkasm_context; +#define checkasm_save_context() checkasm_handle_signal(setjmp(checkasm_context_buf)) +#define checkasm_load_context(s) longjmp(checkasm_context_buf, s) +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) /* setjmp/longjmp on Windows on architectures using SEH (all except x86_32) * will try to use SEH to unwind the stack, which doesn't work for assembly * functions without unwind information. */ -#include -#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) typedef struct { CONTEXT c; int status; } checkasm_context; #define checkasm_save_context() \ (checkasm_context_buf.status = 0, \ @@ -54,9 +59,9 @@ typedef void* checkasm_context; #endif #else #include -typedef jmp_buf checkasm_context; -#define checkasm_save_context() checkasm_handle_signal(setjmp(checkasm_context_buf)) -#define checkasm_load_context(s) longjmp(checkasm_context_buf, s) +typedef sigjmp_buf checkasm_context; +#define checkasm_save_context() checkasm_handle_signal(sigsetjmp(checkasm_context_buf, 1)) +#define checkasm_load_context(s) siglongjmp(checkasm_context_buf, s) #endif #include "include/common/attributes.h" -- cgit v1.2.3 From 655d7ec07dfefd4af3b6eb7d0d9b5adaee7be61c Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Sun, 14 Jan 2024 06:49:31 +0100 Subject: CI: Add loongarch64 toolchain --- .gitlab-ci.yml | 2 +- package/crossfiles/loongarch64-linux.meson | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 package/crossfiles/loongarch64-linux.meson diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b59cf8c..882127e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,7 +4,7 @@ stages: - test .debian-amd64-common: - image: registry.videolan.org/dav1d-debian-unstable:20230512061205 + image: registry.videolan.org/dav1d-debian-unstable:20240113214804 stage: build tags: - docker diff --git a/package/crossfiles/loongarch64-linux.meson b/package/crossfiles/loongarch64-linux.meson new file mode 100644 index 0000000..04c29ac --- /dev/null +++ b/package/crossfiles/loongarch64-linux.meson @@ -0,0 +1,13 @@ +[binaries] +c = 'loongarch64-unknown-linux-gnu-gcc' +cpp = 'loongarch64-unknown-linux-gnu-c++' +ar = 'loongarch64-unknown-linux-gnu-ar' +strip = 'loongarch64-unknown-linux-gnu-strip' +pkgconfig = 'pkg-config' +exe_wrapper = 'qemu-loongarch64' + +[host_machine] +system = 'linux' +cpu_family = 'loongarch64' +cpu = 'loongarch64' +endian = 'little' -- cgit v1.2.3 From 7d225bec62cd923397c2294db0776b526b0e0a2e Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Mon, 15 Jan 2024 14:54:46 +0100 Subject: CI: Add loongarch64 tests --- .gitlab-ci.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 882127e..cfafb61 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -418,6 +418,19 @@ build-debian-wasm: matrix: - CROSSFILE: [wasm32, wasm64] +build-debian-loongarch64: + extends: .debian-amd64-common + variables: + QEMU_CPU: max-loongarch-cpu + QEMU_LD_PREFIX: /opt/cross-tools/target/ + script: + - meson setup build --buildtype release + -Dtrim_dsp=false + --werror + --cross-file package/crossfiles/loongarch64-linux.meson + - ninja -C build + - cd build && meson test -v + .test-common: stage: test @@ -697,6 +710,23 @@ test-debian-armv7-clang-5: - ninja -C build - cd build && time meson test -v +test-debian-loongarch64: + extends: + - .debian-amd64-common + - .test-common + needs: ["build-debian-loongarch64"] + variables: + QEMU_CPU: max-loongarch-cpu + QEMU_LD_PREFIX: /opt/cross-tools/target/ + script: + - meson setup build --buildtype release + -Dtestdata_tests=true + -Dlogging=false + -Dtrim_dsp=false + --cross-file package/crossfiles/loongarch64-linux.meson + - ninja -C build + - cd build && time meson test -v --timeout-multiplier 2 + .test-argon-script: &test-argon-script - meson setup build --buildtype release -Dlogging=false -- cgit v1.2.3 From 2e952f300fa29fbbfbdb9986c826bfec708834be Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 28 Jun 2023 10:52:35 +0800 Subject: Add loongarch support --- include/common/attributes.h | 2 +- meson.build | 10 ++++++++-- src/cpu.c | 2 ++ src/cpu.h | 2 ++ src/loongarch/cpu.c | 47 +++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/cpu.h | 37 +++++++++++++++++++++++++++++++++++ src/meson.build | 4 ++++ tests/checkasm/checkasm.c | 3 +++ tests/checkasm/checkasm.h | 17 ++++++++++++++++ tools/dav1d_cli_parse.c | 5 +++++ 10 files changed, 126 insertions(+), 3 deletions(-) create mode 100644 src/loongarch/cpu.c create mode 100644 src/loongarch/cpu.h diff --git a/include/common/attributes.h b/include/common/attributes.h index 71c34f2..d8dac04 100644 --- a/include/common/attributes.h +++ b/include/common/attributes.h @@ -60,7 +60,7 @@ #define ALIGN_64_VAL 64 #define ALIGN_32_VAL 32 #define ALIGN_16_VAL 16 -#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE +#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE || ARCH_LOONGARCH /* ARM doesn't benefit from anything more than 16-byte alignment. */ #define ALIGN_64_VAL 16 #define ALIGN_32_VAL 16 diff --git a/meson.build b/meson.build index 2b88f3c..0892a4f 100644 --- a/meson.build +++ b/meson.build @@ -66,7 +66,8 @@ is_asm_enabled = (get_option('enable_asm') == true and (host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '') or host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm') or - host_machine.cpu() == 'ppc64le')) + host_machine.cpu() == 'ppc64le' or + host_machine.cpu_family().startswith('loongarch'))) cdata.set10('HAVE_ASM', is_asm_enabled) if is_asm_enabled and get_option('b_sanitize') == 'memory' @@ -232,7 +233,8 @@ endif if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm') or - host_machine.cpu() == 'ppc64le') + host_machine.cpu() == 'ppc64le' or + host_machine.cpu_family().startswith('loongarch')) if cc.has_function('getauxval', prefix : '#include ', args : test_args) cdata.set('HAVE_GETAUXVAL', 1) endif @@ -379,6 +381,10 @@ endif cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le') +cdata.set10('ARCH_LOONGARCH', host_machine.cpu_family().startswith('loongarch')) +cdata.set10('ARCH_LOONGARCH32', host_machine.cpu_family() == 'loongarch32') +cdata.set10('ARCH_LOONGARCH64', host_machine.cpu_family() == 'loongarch64') + # meson's cc.symbols_have_underscore_prefix() is unfortunately unrelieably # when additional flags like '-fprofile-instr-generate' are passed via CFLAGS # see following meson issue https://github.com/mesonbuild/meson/issues/5482 diff --git a/src/cpu.c b/src/cpu.c index d24148c..5d6fc49 100644 --- a/src/cpu.c +++ b/src/cpu.c @@ -56,6 +56,8 @@ COLD void dav1d_init_cpu(void) { // memory sanitizer is inherently incompatible with asm #if ARCH_AARCH64 || ARCH_ARM dav1d_cpu_flags = dav1d_get_cpu_flags_arm(); +#elif ARCH_LOONGARCH + dav1d_cpu_flags = dav1d_get_cpu_flags_loongarch(); #elif ARCH_PPC64LE dav1d_cpu_flags = dav1d_get_cpu_flags_ppc(); #elif ARCH_X86 diff --git a/src/cpu.h b/src/cpu.h index 8f70fef..d42530e 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -37,6 +37,8 @@ #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/cpu.h" +#elif ARCH_LOONGARCH +#include "src/loongarch/cpu.h" #elif ARCH_PPC64LE #include "src/ppc/cpu.h" #elif ARCH_X86 diff --git a/src/loongarch/cpu.c b/src/loongarch/cpu.c new file mode 100644 index 0000000..a79ade5 --- /dev/null +++ b/src/loongarch/cpu.c @@ -0,0 +1,47 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "common/attributes.h" +#include "src/loongarch/cpu.h" + +#if defined(HAVE_GETAUXVAL) +#include + +#define LA_HWCAP_LSX ( 1 << 4 ) +#define LA_HWCAP_LASX ( 1 << 5 ) +#endif + +COLD unsigned dav1d_get_cpu_flags_loongarch(void) { + unsigned flags = 0; +#if defined(HAVE_GETAUXVAL) + unsigned long hw_cap = getauxval(AT_HWCAP); + flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0; + flags |= (hw_cap & LA_HWCAP_LASX) ? DAV1D_LOONGARCH_CPU_FLAG_LASX : 0; +#endif + + return flags; +} diff --git a/src/loongarch/cpu.h b/src/loongarch/cpu.h new file mode 100644 index 0000000..d00ff67 --- /dev/null +++ b/src/loongarch/cpu.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_CPU_H +#define DAV1D_SRC_LOONGARCH_CPU_H + +enum CpuFlags { + DAV1D_LOONGARCH_CPU_FLAG_LSX = 1 << 0, + DAV1D_LOONGARCH_CPU_FLAG_LASX = 1 << 1, +}; + +unsigned dav1d_get_cpu_flags_loongarch(void); + +#endif /* DAV1D_SRC_LOONGARCH_CPU_H */ diff --git a/src/meson.build b/src/meson.build index 3a34e76..ac2a25b 100644 --- a/src/meson.build +++ b/src/meson.build @@ -235,6 +235,10 @@ if is_asm_enabled 'ppc/cdef_tmpl.c', 'ppc/looprestoration_tmpl.c', ) + elif host_machine.cpu_family().startswith('loongarch') + libdav1d_sources += files( + 'loongarch/cpu.c', + ) endif endif diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 26a8560..d4d51bb 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -104,6 +104,9 @@ static const struct { { "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON }, #elif ARCH_PPC64LE { "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX }, +#elif ARCH_LOONGARCH + { "LSX", "lsx", DAV1D_LOONGARCH_CPU_FLAG_LSX }, + { "LASX", "lasx", DAV1D_LOONGARCH_CPU_FLAG_LASX }, #endif { 0 } }; diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index bf69bbb..e323319 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -197,6 +197,23 @@ static inline uint64_t readtime(void) { return (((uint64_t)tbu) << 32) | (uint64_t)tbl; } #define readtime readtime +#elif ARCH_LOONGARCH +static inline uint64_t readtime(void) { +#if ARCH_LOONGARCH64 + uint64_t a, id; + __asm__ __volatile__("rdtime.d %0, %1" + : "=r"(a), "=r"(id) + :: ); + return a; +#else + uint32_t a, id; + __asm__ __volatile__("rdtimel.w %0, %1" + : "=r"(a), "=r"(id) + :: ); + return (uint64_t)a; +#endif +} +#define readtime readtime #endif /* Verifies that clobbered callee-saved registers diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c index 4d747c0..5d22e26 100644 --- a/tools/dav1d_cli_parse.c +++ b/tools/dav1d_cli_parse.c @@ -101,6 +101,8 @@ static const struct option long_opts[] = { #if ARCH_AARCH64 || ARCH_ARM #define ALLOWED_CPU_MASKS " or 'neon'" +#elif ARCH_LOONGARCH +#define ALLOWED_CPU_MASKS ", 'lsx' or 'lasx'" #elif ARCH_PPC64LE #define ALLOWED_CPU_MASKS " or 'vsx'" #elif ARCH_X86 @@ -216,6 +218,9 @@ enum CpuMask { static const EnumParseTable cpu_mask_tbl[] = { #if ARCH_AARCH64 || ARCH_ARM { "neon", DAV1D_ARM_CPU_FLAG_NEON }, +#elif ARCH_LOONGARCH + { "lsx", DAV1D_LOONGARCH_CPU_FLAG_LSX }, + { "lasx", DAV1D_LOONGARCH_CPU_FLAG_LASX }, #elif ARCH_PPC64LE { "vsx", DAV1D_PPC_CPU_FLAG_VSX }, #elif ARCH_X86 -- cgit v1.2.3 From 4fb71a1a014f825ccf6f46ac4c003278db82db2b Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 30 Nov 2023 20:30:59 +0800 Subject: loongarch: add loongson_asm.S --- src/loongarch/loongson_asm.S | 776 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 776 insertions(+) create mode 100644 src/loongarch/loongson_asm.S diff --git a/src/loongarch/loongson_asm.S b/src/loongarch/loongson_asm.S new file mode 100644 index 0000000..a22072b --- /dev/null +++ b/src/loongarch/loongson_asm.S @@ -0,0 +1,776 @@ +/********************************************************************* + * Copyright (c) 2022 Loongson Technology Corporation Limited + * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn) + * Shiyou Yin(yinshiyou-hf@loongson.cn) + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + *********************************************************************/ + +/* + * This file is a LoongArch assembly helper file and available under ISC + * license. It provides a large number of macros and alias to simplify + * writing assembly code, especially for LSX and LASX optimizations. + * + * Any one can modify it or add new features for his/her own purposes. + * Contributing a patch will be appreciated as it might be useful for + * others as well. Send patches to loongson contributor mentioned above. + * + * MAJOR version: Usage changes, incompatible with previous version. + * MINOR version: Add new macros/functions, or bug fixes. + * MICRO version: Comment changes or implementation changes. + */ + +#define LML_VERSION_MAJOR 0 +#define LML_VERSION_MINOR 4 +#define LML_VERSION_MICRO 0 + +#define DEFAULT_ALIGN 5 + +/* Set prefix as needed. */ +#ifndef PRIVATE_PREFIX +#define PRIVATE_PREFIX dav1d_ +#endif + +#define PASTE(a,b) a ## b +#define CONCAT(a,b) PASTE(a,b) + +#ifdef PREFIX +#define ASM_PREF CONCAT(_,PRIVATE_PREFIX) +#else +#define ASM_PREF PRIVATE_PREFIX +#endif + +.macro function name, align=DEFAULT_ALIGN +.macro endfunc + jirl $r0, $r1, 0x0 + .size ASM_PREF\name, . - ASM_PREF\name + .purgem endfunc +.endm +.text ; +.align \align ; +.globl ASM_PREF\name ; +.type ASM_PREF\name, @function ; +ASM_PREF\name: ; +.endm + +.macro const name, align=DEFAULT_ALIGN + .macro endconst + .size \name, . - \name + .purgem endconst + .endm +.section .rodata +.align \align +\name: +.endm + +/* + *============================================================================ + * LoongArch register alias + *============================================================================ + */ + +#define a0 $a0 +#define a1 $a1 +#define a2 $a2 +#define a3 $a3 +#define a4 $a4 +#define a5 $a5 +#define a6 $a6 +#define a7 $a7 + +#define t0 $t0 +#define t1 $t1 +#define t2 $t2 +#define t3 $t3 +#define t4 $t4 +#define t5 $t5 +#define t6 $t6 +#define t7 $t7 +#define t8 $t8 + +#define s0 $s0 +#define s1 $s1 +#define s2 $s2 +#define s3 $s3 +#define s4 $s4 +#define s5 $s5 +#define s6 $s6 +#define s7 $s7 +#define s8 $s8 + +#define zero $zero +#define sp $sp +#define ra $ra + +#define fa0 $fa0 +#define fa1 $fa1 +#define fa2 $fa2 +#define fa3 $fa3 +#define fa4 $fa4 +#define fa5 $fa5 +#define fa6 $fa6 +#define fa7 $fa7 +#define ft0 $ft0 +#define ft1 $ft1 +#define ft2 $ft2 +#define ft3 $ft3 +#define ft4 $ft4 +#define ft5 $ft5 +#define ft6 $ft6 +#define ft7 $ft7 +#define ft8 $ft8 +#define ft9 $ft9 +#define ft10 $ft10 +#define ft11 $ft11 +#define ft12 $ft12 +#define ft13 $ft13 +#define ft14 $ft14 +#define ft15 $ft15 +#define fs0 $fs0 +#define fs1 $fs1 +#define fs2 $fs2 +#define fs3 $fs3 +#define fs4 $fs4 +#define fs5 $fs5 +#define fs6 $fs6 +#define fs7 $fs7 + +#define f0 $f0 +#define f1 $f1 +#define f2 $f2 +#define f3 $f3 +#define f4 $f4 +#define f5 $f5 +#define f6 $f6 +#define f7 $f7 +#define f8 $f8 +#define f9 $f9 +#define f10 $f10 +#define f11 $f11 +#define f12 $f12 +#define f13 $f13 +#define f14 $f14 +#define f15 $f15 +#define f16 $f16 +#define f17 $f17 +#define f18 $f18 +#define f19 $f19 +#define f20 $f20 +#define f21 $f21 +#define f22 $f22 +#define f23 $f23 +#define f24 $f24 +#define f25 $f25 +#define f26 $f26 +#define f27 $f27 +#define f28 $f28 +#define f29 $f29 +#define f30 $f30 +#define f31 $f31 + +#define vr0 $vr0 +#define vr1 $vr1 +#define vr2 $vr2 +#define vr3 $vr3 +#define vr4 $vr4 +#define vr5 $vr5 +#define vr6 $vr6 +#define vr7 $vr7 +#define vr8 $vr8 +#define vr9 $vr9 +#define vr10 $vr10 +#define vr11 $vr11 +#define vr12 $vr12 +#define vr13 $vr13 +#define vr14 $vr14 +#define vr15 $vr15 +#define vr16 $vr16 +#define vr17 $vr17 +#define vr18 $vr18 +#define vr19 $vr19 +#define vr20 $vr20 +#define vr21 $vr21 +#define vr22 $vr22 +#define vr23 $vr23 +#define vr24 $vr24 +#define vr25 $vr25 +#define vr26 $vr26 +#define vr27 $vr27 +#define vr28 $vr28 +#define vr29 $vr29 +#define vr30 $vr30 +#define vr31 $vr31 + +#define xr0 $xr0 +#define xr1 $xr1 +#define xr2 $xr2 +#define xr3 $xr3 +#define xr4 $xr4 +#define xr5 $xr5 +#define xr6 $xr6 +#define xr7 $xr7 +#define xr8 $xr8 +#define xr9 $xr9 +#define xr10 $xr10 +#define xr11 $xr11 +#define xr12 $xr12 +#define xr13 $xr13 +#define xr14 $xr14 +#define xr15 $xr15 +#define xr16 $xr16 +#define xr17 $xr17 +#define xr18 $xr18 +#define xr19 $xr19 +#define xr20 $xr20 +#define xr21 $xr21 +#define xr22 $xr22 +#define xr23 $xr23 +#define xr24 $xr24 +#define xr25 $xr25 +#define xr26 $xr26 +#define xr27 $xr27 +#define xr28 $xr28 +#define xr29 $xr29 +#define xr30 $xr30 +#define xr31 $xr31 + +/* + *============================================================================ + * LSX/LASX synthesize instructions + *============================================================================ + */ + +/* + * Description : Dot product of byte vector elements + * Arguments : Inputs - vj, vk + * Outputs - vd + * Return Type - halfword + */ +.macro vdp2.h.bu vd, vj, vk + vmulwev.h.bu \vd, \vj, \vk + vmaddwod.h.bu \vd, \vj, \vk +.endm + +.macro vdp2.h.bu.b vd, vj, vk + vmulwev.h.bu.b \vd, \vj, \vk + vmaddwod.h.bu.b \vd, \vj, \vk +.endm + +.macro vdp2.w.h vd, vj, vk + vmulwev.w.h \vd, \vj, \vk + vmaddwod.w.h \vd, \vj, \vk +.endm + +.macro xvdp2.h.bu xd, xj, xk + xvmulwev.h.bu \xd, \xj, \xk + xvmaddwod.h.bu \xd, \xj, \xk +.endm + +.macro xvdp2.h.bu.b xd, xj, xk + xvmulwev.h.bu.b \xd, \xj, \xk + xvmaddwod.h.bu.b \xd, \xj, \xk +.endm + +.macro xvdp2.w.h xd, xj, xk + xvmulwev.w.h \xd, \xj, \xk + xvmaddwod.w.h \xd, \xj, \xk +.endm + +/* + * Description : Dot product & addition of halfword vector elements + * Arguments : Inputs - vj, vk + * Outputs - vd + * Return Type - twice size of input + */ +.macro vdp2add.h.bu vd, vj, vk + vmaddwev.h.bu \vd, \vj, \vk + vmaddwod.h.bu \vd, \vj, \vk +.endm + +.macro vdp2add.h.bu.b vd, vj, vk + vmaddwev.h.bu.b \vd, \vj, \vk + vmaddwod.h.bu.b \vd, \vj, \vk +.endm + +.macro vdp2add.w.h vd, vj, vk + vmaddwev.w.h \vd, \vj, \vk + vmaddwod.w.h \vd, \vj, \vk +.endm + +.macro xvdp2add.h.bu.b xd, xj, xk + xvmaddwev.h.bu.b \xd, \xj, \xk + xvmaddwod.h.bu.b \xd, \xj, \xk +.endm + +.macro xvdp2add.w.h xd, xj, xk + xvmaddwev.w.h \xd, \xj, \xk + xvmaddwod.w.h \xd, \xj, \xk +.endm + +/* + * Description : Range element vj[i] to vk[i] ~ vj[i] + * clip: vj > vk ? vj : vk && vj < va ? vj : va + */ +.macro vclip.h vd, vj, vk, va + vmax.h \vd, \vj, \vk + vmin.h \vd, \vd, \va +.endm + +.macro vclip.w vd, vj, vk, va + vmax.w \vd, \vj, \vk + vmin.w \vd, \vd, \va +.endm + +.macro xvclip.h xd, xj, xk, xa + xvmax.h \xd, \xj, \xk + xvmin.h \xd, \xd, \xa +.endm + +.macro xvclip.w xd, xj, xk, xa + xvmax.w \xd, \xj, \xk + xvmin.w \xd, \xd, \xa +.endm + +/* + * Description : Range element vj[i] to 0 ~ 255 + * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0 + */ +.macro vclip255.h vd, vj + vmaxi.h \vd, \vj, 0 + vsat.hu \vd, \vd, 7 +.endm + +.macro vclip255.w vd, vj + vmaxi.w \vd, \vj, 0 + vsat.wu \vd, \vd, 7 +.endm + +.macro xvclip255.h xd, xj + xvmaxi.h \xd, \xj, 0 + xvsat.hu \xd, \xd, 7 +.endm + +.macro xvclip255.w xd, xj + xvmaxi.w \xd, \xj, 0 + xvsat.wu \xd, \xd, 7 +.endm + +/* + * Description : Store elements of vector + * vd : Data vector to be stroed + * rk : Address of data storage + * ra : Offset of address + * si : Index of data in vd + */ +.macro vstelmx.b vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.b \vd, \rk, 0, \si +.endm + +.macro vstelmx.h vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.h \vd, \rk, 0, \si +.endm + +.macro vstelmx.w vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.w \vd, \rk, 0, \si +.endm + +.macro vstelmx.d vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.d \vd, \rk, 0, \si +.endm + +.macro vmov xd, xj + vor.v \xd, \xj, \xj +.endm + +.macro xmov xd, xj + xvor.v \xd, \xj, \xj +.endm + +.macro xvstelmx.d xd, rk, ra, si + add.d \rk, \rk, \ra + xvstelm.d \xd, \rk, 0, \si +.endm + +/* + *============================================================================ + * LSX/LASX custom macros + *============================================================================ + */ + +/* + * Load 4 float, double, V128, v256 elements with stride. + */ +.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + fld.s \out0, \src, 0 + fldx.s \out1, \src, \stride + fldx.s \out2, \src, \stride2 + fldx.s \out3, \src, \stride3 +.endm + +.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + fld.d \out0, \src, 0 + fldx.d \out1, \src, \stride + fldx.d \out2, \src, \stride2 + fldx.d \out3, \src, \stride3 +.endm + +.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + vld \out0, \src, 0 + vldx \out1, \src, \stride + vldx \out2, \src, \stride2 + vldx \out3, \src, \stride3 +.endm + +.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + xvld \out0, \src, 0 + xvldx \out1, \src, \stride + xvldx \out2, \src, \stride2 + xvldx \out3, \src, \stride3 +.endm + +/* + * Description : Transpose 4x4 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + vilvl.h \tmp0, \in1, \in0 + vilvl.h \tmp1, \in3, \in2 + vilvl.w \out0, \tmp1, \tmp0 + vilvh.w \out2, \tmp1, \tmp0 + vilvh.d \out1, \out0, \out0 + vilvh.d \out3, \out0, \out2 +.endm + +/* + * Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : + * Example : + * 1, 2, 3, 4 1, 5, 9,13 + * 5, 6, 7, 8 to 2, 6,10,14 + * 9,10,11,12 =====> 3, 7,11,15 + * 13,14,15,16 4, 8,12,16 + */ +.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + + vilvl.w \tmp0, \in1, \in0 + vilvh.w \out1, \in1, \in0 + vilvl.w \tmp1, \in3, \in2 + vilvh.w \out3, \in3, \in2 + + vilvl.d \out0, \tmp1, \tmp0 + vilvl.d \out2, \out3, \out1 + vilvh.d \out3, \out3, \out1 + vilvh.d \out1, \tmp1, \tmp0 +.endm + +/* + * Description : Transpose 8x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + */ +.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \ + tmp3, tmp4, tmp5, tmp6, tmp7 + vilvl.h \tmp0, \in6, \in4 + vilvl.h \tmp1, \in7, \in5 + vilvl.h \tmp2, \in2, \in0 + vilvl.h \tmp3, \in3, \in1 + + vilvl.h \tmp4, \tmp1, \tmp0 + vilvh.h \tmp5, \tmp1, \tmp0 + vilvl.h \tmp6, \tmp3, \tmp2 + vilvh.h \tmp7, \tmp3, \tmp2 + + vilvh.h \tmp0, \in6, \in4 + vilvh.h \tmp1, \in7, \in5 + vilvh.h \tmp2, \in2, \in0 + vilvh.h \tmp3, \in3, \in1 + + vpickev.d \out0, \tmp4, \tmp6 + vpickod.d \out1, \tmp4, \tmp6 + vpickev.d \out2, \tmp5, \tmp7 + vpickod.d \out3, \tmp5, \tmp7 + + vilvl.h \tmp4, \tmp1, \tmp0 + vilvh.h \tmp5, \tmp1, \tmp0 + vilvl.h \tmp6, \tmp3, \tmp2 + vilvh.h \tmp7, \tmp3, \tmp2 + + vpickev.d \out4, \tmp4, \tmp6 + vpickod.d \out5, \tmp4, \tmp6 + vpickev.d \out6, \tmp5, \tmp7 + vpickod.d \out7, \tmp5, \tmp7 +.endm + +/* + * Description : Transpose 16x8 block with byte elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + */ +.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7,\ + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 + xvilvl.b \tmp0, \in2, \in0 + xvilvl.b \tmp1, \in3, \in1 + xvilvl.b \tmp2, \in6, \in4 + xvilvl.b \tmp3, \in7, \in5 + xvilvl.b \tmp4, \in10, \in8 + xvilvl.b \tmp5, \in11, \in9 + xvilvl.b \tmp6, \in14, \in12 + xvilvl.b \tmp7, \in15, \in13 + xvilvl.b \out0, \tmp1, \tmp0 + xvilvh.b \out1, \tmp1, \tmp0 + xvilvl.b \out2, \tmp3, \tmp2 + xvilvh.b \out3, \tmp3, \tmp2 + xvilvl.b \out4, \tmp5, \tmp4 + xvilvh.b \out5, \tmp5, \tmp4 + xvilvl.b \out6, \tmp7, \tmp6 + xvilvh.b \out7, \tmp7, \tmp6 + xvilvl.w \tmp0, \out2, \out0 + xvilvh.w \tmp2, \out2, \out0 + xvilvl.w \tmp4, \out3, \out1 + xvilvh.w \tmp6, \out3, \out1 + xvilvl.w \tmp1, \out6, \out4 + xvilvh.w \tmp3, \out6, \out4 + xvilvl.w \tmp5, \out7, \out5 + xvilvh.w \tmp7, \out7, \out5 + xvilvl.d \out0, \tmp1, \tmp0 + xvilvh.d \out1, \tmp1, \tmp0 + xvilvl.d \out2, \tmp3, \tmp2 + xvilvh.d \out3, \tmp3, \tmp2 + xvilvl.d \out4, \tmp5, \tmp4 + xvilvh.d \out5, \tmp5, \tmp4 + xvilvl.d \out6, \tmp7, \tmp6 + xvilvh.d \out7, \tmp7, \tmp6 +.endm + +/* + * Description : Transpose 4x4 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + xvilvl.h \tmp0, \in1, \in0 + xvilvl.h \tmp1, \in3, \in2 + xvilvl.w \out0, \tmp1, \tmp0 + xvilvh.w \out2, \tmp1, \tmp0 + xvilvh.d \out1, \out0, \out0 + xvilvh.d \out3, \out0, \out2 +.endm + +/* + * Description : Transpose 4x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + xvilvl.h \tmp0, \in2, \in0 + xvilvl.h \tmp1, \in3, \in1 + xvilvl.h \out2, \tmp1, \tmp0 + xvilvh.h \out3, \tmp1, \tmp0 + + xvilvl.d \out0, \out2, \out2 + xvilvh.d \out1, \out2, \out2 + xvilvl.d \out2, \out3, \out3 + xvilvh.d \out3, \out3, \out3 +.endm + +/* + * Description : Transpose 8x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + */ +.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7, \ + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 + xvilvl.h \tmp0, \in6, \in4 + xvilvl.h \tmp1, \in7, \in5 + xvilvl.h \tmp2, \in2, \in0 + xvilvl.h \tmp3, \in3, \in1 + + xvilvl.h \tmp4, \tmp1, \tmp0 + xvilvh.h \tmp5, \tmp1, \tmp0 + xvilvl.h \tmp6, \tmp3, \tmp2 + xvilvh.h \tmp7, \tmp3, \tmp2 + + xvilvh.h \tmp0, \in6, \in4 + xvilvh.h \tmp1, \in7, \in5 + xvilvh.h \tmp2, \in2, \in0 + xvilvh.h \tmp3, \in3, \in1 + + xvpickev.d \out0, \tmp4, \tmp6 + xvpickod.d \out1, \tmp4, \tmp6 + xvpickev.d \out2, \tmp5, \tmp7 + xvpickod.d \out3, \tmp5, \tmp7 + + xvilvl.h \tmp4, \tmp1, \tmp0 + xvilvh.h \tmp5, \tmp1, \tmp0 + xvilvl.h \tmp6, \tmp3, \tmp2 + xvilvh.h \tmp7, \tmp3, \tmp2 + + xvpickev.d \out4, \tmp4, \tmp6 + xvpickod.d \out5, \tmp4, \tmp6 + xvpickev.d \out6, \tmp5, \tmp7 + xvpickod.d \out7, \tmp5, \tmp7 +.endm + +/* + * Description : Transpose 2x4x4 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1, tmp2 + xvilvh.h \tmp1, \in0, \in1 + xvilvl.h \out1, \in0, \in1 + xvilvh.h \tmp0, \in2, \in3 + xvilvl.h \out3, \in2, \in3 + + xvilvh.w \tmp2, \out3, \out1 + xvilvl.w \out3, \out3, \out1 + + xvilvl.w \out2, \tmp0, \tmp1 + xvilvh.w \tmp1, \tmp0, \tmp1 + + xvilvh.d \out0, \out2, \out3 + xvilvl.d \out2, \out2, \out3 + xvilvh.d \out1, \tmp1, \tmp2 + xvilvl.d \out3, \tmp1, \tmp2 +.endm + +/* + * Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : + * Example : + * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13 + * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14 + * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15 + * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16 + */ +.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + + xvilvl.w \tmp0, \in1, \in0 + xvilvh.w \out1, \in1, \in0 + xvilvl.w \tmp1, \in3, \in2 + xvilvh.w \out3, \in3, \in2 + + xvilvl.d \out0, \tmp1, \tmp0 + xvilvl.d \out2, \out3, \out1 + xvilvh.d \out3, \out3, \out1 + xvilvh.d \out1, \tmp1, \tmp0 +.endm + +/* + * Description : Transpose 8x8 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, + * _out7 + * Example : LASX_TRANSPOSE8x8_W + * in0 : 1,2,3,4,5,6,7,8 + * in1 : 2,2,3,4,5,6,7,8 + * in2 : 3,2,3,4,5,6,7,8 + * in3 : 4,2,3,4,5,6,7,8 + * in4 : 5,2,3,4,5,6,7,8 + * in5 : 6,2,3,4,5,6,7,8 + * in6 : 7,2,3,4,5,6,7,8 + * in7 : 8,2,3,4,5,6,7,8 + * + * out0 : 1,2,3,4,5,6,7,8 + * out1 : 2,2,2,2,2,2,2,2 + * out2 : 3,3,3,3,3,3,3,3 + * out3 : 4,4,4,4,4,4,4,4 + * out4 : 5,5,5,5,5,5,5,5 + * out5 : 6,6,6,6,6,6,6,6 + * out6 : 7,7,7,7,7,7,7,7 + * out7 : 8,8,8,8,8,8,8,8 + */ +.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\ + out0, out1, out2, out3, out4, out5, out6, out7,\ + tmp0, tmp1, tmp2, tmp3 + xvilvl.w \tmp0, \in2, \in0 + xvilvl.w \tmp1, \in3, \in1 + xvilvh.w \tmp2, \in2, \in0 + xvilvh.w \tmp3, \in3, \in1 + xvilvl.w \out0, \tmp1, \tmp0 + xvilvh.w \out1, \tmp1, \tmp0 + xvilvl.w \out2, \tmp3, \tmp2 + xvilvh.w \out3, \tmp3, \tmp2 + + xvilvl.w \tmp0, \in6, \in4 + xvilvl.w \tmp1, \in7, \in5 + xvilvh.w \tmp2, \in6, \in4 + xvilvh.w \tmp3, \in7, \in5 + xvilvl.w \out4, \tmp1, \tmp0 + xvilvh.w \out5, \tmp1, \tmp0 + xvilvl.w \out6, \tmp3, \tmp2 + xvilvh.w \out7, \tmp3, \tmp2 + + xmov \tmp0, \out0 + xmov \tmp1, \out1 + xmov \tmp2, \out2 + xmov \tmp3, \out3 + xvpermi.q \out0, \out4, 0x02 + xvpermi.q \out1, \out5, 0x02 + xvpermi.q \out2, \out6, 0x02 + xvpermi.q \out3, \out7, 0x02 + xvpermi.q \out4, \tmp0, 0x31 + xvpermi.q \out5, \tmp1, 0x31 + xvpermi.q \out6, \tmp2, 0x31 + xvpermi.q \out7, \tmp3, 0x31 +.endm + +/* + * Description : Transpose 4x4 block with double-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Example : LASX_TRANSPOSE4x4_D + * in0 : 1,2,3,4 + * in1 : 1,2,3,4 + * in2 : 1,2,3,4 + * in3 : 1,2,3,4 + * + * out0 : 1,1,1,1 + * out1 : 2,2,2,2 + * out2 : 3,3,3,3 + * out3 : 4,4,4,4 + */ +.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + xvilvl.d \tmp0, \in1, \in0 + xvilvh.d \out1, \in1, \in0 + xvilvh.d \tmp1, \in3, \in2 + xvilvl.d \out2, \in3, \in2 + + xvor.v \out0, \tmp0, \tmp0 + xvor.v \out3, \tmp1, \tmp1 + + xvpermi.q \out0, \out2, 0x02 + xvpermi.q \out2, \tmp0, 0x31 + xvpermi.q \out3, \out1, 0x31 + xvpermi.q \out1, \tmp1, 0x02 +.endm -- cgit v1.2.3 From a23a1e7f814212a809601b55dc22d5edb428e3a5 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 30 Nov 2023 20:40:38 +0800 Subject: loongarch: Improve the performance of warp8x8, warp8x8t functions Relative speedup over C code: warp_8x8_8bpc_c: 81.3 ( 1.00x) warp_8x8_8bpc_lsx: 27.1 ( 3.00x) warp_8x8_8bpc_lasx: 17.9 ( 4.54x) warp_8x8t_8bpc_c: 71.7 ( 1.00x) warp_8x8t_8bpc_lsx: 26.6 ( 2.69x) warp_8x8t_8bpc_lasx: 17.7 ( 4.04x) --- src/loongarch/mc.S | 931 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/mc.h | 58 ++++ src/mc_tmpl.c | 4 + src/meson.build | 5 + 4 files changed, 998 insertions(+) create mode 100644 src/loongarch/mc.S create mode 100644 src/loongarch/mc.h diff --git a/src/loongarch/mc.S b/src/loongarch/mc.S new file mode 100644 index 0000000..6b51599 --- /dev/null +++ b/src/loongarch/mc.S @@ -0,0 +1,931 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +/* +static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *const abcd, int mx, int my + HIGHBD_DECL_SUFFIX) +*/ +.macro FILTER_WARP_RND_P_LSX in0, in1, in2, in3, out0, out1, out2, out3 + vbsrl.v vr2, \in0, \in1 + vbsrl.v vr20, \in0, \in2 + addi.w t4, \in3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr1, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr29, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + vilvl.d vr2, vr20, vr2 + vilvl.d vr1, vr29, vr1 + vmulwev.h.bu.b vr3, vr2, vr1 + vmulwod.h.bu.b vr20, vr2, vr1 + vilvl.d vr2, vr20, vr3 + vhaddw.w.h vr2, vr2, vr2 + vhaddw.d.w vr2, vr2, vr2 + vhaddw.q.d vr2, vr2, vr2 + vilvh.d vr3, vr20, vr3 + vhaddw.w.h vr3, vr3, vr3 + vhaddw.d.w vr3, vr3, vr3 + vhaddw.q.d vr3, vr3, vr3 + vextrins.w \out0, vr2, \out1 + vextrins.w \out2, vr3, \out3 +.endm + +.macro FILTER_WARP_CLIP_LSX in0, in1, in2, out0, out1 + add.w \in0, \in0, \in1 + addi.w t6, \in0, 512 + srai.w t6, t6, 10 + addi.w t6, t6, 64 + slli.w t6, t6, 3 + fldx.d f1, t5, t6 + vsllwil.h.b vr1, vr1, 0 + vmulwev.w.h vr3, \in2, vr1 + vmaddwod.w.h vr3, \in2, vr1 + vhaddw.d.w vr3, vr3, vr3 + vhaddw.q.d vr3, vr3, vr3 + vextrins.w \out0, vr3, \out1 +.endm + +const warp_sh +.rept 2 +.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +.endr +.rept 2 +.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +.endr +endconst + +.macro warp_lsx t, shift +function warp_affine_8x8\t\()_8bpc_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + la.local t4, warp_sh + ld.h t0, a4, 0 // abcd[0] + ld.h t1, a4, 2 // abcd[1] + + alsl.w t2, a3, a3, 1 + addi.w t3, a5, 0 + la.local t5, dav1d_mc_warp_filter + sub.d a2, a2, t2 + addi.d a2, a2, -3 + vld vr0, a2, 0 + vld vr30, t4, 0 + vld vr31, t4, 32 + + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30 + + add.w a5, t1, a5 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x00, vr13, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x00, vr15, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x00, vr17, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x00, vr19, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x10, vr13, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x10, vr15, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x10, vr17, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x10, vr19, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x20, vr13, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x20, vr15, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x20, vr17, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x20, vr19, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x30, vr13, 0x30 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x30, vr15, 0x30 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x30, vr17, 0x30 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x30, vr19, 0x30 + + vsrarni.h.w vr12, vr4, 3 + vsrarni.h.w vr13, vr5, 3 + vsrarni.h.w vr14, vr6, 3 + vsrarni.h.w vr15, vr7, 3 + vsrarni.h.w vr16, vr8, 3 + vsrarni.h.w vr17, vr9, 3 + vsrarni.h.w vr18, vr10, 3 + vsrarni.h.w vr19, vr11, 3 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x00, vr22, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x00, vr24, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x00, vr26, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x00, vr28, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x10, vr22, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x10, vr24, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x10, vr26, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x10, vr28, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x20, vr22, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x20, vr24, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x20, vr26, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x20, vr28, 0x20 + + vsrarni.h.w vr21, vr4, 3 + vsrarni.h.w vr22, vr5, 3 + vsrarni.h.w vr23, vr6, 3 + vsrarni.h.w vr24, vr7, 3 + vsrarni.h.w vr25, vr8, 3 + vsrarni.h.w vr26, vr9, 3 + vsrarni.h.w vr27, vr10, 3 + vsrarni.h.w vr28, vr11, 3 + + addi.w t2, a6, 0 // my + ld.h t7, a4, 4 // abcd[2] + ld.h t8, a4, 6 // abcd[3] + +.ifnb \t + slli.d a1, a1, 1 +.endif + + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 + alsl.d a0, a1, a0, 1 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 + alsl.d a0, a1, a0, 1 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 + alsl.d a0, a1, a0, 1 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc +.endm + +warp_lsx , 11 +warp_lsx t, 7 + +.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3 + xvshuf.b xr2, \in0, \in0, \in2 + + addi.w t4, \in1, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr3, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr4, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr5, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr6, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + xvinsve0.d xr3, xr5, 1 + xvinsve0.d xr3, xr4, 2 + xvinsve0.d xr3, xr6, 3 + + xvmulwev.h.bu.b xr4, xr2, xr3 + xvmulwod.h.bu.b xr5, xr2, xr3 + xvilvl.d xr2, xr5, xr4 + xvilvh.d xr3, xr5, xr4 + xvhaddw.w.h xr2, xr2, xr2 + xvhaddw.w.h xr3, xr3, xr3 + xvhaddw.d.w xr2, xr2, xr2 + xvhaddw.d.w xr3, xr3, xr3 + xvhaddw.q.d xr2, xr2, xr2 + xvhaddw.q.d xr3, xr3, xr3 + + xvextrins.w \out0, xr2, \out1 + xvextrins.w \out2, xr3, \out3 +.endm + +.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1 + add.w \in0, \in0, \in1 + addi.w t6, \in0, 512 + srai.w t6, t6, 10 + addi.w t6, t6, 64 + slli.w t6, t6, 3 + fldx.d f1, t5, t6 + + add.w t2, t2, t7 + addi.w t6, t2, 512 + srai.w t6, t6, 10 + addi.w t6, t6, 64 + slli.w t6, t6, 3 + fldx.d f2, t5, t6 + + vilvl.d vr0, vr2, vr1 + vext2xv.h.b xr0, xr0 + xvmulwev.w.h xr3, \in2, xr0 + xvmaddwod.w.h xr3, \in2, xr0 + xvhaddw.d.w xr3, xr3, xr3 + xvhaddw.q.d xr3, xr3, xr3 + xvextrins.w \out0, xr3, \out1 +.endm + +const shuf0 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +.byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10 +endconst + +.macro warp_lasx t, shift +function warp_affine_8x8\t\()_8bpc_lasx + addi.d sp, sp, -16 + ld.h t0, a4, 0 // abcd[0] + ld.h t1, a4, 2 // abcd[1] + fst.d f24, sp, 0 + fst.d f25, sp, 8 + + alsl.w t2, a3, a3, 1 + addi.w t3, a5, 0 + la.local t4, warp_sh + la.local t5, dav1d_mc_warp_filter + sub.d a2, a2, t2 + addi.d a2, a2, -3 + vld vr0, a2, 0 + xvld xr24, t4, 0 + xvld xr25, t4, 32 + la.local t2, shuf0 + xvld xr1, t2, 0 + xvpermi.q xr0, xr0, 0x00 + xvaddi.bu xr9, xr1, 4 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30 + + xvsrarni.h.w xr12, xr7, 3 + xvsrarni.h.w xr13, xr8, 3 + xvsrarni.h.w xr14, xr10, 3 + xvsrarni.h.w xr15, xr11, 3 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20 + + xvsrarni.h.w xr16, xr7, 3 + xvsrarni.h.w xr17, xr8, 3 + xvsrarni.h.w xr18, xr10, 3 + xvsrarni.h.w xr19, xr11, 3 + + addi.w t2, a6, 0 // my + ld.h t7, a4, 4 // abcd[2] + ld.h t8, a4, 6 // abcd[3] + +.ifnb \t + slli.d a1, a1, 1 +.endif + + // y = 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, \shift + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + alsl.d a0, a1, a0, 1 + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, 11 + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + add.d a0, a0, a1 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + alsl.d a0, a1, a0, 1 + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, 11 + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + add.d a0, a0, a1 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + alsl.d a0, a1, a0, 1 + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, 11 + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + add.d a0, a0, a1 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + fld.d f24, sp, 0 + fld.d f25, sp, 8 + addi.d sp, sp, 16 +endfunc +.endm + +warp_lasx , 11 +warp_lasx t, 7 diff --git a/src/loongarch/mc.h b/src/loongarch/mc.h new file mode 100644 index 0000000..92b438f --- /dev/null +++ b/src/loongarch/mc.h @@ -0,0 +1,58 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_MC_H +#define DAV1D_SRC_LOONGARCH_MC_H + +#include "config.h" +#include "src/mc.h" +#include "src/cpu.h" + +decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx)); +decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx)); + +decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx)); +decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx)); + +static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) { +#if BITDEPTH == 8 + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + + c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return; + + c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx); + +#endif +} + +#endif /* DAV1D_SRC_LOONGARCH_MC_H */ diff --git a/src/mc_tmpl.c b/src/mc_tmpl.c index 20226d8..469fc5f 100644 --- a/src/mc_tmpl.c +++ b/src/mc_tmpl.c @@ -905,6 +905,8 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride, #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/mc.h" +#elif ARCH_LOONGARCH64 +#include "src/loongarch/mc.h" #elif ARCH_X86 #include "src/x86/mc.h" #endif @@ -946,6 +948,8 @@ COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM mc_dsp_init_arm(c); +#elif ARCH_LOONGARCH64 + mc_dsp_init_loongarch(c); #elif ARCH_X86 mc_dsp_init_x86(c); #endif diff --git a/src/meson.build b/src/meson.build index ac2a25b..d12667c 100644 --- a/src/meson.build +++ b/src/meson.build @@ -239,6 +239,11 @@ if is_asm_enabled libdav1d_sources += files( 'loongarch/cpu.c', ) + + libdav1d_sources_asm = files( + 'loongarch/mc.S', + ) + libdav1d_asm_objs += libdav1d_sources_asm endif endif -- cgit v1.2.3 From bde69a94bf9caef8e104124fcbc351e6bb70d4d9 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Fri, 1 Dec 2023 10:21:11 +0800 Subject: loongarch: Improve the performance of w_avg functions Relative speedup over C code: w_avg_w4_8bpc_c: 8.6 ( 1.00x) w_avg_w4_8bpc_lsx: 1.0 ( 8.53x) w_avg_w4_8bpc_lasx: 1.0 ( 8.79x) w_avg_w8_8bpc_c: 24.4 ( 1.00x) w_avg_w8_8bpc_lsx: 2.7 ( 8.90x) w_avg_w8_8bpc_lasx: 1.6 (15.33x) w_avg_w16_8bpc_c: 77.4 ( 1.00x) w_avg_w16_8bpc_lsx: 6.9 (11.29x) w_avg_w16_8bpc_lasx: 5.2 (14.88x) w_avg_w32_8bpc_c: 303.7 ( 1.00x) w_avg_w32_8bpc_lsx: 27.2 (11.16x) w_avg_w32_8bpc_lasx: 14.2 (21.43x) w_avg_w64_8bpc_c: 725.8 ( 1.00x) w_avg_w64_8bpc_lsx: 66.1 (10.98x) w_avg_w64_8bpc_lasx: 35.4 (20.48x) w_avg_w128_8bpc_c: 1812.6 ( 1.00x) w_avg_w128_8bpc_lsx: 169.9 (10.67x) w_avg_w128_8bpc_lasx: 111.7 (16.23x) --- src/loongarch/mc.S | 373 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/mc.h | 4 + 2 files changed, 377 insertions(+) diff --git a/src/loongarch/mc.S b/src/loongarch/mc.S index 6b51599..bd670b8 100644 --- a/src/loongarch/mc.S +++ b/src/loongarch/mc.S @@ -929,3 +929,376 @@ endfunc warp_lasx , 11 warp_lasx t, 7 + +/* +static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, + const int w, int h, + const int weight HIGHBD_DECL_SUFFIX) +*/ + +#define bpc8_sh 5 // sh = intermediate_bits + 1 +#define bpcw8_sh 8 // sh = intermediate_bits + 4 + +#define bpc_sh bpc8_sh +#define bpcw_sh bpcw8_sh + +function w_avg_8bpc_lsx + addi.d t8, a0, 0 + li.w t2, 16 + sub.w t2, t2, a6 // 16 - weight + vreplgr2vr.h vr21, a6 + vreplgr2vr.h vr22, t2 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .W_AVG_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.W_AVG_LSX_JRTABLE: + .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE + +.W_AVG_W4_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vmulwev.w.h vr2, vr0, vr21 + vmulwod.w.h vr3, vr0, vr21 + vmaddwev.w.h vr2, vr1, vr22 + vmaddwod.w.h vr3, vr1, vr22 + vssrarni.hu.w vr3, vr2, bpcw_sh + vssrlni.bu.h vr1, vr3, 0 + vpickod.w vr4, vr2, vr1 + vilvl.b vr0, vr4, vr1 + fst.s f0, a0, 0 + add.d a0, a0, a1 + vstelm.w vr0, a0, 0, 1 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a1, a0 + blt zero, a5, .W_AVG_W4_LSX + b .W_AVG_END_LSX +.W_AVG_W8_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vmulwev.w.h vr2, vr0, vr21 + vmulwod.w.h vr3, vr0, vr21 + vmaddwev.w.h vr2, vr1, vr22 + vmaddwod.w.h vr3, vr1, vr22 + vssrarni.hu.w vr3, vr2, bpcw_sh + vssrlni.bu.h vr1, vr3, 0 + vpickod.w vr4, vr2, vr1 + vilvl.b vr0, vr4, vr1 + fst.d f0, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W8_LSX + b .W_AVG_END_LSX +.W_AVG_W16_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W16_LSX + b .W_AVG_END_LSX +.W_AVG_W32_LSX: +.rept 2 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W32_LSX + b .W_AVG_END_LSX + +.W_AVG_W64_LSX: +.rept 4 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W64_LSX + b .W_AVG_END_LSX + +.W_AVG_W128_LSX: +.rept 8 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W128_LSX +.W_AVG_END_LSX: +endfunc + +function w_avg_8bpc_lasx + addi.d t8, a0, 0 + li.w t2, 16 + sub.w t2, t2, a6 // 16 - weight + xvreplgr2vr.h xr21, a6 + xvreplgr2vr.h xr22, t2 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .W_AVG_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.W_AVG_LASX_JRTABLE: + .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE + +.W_AVG_W4_LASX: + vld vr0, a2, 0 + vld vr1, a3, 0 + xvpermi.d xr2, xr0, 0xD8 + xvpermi.d xr3, xr1, 0xD8 + xvilvl.h xr4, xr3, xr2 + xvmulwev.w.h xr0, xr4, xr21 + xvmaddwod.w.h xr0, xr4, xr22 + xvssrarni.hu.w xr1, xr0, bpcw_sh + xvssrlni.bu.h xr0, xr1, 0 + fst.s f0, a0, 0 + add.d a0, a0, a1 + xvstelm.w xr0, a0, 0, 4 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a1, a0 + blt zero, a5, .W_AVG_W4_LASX + b .W_AVG_END_LASX + +.W_AVG_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + xvmulwev.w.h xr2, xr0, xr21 + xvmulwod.w.h xr3, xr0, xr21 + xvmaddwev.w.h xr2, xr1, xr22 + xvmaddwod.w.h xr3, xr1, xr22 + xvssrarni.hu.w xr3, xr2, bpcw_sh + xvssrlni.bu.h xr1, xr3, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + xvstelm.d xr0, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.d xr0, a0, 0, 2 + addi.w a5, a5, -2 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W8_LASX + b .W_AVG_END_LASX + +.W_AVG_W16_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + xvmulwev.w.h xr2, xr0, xr21 + xvmulwod.w.h xr3, xr0, xr21 + xvmaddwev.w.h xr2, xr1, xr22 + xvmaddwod.w.h xr3, xr1, xr22 + xvssrarni.hu.w xr3, xr2, bpcw_sh + xvssrlni.bu.h xr1, xr3, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + xvpermi.d xr1, xr0, 0xD8 + vst vr1, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W16_LASX + b .W_AVG_END_LSX + +.W_AVG_W32_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvmulwev.w.h xr4, xr0, xr21 + xvmulwod.w.h xr5, xr0, xr21 + xvmulwev.w.h xr6, xr2, xr21 + xvmulwod.w.h xr7, xr2, xr21 + xvmaddwev.w.h xr4, xr1, xr22 + xvmaddwod.w.h xr5, xr1, xr22 + xvmaddwev.w.h xr6, xr3, xr22 + xvmaddwod.w.h xr7, xr3, xr22 + xvssrarni.hu.w xr6, xr4, bpcw_sh + xvssrarni.hu.w xr7, xr5, bpcw_sh + xvssrlni.bu.h xr7, xr6, 0 + xvshuf4i.w xr8, xr7, 0x4E + xvilvl.b xr9, xr8, xr7 + xvpermi.d xr0, xr9, 0xD8 + xvst xr0, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W32_LASX + b .W_AVG_END_LASX + +.W_AVG_W64_LASX: +.rept 2 + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvmulwev.w.h xr4, xr0, xr21 + xvmulwod.w.h xr5, xr0, xr21 + xvmulwev.w.h xr6, xr2, xr21 + xvmulwod.w.h xr7, xr2, xr21 + xvmaddwev.w.h xr4, xr1, xr22 + xvmaddwod.w.h xr5, xr1, xr22 + xvmaddwev.w.h xr6, xr3, xr22 + xvmaddwod.w.h xr7, xr3, xr22 + xvssrarni.hu.w xr6, xr4, bpcw_sh + xvssrarni.hu.w xr7, xr5, bpcw_sh + xvssrlni.bu.h xr7, xr6, 0 + xvshuf4i.w xr8, xr7, 0x4E + xvilvl.b xr9, xr8, xr7 + xvpermi.d xr0, xr9, 0xD8 + xvst xr0, a0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a0, a0, 32 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W64_LASX + b .W_AVG_END_LASX + +.W_AVG_W128_LASX: +.rept 4 + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvmulwev.w.h xr4, xr0, xr21 + xvmulwod.w.h xr5, xr0, xr21 + xvmulwev.w.h xr6, xr2, xr21 + xvmulwod.w.h xr7, xr2, xr21 + xvmaddwev.w.h xr4, xr1, xr22 + xvmaddwod.w.h xr5, xr1, xr22 + xvmaddwev.w.h xr6, xr3, xr22 + xvmaddwod.w.h xr7, xr3, xr22 + xvssrarni.hu.w xr6, xr4, bpcw_sh + xvssrarni.hu.w xr7, xr5, bpcw_sh + xvssrlni.bu.h xr7, xr6, 0 + xvshuf4i.w xr8, xr7, 0x4E + xvilvl.b xr9, xr8, xr7 + xvpermi.d xr0, xr9, 0xD8 + xvst xr0, a0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a0, a0, 32 +.endr + + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W128_LASX +.W_AVG_END_LASX: +endfunc + +#undef bpc_sh +#undef bpcw_sh + diff --git a/src/loongarch/mc.h b/src/loongarch/mc.h index 92b438f..f7ad65f 100644 --- a/src/loongarch/mc.h +++ b/src/loongarch/mc.h @@ -32,9 +32,11 @@ #include "src/mc.h" #include "src/cpu.h" +decl_w_avg_fn(BF(dav1d_w_avg, lsx)); decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx)); decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx)); +decl_w_avg_fn(BF(dav1d_w_avg, lasx)); decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx)); decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx)); @@ -44,11 +46,13 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) { if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + c->w_avg = BF(dav1d_w_avg, lsx); c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx); c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx); if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return; + c->w_avg = BF(dav1d_w_avg, lasx); c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx); c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx); -- cgit v1.2.3 From 4080673c17fe4ee248ba94eb0299b951b0d72fb4 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 10:29:51 +0800 Subject: loongarch: Improve the performance of mask_c, w_mask_420 functions Relative speedup over C code: mask_w4_8bpc_c: 9.2 ( 1.00x) mask_w4_8bpc_lsx: 1.1 ( 8.31x) mask_w4_8bpc_lasx: 1.2 ( 7.42x) mask_w8_8bpc_c: 27.4 ( 1.00x) mask_w8_8bpc_lsx: 2.6 (10.54x) mask_w8_8bpc_lasx: 1.9 (14.65x) mask_w16_8bpc_c: 87.2 ( 1.00x) mask_w16_8bpc_lsx: 8.0 (10.92x) mask_w16_8bpc_lasx: 6.5 (13.46x) mask_w32_8bpc_c: 343.4 ( 1.00x) mask_w32_8bpc_lsx: 31.7 (10.84x) mask_w32_8bpc_lasx: 22.1 (15.51x) mask_w64_8bpc_c: 824.9 ( 1.00x) mask_w64_8bpc_lsx: 78.0 (10.57x) mask_w64_8bpc_lasx: 54.1 (15.25x) mask_w128_8bpc_c: 2042.9 ( 1.00x) mask_w128_8bpc_lsx: 200.7 (10.18x) mask_w128_8bpc_lasx: 157.1 (13.00x) w_mask_420_w4_8bpc_c: 19.0 ( 1.00x) w_mask_420_w4_8bpc_lsx: 1.7 (11.11x) w_mask_420_w4_8bpc_lasx: 1.2 (15.87x) w_mask_420_w8_8bpc_c: 58.2 ( 1.00x) w_mask_420_w8_8bpc_lsx: 4.6 (12.58x) w_mask_420_w8_8bpc_lasx: 2.5 (23.74x) w_mask_420_w16_8bpc_c: 188.0 ( 1.00x) w_mask_420_w16_8bpc_lsx: 11.8 (15.88x) w_mask_420_w16_8bpc_lasx: 8.3 (22.66x) w_mask_420_w32_8bpc_c: 742.2 ( 1.00x) w_mask_420_w32_8bpc_lsx: 47.3 (15.68x) w_mask_420_w32_8bpc_lasx: 32.7 (22.68x) w_mask_420_w64_8bpc_c: 1786.3 ( 1.00x) w_mask_420_w64_8bpc_lsx: 112.4 (15.89x) w_mask_420_w64_8bpc_lasx: 78.4 (22.78x) w_mask_420_w128_8bpc_c: 4442.2 ( 1.00x) w_mask_420_w128_8bpc_lsx: 298.9 (14.86x) w_mask_420_w128_8bpc_lasx: 220.5 (20.15x) --- src/loongarch/mc.S | 1033 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/mc.h | 8 + 2 files changed, 1041 insertions(+) diff --git a/src/loongarch/mc.S b/src/loongarch/mc.S index bd670b8..ba58b22 100644 --- a/src/loongarch/mc.S +++ b/src/loongarch/mc.S @@ -1302,3 +1302,1036 @@ endfunc #undef bpc_sh #undef bpcw_sh +#define mask_sh 10 +/* +static void mask_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, const int w, int h, + const uint8_t *mask HIGHBD_DECL_SUFFIX) +*/ +function mask_8bpc_lsx + vldi vr21, 0x440 // 64 + vxor.v vr19, vr19, vr19 + addi.d t8, a0, 0 + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .MASK_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.MASK_LSX_JRTABLE: + .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W64_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W32_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W16_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W8_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W4_LSX - .MASK_LSX_JRTABLE + +.MASK_W4_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + fld.d f22, a6, 0 + + vilvl.b vr2, vr19, vr22 + vsub.h vr3, vr21, vr2 + + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vssrarni.hu.w vr5, vr4, mask_sh + vssrlrni.bu.h vr1, vr5, 0 + vpickod.w vr4, vr2, vr1 + vilvl.b vr0, vr4, vr1 + fst.s f0, a0, 0 + add.d a0, a0, a1 + vstelm.w vr0, a0, 0, 1 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + addi.d a6, a6, 8 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W4_LSX + b .MASK_END_LSX +.MASK_W8_LSX: + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + fst.d f0, a0, 0 + add.d a0, a0, a1 + vstelm.d vr0, a0, 0, 1 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W8_LSX + b .MASK_END_LSX + +.MASK_W16_LSX: + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -1 + blt zero, a5, .MASK_W16_LSX + b .MASK_END_LSX +.MASK_W32_LSX: +.rept 2 + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + addi.d a0, a0, 16 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W32_LSX + b .MASK_END_LSX +.MASK_W64_LSX: +.rept 4 + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + addi.d a0, a0, 16 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W64_LSX + b .MASK_END_LSX +.MASK_W128_LSX: +.rept 8 + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + addi.d a0, a0, 16 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W128_LSX +.MASK_END_LSX: +endfunc + +function mask_8bpc_lasx + xvldi xr21, 0x440 // 64 + xvxor.v xr19, xr19, xr19 + addi.d t8, a0, 0 + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .MASK_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.MASK_LASX_JRTABLE: + .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W64_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W32_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W16_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W8_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W4_LASX - .MASK_LASX_JRTABLE + +.MASK_W4_LASX: + vld vr0, a2, 0 + vld vr1, a3, 0 + fld.d f22, a6, 0 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr14, vr1, vr0 + vilvl.b vr2, vr19, vr22 + vsub.h vr3, vr21, vr2 + xvpermi.q xr14, xr4, 0x20 + vilvl.h vr5, vr3, vr2 + vilvh.h vr15, vr3, vr2 + xvpermi.q xr15, xr5, 0x20 + xvmulwev.w.h xr0, xr14, xr15 + xvmaddwod.w.h xr0, xr14, xr15 + xvssrarni.hu.w xr1, xr0, mask_sh + xvssrlni.bu.h xr2, xr1, 0 + fst.s f2, a0, 0 + add.d a0, a0, a1 + xvstelm.w xr2, a0, 0, 4 + + addi.d a2, a2, 16 + addi.d a3, a3, 16 + addi.d a6, a6, 8 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W4_LASX + b .MASK_END_LASX + +.MASK_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + vld vr22, a6, 0 + + vext2xv.hu.bu xr2, xr22 + xvsub.h xr3, xr21, xr2 + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvssrarni.hu.w xr5, xr4, mask_sh + xvssrlni.bu.h xr1, xr5, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + fst.d f0, a0, 0 + add.d a0, a0, a1 + xvstelm.d xr0, a0, 0, 2 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W8_LASX + b .MASK_END_LASX + +.MASK_W16_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + vld vr22, a6, 0 + + vext2xv.hu.bu xr2, xr22 + xvsub.h xr3, xr21, xr2 + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvssrarni.hu.w xr5, xr4, mask_sh + xvssrlni.bu.h xr1, xr5, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + xvpermi.d xr1, xr0, 0xD8 + vst vr1, a0, 0 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -1 + blt zero, a5, .MASK_W16_LASX + b .MASK_END_LASX +.MASK_W32_LASX: + xvld xr0, a2, 0 + xvld xr10, a2, 32 + xvld xr1, a3, 0 + xvld xr11, a3, 32 + xvld xr22, a6, 0 + vext2xv.hu.bu xr2, xr22 + xvpermi.q xr4, xr22, 0x01 + vext2xv.hu.bu xr12, xr4 + xvsub.h xr3, xr21, xr2 + xvsub.h xr13, xr21, xr12 + + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmulwev.w.h xr14, xr10, xr12 + xvmulwod.w.h xr15, xr10, xr12 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvmaddwev.w.h xr14, xr11, xr13 + xvmaddwod.w.h xr15, xr11, xr13 + xvssrarni.hu.w xr14, xr4, mask_sh + xvssrarni.hu.w xr15, xr5, mask_sh + xvssrlni.bu.h xr15, xr14, 0 + xvshuf4i.w xr6, xr15, 0x4E + xvilvl.b xr1, xr6, xr15 + xvpermi.d xr0, xr1, 0xD8 + xvst xr0, a0, 0 + + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 32 + add.d a0, a0, a1 + addi.w a5, a5, -1 + blt zero, a5, .MASK_W32_LASX + b .MASK_END_LASX + +.MASK_W64_LASX: +.rept 2 + xvld xr0, a2, 0 + xvld xr10, a2, 32 + xvld xr1, a3, 0 + xvld xr11, a3, 32 + xvld xr22, a6, 0 + vext2xv.hu.bu xr2, xr22 + xvpermi.q xr4, xr22, 0x01 + vext2xv.hu.bu xr12, xr4 + xvsub.h xr3, xr21, xr2 + xvsub.h xr13, xr21, xr12 + + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmulwev.w.h xr14, xr10, xr12 + xvmulwod.w.h xr15, xr10, xr12 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvmaddwev.w.h xr14, xr11, xr13 + xvmaddwod.w.h xr15, xr11, xr13 + xvssrarni.hu.w xr14, xr4, mask_sh + xvssrarni.hu.w xr15, xr5, mask_sh + xvssrlni.bu.h xr15, xr14, 0 + xvshuf4i.w xr6, xr15, 0x4E + xvilvl.b xr1, xr6, xr15 + xvpermi.d xr0, xr1, 0xD8 + xvst xr0, a0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 32 + addi.d a0, a0, 32 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W64_LASX + b .MASK_END_LASX + +.MASK_W128_LASX: +.rept 4 + xvld xr0, a2, 0 + xvld xr10, a2, 32 + xvld xr1, a3, 0 + xvld xr11, a3, 32 + xvld xr22, a6, 0 + vext2xv.hu.bu xr2, xr22 + xvpermi.q xr4, xr22, 0x01 + vext2xv.hu.bu xr12, xr4 + xvsub.h xr3, xr21, xr2 + xvsub.h xr13, xr21, xr12 + + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmulwev.w.h xr14, xr10, xr12 + xvmulwod.w.h xr15, xr10, xr12 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvmaddwev.w.h xr14, xr11, xr13 + xvmaddwod.w.h xr15, xr11, xr13 + xvssrarni.hu.w xr14, xr4, mask_sh + xvssrarni.hu.w xr15, xr5, mask_sh + xvssrlni.bu.h xr15, xr14, 0 + xvshuf4i.w xr6, xr15, 0x4E + xvilvl.b xr1, xr6, xr15 + xvpermi.d xr0, xr1, 0xD8 + xvst xr0, a0, 0 + + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 32 + addi.d a0, a0, 32 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W128_LASX +.MASK_END_LASX: +endfunc + +/* +static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, const int w, int h, + uint8_t *mask, const int sign, + const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) +*/ +function w_mask_420_8bpc_lsx + addi.d sp, sp, -24 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + vldi vr20, 0x440 + vreplgr2vr.h vr21, a7 + vldi vr22, 0x426 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .WMASK420_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t8, t0, 0 + add.d t1, t1, t8 + jirl $r0, t1, 0 + + .align 3 +.WMASK420_LSX_JRTABLE: + .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE + +.WMASK420_W4_LSX: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a3, 0 + vld vr3, a3, 16 + addi.w a5, a5, -4 + + vabsd.h vr4, vr0, vr2 + vabsd.h vr5, vr1, vr3 + vaddi.hu vr4, vr4, 8 + vaddi.hu vr5, vr5, 8 + vsrli.h vr4, vr4, 8 + vsrli.h vr5, vr5, 8 + vadd.h vr4, vr4, vr22 + vadd.h vr5, vr5, vr22 + vmin.hu vr6, vr4, vr20 + vmin.hu vr7, vr5, vr20 + vsub.h vr8, vr20, vr6 + vsub.h vr9, vr20, vr7 + vmulwev.w.h vr4, vr6, vr0 + vmulwod.w.h vr5, vr6, vr0 + vmulwev.w.h vr10, vr7, vr1 + vmulwod.w.h vr11, vr7, vr1 + vmaddwev.w.h vr4, vr8, vr2 + vmaddwod.w.h vr5, vr8, vr2 + vmaddwev.w.h vr10, vr9, vr3 + vmaddwod.w.h vr11, vr9, vr3 + vilvl.w vr0, vr5, vr4 + vilvh.w vr1, vr5, vr4 + vilvl.w vr2, vr11, vr10 + vilvh.w vr3, vr11, vr10 + vssrarni.hu.w vr1, vr0, 10 + vssrarni.hu.w vr3, vr2, 10 + vssrlni.bu.h vr3, vr1, 0 + vstelm.w vr3, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 1 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 2 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 3 + add.d a0, a0, a1 + vpickev.h vr0, vr7, vr6 + vpickod.h vr1, vr7, vr6 + vadd.h vr0, vr0, vr1 + vshuf4i.h vr0, vr0, 0xd8 + vhaddw.w.h vr2, vr0, vr0 + vpickev.h vr2, vr2, vr2 + vsub.h vr2, vr2, vr21 + vaddi.hu vr2, vr2, 2 + vssrani.bu.h vr2, vr2, 2 + vstelm.w vr2, a6, 0, 0 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 4 + blt zero, a5, .WMASK420_W4_LSX + b .END_W420 + +.WMASK420_W8_LSX: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a3, 0 + vld vr3, a3, 16 + addi.w a5, a5, -2 + + vabsd.h vr4, vr0, vr2 + vabsd.h vr5, vr1, vr3 + vaddi.hu vr4, vr4, 8 + vaddi.hu vr5, vr5, 8 + vsrli.h vr4, vr4, 8 + vsrli.h vr5, vr5, 8 + vadd.h vr4, vr4, vr22 + vadd.h vr5, vr5, vr22 + vmin.hu vr6, vr4, vr20 + vmin.hu vr7, vr5, vr20 + vsub.h vr8, vr20, vr6 + vsub.h vr9, vr20, vr7 + vmulwev.w.h vr4, vr6, vr0 + vmulwod.w.h vr5, vr6, vr0 + vmulwev.w.h vr10, vr7, vr1 + vmulwod.w.h vr11, vr7, vr1 + vmaddwev.w.h vr4, vr8, vr2 + vmaddwod.w.h vr5, vr8, vr2 + vmaddwev.w.h vr10, vr9, vr3 + vmaddwod.w.h vr11, vr9, vr3 + vssrarni.hu.w vr10, vr4, 10 + vssrarni.hu.w vr11, vr5, 10 + vssrlni.bu.h vr11, vr10, 0 + vshuf4i.w vr0, vr11, 0x4E + vilvl.b vr3, vr0, vr11 + vstelm.d vr3, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr3, a0, 0, 1 + add.d a0, a0, a1 + vpickev.h vr0, vr7, vr6 + vpickod.h vr1, vr7, vr6 + vadd.h vr0, vr0, vr1 + vilvh.d vr2, vr0, vr0 + vadd.h vr2, vr2, vr0 + vsub.h vr2, vr2, vr21 + vaddi.hu vr2, vr2, 2 + vssrani.bu.h vr2, vr2, 2 + vstelm.w vr2, a6, 0, 0 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 4 + blt zero, a5, .WMASK420_W8_LSX + b .END_W420 + +.WMASK420_W16_LSX: + vld vr0, a2, 0 + vld vr1, a2, 16 + alsl.d a2, a4, a2, 1 + vld vr2, a2, 0 + vld vr3, a2, 16 + vld vr4, a3, 0 + vld vr5, a3, 16 + alsl.d a3, a4, a3, 1 + vld vr6, a3, 0 + vld vr7, a3, 16 + + vabsd.h vr8, vr0, vr4 + vabsd.h vr9, vr1, vr5 + vabsd.h vr10, vr2, vr6 + vabsd.h vr11, vr3, vr7 + vaddi.hu vr8, vr8, 8 + vaddi.hu vr9, vr9, 8 + vaddi.hu vr10, vr10, 8 + vaddi.hu vr11, vr11, 8 + vsrli.h vr8, vr8, 8 + vsrli.h vr9, vr9, 8 + vsrli.h vr10, vr10, 8 + vsrli.h vr11, vr11, 8 + vadd.h vr8, vr8, vr22 + vadd.h vr9, vr9, vr22 + vadd.h vr10, vr10, vr22 + vadd.h vr11, vr11, vr22 + vmin.hu vr12, vr8, vr20 + vmin.hu vr13, vr9, vr20 + vmin.hu vr14, vr10, vr20 + vmin.hu vr15, vr11, vr20 + vsub.h vr16, vr20, vr12 + vsub.h vr17, vr20, vr13 + vsub.h vr18, vr20, vr14 + vsub.h vr19, vr20, vr15 + vmulwev.w.h vr8, vr12, vr0 + vmulwod.w.h vr9, vr12, vr0 + vmulwev.w.h vr10, vr13, vr1 + vmulwod.w.h vr11, vr13, vr1 + vmulwev.w.h vr23, vr14, vr2 + vmulwod.w.h vr24, vr14, vr2 + vmulwev.w.h vr25, vr15, vr3 + vmulwod.w.h vr26, vr15, vr3 + vmaddwev.w.h vr8, vr16, vr4 + vmaddwod.w.h vr9, vr16, vr4 + vmaddwev.w.h vr10, vr17, vr5 + vmaddwod.w.h vr11, vr17, vr5 + vmaddwev.w.h vr23, vr18, vr6 + vmaddwod.w.h vr24, vr18, vr6 + vmaddwev.w.h vr25, vr19, vr7 + vmaddwod.w.h vr26, vr19, vr7 + vssrarni.hu.w vr10, vr8, 10 + vssrarni.hu.w vr11, vr9, 10 + vssrarni.hu.w vr25, vr23, 10 + vssrarni.hu.w vr26, vr24, 10 + vssrlni.bu.h vr11, vr10, 0 + vssrlni.bu.h vr26, vr25, 0 + vshuf4i.w vr0, vr11, 0x4E + vshuf4i.w vr1, vr26, 0x4E + vilvl.b vr3, vr0, vr11 + vilvl.b vr7, vr1, vr26 + vst vr3, a0, 0 + vstx vr7, a0, a1 + vpickev.h vr0, vr13, vr12 + vpickod.h vr1, vr13, vr12 + vpickev.h vr2, vr15, vr14 + vpickod.h vr3, vr15, vr14 + vadd.h vr4, vr0, vr1 + vadd.h vr5, vr2, vr3 + vadd.h vr4, vr4, vr5 + vsub.h vr4, vr4, vr21 + vssrarni.bu.h vr4, vr4, 2 + vstelm.d vr4, a6, 0, 0 + + alsl.d a2, a4, a2, 1 + alsl.d a3, a4, a3, 1 + alsl.d a0, a1, a0, 1 + addi.d a6, a6, 8 + addi.w a5, a5, -2 + blt zero, a5, .WMASK420_W16_LSX + b .END_W420 + +.WMASK420_W32_LSX: +.WMASK420_W64_LSX: +.WMASK420_W128_LSX: + +.LOOP_W32_420_LSX: + add.d t1, a2, zero + add.d t2, a3, zero + add.d t3, a0, zero + add.d t4, a6, zero + alsl.d t5, a4, t1, 1 + alsl.d t6, a4, t2, 1 + or t7, a4, a4 + +.W32_420_LSX: + vld vr0, t1, 0 + vld vr1, t1, 16 + vld vr2, t2, 0 + vld vr3, t2, 16 + vld vr4, t5, 0 + vld vr5, t5, 16 + vld vr6, t6, 0 + vld vr7, t6, 16 + addi.d t1, t1, 32 + addi.d t2, t2, 32 + addi.d t5, t5, 32 + addi.d t6, t6, 32 + addi.w t7, t7, -16 + vabsd.h vr8, vr0, vr2 + vabsd.h vr9, vr1, vr3 + vabsd.h vr10, vr4, vr6 + vabsd.h vr11, vr5, vr7 + vaddi.hu vr8, vr8, 8 + vaddi.hu vr9, vr9, 8 + vaddi.hu vr10, vr10, 8 + vaddi.hu vr11, vr11, 8 + vsrli.h vr8, vr8, 8 + vsrli.h vr9, vr9, 8 + vsrli.h vr10, vr10, 8 + vsrli.h vr11, vr11, 8 + vadd.h vr8, vr8, vr22 + vadd.h vr9, vr9, vr22 + vadd.h vr10, vr10, vr22 + vadd.h vr11, vr11, vr22 + vmin.hu vr12, vr8, vr20 + vmin.hu vr13, vr9, vr20 + vmin.hu vr14, vr10, vr20 + vmin.hu vr15, vr11, vr20 + vsub.h vr16, vr20, vr12 + vsub.h vr17, vr20, vr13 + vsub.h vr18, vr20, vr14 + vsub.h vr19, vr20, vr15 + vmulwev.w.h vr8, vr12, vr0 + vmulwod.w.h vr9, vr12, vr0 + vmulwev.w.h vr10, vr13, vr1 + vmulwod.w.h vr11, vr13, vr1 + vmulwev.w.h vr23, vr14, vr4 + vmulwod.w.h vr24, vr14, vr4 + vmulwev.w.h vr25, vr15, vr5 + vmulwod.w.h vr26, vr15, vr5 + vmaddwev.w.h vr8, vr16, vr2 + vmaddwod.w.h vr9, vr16, vr2 + vmaddwev.w.h vr10, vr17, vr3 + vmaddwod.w.h vr11, vr17, vr3 + vmaddwev.w.h vr23, vr18, vr6 + vmaddwod.w.h vr24, vr18, vr6 + vmaddwev.w.h vr25, vr19, vr7 + vmaddwod.w.h vr26, vr19, vr7 + vssrarni.hu.w vr10, vr8, 10 + vssrarni.hu.w vr11, vr9, 10 + vssrarni.hu.w vr25, vr23, 10 + vssrarni.hu.w vr26, vr24, 10 + vssrlni.bu.h vr11, vr10, 0 + vssrlni.bu.h vr26, vr25, 0 + vshuf4i.w vr8, vr11, 0x4E + vshuf4i.w vr9, vr26, 0x4E + vilvl.b vr3, vr8, vr11 + vilvl.b vr7, vr9, vr26 + vst vr3, t3, 0 + vstx vr7, a1, t3 + addi.d t3, t3, 16 + vpickev.h vr8, vr13, vr12 + vpickod.h vr9, vr13, vr12 + vpickev.h vr10, vr15, vr14 + vpickod.h vr11, vr15, vr14 + vadd.h vr8, vr8, vr9 + vadd.h vr10, vr10, vr11 + vadd.h vr12, vr8, vr10 + vsub.h vr12, vr12, vr21 + vssrarni.bu.h vr12, vr12, 2 + vstelm.d vr12, t4, 0, 0 + addi.d t4, t4, 8 + bne t7, zero, .W32_420_LSX + + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + alsl.d a0, a1, a0, 1 + srai.w t8, a4, 1 + add.d a6, a6, t8 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_W32_420_LSX + +.END_W420: + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + addi.d sp, sp, 24 +endfunc + +function w_mask_420_8bpc_lasx + xvldi xr20, 0x440 + xvreplgr2vr.h xr21, a7 + xvldi xr22, 0x426 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .WMASK420_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t8, t0, 0 + add.d t1, t1, t8 + jirl $r0, t1, 0 + + .align 3 +.WMASK420_LASX_JRTABLE: + .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W64_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W32_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W16_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W8_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W4_LASX - .WMASK420_LASX_JRTABLE + +.WMASK420_W4_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + addi.w a5, a5, -4 + + xvabsd.h xr2, xr0, xr1 + xvaddi.hu xr2, xr2, 8 + xvsrli.h xr2, xr2, 8 + xvadd.h xr2, xr2, xr22 + xvmin.hu xr3, xr2, xr20 + xvsub.h xr4, xr20, xr3 + xvmulwev.w.h xr5, xr3, xr0 + xvmulwod.w.h xr6, xr3, xr0 + xvmaddwev.w.h xr5, xr4, xr1 + xvmaddwod.w.h xr6, xr4, xr1 + xvilvl.w xr7, xr6, xr5 + xvilvh.w xr8, xr6, xr5 + xvssrarni.hu.w xr8, xr7, 10 + xvssrlni.bu.h xr9, xr8, 0 + vstelm.w vr9, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr9, a0, 0, 1 + add.d a0, a0, a1 + xvstelm.w xr9, a0, 0, 4 + add.d a0, a0, a1 + xvstelm.w xr9, a0, 0, 5 + add.d a0, a0, a1 + + xvhaddw.w.h xr3, xr3, xr3 + xvpermi.d xr4, xr3, 0xb1 + xvadd.h xr3, xr3, xr4 + xvpickev.h xr3, xr3, xr3 + xvsub.h xr3, xr3, xr21 + xvssrarni.bu.h xr3, xr3, 2 + vstelm.h vr3, a6, 0, 0 + xvstelm.h xr3, a6, 2, 8 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 4 + blt zero, a5, .WMASK420_W4_LASX + b .END_W420_LASX + +.WMASK420_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a2, 32 + xvld xr2, a3, 0 + xvld xr3, a3, 32 + addi.w a5, a5, -4 + + xvabsd.h xr4, xr0, xr2 + xvabsd.h xr5, xr1, xr3 + xvaddi.hu xr4, xr4, 8 + xvaddi.hu xr5, xr5, 8 + xvsrli.h xr4, xr4, 8 + xvsrli.h xr5, xr5, 8 + xvadd.h xr4, xr4, xr22 + xvadd.h xr5, xr5, xr22 + xvmin.hu xr6, xr4, xr20 + xvmin.hu xr7, xr5, xr20 + xvsub.h xr8, xr20, xr6 + xvsub.h xr9, xr20, xr7 + xvmulwev.w.h xr10, xr6, xr0 + xvmulwod.w.h xr11, xr6, xr0 + xvmulwev.w.h xr12, xr7, xr1 + xvmulwod.w.h xr13, xr7, xr1 + xvmaddwev.w.h xr10, xr8, xr2 + xvmaddwod.w.h xr11, xr8, xr2 + xvmaddwev.w.h xr12, xr9, xr3 + xvmaddwod.w.h xr13, xr9, xr3 + xvssrarni.hu.w xr12, xr10, 10 + xvssrarni.hu.w xr13, xr11, 10 + xvssrlni.bu.h xr13, xr12, 0 + xvshuf4i.w xr1, xr13, 0x4E + xvilvl.b xr17, xr1, xr13 + vstelm.d vr17, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.d xr17, a0, 0, 2 + add.d a0, a0, a1 + xvstelm.d xr17, a0, 0, 1 + add.d a0, a0, a1 + xvstelm.d xr17, a0, 0, 3 + add.d a0, a0, a1 + + xvhaddw.w.h xr6, xr6, xr6 + xvhaddw.w.h xr7, xr7, xr7 + xvpickev.h xr8, xr7, xr6 + xvpermi.q xr9, xr8, 0x01 + vadd.h vr8, vr8, vr9 + vsub.h vr8, vr8, vr21 + vssrarni.bu.h vr8, vr8, 2 + vstelm.d vr8, a6, 0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 8 + blt zero, a5, .WMASK420_W8_LASX + b .END_W420_LASX + +.WMASK420_W16_LASX: + xvld xr0, a2, 0 + xvld xr1, a2, 32 + xvld xr2, a3, 0 + xvld xr3, a3, 32 + addi.w a5, a5, -2 + + xvabsd.h xr4, xr0, xr2 + xvabsd.h xr5, xr1, xr3 + xvaddi.hu xr4, xr4, 8 + xvaddi.hu xr5, xr5, 8 + xvsrli.h xr4, xr4, 8 + xvsrli.h xr5, xr5, 8 + xvadd.h xr4, xr4, xr22 + xvadd.h xr5, xr5, xr22 + xvmin.hu xr4, xr4, xr20 + xvmin.hu xr5, xr5, xr20 + xvsub.h xr6, xr20, xr4 + xvsub.h xr7, xr20, xr5 + xvmulwev.w.h xr8, xr4, xr0 + xvmulwod.w.h xr9, xr4, xr0 + xvmulwev.w.h xr10, xr5, xr1 + xvmulwod.w.h xr11, xr5, xr1 + xvmaddwev.w.h xr8, xr6, xr2 + xvmaddwod.w.h xr9, xr6, xr2 + xvmaddwev.w.h xr10, xr7, xr3 + xvmaddwod.w.h xr11, xr7, xr3 + xvssrarni.hu.w xr10, xr8, 10 + xvssrarni.hu.w xr11, xr9, 10 + xvssrlni.bu.h xr11, xr10, 0 + xvshuf4i.w xr8, xr11, 0x4E + xvilvl.b xr15, xr8, xr11 + xvpermi.d xr16, xr15, 0xd8 + vst vr16, a0, 0 + add.d a0, a0, a1 + xvpermi.q xr16, xr16, 0x01 + vst vr16, a0, 0 + add.d a0, a0, a1 + + xvhaddw.w.h xr4, xr4, xr4 + xvhaddw.w.h xr5, xr5, xr5 + xvadd.h xr4, xr5, xr4 + xvpickev.h xr6, xr4, xr4 + xvpermi.d xr7, xr6, 0x08 + vsub.h vr7, vr7, vr21 + vssrarni.bu.h vr7, vr7, 2 + vstelm.d vr7, a6, 0, 0 + + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 8 + blt zero, a5, .WMASK420_W16_LASX + b .END_W420_LASX + +.WMASK420_W32_LASX: +.WMASK420_W64_LASX: +.WMASK420_W128_LASX: + +.LOOP_W32_420_LASX: + add.d t1, a2, zero + add.d t2, a3, zero + add.d t3, a0, zero + add.d t4, a6, zero + alsl.d t5, a4, t1, 1 + alsl.d t6, a4, t2, 1 + or t7, a4, a4 +.W32_420_LASX: + xvld xr0, t1, 0 + xvld xr1, t2, 0 + xvld xr2, t5, 0 + xvld xr3, t6, 0 + addi.d t1, t1, 32 + addi.d t2, t2, 32 + addi.d t5, t5, 32 + addi.d t6, t6, 32 + addi.w t7, t7, -16 + xvabsd.h xr4, xr0, xr1 + xvabsd.h xr5, xr2, xr3 + xvaddi.hu xr4, xr4, 8 + xvaddi.hu xr5, xr5, 8 + xvsrli.h xr4, xr4, 8 + xvsrli.h xr5, xr5, 8 + xvadd.h xr4, xr4, xr22 + xvadd.h xr5, xr5, xr22 + xvmin.hu xr6, xr4, xr20 + xvmin.hu xr7, xr5, xr20 + xvsub.h xr8, xr20, xr6 + xvsub.h xr9, xr20, xr7 + xvmulwev.w.h xr10, xr6, xr0 + xvmulwod.w.h xr11, xr6, xr0 + xvmulwev.w.h xr12, xr7, xr2 + xvmulwod.w.h xr13, xr7, xr2 + xvmaddwev.w.h xr10, xr8, xr1 + xvmaddwod.w.h xr11, xr8, xr1 + xvmaddwev.w.h xr12, xr9, xr3 + xvmaddwod.w.h xr13, xr9, xr3 + xvssrarni.hu.w xr12, xr10, 10 + xvssrarni.hu.w xr13, xr11, 10 + xvssrlni.bu.h xr13, xr12, 0 + xvshuf4i.w xr10, xr13, 0x4E + xvilvl.b xr17, xr10, xr13 + xvpermi.d xr18, xr17, 0x08 + xvpermi.d xr19, xr17, 0x0d + vst vr18, t3, 0 + vstx vr19, t3, a1 + addi.d t3, t3, 16 + + xvhaddw.w.h xr6, xr6, xr6 + xvhaddw.w.h xr7, xr7, xr7 + xvadd.h xr6, xr7, xr6 + xvpickev.h xr7, xr6, xr6 + xvpermi.d xr8, xr7, 0x08 + vsub.h vr9, vr8, vr21 + vssrarni.bu.h vr9, vr9, 2 + vstelm.d vr9, t4, 0, 0 + addi.d t4, t4, 8 + bne t7, zero, .W32_420_LASX + + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + alsl.d a0, a1, a0, 1 + srai.w t8, a4, 1 + add.d a6, a6, t8 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_W32_420_LASX + +.END_W420_LASX: +endfunc diff --git a/src/loongarch/mc.h b/src/loongarch/mc.h index f7ad65f..7c010db 100644 --- a/src/loongarch/mc.h +++ b/src/loongarch/mc.h @@ -33,12 +33,16 @@ #include "src/cpu.h" decl_w_avg_fn(BF(dav1d_w_avg, lsx)); +decl_mask_fn(BF(dav1d_mask, lsx)); decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx)); decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx)); +decl_w_mask_fn(BF(dav1d_w_mask_420, lsx)); decl_w_avg_fn(BF(dav1d_w_avg, lasx)); +decl_mask_fn(BF(dav1d_mask, lasx)); decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx)); decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx)); +decl_w_mask_fn(BF(dav1d_w_mask_420, lasx)); static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) { #if BITDEPTH == 8 @@ -47,14 +51,18 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) { if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; c->w_avg = BF(dav1d_w_avg, lsx); + c->mask = BF(dav1d_mask, lsx); c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx); c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx); + c->w_mask[2] = BF(dav1d_w_mask_420, lsx); if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return; c->w_avg = BF(dav1d_w_avg, lasx); + c->mask = BF(dav1d_mask, lasx); c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx); c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx); + c->w_mask[2] = BF(dav1d_w_mask_420, lasx); #endif } -- cgit v1.2.3 From d61886753328cb11227580ed411ac5d889486d63 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 11:08:09 +0800 Subject: loongarch: Improve the performance of avg functions Relative speedup over C code: avg_w4_8bpc_c: 7.0 ( 1.00x) avg_w4_8bpc_lsx: 0.8 ( 8.69x) avg_w4_8bpc_lasx: 0.8 ( 8.94x) avg_w8_8bpc_c: 20.4 ( 1.00x) avg_w8_8bpc_lsx: 1.1 (18.25x) avg_w8_8bpc_lasx: 0.9 (23.16x) avg_w16_8bpc_c: 65.1 ( 1.00x) avg_w16_8bpc_lsx: 2.5 (26.43x) avg_w16_8bpc_lasx: 2.0 (32.05x) avg_w32_8bpc_c: 255.1 ( 1.00x) avg_w32_8bpc_lsx: 8.6 (29.74x) avg_w32_8bpc_lasx: 6.0 (42.80x) avg_w64_8bpc_c: 611.0 ( 1.00x) avg_w64_8bpc_lsx: 21.0 (29.10x) avg_w64_8bpc_lasx: 12.1 (50.36x) avg_w128_8bpc_c: 1519.3 ( 1.00x) avg_w128_8bpc_lsx: 88.7 (17.13x) avg_w128_8bpc_lasx: 60.3 (25.20x) --- src/loongarch/mc.S | 289 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/mc.h | 4 + 2 files changed, 293 insertions(+) diff --git a/src/loongarch/mc.S b/src/loongarch/mc.S index ba58b22..0d335b5 100644 --- a/src/loongarch/mc.S +++ b/src/loongarch/mc.S @@ -943,6 +943,292 @@ static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, #define bpc_sh bpc8_sh #define bpcw_sh bpcw8_sh +function avg_8bpc_lsx + addi.d t8, a0, 0 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .AVG_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE + add.d t1, t1, t2 // Get absolute address + jirl $r0, t1, 0 + + .align 3 +.AVG_LSX_JRTABLE: + .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W64_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W32_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W16_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W8_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W4_LSX - .AVG_LSX_JRTABLE + +.AVG_W4_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vadd.h vr2, vr0, vr1 + vssrarni.bu.h vr3, vr2, bpc_sh + vstelm.w vr3, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 1 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a0, a1 + blt zero, a5, .AVG_W4_LSX + b .AVG_END_LSX + +.AVG_W8_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr4, vr0, vr1 + vadd.h vr5, vr2, vr3 + vssrarni.bu.h vr5, vr4, bpc_sh + addi.w a5, a5, -2 + addi.d a2, a2, 32 + vstelm.d vr5, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr5, a0, 0, 1 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .AVG_W8_LSX + b .AVG_END_LSX + +.AVG_W16_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr4, vr0, vr1 + vadd.h vr5, vr2, vr3 + vssrarni.bu.h vr5, vr4, bpc_sh + addi.w a5, a5, -1 + addi.d a2, a2, 32 + vst vr5, a0, 0 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .AVG_W16_LSX + b .AVG_END_LSX + +.AVG_W32_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr4, a2, 32 + vld vr6, a2, 48 + vld vr1, a3, 0 + vld vr3, a3, 16 + vld vr5, a3, 32 + vld vr7, a3, 48 + vadd.h vr0, vr0, vr1 + vadd.h vr2, vr2, vr3 + vadd.h vr4, vr4, vr5 + vadd.h vr6, vr6, vr7 + vssrarni.bu.h vr2, vr0, bpc_sh + vssrarni.bu.h vr6, vr4, bpc_sh + addi.w a5, a5, -1 + addi.d a2, a2, 64 + vst vr2, a0, 0 + vst vr6, a0, 16 + addi.d a3, a3, 64 + add.d a0, a0, a1 + blt zero, a5, .AVG_W32_LSX + b .AVG_END_LSX + +.AVG_W64_LSX: +.rept 4 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr0, vr0, vr1 + vadd.h vr2, vr2, vr3 + vssrarni.bu.h vr2, vr0, bpc_sh + addi.d a2, a2, 32 + addi.d a3, a3, 32 + vst vr2, a0, 0 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .AVG_W64_LSX + b .AVG_END_LSX + +.AVG_W128_LSX: +.rept 8 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr0, vr0, vr1 + vadd.h vr2, vr2, vr3 + vssrarni.bu.h vr2, vr0, bpc_sh + addi.d a2, a2, 32 + addi.d a3, a3, 32 + vst vr2, a0, 0 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .AVG_W128_LSX +.AVG_END_LSX: +endfunc + +function avg_8bpc_lasx + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .AVG_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.AVG_LASX_JRTABLE: + .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W64_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W32_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W16_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W8_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W4_LASX - .AVG_LASX_JRTABLE + +.AVG_W4_LASX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vadd.h vr0, vr0, vr1 + vssrarni.bu.h vr1, vr0, bpc_sh + vstelm.w vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr1, a0, 0, 1 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a0, a1 + blt zero, a5, .AVG_W4_LASX + b .AVG_END_LASX +.AVG_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + xvadd.h xr2, xr0, xr1 + xvssrarni.bu.h xr1, xr2, bpc_sh + xvstelm.d xr1, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.d xr1, a0, 0, 2 + addi.w a5, a5, -2 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a1, a0 + blt zero, a5, .AVG_W8_LASX + b .AVG_END_LASX +.AVG_W16_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvadd.h xr4, xr0, xr1 + xvadd.h xr5, xr2, xr3 + xvssrarni.bu.h xr5, xr4, bpc_sh + xvpermi.d xr2, xr5, 0xd8 + xvpermi.d xr3, xr5, 0x8d + vst vr2, a0, 0 + vstx vr3, a0, a1 + addi.w a5, a5, -2 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + alsl.d a0, a1, a0, 1 + blt zero, a5, .AVG_W16_LASX + b .AVG_END_LASX +.AVG_W32_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvadd.h xr4, xr0, xr1 + xvadd.h xr5, xr2, xr3 + xvssrarni.bu.h xr5, xr4, bpc_sh + xvpermi.d xr6, xr5, 0xd8 + xvst xr6, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + add.d a0, a0, a1 + blt zero, a5, .AVG_W32_LASX + b .AVG_END_LASX +.AVG_W64_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr4, a2, 64 + xvld xr6, a2, 96 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvld xr5, a3, 64 + xvld xr7, a3, 96 + xvadd.h xr0, xr0, xr1 + xvadd.h xr2, xr2, xr3 + xvadd.h xr4, xr4, xr5 + xvadd.h xr6, xr6, xr7 + xvssrarni.bu.h xr2, xr0, bpc_sh + xvssrarni.bu.h xr6, xr4, bpc_sh + xvpermi.d xr1, xr2, 0xd8 + xvpermi.d xr3, xr6, 0xd8 + xvst xr1, a0, 0 + xvst xr3, a0, 32 + addi.w a5, a5, -1 + addi.d a2, a2, 128 + addi.d a3, a3, 128 + add.d a0, a0, a1 + blt zero, a5, .AVG_W64_LASX + b .AVG_END_LASX +.AVG_W128_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr4, a2, 64 + xvld xr6, a2, 96 + xvld xr8, a2, 128 + xvld xr10, a2, 160 + xvld xr12, a2, 192 + xvld xr14, a2, 224 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvld xr5, a3, 64 + xvld xr7, a3, 96 + xvld xr9, a3, 128 + xvld xr11, a3, 160 + xvld xr13, a3, 192 + xvld xr15, a3, 224 + xvadd.h xr0, xr0, xr1 + xvadd.h xr2, xr2, xr3 + xvadd.h xr4, xr4, xr5 + xvadd.h xr6, xr6, xr7 + xvadd.h xr8, xr8, xr9 + xvadd.h xr10, xr10, xr11 + xvadd.h xr12, xr12, xr13 + xvadd.h xr14, xr14, xr15 + xvssrarni.bu.h xr2, xr0, bpc_sh + xvssrarni.bu.h xr6, xr4, bpc_sh + xvssrarni.bu.h xr10, xr8, bpc_sh + xvssrarni.bu.h xr14, xr12, bpc_sh + xvpermi.d xr1, xr2, 0xd8 + xvpermi.d xr3, xr6, 0xd8 + xvpermi.d xr5, xr10, 0xd8 + xvpermi.d xr7, xr14, 0xd8 + xvst xr1, a0, 0 + xvst xr3, a0, 32 + xvst xr5, a0, 64 + xvst xr7, a0, 96 + addi.w a5, a5, -1 + addi.d a2, a2, 256 + addi.d a3, a3, 256 + add.d a0, a0, a1 + blt zero, a5, .AVG_W128_LASX +.AVG_END_LASX: +endfunc + function w_avg_8bpc_lsx addi.d t8, a0, 0 li.w t2, 16 @@ -2335,3 +2621,6 @@ function w_mask_420_8bpc_lasx .END_W420_LASX: endfunc + +#undef bpc_sh +#undef bpcw_sh diff --git a/src/loongarch/mc.h b/src/loongarch/mc.h index 7c010db..56168e5 100644 --- a/src/loongarch/mc.h +++ b/src/loongarch/mc.h @@ -32,12 +32,14 @@ #include "src/mc.h" #include "src/cpu.h" +decl_avg_fn(BF(dav1d_avg, lsx)); decl_w_avg_fn(BF(dav1d_w_avg, lsx)); decl_mask_fn(BF(dav1d_mask, lsx)); decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx)); decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx)); decl_w_mask_fn(BF(dav1d_w_mask_420, lsx)); +decl_avg_fn(BF(dav1d_avg, lasx)); decl_w_avg_fn(BF(dav1d_w_avg, lasx)); decl_mask_fn(BF(dav1d_mask, lasx)); decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx)); @@ -50,6 +52,7 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) { if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + c->avg = BF(dav1d_avg, lsx); c->w_avg = BF(dav1d_w_avg, lsx); c->mask = BF(dav1d_mask, lsx); c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx); @@ -58,6 +61,7 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) { if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return; + c->avg = BF(dav1d_avg, lasx); c->w_avg = BF(dav1d_w_avg, lasx); c->mask = BF(dav1d_mask, lasx); c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx); -- cgit v1.2.3 From b34ecaf3105f4bafd6b52dc6e4d7e32b80b0542f Mon Sep 17 00:00:00 2001 From: jinbo Date: Fri, 1 Dec 2023 11:13:42 +0800 Subject: loongarch: Improve the performance of mc_8bpc.mc functions Relative speedup over C code: mc_8tap_regular_w2_0_8bpc_c: 5.3 ( 1.00x) mc_8tap_regular_w2_0_8bpc_lsx: 0.8 ( 6.62x) mc_8tap_regular_w2_h_8bpc_c: 11.0 ( 1.00x) mc_8tap_regular_w2_h_8bpc_lsx: 2.5 ( 4.40x) mc_8tap_regular_w2_hv_8bpc_c: 24.4 ( 1.00x) mc_8tap_regular_w2_hv_8bpc_lsx: 9.1 ( 2.70x) mc_8tap_regular_w2_v_8bpc_c: 12.9 ( 1.00x) mc_8tap_regular_w2_v_8bpc_lsx: 3.2 ( 4.08x) mc_8tap_regular_w4_0_8bpc_c: 4.8 ( 1.00x) mc_8tap_regular_w4_0_8bpc_lsx: 0.8 ( 5.97x) mc_8tap_regular_w4_h_8bpc_c: 20.0 ( 1.00x) mc_8tap_regular_w4_h_8bpc_lsx: 3.9 ( 5.06x) mc_8tap_regular_w4_hv_8bpc_c: 44.3 ( 1.00x) mc_8tap_regular_w4_hv_8bpc_lsx: 15.0 ( 2.96x) mc_8tap_regular_w4_v_8bpc_c: 23.5 ( 1.00x) mc_8tap_regular_w4_v_8bpc_lsx: 4.2 ( 5.54x) mc_8tap_regular_w8_0_8bpc_c: 4.8 ( 1.00x) mc_8tap_regular_w8_0_8bpc_lsx: 0.8 ( 6.03x) mc_8tap_regular_w8_h_8bpc_c: 37.5 ( 1.00x) mc_8tap_regular_w8_h_8bpc_lsx: 7.6 ( 4.96x) mc_8tap_regular_w8_hv_8bpc_c: 84.0 ( 1.00x) mc_8tap_regular_w8_hv_8bpc_lsx: 23.9 ( 3.51x) mc_8tap_regular_w8_v_8bpc_c: 44.8 ( 1.00x) mc_8tap_regular_w8_v_8bpc_lsx: 7.2 ( 6.23x) mc_8tap_regular_w16_0_8bpc_c: 5.8 ( 1.00x) mc_8tap_regular_w16_0_8bpc_lsx: 1.1 ( 5.12x) mc_8tap_regular_w16_h_8bpc_c: 103.8 ( 1.00x) mc_8tap_regular_w16_h_8bpc_lsx: 21.6 ( 4.80x) mc_8tap_regular_w16_hv_8bpc_c: 220.2 ( 1.00x) mc_8tap_regular_w16_hv_8bpc_lsx: 65.1 ( 3.38x) mc_8tap_regular_w16_v_8bpc_c: 124.8 ( 1.00x) mc_8tap_regular_w16_v_8bpc_lsx: 19.9 ( 6.28x) mc_8tap_regular_w32_0_8bpc_c: 8.9 ( 1.00x) mc_8tap_regular_w32_0_8bpc_lsx: 2.9 ( 3.06x) mc_8tap_regular_w32_h_8bpc_c: 323.6 ( 1.00x) mc_8tap_regular_w32_h_8bpc_lsx: 69.1 ( 4.68x) mc_8tap_regular_w32_hv_8bpc_c: 649.5 ( 1.00x) mc_8tap_regular_w32_hv_8bpc_lsx: 197.7 ( 3.29x) mc_8tap_regular_w32_v_8bpc_c: 390.5 ( 1.00x) mc_8tap_regular_w32_v_8bpc_lsx: 61.9 ( 6.31x) mc_8tap_regular_w64_0_8bpc_c: 13.3 ( 1.00x) mc_8tap_regular_w64_0_8bpc_lsx: 9.7 ( 1.37x) mc_8tap_regular_w64_h_8bpc_c: 1145.3 ( 1.00x) mc_8tap_regular_w64_h_8bpc_lsx: 248.2 ( 4.61x) mc_8tap_regular_w64_hv_8bpc_c: 2204.4 ( 1.00x) mc_8tap_regular_w64_hv_8bpc_lsx: 682.1 ( 3.23x) mc_8tap_regular_w64_v_8bpc_c: 1384.9 ( 1.00x) mc_8tap_regular_w64_v_8bpc_lsx: 218.9 ( 6.33x) mc_8tap_regular_w128_0_8bpc_c: 33.6 ( 1.00x) mc_8tap_regular_w128_0_8bpc_lsx: 27.7 ( 1.21x) mc_8tap_regular_w128_h_8bpc_c: 3228.1 ( 1.00x) mc_8tap_regular_w128_h_8bpc_lsx: 701.7 ( 4.60x) mc_8tap_regular_w128_hv_8bpc_c: 6108.2 ( 1.00x) mc_8tap_regular_w128_hv_8bpc_lsx: 1905.3 ( 3.21x) mc_8tap_regular_w128_v_8bpc_c: 3906.8 ( 1.00x) mc_8tap_regular_w128_v_8bpc_lsx: 617.4 ( 6.33x) --- src/loongarch/mc.S | 1026 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/mc.h | 25 ++ 2 files changed, 1051 insertions(+) diff --git a/src/loongarch/mc.S b/src/loongarch/mc.S index 0d335b5..9e0dbff 100644 --- a/src/loongarch/mc.S +++ b/src/loongarch/mc.S @@ -2624,3 +2624,1029 @@ endfunc #undef bpc_sh #undef bpcw_sh + +.macro vhaddw.d.h in0 + vhaddw.w.h \in0, \in0, \in0 + vhaddw.d.w \in0, \in0, \in0 +.endm +.macro vhaddw.q.w in0 + vhaddw.d.w \in0, \in0, \in0 + vhaddw.q.d \in0, \in0, \in0 +.endm +.macro PUT_H_8W in0 + vbsrl.v vr2, \in0, 1 + vbsrl.v vr3, \in0, 2 + vbsrl.v vr4, \in0, 3 + vbsrl.v vr5, \in0, 4 + vbsrl.v vr6, \in0, 5 + vbsrl.v vr7, \in0, 6 + vbsrl.v vr10, \in0, 7 + vilvl.d vr2, vr2, \in0 + vilvl.d vr3, vr4, vr3 + vilvl.d vr4, vr6, vr5 + vilvl.d vr5, vr10, vr7 + vdp2.h.bu.b \in0, vr2, vr8 + vdp2.h.bu.b vr2, vr3, vr8 + vdp2.h.bu.b vr3, vr4, vr8 + vdp2.h.bu.b vr4, vr5, vr8 + vhaddw.d.h \in0 + vhaddw.d.h vr2 + vhaddw.d.h vr3 + vhaddw.d.h vr4 + vpickev.w \in0, vr2, \in0 + vpickev.w vr2, vr4, vr3 + vpickev.h \in0, vr2, \in0 + vadd.h \in0, \in0, vr9 +.endm +.macro FILTER_8TAP_4W in0 + vbsrl.v vr10, \in0, 1 + vbsrl.v vr11, \in0, 2 + vbsrl.v vr12, \in0, 3 + vilvl.d vr10, vr10, \in0 + vilvl.d vr11, vr12, vr11 + vdp2.h.bu.b vr7, vr10, vr8 + vdp2.h.bu.b vr10, vr11, vr8 + vhaddw.d.h vr7 + vhaddw.d.h vr10 + vpickev.w \in0, vr10, vr7 +.endm +.macro FILTER_8TAP_8W in0 + vbsrl.v vr10, \in0, 1 + vbsrl.v vr11, \in0, 2 + vbsrl.v vr12, \in0, 3 + vbsrl.v vr13, \in0, 4 + vbsrl.v vr14, \in0, 5 + vbsrl.v vr15, \in0, 6 + vbsrl.v vr16, \in0, 7 + vilvl.d vr10, vr10, \in0 + vilvl.d vr11, vr12, vr11 + vilvl.d vr12, vr14, vr13 + vilvl.d vr13, vr16, vr15 + vdp2.h.bu.b vr14, vr10, vr8 + vdp2.h.bu.b vr15, vr11, vr8 + vdp2.h.bu.b vr16, vr12, vr8 + vdp2.h.bu.b vr17, vr13, vr8 + vhaddw.d.h vr14 + vhaddw.d.h vr15 + vhaddw.d.h vr16 + vhaddw.d.h vr17 + vpickev.w vr13, vr15, vr14 + vpickev.w vr14, vr17, vr16 + vpickev.h \in0, vr14, vr13 //x0 ... x7 + vsrari.h \in0, \in0, 2 +.endm +.macro FILTER_8TAP_8W_CLIP_STORE + vdp2.w.h vr12, vr0, vr9 + vdp2.w.h vr13, vr1, vr9 + vdp2.w.h vr14, vr2, vr9 + vdp2.w.h vr15, vr3, vr9 + vdp2.w.h vr16, vr4, vr9 + vdp2.w.h vr17, vr5, vr9 + vdp2.w.h vr18, vr6, vr9 + vdp2.w.h vr19, vr7, vr9 + vhaddw.q.w vr12 + vhaddw.q.w vr13 + vhaddw.q.w vr14 + vhaddw.q.w vr15 + vhaddw.q.w vr16 + vhaddw.q.w vr17 + vhaddw.q.w vr18 + vhaddw.q.w vr19 + vpackev.w vr12, vr13, vr12 + vpackev.w vr13, vr15, vr14 + vpackev.d vr12, vr13, vr12 + vpackev.w vr14, vr17, vr16 + vpackev.w vr15, vr19, vr18 + vpackev.d vr13, vr15, vr14 + vssrarni.hu.w vr13, vr12, 10 + vssrani.bu.h vr13, vr13, 0 + vstelm.d vr13, a0, 0, 0 + add.d a0, a0, a1 +.endm +.macro VEXTRINS_Hx8 in0 + vextrins.h vr0, \in0, 0x70 + vextrins.h vr1, \in0, 0x71 + vextrins.h vr2, \in0, 0x72 + vextrins.h vr3, \in0, 0x73 + vextrins.h vr4, \in0, 0x74 + vextrins.h vr5, \in0, 0x75 + vextrins.h vr6, \in0, 0x76 + vextrins.h vr7, \in0, 0x77 +.endm +.macro VBSRL_Vx8 + vbsrl.v vr0, vr0, 2 + vbsrl.v vr1, vr1, 2 + vbsrl.v vr2, vr2, 2 + vbsrl.v vr3, vr3, 2 + vbsrl.v vr4, vr4, 2 + vbsrl.v vr5, vr5, 2 + vbsrl.v vr6, vr6, 2 + vbsrl.v vr7, vr7, 2 +.endm + +.macro PUT_8TAP_8BPC_LSX lable + li.w t0, 4 + la.local t6, dav1d_mc_subpel_filters + slli.d t2, a3, 1 //src_stride*2 + add.d t3, t2, a3 //src_stride*3 + slli.d t4, t2, 1 //src_stride*4 + + bnez a6, .l_\lable\()put_h //mx + bnez a7, .l_\lable\()put_v //my + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_hv0_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_hv0_jtable: + .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_64w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_32w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_16w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_8w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_4w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_2w - .l_\lable\()put_hv0_jtable + +.l_\lable\()put_hv0_2w: + vldrepl.h vr0, a2, 0 + add.d a2, a2, a3 + vldrepl.h vr1, a2, 0 + vstelm.h vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr1, a0, 0, 0 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_2w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_4w: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fst.s f0, a0, 0 + fstx.s f1, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_4w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_8w: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fst.d f0, a0, 0 + fstx.d f1, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_8w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_16w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vst vr0, a0, 0 + vstx vr1, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_16w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_32w: + vld vr0, a2, 0 + vld vr1, a2, 16 + add.d a2, a2, a3 + vld vr2, a2, 0 + vld vr3, a2, 16 + vst vr0, a0, 0 + vst vr1, a0, 16 + add.d a0, a0, a1 + vst vr2, a0, 0 + vst vr3, a0, 16 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_32w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_64w: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a2, 32 + vld vr3, a2, 48 + add.d a2, a2, a3 + vld vr4, a2, 0 + vld vr5, a2, 16 + vld vr6, a2, 32 + vld vr7, a2, 48 + add.d a2, a2, a3 + vst vr0, a0, 0 + vst vr1, a0, 16 + vst vr2, a0, 32 + vst vr3, a0, 48 + add.d a0, a0, a1 + vst vr4, a0, 0 + vst vr5, a0, 16 + vst vr6, a0, 32 + vst vr7, a0, 48 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_64w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_128w: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a2, 32 + vld vr3, a2, 48 + vld vr4, a2, 64 + vld vr5, a2, 80 + vld vr6, a2, 96 + vld vr7, a2, 112 + add.d a2, a2, a3 + vld vr8, a2, 0 + vld vr9, a2, 16 + vld vr10, a2, 32 + vld vr11, a2, 48 + vld vr12, a2, 64 + vld vr13, a2, 80 + vld vr14, a2, 96 + vld vr15, a2, 112 + add.d a2, a2, a3 + vst vr0, a0, 0 + vst vr1, a0, 16 + vst vr2, a0, 32 + vst vr3, a0, 48 + vst vr4, a0, 64 + vst vr5, a0, 80 + vst vr6, a0, 96 + vst vr7, a0, 112 + add.d a0, a0, a1 + vst vr8, a0, 0 + vst vr9, a0, 16 + vst vr10, a0, 32 + vst vr11, a0, 48 + vst vr12, a0, 64 + vst vr13, a0, 80 + vst vr14, a0, 96 + vst vr15, a0, 112 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_128w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h: + bnez a7, .l_\lable\()put_hv //if(fh) && if (fv) + ld.d t5, sp, 0 //filter_type + andi t1, t5, 3 + blt t0, a4, .l_\lable\()put_h_idx_fh + andi t1, t5, 1 + addi.w t1, t1, 3 + +.l_\lable\()put_h_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + vldrepl.d vr8, t1, 0 + addi.d a2, a2, -3 + li.w t1, 34 + vreplgr2vr.h vr9, t1 + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_h_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_h_jtable: + .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_64w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_32w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_16w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_8w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_4w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_2w - .l_\lable\()put_h_jtable + +.l_\lable\()put_h_2w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + + vbsrl.v vr2, vr0, 1 + vilvl.d vr0, vr2, vr0 + vdp2.h.bu.b vr2, vr0, vr8 + vhaddw.w.h vr0, vr2, vr2 + vhaddw.d.w vr0, vr0, vr0 + vbsrl.v vr2, vr1, 1 + vilvl.d vr1, vr2, vr1 + vdp2.h.bu.b vr2, vr1, vr8 + vhaddw.w.h vr1, vr2, vr2 + vhaddw.d.w vr1, vr1, vr1 + vpickev.w vr0, vr1, vr0 + vpickev.h vr0, vr0, vr0 + vadd.h vr0, vr0, vr9 + vssrani.bu.h vr0, vr0, 6 + + vstelm.h vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr0, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_h_2w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h_4w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + + vbsrl.v vr2, vr0, 1 + vbsrl.v vr3, vr0, 2 + vbsrl.v vr4, vr0, 3 + vilvl.d vr0, vr2, vr0 //x0 x1 + vilvl.d vr2, vr4, vr3 //x2 x3 + vdp2.h.bu.b vr3, vr0, vr8 + vdp2.h.bu.b vr4, vr2, vr8 + vhaddw.w.h vr0, vr3, vr3 + vhaddw.d.w vr0, vr0, vr0 + vhaddw.w.h vr2, vr4, vr4 + vhaddw.d.w vr2, vr2, vr2 + vpickev.w vr5, vr2, vr0 + vbsrl.v vr2, vr1, 1 + vbsrl.v vr3, vr1, 2 + vbsrl.v vr4, vr1, 3 + vilvl.d vr0, vr2, vr1 //x0 x1 + vilvl.d vr2, vr4, vr3 //x2 x3 + vdp2.h.bu.b vr3, vr0, vr8 + vdp2.h.bu.b vr4, vr2, vr8 + vhaddw.w.h vr0, vr3, vr3 + vhaddw.d.w vr0, vr0, vr0 + vhaddw.w.h vr2, vr4, vr4 + vhaddw.d.w vr2, vr2, vr2 + vpickev.w vr6, vr2, vr0 + vpickev.h vr0, vr6, vr5 + vadd.h vr0, vr0, vr9 + vssrani.bu.h vr0, vr0, 6 + + vstelm.w vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr0, a0, 0, 1 + add.d a0, a0, a1 + addi.d a5, a5, -2 + bnez a5, .l_\lable\()put_h_4w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h_8w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + PUT_H_8W vr0 + PUT_H_8W vr1 + vssrani.bu.h vr1, vr0, 6 + vstelm.d vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr1, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_h_8w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h_16w: +.l_\lable\()put_h_32w: +.l_\lable\()put_h_64w: +.l_\lable\()put_h_128w: + addi.d t0, a2, 0 //src + addi.w t5, a5, 0 //h + addi.d t8, a0, 0 //dst +.l_\lable\()put_h_16w_loop: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + PUT_H_8W vr0 + PUT_H_8W vr1 + vssrani.bu.h vr1, vr0, 6 + vstelm.d vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr1, a0, 0, 1 + add.d a0, a0, a1 + addi.d a5, a5, -2 + bnez a5, .l_\lable\()put_h_16w_loop + addi.d a2, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.w a5, t5, 0 + addi.w a4, a4, -8 + bnez a4, .l_\lable\()put_h_16w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_v: + ld.d t1, sp, 0 //filter_type + srli.w t1, t1, 2 + blt t0, a5, .l_\lable\()put_v_idx_fv + andi t1, t1, 1 + addi.w t1, t1, 3 + +.l_\lable\()put_v_idx_fv: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a7, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fv's offset + vldrepl.d vr8, t1, 0 + sub.d a2, a2, t3 + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_v_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_v_jtable: + .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_64w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_32w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_16w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_8w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_4w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_2w - .l_\lable\()put_v_jtable + +.l_\lable\()put_v_2w: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fldx.s f2, a2, t2 + add.d a2, a2, t3 + fld.s f3, a2, 0 + fldx.s f4, a2, a3 + fldx.s f5, a2, t2 + fldx.s f6, a2, t3 + add.d a2, a2, t4 + vilvl.b vr0, vr1, vr0 + vilvl.b vr1, vr3, vr2 + vilvl.b vr2, vr5, vr4 + vilvl.b vr3, vr7, vr6 + vilvl.h vr0, vr1, vr0 + vilvl.h vr1, vr3, vr2 + vilvl.w vr0, vr1, vr0 + +.l_\lable\()put_v_2w_loop: + fld.s f7, a2, 0 //h0 + fldx.s f10, a2, a3 //h1 + add.d a2, a2, t2 + + vextrins.b vr0, vr7, 0x70 + vextrins.b vr0, vr7, 0xf1 + vbsrl.v vr1, vr0, 1 + vextrins.b vr1, vr10, 0x70 + vextrins.b vr1, vr10, 0xf1 + vdp2.h.bu.b vr10, vr0, vr8 + vdp2.h.bu.b vr11, vr1, vr8 + vbsrl.v vr0, vr1, 1 + vhaddw.d.h vr10 + vhaddw.d.h vr11 + vpickev.w vr10, vr11, vr10 + vssrarni.hu.w vr10, vr10, 6 + vssrani.bu.h vr10, vr10, 0 + + vstelm.h vr10, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr10, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_v_2w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_v_4w: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fldx.s f2, a2, t2 + add.d a2, a2, t3 + fld.s f3, a2, 0 + fldx.s f4, a2, a3 + fldx.s f5, a2, t2 + fldx.s f6, a2, t3 + add.d a2, a2, t4 + + vilvl.b vr0, vr1, vr0 + vilvl.b vr1, vr3, vr2 + vilvl.b vr2, vr5, vr4 + vilvl.b vr3, vr7, vr6 + vilvl.h vr0, vr1, vr0 + vilvl.h vr1, vr3, vr2 + vilvl.w vr2, vr1, vr0 + vilvh.w vr3, vr1, vr0 + +.l_\lable\()put_v_4w_loop: + fld.s f7, a2, 0 + fldx.s f10, a2, a3 + add.d a2, a2, t2 + + vextrins.b vr2, vr7, 0x70 + vextrins.b vr2, vr7, 0xf1 //x0x1(h0) + vbsrl.v vr4, vr2, 1 + vextrins.b vr4, vr10, 0x70 + vextrins.b vr4, vr10, 0xf1 //x0x1(h1) + vdp2.h.bu.b vr11, vr2, vr8 + vdp2.h.bu.b vr12, vr4, vr8 + vbsrl.v vr2, vr4, 1 + + vextrins.b vr3, vr7, 0x72 + vextrins.b vr3, vr7, 0xf3 //x2x3(h0) + vbsrl.v vr4, vr3, 1 + vextrins.b vr4, vr10, 0x72 + vextrins.b vr4, vr10, 0xf3 //x2x3(h1) + vdp2.h.bu.b vr13, vr3, vr8 + vdp2.h.bu.b vr14, vr4, vr8 + vbsrl.v vr3, vr4, 1 + + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + + vpickev.w vr11, vr13, vr11 + vpickev.w vr12, vr14, vr12 + vpickev.h vr11, vr12, vr11 + vssrarni.bu.h vr11, vr11, 6 + vstelm.w vr11, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr11, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_v_4w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_v_8w: +.l_\lable\()put_v_16w: +.l_\lable\()put_v_32w: +.l_\lable\()put_v_64w: +.l_\lable\()put_v_128w: + addi.d t0, a2, 0 //src + addi.d t5, a5, 0 //h + addi.d t8, a0, 0 //dst +.l_\lable\()put_v_8w_loop0: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fldx.d f2, a2, t2 + add.d a2, a2, t3 + fld.d f3, a2, 0 + fldx.d f4, a2, a3 + fldx.d f5, a2, t2 + fldx.d f6, a2, t3 + add.d a2, a2, t4 + + vilvl.b vr0, vr1, vr0 + vilvl.b vr1, vr3, vr2 + vilvl.b vr2, vr5, vr4 + vilvl.b vr3, vr7, vr6 + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr6, vr3, vr2 + vilvh.h vr7, vr3, vr2 + vilvl.w vr0, vr6, vr4 // x0x1 + vilvh.w vr1, vr6, vr4 // x2x3 + vilvl.w vr2, vr7, vr5 // x4x5 + vilvh.w vr3, vr7, vr5 // x6x7 +.l_\lable\()put_v_8w_loop: + fld.d f7, a2, 0 + fldx.d f10, a2, a3 + add.d a2, a2, t2 + //h0 + vextrins.b vr0, vr7, 0x70 + vextrins.b vr0, vr7, 0xf1 + vextrins.b vr1, vr7, 0x72 + vextrins.b vr1, vr7, 0xf3 + vextrins.b vr2, vr7, 0x74 + vextrins.b vr2, vr7, 0xf5 + vextrins.b vr3, vr7, 0x76 + vextrins.b vr3, vr7, 0xf7 + vdp2.h.bu.b vr11, vr0, vr8 + vdp2.h.bu.b vr12, vr1, vr8 + vdp2.h.bu.b vr13, vr2, vr8 + vdp2.h.bu.b vr14, vr3, vr8 + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + vpickev.w vr11, vr12, vr11 + vpickev.w vr12, vr14, vr13 + vpickev.h vr11, vr12, vr11 + vssrarni.bu.h vr11, vr11, 6 + fst.d f11, a0, 0 + add.d a0, a0, a1 + //h1 + vbsrl.v vr0, vr0, 1 + vbsrl.v vr1, vr1, 1 + vbsrl.v vr2, vr2, 1 + vbsrl.v vr3, vr3, 1 + vextrins.b vr0, vr10, 0x70 + vextrins.b vr0, vr10, 0xf1 + vextrins.b vr1, vr10, 0x72 + vextrins.b vr1, vr10, 0xf3 + vextrins.b vr2, vr10, 0x74 + vextrins.b vr2, vr10, 0xf5 + vextrins.b vr3, vr10, 0x76 + vextrins.b vr3, vr10, 0xf7 + vdp2.h.bu.b vr11, vr0, vr8 + vdp2.h.bu.b vr12, vr1, vr8 + vdp2.h.bu.b vr13, vr2, vr8 + vdp2.h.bu.b vr14, vr3, vr8 + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + vpickev.w vr11, vr12, vr11 + vpickev.w vr12, vr14, vr13 + vpickev.h vr11, vr12, vr11 + vssrarni.bu.h vr11, vr11, 6 + fst.d f11, a0, 0 + add.d a0, a0, a1 + vbsrl.v vr0, vr0, 1 + vbsrl.v vr1, vr1, 1 + vbsrl.v vr2, vr2, 1 + vbsrl.v vr3, vr3, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_v_8w_loop + addi.d a2, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.d a5, t5, 0 + addi.w a4, a4, -8 + bnez a4, .l_\lable\()put_v_8w_loop0 + b .l_\lable\()end_put_8tap + +.l_\lable\()put_hv: + ld.d t5, sp, 0 //filter_type + andi t1, t5, 3 + blt t0, a4, .l_\lable\()put_hv_idx_fh + andi t1, t5, 1 + addi.w t1, t1, 3 +.l_\lable\()put_hv_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + vldrepl.d vr8, t1, 0 + ld.d t1, sp, 0 //filter_type + srli.w t1, t1, 2 + blt t0, a5, .l_\lable\()put_hv_idx_fv + andi t1, t1, 1 + addi.w t1, t1, 3 +.l_\lable\()put_hv_idx_fv: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a7, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fv's offset + vldrepl.d vr9, t1, 0 + vexth.h.b vr9, vr9 + + sub.d a2, a2, t3 + addi.d a2, a2, -3 + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_hv_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_hv_jtable: + .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_64w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_32w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_16w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_8w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_4w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_2w - .l_\lable\()put_hv_jtable + +.l_\lable\()put_hv_2w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t2 + add.d a2, a2, t3 + vld vr3, a2, 0 + vldx vr4, a2, a3 + vldx vr5, a2, t2 + vldx vr6, a2, t3 + add.d a2, a2, t4 + + vbsrl.v vr10, vr0, 1 + vbsrl.v vr11, vr1, 1 + vbsrl.v vr12, vr2, 1 + vbsrl.v vr13, vr3, 1 + vbsrl.v vr14, vr4, 1 + vbsrl.v vr15, vr5, 1 + vbsrl.v vr16, vr6, 1 + vilvl.d vr0, vr10, vr0 + vilvl.d vr1, vr11, vr1 + vilvl.d vr2, vr12, vr2 + vilvl.d vr3, vr13, vr3 + vilvl.d vr4, vr14, vr4 + vilvl.d vr5, vr15, vr5 + vilvl.d vr6, vr16, vr6 + vdp2.h.bu.b vr10, vr0, vr8 + vdp2.h.bu.b vr11, vr1, vr8 + vdp2.h.bu.b vr12, vr2, vr8 + vdp2.h.bu.b vr13, vr3, vr8 + vdp2.h.bu.b vr14, vr4, vr8 + vdp2.h.bu.b vr15, vr5, vr8 + vdp2.h.bu.b vr16, vr6, vr8 + vhaddw.d.h vr10 + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + vhaddw.d.h vr15 + vhaddw.d.h vr16 + + vpackev.w vr10, vr11, vr10 + vpackev.w vr12, vr13, vr12 + vpackod.d vr11, vr12, vr10 + vpackev.d vr10, vr12, vr10 + + vpackev.w vr12, vr15, vr14 + vpackev.w vr16, vr17, vr16 + vpackod.d vr13, vr16, vr12 + vpackev.d vr12, vr16, vr12 + + vpickev.h vr10, vr12, vr10 //0 1 2 3 4 5 6 * (h0) + vpickev.h vr11, vr13, vr11 //8 9 10 11 12 13 14 * (h1) + vsrari.h vr10, vr10, 2 + vsrari.h vr11, vr11, 2 +.l_\lable\()put_hv_2w_loop: + vld vr7, a2, 0 + vldx vr12, a2, a3 + add.d a2, a2, t2 + + vbsrl.v vr1, vr7, 1 + vbsrl.v vr2, vr12, 1 + vilvl.d vr0, vr1, vr7 + vilvl.d vr1, vr2, vr12 + vdp2.h.bu.b vr2, vr0, vr8 + vdp2.h.bu.b vr3, vr1, vr8 + vhaddw.d.h vr2 + vhaddw.d.h vr3 + vpickev.w vr2, vr3, vr2 + vpickev.h vr2, vr2, vr2 + vsrari.h vr2, vr2, 2 + vextrins.h vr10, vr2, 0x70 //0 1 2 3 4 5 6 7 + vextrins.h vr11, vr2, 0x71 + vbsrl.v vr12, vr10, 2 + vbsrl.v vr13, vr11, 2 + vextrins.h vr12, vr2, 0x72 //1 2 3 4 5 6 7 8 + vextrins.h vr13, vr2, 0x73 + vdp2.w.h vr0, vr10, vr9 + vdp2.w.h vr1, vr11, vr9 + vdp2.w.h vr2, vr12, vr9 + vdp2.w.h vr3, vr13, vr9 + vhaddw.q.w vr0 + vhaddw.q.w vr1 + vhaddw.q.w vr2 + vhaddw.q.w vr3 + vpackev.w vr0, vr1, vr0 + vpackev.w vr1, vr3, vr2 + vpackev.d vr0, vr1, vr0 + vssrarni.hu.w vr0, vr0, 10 + vssrani.bu.h vr0, vr0, 0 + vbsrl.v vr10, vr12, 2 + vbsrl.v vr11, vr13, 2 + vstelm.h vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr0, a0, 0, 1 + add.d a0, a0, a1 + addi.d a5, a5, -2 + bnez a5, .l_\lable\()put_hv_2w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_hv_4w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t2 + add.d a2, a2, t3 + vld vr3, a2, 0 + vldx vr4, a2, a3 + vldx vr5, a2, t2 + vldx vr6, a2, t3 + add.d a2, a2, t4 + FILTER_8TAP_4W vr0 //x0 x1 x2 x3 + FILTER_8TAP_4W vr1 + FILTER_8TAP_4W vr2 + FILTER_8TAP_4W vr3 + FILTER_8TAP_4W vr4 + FILTER_8TAP_4W vr5 + FILTER_8TAP_4W vr6 + vpackev.h vr0, vr1, vr0 + vpackev.h vr1, vr3, vr2 + vpackev.h vr2, vr5, vr4 + vpackev.h vr3, vr7, vr6 + vilvl.w vr4, vr1, vr0 + vilvh.w vr5, vr1, vr0 + vilvl.w vr6, vr3, vr2 + vilvh.w vr7, vr3, vr2 + vilvl.d vr0, vr6, vr4 //0 1 2 3 4 5 6 * + vilvh.d vr1, vr6, vr4 + vilvl.d vr2, vr7, vr5 + vilvh.d vr3, vr7, vr5 + vsrari.h vr0, vr0, 2 + vsrari.h vr1, vr1, 2 + vsrari.h vr2, vr2, 2 + vsrari.h vr3, vr3, 2 +.l_\lable\()put_hv_4w_loop: + vld vr4, a2, 0 + vldx vr5, a2, a3 + add.d a2, a2, t2 + FILTER_8TAP_4W vr4 + FILTER_8TAP_4W vr5 + vpickev.h vr4, vr5, vr4 + vsrari.h vr4, vr4, 2 + vextrins.h vr0, vr4, 0x70 + vextrins.h vr1, vr4, 0x71 + vextrins.h vr2, vr4, 0x72 + vextrins.h vr3, vr4, 0x73 + vbsrl.v vr5, vr0, 2 + vbsrl.v vr6, vr1, 2 + vbsrl.v vr7, vr2, 2 + vbsrl.v vr10, vr3, 2 + vextrins.h vr5, vr4, 0x74 + vextrins.h vr6, vr4, 0x75 + vextrins.h vr7, vr4, 0x76 + vextrins.h vr10, vr4, 0x77 + vdp2.w.h vr11, vr0, vr9 + vdp2.w.h vr12, vr1, vr9 + vdp2.w.h vr13, vr2, vr9 + vdp2.w.h vr14, vr3, vr9 + vhaddw.q.w vr11 + vhaddw.q.w vr12 + vhaddw.q.w vr13 + vhaddw.q.w vr14 + vpackev.w vr0, vr12, vr11 + vpackev.w vr1, vr14, vr13 + vpackev.d vr0, vr1, vr0 + vdp2.w.h vr11, vr5, vr9 + vdp2.w.h vr12, vr6, vr9 + vdp2.w.h vr13, vr7, vr9 + vdp2.w.h vr14, vr10, vr9 + vhaddw.q.w vr11 + vhaddw.q.w vr12 + vhaddw.q.w vr13 + vhaddw.q.w vr14 + vpackev.w vr1, vr12, vr11 + vpackev.w vr2, vr14, vr13 + vpackev.d vr1, vr2, vr1 + vssrarni.hu.w vr1, vr0, 10 + vssrani.bu.h vr1, vr1, 0 + vstelm.w vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr1, a0, 0, 1 + add.d a0, a0, a1 + vbsrl.v vr0, vr5, 2 + vbsrl.v vr1, vr6, 2 + vbsrl.v vr2, vr7, 2 + vbsrl.v vr3, vr10, 2 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv_4w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_hv_8w: +.l_\lable\()put_hv_16w: +.l_\lable\()put_hv_32w: +.l_\lable\()put_hv_64w: +.l_\lable\()put_hv_128w: + addi.d t0, a2, 0 //src + addi.d t5, a5, 0 //h + addi.d t8, a0, 0 //dst +.l_\lable\()put_hv_8w_loop0: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t2 + add.d a2, a2, t3 + vld vr3, a2, 0 + vldx vr4, a2, a3 + vldx vr5, a2, t2 + vldx vr6, a2, t3 + add.d a2, a2, t4 + FILTER_8TAP_8W vr0 + FILTER_8TAP_8W vr1 + FILTER_8TAP_8W vr2 + FILTER_8TAP_8W vr3 + FILTER_8TAP_8W vr4 + FILTER_8TAP_8W vr5 + FILTER_8TAP_8W vr6 + LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\ + vr10,vr11,vr12,vr13,vr14,vr15,vr16,vr17 +.l_\lable\()put_hv_8w_loop: + vld vr20, a2, 0 + vldx vr21, a2, a3 + add.d a2, a2, t2 + FILTER_8TAP_8W vr20 + FILTER_8TAP_8W vr21 + VEXTRINS_Hx8 vr20 + FILTER_8TAP_8W_CLIP_STORE + VBSRL_Vx8 + VEXTRINS_Hx8 vr21 + FILTER_8TAP_8W_CLIP_STORE + VBSRL_Vx8 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv_8w_loop + addi.d a2, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.d a5, t5, 0 + addi.w a4, a4, -8 + bnez a4, .l_\lable\()put_hv_8w_loop0 +.l_\lable\()end_put_8tap: +.endm + +function put_8tap_regular_8bpc_lsx + addi.d sp, sp, -16 + st.d zero, sp, 0 + PUT_8TAP_8BPC_LSX 0 + addi.d sp, sp, 16 +endfunc + +function put_8tap_smooth_regular_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 1 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 1 + addi.d sp, sp, 16 +endfunc + +function put_8tap_sharp_regular_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 2 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 2 + addi.d sp, sp, 16 +endfunc + +function put_8tap_regular_smooth_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 4 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 4 + addi.d sp, sp, 16 +endfunc + +function put_8tap_smooth_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 5 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 5 + addi.d sp, sp, 16 +endfunc + +function put_8tap_sharp_smooth_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 6 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 6 + addi.d sp, sp, 16 +endfunc + +function put_8tap_regular_sharp_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 8 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 8 + addi.d sp, sp, 16 +endfunc + +function put_8tap_smooth_sharp_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 9 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 9 + addi.d sp, sp, 16 +endfunc + +function put_8tap_sharp_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 10 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 10 + addi.d sp, sp, 16 +endfunc diff --git a/src/loongarch/mc.h b/src/loongarch/mc.h index 56168e5..d5ac00f 100644 --- a/src/loongarch/mc.h +++ b/src/loongarch/mc.h @@ -32,6 +32,11 @@ #include "src/mc.h" #include "src/cpu.h" +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = BF(dav1d_put_##name, suffix) +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = BF(dav1d_prep_##name, suffix) + decl_avg_fn(BF(dav1d_avg, lsx)); decl_w_avg_fn(BF(dav1d_w_avg, lsx)); decl_mask_fn(BF(dav1d_mask, lsx)); @@ -39,6 +44,16 @@ decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx)); decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx)); decl_w_mask_fn(BF(dav1d_w_mask_420, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_regular, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_smooth, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_sharp, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, lsx)); + decl_avg_fn(BF(dav1d_avg, lasx)); decl_w_avg_fn(BF(dav1d_w_avg, lasx)); decl_mask_fn(BF(dav1d_mask, lasx)); @@ -59,6 +74,16 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) { c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx); c->w_mask[2] = BF(dav1d_w_mask_420, lsx); + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lsx); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lsx); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lsx); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lsx); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lsx); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lsx); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lsx); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lsx); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lsx); + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return; c->avg = BF(dav1d_avg, lasx); -- cgit v1.2.3 From ae8756ed91e2acbfc23711132eebbdf52ea874b3 Mon Sep 17 00:00:00 2001 From: jinbo Date: Fri, 1 Dec 2023 11:20:59 +0800 Subject: loongarch: Improve the performance of mc_8bpc.mct functions Relative speedup over C code: mct_8tap_regular_w4_0_8bpc_c: 4.2 ( 1.00x) mct_8tap_regular_w4_0_8bpc_lasx: 0.5 ( 9.08x) mct_8tap_regular_w4_h_8bpc_c: 12.5 ( 1.00x) mct_8tap_regular_w4_h_8bpc_lasx: 1.6 ( 7.80x) mct_8tap_regular_w4_hv_8bpc_c: 33.5 ( 1.00x) mct_8tap_regular_w4_hv_8bpc_lasx: 6.0 ( 5.54x) mct_8tap_regular_w4_v_8bpc_c: 13.6 ( 1.00x) mct_8tap_regular_w4_v_8bpc_lasx: 2.2 ( 6.22x) mct_8tap_regular_w8_0_8bpc_c: 11.3 ( 1.00x) mct_8tap_regular_w8_0_8bpc_lasx: 0.7 (15.77x) mct_8tap_regular_w8_h_8bpc_c: 39.1 ( 1.00x) mct_8tap_regular_w8_h_8bpc_lasx: 4.7 ( 8.30x) mct_8tap_regular_w8_hv_8bpc_c: 90.9 ( 1.00x) mct_8tap_regular_w8_hv_8bpc_lasx: 17.2 ( 5.29x) mct_8tap_regular_w8_v_8bpc_c: 40.5 ( 1.00x) mct_8tap_regular_w8_v_8bpc_lasx: 6.9 ( 5.86x) mct_8tap_regular_w16_0_8bpc_c: 34.3 ( 1.00x) mct_8tap_regular_w16_0_8bpc_lasx: 1.3 (26.32x) mct_8tap_regular_w16_h_8bpc_c: 128.3 ( 1.00x) mct_8tap_regular_w16_h_8bpc_lasx: 20.5 ( 6.26x) mct_8tap_regular_w16_hv_8bpc_c: 273.5 ( 1.00x) mct_8tap_regular_w16_hv_8bpc_lasx: 54.5 ( 5.02x) mct_8tap_regular_w16_v_8bpc_c: 129.7 ( 1.00x) mct_8tap_regular_w16_v_8bpc_lasx: 22.8 ( 5.69x) mct_8tap_regular_w32_0_8bpc_c: 133.7 ( 1.00x) mct_8tap_regular_w32_0_8bpc_lasx: 5.4 (24.65x) mct_8tap_regular_w32_h_8bpc_c: 511.4 ( 1.00x) mct_8tap_regular_w32_h_8bpc_lasx: 85.1 ( 6.01x) mct_8tap_regular_w32_hv_8bpc_c: 1018.2 ( 1.00x) mct_8tap_regular_w32_hv_8bpc_lasx: 210.0 ( 4.85x) mct_8tap_regular_w32_v_8bpc_c: 513.6 ( 1.00x) mct_8tap_regular_w32_v_8bpc_lasx: 88.7 ( 5.79x) mct_8tap_regular_w64_0_8bpc_c: 315.4 ( 1.00x) mct_8tap_regular_w64_0_8bpc_lasx: 13.2 (23.86x) mct_8tap_regular_w64_h_8bpc_c: 1236.8 ( 1.00x) mct_8tap_regular_w64_h_8bpc_lasx: 208.2 ( 5.94x) mct_8tap_regular_w64_hv_8bpc_c: 2428.0 ( 1.00x) mct_8tap_regular_w64_hv_8bpc_lasx: 502.7 ( 4.83x) mct_8tap_regular_w64_v_8bpc_c: 1238.3 ( 1.00x) mct_8tap_regular_w64_v_8bpc_lasx: 214.0 ( 5.79x) mct_8tap_regular_w128_0_8bpc_c: 775.3 ( 1.00x) mct_8tap_regular_w128_0_8bpc_lasx: 32.5 (23.86x) mct_8tap_regular_w128_h_8bpc_c: 3077.5 ( 1.00x) mct_8tap_regular_w128_h_8bpc_lasx: 518.6 ( 5.93x) mct_8tap_regular_w128_hv_8bpc_c: 5987.0 ( 1.00x) mct_8tap_regular_w128_hv_8bpc_lasx: 1242.4 ( 4.82x) mct_8tap_regular_w128_v_8bpc_c: 3077.5 ( 1.00x) mct_8tap_regular_w128_v_8bpc_lasx: 530.3 ( 5.80x) --- src/loongarch/mc.S | 1106 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/mc.h | 19 + 2 files changed, 1125 insertions(+) diff --git a/src/loongarch/mc.S b/src/loongarch/mc.S index 9e0dbff..97887de 100644 --- a/src/loongarch/mc.S +++ b/src/loongarch/mc.S @@ -3650,3 +3650,1109 @@ function put_8tap_sharp_8bpc_lsx PUT_8TAP_8BPC_LSX 10 addi.d sp, sp, 16 endfunc + +const shufb1 +.byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8,0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8 +endconst + +.macro SHUFB in0, in1, tmp, out + xvbsrl.v \tmp, \in0, 2 + xvpermi.q \tmp, \in0, 0x20 + xvshuf.b \out, \tmp, \tmp, \in1 +.endm + +.macro HADDWDH in0 + xvhaddw.w.h \in0, \in0, \in0 + xvhaddw.d.w \in0, \in0, \in0 +.endm + +.macro HADDWQW in0 + xvhaddw.d.w \in0, \in0, \in0 + xvhaddw.q.d \in0, \in0, \in0 +.endm + +.macro PREP_W16_H in0 + xvbsrl.v xr4, \in0, 4 + xvbsrl.v xr5, \in0, 8 + xvpermi.q xr9, \in0, 0x31 + xvpackev.d xr5, xr9, xr5 + xvbsrl.v xr6, xr5, 4 + SHUFB \in0, xr23, xr9, \in0 + SHUFB xr4, xr23, xr9, xr4 + SHUFB xr5, xr23, xr9, xr5 + SHUFB xr6, xr23, xr9, xr6 + xvdp2.h.bu.b xr10, \in0, xr22 + xvdp2.h.bu.b xr11, xr4, xr22 + xvdp2.h.bu.b xr12, xr5, xr22 + xvdp2.h.bu.b xr13, xr6, xr22 + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + xvpickev.w xr10, xr11, xr10 + xvpickev.w xr11, xr13, xr12 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr11, xr11, 0xd8 + xvpickev.h xr10, xr11, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h \in0, xr10, 2 +.endm + +.macro PREP_8TAP_8BPC_LASX lable + li.w t0, 4 + la.local t6, dav1d_mc_subpel_filters + la.local t7, shufb1 + xvld xr23, t7, 0 + slli.d t2, a2, 1 //src_stride*2 + add.d t3, t2, a2 //src_stride*3 + slli.d t4, t2, 1 + + bnez a5, .l_\lable\()h //mx + bnez a6, .l_\lable\()v + + clz.w t1, a3 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()prep_hv0_jtable + alsl.d t1, t1, t5, 1 + ld.h t8, t1, 0 + add.d t5, t5, t8 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()prep_hv0_jtable: + .hword .l_\lable\()hv0_128w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_64w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_32w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_16w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_8w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_4w - .l_\lable\()prep_hv0_jtable + +.l_\lable\()hv0_4w: + fld.s f0, a1, 0 + fldx.s f1, a1, a2 + fldx.s f2, a1, t2 + fldx.s f3, a1, t3 + add.d a1, a1, t4 + xvpackev.w xr0, xr1, xr0 + xvpackev.w xr1, xr3, xr2 + xvpermi.q xr0, xr1, 0x02 + xvsllwil.hu.bu xr0, xr0, 4 + xvst xr0, a0, 0 + addi.d a0, a0, 32 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_4w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_8w: + fld.d f0, a1, 0 + fldx.d f1, a1, a2 + fldx.d f2, a1, t2 + fldx.d f3, a1, t3 + add.d a1, a1, t4 + xvpermi.q xr0, xr1, 0x02 + xvpermi.q xr2, xr3, 0x02 + xvsllwil.hu.bu xr0, xr0, 4 + xvsllwil.hu.bu xr2, xr2, 4 + xvst xr0, a0, 0 + xvst xr2, a0, 32 + addi.d a0, a0, 64 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_8w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_16w: + vld vr0, a1, 0 + vldx vr1, a1, a2 + vldx vr2, a1, t2 + vldx vr3, a1, t3 + add.d a1, a1, t4 + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + vext2xv.hu.bu xr3, xr3 + xvslli.h xr0, xr0, 4 + xvslli.h xr1, xr1, 4 + xvslli.h xr2, xr2, 4 + xvslli.h xr3, xr3, 4 + xvst xr0, a0, 0 + xvst xr1, a0, 32 + xvst xr2, a0, 64 + xvst xr3, a0, 96 + addi.d a0, a0, 128 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_16w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_32w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + xvpermi.d xr4, xr0, 0xD8 + xvpermi.d xr5, xr1, 0xD8 + xvpermi.d xr6, xr2, 0xD8 + xvpermi.d xr7, xr3, 0xD8 + xvpermi.d xr10, xr0, 0x32 + xvpermi.d xr11, xr1, 0x32 + xvpermi.d xr12, xr2, 0x32 + xvpermi.d xr13, xr3, 0x32 + xvsllwil.hu.bu xr0, xr4, 4 + xvsllwil.hu.bu xr1, xr5, 4 + xvsllwil.hu.bu xr2, xr6, 4 + xvsllwil.hu.bu xr3, xr7, 4 + xvsllwil.hu.bu xr4, xr10, 4 + xvsllwil.hu.bu xr5, xr11, 4 + xvsllwil.hu.bu xr6, xr12, 4 + xvsllwil.hu.bu xr7, xr13, 4 + xvst xr0, a0, 0 + xvst xr4, a0, 32 + xvst xr1, a0, 64 + xvst xr5, a0, 96 + xvst xr2, a0, 128 + xvst xr6, a0, 160 + xvst xr3, a0, 192 + xvst xr7, a0, 224 + addi.d a0, a0, 256 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_32w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_64w: +.l_\lable\()hv0_128w: + addi.d t0, a1, 0 + addi.d t5, a4, 0 + srli.w t7, a3, 5 + slli.w t7, t7, 6 + addi.d t8, a0, 0 +.l_\lable\()hv0_32_loop: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + xvpermi.d xr4, xr0, 0xD8 + xvpermi.d xr5, xr1, 0xD8 + xvpermi.d xr6, xr2, 0xD8 + xvpermi.d xr7, xr3, 0xD8 + xvpermi.d xr10, xr0, 0x32 + xvpermi.d xr11, xr1, 0x32 + xvpermi.d xr12, xr2, 0x32 + xvpermi.d xr13, xr3, 0x32 + xvsllwil.hu.bu xr0, xr4, 4 + xvsllwil.hu.bu xr1, xr5, 4 + xvsllwil.hu.bu xr2, xr6, 4 + xvsllwil.hu.bu xr3, xr7, 4 + xvsllwil.hu.bu xr4, xr10, 4 + xvsllwil.hu.bu xr5, xr11, 4 + xvsllwil.hu.bu xr6, xr12, 4 + xvsllwil.hu.bu xr7, xr13, 4 + xvst xr0, a0, 0 + xvst xr4, a0, 32 + add.d t1, a0, t7 + xvst xr1, t1, 0 + xvst xr5, t1, 32 + add.d t1, t1, t7 + xvst xr2, t1, 0 + xvst xr6, t1, 32 + add.d t1, t1, t7 + xvst xr3, t1, 0 + xvst xr7, t1, 32 + add.d a0, t1, t7 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_32_loop + addi.d a1, t0, 32 + addi.d t0, t0, 32 + addi.d a0, t8, 64 + addi.d t8, t8, 64 + addi.d a4, t5, 0 + addi.d a3, a3, -32 + bnez a3, .l_\lable\()hv0_32_loop + b .l_\lable\()end_pre_8tap + +.l_\lable\()h: + bnez a6, .l_\lable\()hv //if(fh) && if (fv) + + andi t1, a7, 3 + blt t0, a3, .l_\lable\()h_idx_fh + andi t1, a7, 1 + addi.w t1, t1, 3 +.l_\lable\()h_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a5, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + xvldrepl.d xr22, t1, 0 + + addi.d a1, a1, -3 + clz.w t1, a3 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()prep_h_jtable + alsl.d t1, t1, t5, 1 + ld.h t8, t1, 0 + add.d t5, t5, t8 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()prep_h_jtable: + .hword .l_\lable\()h_128w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_64w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_32w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_16w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_8w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_4w - .l_\lable\()prep_h_jtable + +.l_\lable\()h_4w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr12, xr1, xr22 + xvdp2.h.bu.b xr14, xr2, xr22 + xvdp2.h.bu.b xr16, xr3, xr22 + + HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 + HADDWDH xr12 //h1 mid4 mid5 mid6 mid7 + HADDWDH xr14 //h2 + HADDWDH xr16 //h3 + + xvpickev.w xr10, xr12, xr10 + xvpickev.w xr14, xr16, xr14 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr14, xr14, 0xd8 + xvpickev.h xr10, xr14, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h xr10, xr10, 2 + + xvst xr10, a0, 0 + addi.d a0, a0, 32 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()h_4w + b .l_\lable\()end_pre_8tap + +.l_\lable\()h_8w: + xvld xr0, a1, 0 + xvldx xr2, a1, a2 + xvldx xr4, a1, t2 + xvldx xr6, a1, t3 + add.d a1, a1, t4 + + xvbsrl.v xr1, xr0, 4 + xvbsrl.v xr3, xr2, 4 + xvbsrl.v xr5, xr4, 4 + xvbsrl.v xr7, xr6, 4 + + SHUFB xr0, xr23, xr9, xr10 + SHUFB xr1, xr23, xr9, xr11 + SHUFB xr2, xr23, xr9, xr12 + SHUFB xr3, xr23, xr9, xr13 + SHUFB xr4, xr23, xr9, xr14 + SHUFB xr5, xr23, xr9, xr15 + SHUFB xr6, xr23, xr9, xr16 + SHUFB xr7, xr23, xr9, xr17 + + xvdp2.h.bu.b xr0, xr10, xr22 + xvdp2.h.bu.b xr1, xr11, xr22 + xvdp2.h.bu.b xr2, xr12, xr22 + xvdp2.h.bu.b xr3, xr13, xr22 + xvdp2.h.bu.b xr4, xr14, xr22 + xvdp2.h.bu.b xr5, xr15, xr22 + xvdp2.h.bu.b xr6, xr16, xr22 + xvdp2.h.bu.b xr7, xr17, xr22 + + HADDWDH xr0 + HADDWDH xr1 + HADDWDH xr2 + HADDWDH xr3 + HADDWDH xr4 + HADDWDH xr5 + HADDWDH xr6 + HADDWDH xr7 + + xvpickev.w xr0, xr1, xr0 + xvpickev.w xr2, xr3, xr2 + xvpermi.d xr0, xr0, 0xd8 + xvpermi.d xr2, xr2, 0xd8 + xvpickev.h xr0, xr2, xr0 + xvpermi.d xr0, xr0, 0xd8 + xvsrari.h xr0, xr0, 2 + + xvpickev.w xr4, xr5, xr4 + xvpickev.w xr6, xr7, xr6 + xvpermi.d xr4, xr4, 0xd8 + xvpermi.d xr6, xr6, 0xd8 + xvpickev.h xr4, xr6, xr4 + xvpermi.d xr4, xr4, 0xd8 + xvsrari.h xr4, xr4, 2 + + xvst xr0, a0, 0 + xvst xr4, a0, 32 + addi.d a0, a0, 64 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()h_8w + b .l_\lable\()end_pre_8tap + +.l_\lable\()h_16w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + + PREP_W16_H xr0 + PREP_W16_H xr1 + PREP_W16_H xr2 + PREP_W16_H xr3 + + xvst xr0, a0, 0 + xvst xr1, a0, 32 + xvst xr2, a0, 64 + xvst xr3, a0, 96 + + addi.d a0, a0, 128 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()h_16w + b .l_\lable\()end_pre_8tap + +.l_\lable\()h_32w: +.l_\lable\()h_64w: +.l_\lable\()h_128w: + addi.d t0, a1, 0 //src + addi.d t5, a4, 0 //h + srli.w t7, a3, 4 //w + slli.w t7, t7, 5 //store offset + addi.d t8, a0, 0 //dst +.l_\lable\()h_16_loop: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + + PREP_W16_H xr0 + PREP_W16_H xr1 + PREP_W16_H xr2 + PREP_W16_H xr3 + + xvst xr0, a0, 0 + xvstx xr1, a0, t7 + slli.w t1, t7, 1 + xvstx xr2, a0, t1 + add.w t1, t1, t7 + xvstx xr3, a0, t1 + slli.w t1, t7, 2 + add.d a0, a0, t1 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()h_16_loop + + addi.d a1, t0, 16 + addi.d t0, t0, 16 + addi.d a0, t8, 32 + addi.d t8, t8, 32 + addi.d a4, t5, 0 + addi.d a3, a3, -16 + bnez a3, .l_\lable\()h_16_loop + b .l_\lable\()end_pre_8tap +.l_\lable\()hv: + andi t1, a7, 3 + blt t0, a3, .l_\lable\()hv_idx_fh + andi t1, a7, 1 + addi.w t1, t1, 3 +.l_\lable\()hv_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a5, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + xvldrepl.d xr22, t1, 0 + srli.w a7, a7, 2 + blt t0, a4, .l_\lable\()hv_idx_fv + andi a7, a7, 1 + addi.w a7, a7, 3 +.l_\lable\()hv_idx_fv: + addi.w t5, zero, 120 + mul.w a7, a7, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w a7, a7, t5 + add.d a7, t6, a7 //fv's offset + xvldrepl.d xr8, a7, 0 + xvsllwil.h.b xr8, xr8, 0 + + sub.d a1, a1, t3 + addi.d a1, a1, -3 + beq a3, t0, .l_\lable\()hv_4w + b .l_\lable\()hv_8w +.l_\lable\()hv_4w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + xvld xr4, a1, 0 + xvldx xr5, a1, a2 + xvldx xr6, a1, t2 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + + SHUFB xr4, xr23, xr9, xr4 + SHUFB xr5, xr23, xr9, xr5 + SHUFB xr6, xr23, xr9, xr6 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr11, xr1, xr22 + xvdp2.h.bu.b xr12, xr2, xr22 + xvdp2.h.bu.b xr13, xr3, xr22 + + xvdp2.h.bu.b xr14, xr4, xr22 + xvdp2.h.bu.b xr15, xr5, xr22 + xvdp2.h.bu.b xr16, xr6, xr22 + + HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 + HADDWDH xr11 //h1 mid4 mid5 mid6 mid7 + HADDWDH xr12 //h2 + HADDWDH xr13 //h3 + + xvpackev.w xr10, xr11, xr10 + xvpackev.w xr12, xr13, xr12 + xvpackev.d xr11, xr12, xr10 + xvpackod.d xr10, xr12, xr10 + xvpickev.h xr11, xr10, xr11 + xvsrari.h xr11, xr11, 2 + + HADDWDH xr14 //h4 + HADDWDH xr15 //h5 + HADDWDH xr16 //h6 + + xvpackev.w xr14, xr15, xr14 + xvpackev.w xr16, xr17, xr16 + xvpackev.d xr17, xr16, xr14 + xvpackod.d xr14, xr16, xr14 + xvpickev.h xr13, xr14, xr17 + xvsrari.h xr13, xr13, 2 + + xvpackev.d xr18, xr13, xr11 //0 4 8 12 16 20 24 * 2 6 10 14 18 22 26 * + xvpackod.d xr19, xr13, xr11 //1 5 9 13 17 21 25 * 3 7 11 15 19 23 27 * +.l_\lable\()hv_w4_loop: + xvldx xr0, a1, t3 + add.d a1, a1, t4 + xvld xr1, a1, 0 + xvldx xr2, a1, a2 + xvldx xr3, a1, t2 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr12, xr1, xr22 + xvdp2.h.bu.b xr14, xr2, xr22 + xvdp2.h.bu.b xr16, xr3, xr22 + + HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 + HADDWDH xr12 //h1 mid4 mid5 mid6 mid7 + HADDWDH xr14 //h2 + HADDWDH xr16 //h3 + + xvpackev.w xr10, xr12, xr10 + xvpackev.w xr14, xr16, xr14 + xvpackev.d xr12, xr14, xr10 + xvpackod.d xr10, xr14, xr10 + xvpickev.h xr12, xr10, xr12 + xvsrari.h xr12, xr12, 2 + + xvextrins.h xr18, xr12, 0x70 //0 4 8 12 16 20 24 0(x0) 2 6 10 14 18 22 26 2(x2) + xvextrins.h xr19, xr12, 0x74 //1 5 9 13 17 21 25 0(x1) 3 7 11 15 19 23 27 2(x3) + + xvdp2.w.h xr0, xr18, xr8 + xvdp2.w.h xr2, xr19, xr8 + HADDWQW xr0 + HADDWQW xr2 + xvpackev.w xr0, xr2, xr0 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + xvextrins.h xr18, xr12, 0x71 + xvextrins.h xr19, xr12, 0x75 + xvdp2.w.h xr2, xr18, xr8 + xvdp2.w.h xr4, xr19, xr8 + HADDWQW xr2 + HADDWQW xr4 + xvpackev.w xr2, xr4, xr2 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + xvextrins.h xr18, xr12, 0x72 + xvextrins.h xr19, xr12, 0x76 + xvdp2.w.h xr4, xr18, xr8 + xvdp2.w.h xr9, xr19, xr8 + HADDWQW xr4 + HADDWQW xr9 + xvpackev.w xr4, xr9, xr4 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + xvextrins.h xr18, xr12, 0x73 + xvextrins.h xr19, xr12, 0x77 + xvdp2.w.h xr9, xr18, xr8 + xvdp2.w.h xr11, xr19, xr8 + HADDWQW xr9 + HADDWQW xr11 + xvpackev.w xr9, xr11, xr9 + + xvpackev.d xr0, xr2, xr0 + xvpackev.d xr4, xr9, xr4 + xvsrari.w xr0, xr0, 6 + xvsrari.w xr4, xr4, 6 + xvpermi.d xr0, xr0, 0xd8 + xvpermi.d xr4, xr4, 0xd8 + xvpickev.h xr0, xr4, xr0 + xvpermi.d xr0, xr0, 0xd8 + xvst xr0, a0, 0 + addi.d a0, a0, 32 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv_w4_loop + b .l_\lable\()end_pre_8tap + +.l_\lable\()hv_8w: + addi.d t0, a1, 0 + addi.d t5, a4, 0 + srli.w t7, a3, 3 + slli.w t7, t7, 4 // store offset + addi.d t8, a0, 0 +.l_\lable\()hv_8w_loop0: + xvld xr0, a1, 0 + xvldx xr2, a1, a2 + xvldx xr4, a1, t2 + xvldx xr6, a1, t3 + + add.d a1, a1, t4 + xvld xr10, a1, 0 + xvldx xr11, a1, a2 + xvldx xr12, a1, t2 + + xvbsrl.v xr1, xr0, 4 + xvbsrl.v xr3, xr2, 4 + xvbsrl.v xr5, xr4, 4 + xvbsrl.v xr7, xr6, 4 + + SHUFB xr0, xr23, xr9, xr13 + SHUFB xr1, xr23, xr9, xr14 + SHUFB xr2, xr23, xr9, xr15 + SHUFB xr3, xr23, xr9, xr16 + SHUFB xr4, xr23, xr9, xr17 + SHUFB xr5, xr23, xr9, xr18 + SHUFB xr6, xr23, xr9, xr19 + SHUFB xr7, xr23, xr9, xr20 + + xvdp2.h.bu.b xr0, xr13, xr22 + xvdp2.h.bu.b xr1, xr14, xr22 + xvdp2.h.bu.b xr2, xr15, xr22 + xvdp2.h.bu.b xr3, xr16, xr22 + xvdp2.h.bu.b xr4, xr17, xr22 + xvdp2.h.bu.b xr5, xr18, xr22 + xvdp2.h.bu.b xr6, xr19, xr22 + xvdp2.h.bu.b xr7, xr20, xr22 + + HADDWDH xr0 + HADDWDH xr1 + HADDWDH xr2 + HADDWDH xr3 + HADDWDH xr4 + HADDWDH xr5 + HADDWDH xr6 + HADDWDH xr7 + + xvpackev.w xr0, xr2, xr0 + xvpackev.w xr2, xr6, xr4 + xvpackev.d xr16, xr2, xr0 + xvpackod.d xr0, xr2, xr0 + xvpickev.h xr0, xr0, xr16 + xvsrari.h xr0, xr0, 2 // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27 + + xvpackev.w xr1, xr3, xr1 + xvpackev.w xr3, xr7, xr5 + xvpackev.d xr16, xr3, xr1 + xvpackod.d xr1, xr3, xr1 + xvpickev.h xr1, xr1, xr16 + xvsrari.h xr1, xr1, 2 // 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31 + + xvbsrl.v xr13, xr10, 4 + xvbsrl.v xr14, xr11, 4 + xvbsrl.v xr15, xr12, 4 + + SHUFB xr10, xr23, xr9, xr10 + SHUFB xr13, xr23, xr9, xr13 + SHUFB xr11, xr23, xr9, xr11 + SHUFB xr14, xr23, xr9, xr14 + SHUFB xr12, xr23, xr9, xr12 + SHUFB xr15, xr23, xr9, xr15 + + xvdp2.h.bu.b xr4, xr10, xr22 + xvdp2.h.bu.b xr5, xr13, xr22 + xvdp2.h.bu.b xr6, xr11, xr22 + xvdp2.h.bu.b xr7, xr14, xr22 + xvdp2.h.bu.b xr9, xr12, xr22 + xvdp2.h.bu.b xr10, xr15, xr22 + + HADDWDH xr4 + HADDWDH xr5 + HADDWDH xr6 + HADDWDH xr7 + HADDWDH xr9 + HADDWDH xr10 + + xvpackev.w xr4, xr6, xr4 + xvpackev.w xr9, xr12, xr9 + xvpackev.d xr16, xr9, xr4 + xvpackod.d xr11, xr9, xr4 + xvpickev.h xr2, xr11, xr16 + xvsrari.h xr2, xr2, 2 // 32 40 48 * 33 41 49 * 34 42 50 * 35 43 51 * + + xvpackev.w xr5, xr7, xr5 + xvpackev.w xr10, xr12, xr10 + xvpackev.d xr16, xr10, xr5 + xvpackod.d xr11, xr10, xr5 + xvpickev.h xr3, xr11, xr16 + xvsrari.h xr3, xr3, 2 // 36 44 52 * 37 45 53 * 38 46 54 * 39 47 56 * + + xvpackev.d xr18, xr2, xr0 // 0 8 16 24 32 40 48 * 2 10 18 26 34 42 50 * + xvpackod.d xr19, xr2, xr0 // 1 9 17 25 33 41 49 * 3 11 19 27 35 43 51 * + xvpackev.d xr20, xr3, xr1 // 4 12 20 28 36 44 52 * 6 14 22 30 38 46 54 * + xvpackod.d xr21, xr3, xr1 // 5 13 21 29 37 45 53 * 7 15 23 31 39 47 55 * + +.l_\lable\()hv_8w_loop: + xvldx xr0, a1, t3 + add.d a1, a1, t4 + xvld xr2, a1, 0 + xvldx xr4, a1, a2 + xvldx xr6, a1, t2 + + xvbsrl.v xr1, xr0, 4 + xvbsrl.v xr3, xr2, 4 + xvbsrl.v xr5, xr4, 4 + xvbsrl.v xr7, xr6, 4 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + SHUFB xr4, xr23, xr9, xr4 + SHUFB xr5, xr23, xr9, xr5 + SHUFB xr6, xr23, xr9, xr6 + SHUFB xr7, xr23, xr9, xr7 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr11, xr1, xr22 + xvdp2.h.bu.b xr12, xr2, xr22 + xvdp2.h.bu.b xr13, xr3, xr22 + xvdp2.h.bu.b xr14, xr4, xr22 + xvdp2.h.bu.b xr15, xr5, xr22 + xvdp2.h.bu.b xr16, xr6, xr22 + xvdp2.h.bu.b xr17, xr7, xr22 + + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + HADDWDH xr14 + HADDWDH xr15 + HADDWDH xr16 + HADDWDH xr17 + + xvpackev.w xr0, xr12, xr10 + xvpackev.w xr2, xr16, xr14 + xvpackev.d xr9, xr2, xr0 + xvpackod.d xr0, xr2, xr0 + xvpickev.h xr0, xr0, xr9 + xvsrari.h xr0, xr0, 2 // 56 64 72 80 57 65 73 81 58 66 74 82 59 67 75 83 + + xvpackev.w xr1, xr13, xr11 + xvpackev.w xr3, xr17, xr15 + xvpackev.d xr9, xr3, xr1 + xvpackod.d xr1, xr3, xr1 + xvpickev.h xr1, xr1, xr9 + xvsrari.h xr1, xr1, 2 // 60 68 76 84 61 69 77 85 62 70 78 86 63 71 79 87 + + xvextrins.h xr18, xr0, 0x70 // 0 8 16 24 32 40 48 (56) 2 10 18 26 34 42 50 (58) + xvextrins.h xr19, xr0, 0x74 // 1 9 17 25 33 41 49 (57) 3 11 19 27 35 43 51 (59) + xvextrins.h xr20, xr1, 0x70 + xvextrins.h xr21, xr1, 0x74 + + //h - 1 + xvdp2.w.h xr10, xr18, xr8 + xvdp2.w.h xr11, xr19, xr8 + xvdp2.w.h xr12, xr20, xr8 + xvdp2.w.h xr13, xr21, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr2, xr11, xr10 //0 1 * * 2 3 * * + xvpackev.w xr3, xr13, xr12 //4 5 * * 6 7 * * + xvpackev.d xr2, xr3, xr2 //0 1 4 5 2 3 6 7 + //h - 2 + xvbsrl.v xr4, xr18, 2 + xvbsrl.v xr5, xr19, 2 + xvbsrl.v xr6, xr20, 2 + xvbsrl.v xr7, xr21, 2 + xvextrins.h xr4, xr0, 0x71 + xvextrins.h xr5, xr0, 0x75 + xvextrins.h xr6, xr1, 0x71 + xvextrins.h xr7, xr1, 0x75 + + xvdp2.w.h xr10, xr4, xr8 + xvdp2.w.h xr11, xr5, xr8 + xvdp2.w.h xr12, xr6, xr8 + xvdp2.w.h xr13, xr7, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr14, xr11, xr10 + xvpackev.w xr15, xr13, xr12 + xvpackev.d xr14, xr15, xr14 //8 9 12 13 10 11 14 15 + //h - 3 + xvbsrl.v xr4, xr4, 2 + xvbsrl.v xr5, xr5, 2 + xvbsrl.v xr6, xr6, 2 + xvbsrl.v xr7, xr7, 2 + xvextrins.h xr4, xr0, 0x72 + xvextrins.h xr5, xr0, 0x76 + xvextrins.h xr6, xr1, 0x72 + xvextrins.h xr7, xr1, 0x76 + + xvdp2.w.h xr10, xr4, xr8 + xvdp2.w.h xr11, xr5, xr8 + xvdp2.w.h xr12, xr6, xr8 + xvdp2.w.h xr13, xr7, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr15, xr11, xr10 + xvpackev.w xr16, xr13, xr12 + xvpackev.d xr15, xr16, xr15 //16 17 20 21 18 19 22 23 + //h - 4 + xvbsrl.v xr4, xr4, 2 + xvbsrl.v xr5, xr5, 2 + xvbsrl.v xr6, xr6, 2 + xvbsrl.v xr7, xr7, 2 + xvextrins.h xr4, xr0, 0x73 + xvextrins.h xr5, xr0, 0x77 + xvextrins.h xr6, xr1, 0x73 + xvextrins.h xr7, xr1, 0x77 + + xvdp2.w.h xr10, xr4, xr8 + xvdp2.w.h xr11, xr5, xr8 + xvdp2.w.h xr12, xr6, xr8 + xvdp2.w.h xr13, xr7, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr16, xr11, xr10 + xvpackev.w xr17, xr13, xr12 + xvpackev.d xr16, xr17, xr16 //24 25 28 29 26 27 30 31 + + xvsrari.w xr2, xr2, 6 + xvsrari.w xr14, xr14, 6 + xvsrari.w xr15, xr15, 6 + xvsrari.w xr16, xr16, 6 + + xvpermi.d xr2, xr2, 0xd8 + xvpermi.d xr14, xr14, 0xd8 + xvpermi.d xr15, xr15, 0xd8 + xvpermi.d xr16, xr16, 0xd8 + xvpickev.h xr2, xr14, xr2 + xvpickev.h xr3, xr16, xr15 + xvpermi.d xr2, xr2, 0xd8 + xvpermi.d xr3, xr3, 0xd8 + + xvpermi.q xr10, xr2, 0x31 + xvpermi.q xr11, xr3, 0x31 + + vst vr2, a0, 0 + vstx vr10, a0, t7 //32 + slli.w t1, t7, 1 //64 + vstx vr3, a0, t1 + add.w t1, t1, t7 //96 + vstx vr11, a0, t1 + slli.w t1, t7, 2 //128 + add.d a0, a0, t1 + + xvbsrl.v xr18, xr4, 2 + xvbsrl.v xr19, xr5, 2 + xvbsrl.v xr20, xr6, 2 + xvbsrl.v xr21, xr7, 2 + + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv_8w_loop + + addi.d a1, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 16 + addi.d t8, t8, 16 + addi.d a4, t5, 0 + addi.d a3, a3, -8 + bnez a3, .l_\lable\()hv_8w_loop0 + b .l_\lable\()end_pre_8tap +.l_\lable\()v: + + srli.w a7, a7, 2 + blt t0, a4, .l_\lable\()v_idx_fv + andi a7, a7, 1 + addi.w a7, a7, 3 +.l_\lable\()v_idx_fv: + addi.w t5, zero, 120 + mul.w a7, a7, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w a7, a7, t5 + add.d a7, t6, a7 //fv's offset + xvldrepl.d xr8, a7, 0 + + sub.d a1, a1, t3 + beq a3, t0, .l_\lable\()v_4w + blt t0, a3, .l_\lable\()v_8w +.l_\lable\()v_4w: + fld.s f0, a1, 0 + fldx.s f1, a1, a2 + fldx.s f2, a1, t2 + add.d a1, a1, t3 + fld.s f3, a1, 0 + fldx.s f4, a1, a2 + fldx.s f5, a1, t2 + fldx.s f6, a1, t3 + + xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25 + xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27 + xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29 + xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31 + xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 + xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 + xvilvl.w xr2, xr1, xr0 + xvilvh.w xr0, xr1, xr0 + xvpermi.q xr0, xr2, 0x20 + +.l_\lable\()v_4w_loop: + add.d a1, a1, t4 + fld.s f7, a1, 0 //h0 + fldx.s f10, a1, a2 //h1 + fldx.s f11, a1, t2 //h2 + fldx.s f12, a1, t3 //h3 + + xvbsrl.v xr9, xr7, 2 + xvpermi.q xr9, xr7, 0x20 + xvextrins.b xr0, xr9, 0x70 + xvextrins.b xr0, xr9, 0xf1 + + xvbsrl.v xr1, xr0, 1 + xvbsrl.v xr7, xr10, 2 + xvpermi.q xr7, xr10, 0x20 + xvextrins.b xr1, xr7, 0x70 + xvextrins.b xr1, xr7, 0xf1 + + xvbsrl.v xr2, xr1, 1 + xvbsrl.v xr7, xr11, 2 + xvpermi.q xr7, xr11, 0x20 + xvextrins.b xr2, xr7, 0x70 + xvextrins.b xr2, xr7, 0xf1 + + xvbsrl.v xr3, xr2, 1 + xvbsrl.v xr7, xr12, 2 + xvpermi.q xr7, xr12, 0x20 + xvextrins.b xr3, xr7, 0x70 + xvextrins.b xr3, xr7, 0xf1 + xvbsrl.v xr4, xr3, 1 + + xvdp2.h.bu.b xr10, xr0, xr8 + xvdp2.h.bu.b xr11, xr1, xr8 + xvdp2.h.bu.b xr12, xr2, xr8 + xvdp2.h.bu.b xr13, xr3, xr8 + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + xvpickev.w xr10, xr11, xr10 + xvpickev.w xr11, xr13, xr12 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr11, xr11, 0xd8 + xvpickev.h xr10, xr11, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h xr10, xr10, 2 + + xvaddi.bu xr0, xr4, 0 + + xvst xr10, a0, 0 + addi.d a0, a0, 32 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()v_4w_loop + b .l_\lable\()end_pre_8tap + +.l_\lable\()v_8w: + addi.d t0, a1, 0 + addi.d t5, a4, 0 + srli.w t7, a3, 2 + slli.w t7, t7, 3 + addi.d t8, a0, 0 +.l_\lable\()v_8w_loop0: + fld.s f0, a1, 0 + fldx.s f1, a1, a2 + fldx.s f2, a1, t2 + add.d a1, a1, t3 + fld.s f3, a1, 0 + fldx.s f4, a1, a2 + fldx.s f5, a1, t2 + fldx.s f6, a1, t3 + + xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25 + xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27 + xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29 + xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31 + xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 + xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 + xvilvl.w xr2, xr1, xr0 + xvilvh.w xr0, xr1, xr0 + xvpermi.q xr0, xr2, 0x20 + +.l_\lable\()v_8w_loop: + add.d a1, a1, t4 + fld.s f7, a1, 0 //h0 + fldx.s f10, a1, a2 //h1 + fldx.s f11, a1, t2 //h2 + fldx.s f12, a1, t3 //h3 + + xvbsrl.v xr9, xr7, 2 + xvpermi.q xr9, xr7, 0x20 + xvextrins.b xr0, xr9, 0x70 + xvextrins.b xr0, xr9, 0xf1 + + xvbsrl.v xr1, xr0, 1 + xvbsrl.v xr7, xr10, 2 + xvpermi.q xr7, xr10, 0x20 + xvextrins.b xr1, xr7, 0x70 + xvextrins.b xr1, xr7, 0xf1 + + xvbsrl.v xr2, xr1, 1 + xvbsrl.v xr7, xr11, 2 + xvpermi.q xr7, xr11, 0x20 + xvextrins.b xr2, xr7, 0x70 + xvextrins.b xr2, xr7, 0xf1 + + xvbsrl.v xr3, xr2, 1 + xvbsrl.v xr7, xr12, 2 + xvpermi.q xr7, xr12, 0x20 + xvextrins.b xr3, xr7, 0x70 + xvextrins.b xr3, xr7, 0xf1 + xvbsrl.v xr4, xr3, 1 + + xvdp2.h.bu.b xr10, xr0, xr8 + xvdp2.h.bu.b xr11, xr1, xr8 + xvdp2.h.bu.b xr12, xr2, xr8 + xvdp2.h.bu.b xr13, xr3, xr8 + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + xvpickev.w xr10, xr11, xr10 + xvpickev.w xr11, xr13, xr12 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr11, xr11, 0xd8 + xvpickev.h xr10, xr11, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h xr10, xr10, 2 + + xvaddi.bu xr0, xr4, 0 + + xvstelm.d xr10, a0, 0, 0 + add.d a0, a0, t7 + xvstelm.d xr10, a0, 0, 1 + add.d a0, a0, t7 + xvstelm.d xr10, a0, 0, 2 + add.d a0, a0, t7 + xvstelm.d xr10, a0, 0, 3 + add.d a0, a0, t7 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()v_8w_loop + + addi.d a1, t0, 4 + addi.d t0, t0, 4 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.d a4, t5, 0 + addi.d a3, a3, -4 + bnez a3, .l_\lable\()v_8w_loop0 + +.l_\lable\()end_pre_8tap: +.endm + +function prep_8tap_regular_8bpc_lasx + addi.w a7, zero, 0 + PREP_8TAP_8BPC_LASX 0 +endfunc + +function prep_8tap_smooth_regular_8bpc_lasx + addi.w a7, zero, 1 + PREP_8TAP_8BPC_LASX 1 +endfunc + +function prep_8tap_sharp_regular_8bpc_lasx + addi.w a7, zero, 2 + PREP_8TAP_8BPC_LASX 2 +endfunc + +function prep_8tap_regular_smooth_8bpc_lasx + addi.w a7, zero, 4 + PREP_8TAP_8BPC_LASX 4 +endfunc + +function prep_8tap_smooth_8bpc_lasx + addi.w a7, zero, 5 + PREP_8TAP_8BPC_LASX 5 +endfunc + +function prep_8tap_sharp_smooth_8bpc_lasx + addi.w a7, zero, 6 + PREP_8TAP_8BPC_LASX 6 +endfunc + +function prep_8tap_regular_sharp_8bpc_lasx + addi.w a7, zero, 8 + PREP_8TAP_8BPC_LASX 8 +endfunc + +function prep_8tap_smooth_sharp_8bpc_lasx + addi.w a7, zero, 9 + PREP_8TAP_8BPC_LASX 9 +endfunc + +function prep_8tap_sharp_8bpc_lasx + addi.w a7, zero, 10 + PREP_8TAP_8BPC_LASX 10 +endfunc diff --git a/src/loongarch/mc.h b/src/loongarch/mc.h index d5ac00f..c64b7ef 100644 --- a/src/loongarch/mc.h +++ b/src/loongarch/mc.h @@ -61,6 +61,16 @@ decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx)); decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx)); decl_w_mask_fn(BF(dav1d_w_mask_420, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_regular, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, lasx)); + static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) { #if BITDEPTH == 8 const unsigned flags = dav1d_get_cpu_flags(); @@ -93,6 +103,15 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) { c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx); c->w_mask[2] = BF(dav1d_w_mask_420, lasx); + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lasx); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lasx); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lasx); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lasx); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lasx); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lasx); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lasx); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lasx); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lasx); #endif } -- cgit v1.2.3 From 78a776d253c8aae4e6f68d13780edf5eb0b3f7a3 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 11:28:28 +0800 Subject: loongarch: Improve the performance of loopfilter_8bpc series functions Relative speedup over C code: lpf_h_sb_uv_w4_8bpc_c: 25.3 ( 1.00x) lpf_h_sb_uv_w4_8bpc_lsx: 6.7 ( 3.79x) lpf_h_sb_uv_w6_8bpc_c: 36.5 ( 1.00x) lpf_h_sb_uv_w6_8bpc_lsx: 11.0 ( 3.31x) lpf_h_sb_y_w4_8bpc_c: 47.7 ( 1.00x) lpf_h_sb_y_w4_8bpc_lsx: 12.5 ( 3.82x) lpf_h_sb_y_w8_8bpc_c: 81.9 ( 1.00x) lpf_h_sb_y_w8_8bpc_lsx: 22.2 ( 3.69x) lpf_h_sb_y_w16_8bpc_c: 85.1 ( 1.00x) lpf_h_sb_y_w16_8bpc_lsx: 18.1 ( 4.70x) lpf_v_sb_uv_w4_8bpc_c: 25.3 ( 1.00x) lpf_v_sb_uv_w4_8bpc_lsx: 5.7 ( 4.43x) lpf_v_sb_uv_w6_8bpc_c: 37.6 ( 1.00x) lpf_v_sb_uv_w6_8bpc_lsx: 9.5 ( 3.97x) lpf_v_sb_y_w4_8bpc_c: 59.4 ( 1.00x) lpf_v_sb_y_w4_8bpc_lsx: 15.7 ( 3.78x) lpf_v_sb_y_w8_8bpc_c: 94.5 ( 1.00x) lpf_v_sb_y_w8_8bpc_lsx: 29.4 ( 3.21x) lpf_v_sb_y_w16_8bpc_c: 97.8 ( 1.00x) lpf_v_sb_y_w16_8bpc_lsx: 36.3 ( 2.70x) --- src/loongarch/loopfilter.S | 1108 ++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/loopfilter.h | 52 +++ src/loopfilter_tmpl.c | 4 + src/meson.build | 1 + 4 files changed, 1165 insertions(+) create mode 100644 src/loongarch/loopfilter.S create mode 100644 src/loongarch/loopfilter.h diff --git a/src/loongarch/loopfilter.S b/src/loongarch/loopfilter.S new file mode 100644 index 0000000..e71d5a7 --- /dev/null +++ b/src/loongarch/loopfilter.S @@ -0,0 +1,1108 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +.macro FILTER_W4 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -2 + fld.s f6, t5, 0 //p1 p0 q0 q1 + fldx.s f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.s f8, t5, 0 + fldx.s f9, t5, a1 + + vilvl.b vr6, vr7, vr6 + vilvl.b vr7, vr9, vr8 + vilvl.h vr6, vr7, vr6 //p1p1p1p1 + vbsrl.v vr7, vr6, 4 //p0p0p0p0 + vbsrl.v vr8, vr7, 4 //q0q0q0q0 + vbsrl.v vr9, vr8, 4 //q1q1q1q1 +.else + sub.d t5, a0, a1 + fld.s f7, t5, 0 + sub.d t5, t5, a1 + fld.s f6, t5, 0 + fld.s f8, a0, 0 + fldx.s f9, a0, a1 +.endif + + vabsd.bu vr10, vr6, vr7 // (p1 - p0) + vabsd.bu vr11, vr9, vr8 // (q1 - q0) + vabsd.bu vr12, vr7, vr8 // (p0 - q0) + vabsd.bu vr13, vr6, vr9 // (p1 - q1) + + vmax.bu vr14, vr10, vr11 + vsle.bu vr15, vr14, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I + vsadd.bu vr16, vr12, vr12 + vsrli.b vr17, vr13, 1 + vsadd.bu vr16, vr16, vr17 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) + vsle.bu vr16, vr16, vr3 + vand.v vr20, vr15, vr16 //fm + + vpickve2gr.wu t5, vr20, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W4 + + vslt.bu vr16, vr2, vr14 //hev + + vsllwil.h.b vr30, vr20, 0 //expand fm to w + vsllwil.w.h vr30, vr30, 0 + + vsllwil.hu.bu vr17, vr6, 0 + vsllwil.hu.bu vr18, vr9, 0 + vsub.h vr17, vr17, vr18 + vssrarni.b.h vr17, vr17, 0 //f = iclip_diff(p1 - q1) + + vand.v vr17, vr17, vr16 + vsllwil.h.b vr18, vr17, 0 + + vsllwil.hu.bu vr10, vr8, 0 + vsllwil.hu.bu vr11, vr7, 0 + vsub.h vr10, vr10, vr11 + + vsadd.h vr11, vr10, vr10 + vsadd.h vr10, vr10, vr11 //3 * (q0 - p0) + vsadd.h vr10, vr10, vr18 //f = iclip_diff(3 * (q0 - p0) + f); + vssrani.b.h vr10, vr10, 0 + vsllwil.h.b vr10, vr10, 0 + + vaddi.hu vr11, vr10, 4 + vaddi.hu vr12, vr10, 3 + li.w t5, 127 + vreplgr2vr.h vr13, t5 + vmin.h vr11, vr11, vr13 + vmin.h vr12, vr12, vr13 + vsrai.h vr11, vr11, 3 //f1 + vsrai.h vr12, vr12, 3 //f2 + + vsllwil.hu.bu vr13, vr7, 0 //p0 + vsllwil.hu.bu vr14, vr8, 0 //q0 + vsadd.h vr13, vr13, vr12 + vssub.h vr14, vr14, vr11 + vssrani.bu.h vr13, vr13, 0 //dst-1 + vssrani.bu.h vr14, vr14, 0 //dst+0 + + vsrari.h vr15, vr11, 1 //f + vsllwil.hu.bu vr18, vr6, 0 //p1 + vsllwil.hu.bu vr19, vr9, 0 //q1 + vsadd.h vr18, vr18, vr15 + vssub.h vr19, vr19, vr15 + vssrani.bu.h vr18, vr18, 0 //dst-2 + vssrani.bu.h vr19, vr19, 0 //dst+1 + vbitsel.v vr26, vr18, vr6, vr16 + vbitsel.v vr29, vr19, vr9, vr16 + + vbitsel.v vr6, vr6, vr26, vr20 + vbitsel.v vr7, vr7, vr13, vr20 + vbitsel.v vr8, vr8, vr14, vr20 + vbitsel.v vr9, vr9, vr29, vr20 + +.ifc \DIR, h + vilvl.b vr6, vr7, vr6 + vilvl.b vr9, vr9, vr8 + vilvl.h vr6, vr9, vr6 + + addi.d t5, a0, -2 + vstelm.w vr6, t5, 0, 0 + add.d t5, t5, a1 + vstelm.w vr6, t5, 0, 1 + add.d t5, t5, a1 + vstelm.w vr6, t5, 0, 2 + add.d t5, t5, a1 + vstelm.w vr6, t5, 0, 3 +.else + fst.s f8, a0, 0 + fstx.s f9, a0, a1 + sub.d t5, a0, a1 + fst.s f7, t5, 0 + sub.d t5, t5, a1 + fst.s f6, t5, 0 +.endif +.END_FILTER_\DIR\()\TYPE\()_W4: +.endm + +.macro FILTER_W6 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -3 + fld.d f6, t5, 0 //p2 p1 p0 q0 q1 q2 + fldx.d f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f8, t5, 0 + fldx.d f9, t5, a1 + + vilvl.b vr6, vr7, vr6 + vilvl.b vr7, vr9, vr8 + vilvh.h vr10, vr7, vr6 + vilvl.h vr6, vr7, vr6 + + vbsrl.v vr7, vr6, 4 //p1 + vbsrl.v vr8, vr7, 4 //p0 + vbsrl.v vr9, vr8, 4 //q0 + vbsrl.v vr11, vr10, 4 //q2 +.else + alsl.d t5, a1, a1, 1 + sub.d t5, a0, t5 + fld.d f6, t5, 0 + fldx.d f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f8, t5, 0 + fldx.d f9, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f10, t5, 0 + fldx.d f11, t5, a1 +.endif + + vabsd.bu vr12, vr7, vr8 //abs(p1-p0) + vabsd.bu vr13, vr10, vr9 //abs(q1-q0) + vmax.bu vr14, vr12, vr13 + vslt.bu vr2, vr2, vr14 //hev + vabsd.bu vr12, vr6, vr7 //abs(p2-p1) + vmax.bu vr12, vr12, vr14 + vabsd.bu vr13, vr11, vr10 //abs(q2-q1) + vmax.bu vr12, vr12, vr13 + vsle.bu vr0, vr12, vr4 // <=I + + vabsd.bu vr13, vr8, vr9 //abs(p0-q0) + vsadd.bu vr13, vr13, vr13 + vabsd.bu vr15, vr7, vr10 + vsrli.b vr15, vr15, 1 + vsadd.bu vr13, vr13, vr15 + vsle.bu vr13, vr13, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E + vand.v vr0, vr0, vr13 //fm + + vpickve2gr.wu t5, vr0, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W6 + + vabsd.bu vr12, vr6, vr8 //abs(p2-p0) + vabsd.bu vr13, vr11, vr9 //abs(q2-q0) + vmax.bu vr12, vr12, vr14 + vmax.bu vr12, vr12, vr13 + vxor.v vr13, vr13, vr13 + vaddi.bu vr13, vr13, 1 + vsle.bu vr1, vr12, vr13 //flat8in + + //6789 10 11 --expand to h + vsllwil.hu.bu vr12, vr6, 0 + vsllwil.hu.bu vr13, vr7, 0 + vsllwil.hu.bu vr14, vr8, 0 + vsllwil.hu.bu vr15, vr9, 0 + vsllwil.hu.bu vr16, vr10, 0 + vsllwil.hu.bu vr17, vr11, 0 + + //dst-2 + vsadd.hu vr18, vr12, vr12 + vsadd.hu vr18, vr18, vr12 + vsadd.hu vr18, vr18, vr13 + vsadd.hu vr18, vr18, vr13 + vsadd.hu vr18, vr18, vr14 + vsadd.hu vr18, vr18, vr14 + vsadd.hu vr18, vr18, vr15 + + //dst-1 + vsadd.hu vr19, vr18, vr15 + vsadd.hu vr19, vr19, vr16 + vssub.hu vr19, vr19, vr12 + vssub.hu vr19, vr19, vr12 + + //dst+0 + vsadd.hu vr20, vr19, vr17 + vsadd.hu vr20, vr20, vr16 + vssub.hu vr20, vr20, vr12 + vssub.hu vr20, vr20, vr13 + + //dst+1 + vsadd.hu vr21, vr20, vr17 + vsadd.hu vr21, vr21, vr17 + vssub.hu vr21, vr21, vr13 + vssub.hu vr21, vr21, vr14 + + vsrari.h vr18, vr18, 3 + vsrari.h vr19, vr19, 3 + vsrari.h vr20, vr20, 3 + vsrari.h vr21, vr21, 3 + + vsub.h vr22, vr13, vr16 + vssrani.b.h vr22, vr22, 0 + vand.v vr22, vr22, vr2 + vsllwil.h.b vr22, vr22, 0 //f = iclip_diff(p1 - q1); + + vsub.h vr23, vr15, vr14 + vsadd.h vr24, vr23, vr23 + vsadd.h vr23, vr23, vr24 + vsadd.h vr23, vr23, vr22 + vssrani.b.h vr23, vr23, 0 + vsllwil.h.b vr23, vr23, 0 //f = iclip_diff(3 * (q0 - p0) + f); + + vaddi.hu vr24, vr23, 4 + vaddi.hu vr25, vr23, 3 + li.w t5, 127 + vreplgr2vr.h vr3, t5 + vmin.h vr24, vr24, vr3 + vmin.h vr25, vr25, vr3 + vsrai.h vr24, vr24, 3 //f1 + vsrai.h vr25, vr25, 3 //f2 + + vsadd.h vr26, vr14, vr25 //dst-1 + vssub.h vr27, vr15, vr24 //dst+0 + + vsrari.h vr24, vr24, 1 + vsadd.h vr28, vr13, vr24 + vssub.h vr29, vr16, vr24 + vsllwil.h.b vr2, vr2, 0 + vbitsel.v vr28, vr28, vr13, vr2 //dst-2 + vbitsel.v vr29, vr29, vr16, vr2 //dst+1 + + //flat8in + vsllwil.h.b vr1, vr1, 0 + vbitsel.v vr18, vr28, vr18, vr1 + vbitsel.v vr19, vr26, vr19, vr1 + vbitsel.v vr20, vr27, vr20, vr1 + vbitsel.v vr21, vr29, vr21, vr1 + + vssrani.bu.h vr18, vr18, 0 + vssrani.bu.h vr19, vr19, 0 + vssrani.bu.h vr20, vr20, 0 + vssrani.bu.h vr21, vr21, 0 + + vbitsel.v vr7, vr7, vr18, vr0 //p1 + vbitsel.v vr8, vr8, vr19, vr0 //p0 + vbitsel.v vr9, vr9, vr20, vr0 //q0 + vbitsel.v vr10, vr10, vr21, vr0 //q1 + +.ifc \DIR, h + vilvl.b vr7, vr8, vr7 + vilvl.b vr9, vr10, vr9 + vilvl.h vr7, vr9, vr7 + + addi.d t5, a0, -2 + vstelm.w vr7, t5, 0, 0 + add.d t5, t5, a1 + vstelm.w vr7, t5, 0, 1 + add.d t5, t5, a1 + vstelm.w vr7, t5, 0, 2 + add.d t5, t5, a1 + vstelm.w vr7, t5, 0, 3 +.else + fst.s f9, a0, 0 + fstx.s f10, a0, a1 + sub.d t5, a0, a1 + fst.s f8, t5, 0 + sub.d t5, t5, a1 + fst.s f7, t5, 0 +.endif +.END_FILTER_\DIR\()\TYPE\()_W6: +.endm + +.macro FILTER_W8 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -4 + fld.d f6, t5, 0 //p3 p2 p1 p0 q0 q1 q2 q3 + fldx.d f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f8, t5, 0 + fldx.d f9, t5, a1 + + vilvl.b vr6, vr7, vr6 + vilvl.b vr7, vr9, vr8 + vilvh.h vr10, vr7, vr6 //q0 + vilvl.h vr6, vr7, vr6 //p3 + vbsrl.v vr7, vr6, 4 //p2 + vbsrl.v vr8, vr6, 8 //p1 + vbsrl.v vr9, vr6, 12 //p0 + vbsrl.v vr11, vr10, 4 //q1 + vbsrl.v vr12, vr10, 8 //q2 + vbsrl.v vr13, vr10, 12 //q3 +.else + fld.s f10, a0, 0 + fldx.s f11, a0, a1 + add.d t5, a0, a1 + fldx.s f12, t5, a1 + add.d t5, t5, a1 + fldx.s f13, t5, a1 + sub.d t5, a0, a1 + fld.s f9, t5, 0 + sub.d t5, t5, a1 + fld.s f8, t5, 0 + sub.d t5, t5, a1 + fld.s f7, t5, 0 + sub.d t5, t5, a1 + fld.s f6, t5, 0 +.endif + + vabsd.bu vr14, vr8, vr9 //p1-p0 + vabsd.bu vr15, vr11, vr10 //q1-q0 + vabsd.bu vr16, vr9, vr10 //p0-q0 + vabsd.bu vr17, vr8, vr11 //p1-q1 + vabsd.bu vr18, vr7, vr8 //p2-p1 + vabsd.bu vr19, vr12, vr11 //q2-q1 + vabsd.bu vr20, vr6, vr7 //p3-p2 + vabsd.bu vr21, vr13, vr12 //q3-q2 + + vmax.bu vr22, vr14, vr15 + vsle.bu vr23, vr22, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I + vsadd.bu vr16, vr16, vr16 + vsrli.b vr17, vr17, 1 + vsadd.bu vr16, vr16, vr17 + vsle.bu vr16, vr16, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E + vand.v vr16, vr16, vr23 //fm + + vpickve2gr.wu t5, vr16, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W8 + + vmax.bu vr23, vr18, vr19 + vmax.bu vr23, vr23, vr20 + vmax.bu vr23, vr23, vr21 + vsle.bu vr23, vr23, vr4 + vand.v vr16, vr16, vr23 //fm + + vabsd.bu vr17, vr7, vr9 //abs(p2-p0) + vabsd.bu vr18, vr12, vr10 //abs(q2-q0) + vmax.bu vr17, vr17, vr14 + vmax.bu vr17, vr17, vr15 + vmax.bu vr17, vr17, vr18 + vabsd.bu vr18, vr6, vr9 //abs(p3 - p0) + vabsd.bu vr19, vr13, vr10 //abs(q3 - q0) + vmax.bu vr17, vr17, vr18 + vmax.bu vr17, vr17, vr19 + + vxor.v vr5, vr5, vr5 + vaddi.bu vr5, vr5, 1 //F + vsle.bu vr17, vr17, vr5 //flat8in + + vsllwil.hu.bu vr0, vr6, 0 //p3 + vsllwil.hu.bu vr1, vr7, 0 //p2 + vsllwil.hu.bu vr27, vr8, 0 //p1 + vsllwil.hu.bu vr3, vr9, 0 //p0 + vsllwil.hu.bu vr4, vr10, 0 //q0 + vsllwil.hu.bu vr5, vr11, 0 //q1 + vsllwil.hu.bu vr14, vr12, 0 //q2 + vsllwil.hu.bu vr15, vr13, 0 //q3 + + vsadd.hu vr18, vr0, vr0 //p3+p3 + vsadd.hu vr19, vr15, vr15 //q3+q3 + vsadd.hu vr20, vr0, vr1 //p3+p2 + vsadd.hu vr21, vr1, vr27 //p2+p1 + vsadd.hu vr28, vr27, vr3 //p1+p0 + vsadd.hu vr23, vr3, vr4 //p0+q0 + vsadd.hu vr24, vr4, vr5 //q0+q1 + vsadd.hu vr25, vr5, vr14 //q1+q2 + vsadd.hu vr26, vr14, vr15 //q2+q3 + + // dst-3 + vsadd.hu vr29, vr18, vr20 + vsadd.hu vr29, vr29, vr21 + vsadd.hu vr29, vr29, vr23 + + // dst-2 + vsadd.hu vr30, vr18, vr21 + vsadd.hu vr30, vr30, vr28 + vsadd.hu vr30, vr30, vr24 + + // dst-1 + vsadd.hu vr31, vr20, vr28 + vsadd.hu vr31, vr31, vr23 + vsadd.hu vr31, vr31, vr25 + + // dst+0 + vsadd.hu vr18, vr21, vr23 + vsadd.hu vr18, vr18, vr24 + vsadd.hu vr18, vr18, vr26 + + //dst+1 + vsadd.hu vr20, vr28, vr24 + vsadd.hu vr20, vr20, vr25 + vsadd.hu vr20, vr20, vr19 + + //dst+2 + vsadd.hu vr21, vr23, vr25 + vsadd.hu vr21, vr21, vr26 + vsadd.hu vr21, vr21, vr19 + + vssrarni.bu.h vr23, vr29, 3 + vssrarni.bu.h vr24, vr30, 3 + vssrarni.bu.h vr25, vr31, 3 + vssrarni.bu.h vr19, vr18, 3 + vssrarni.bu.h vr20, vr20, 3 + vssrarni.bu.h vr21, vr21, 3 + + // !flat8in + vslt.bu vr2, vr2, vr22 //hev + + vsub.h vr30, vr27, vr5 //p1-q1 + vssrani.b.h vr30, vr30, 0 + vand.v vr30, vr30, vr2 + vsllwil.h.b vr30, vr30, 0 + + vsub.h vr31, vr4, vr3 + vsadd.h vr0, vr31, vr31 + vsadd.h vr31, vr31, vr0 + vsadd.h vr31, vr31, vr30 + vssrani.b.h vr31, vr31, 0 + vsllwil.h.b vr31, vr31, 0 //f = iclip_diff(3 * (q0 - p0) + f); + + vaddi.hu vr14, vr31, 4 + vaddi.hu vr15, vr31, 3 + li.w t5, 127 + vreplgr2vr.h vr18, t5 + vmin.h vr14, vr14, vr18 + vmin.h vr15, vr15, vr18 + vsrai.h vr14, vr14, 3 //f1 + vsrai.h vr15, vr15, 3 //f2 + + vsadd.h vr3, vr3, vr15 + vssub.h vr4, vr4, vr14 + vssrani.bu.h vr3, vr3, 0 //dst-1 + vssrani.bu.h vr4, vr4, 0 //dst+0 + + vsrari.h vr14, vr14, 1 + vsadd.h vr18, vr27, vr14 + vssub.h vr26, vr5, vr14 + vssrani.bu.h vr18, vr18, 0 //dst-2 + vssrani.bu.h vr26, vr26, 0 //dst+1 + + vbitsel.v vr27, vr18, vr8, vr2 //dst-2 + vbitsel.v vr28, vr26, vr11, vr2 //dst+1 + + vbitsel.v vr23, vr7, vr23, vr17 //dst-3 (p2) + vbitsel.v vr24, vr27, vr24, vr17 //dst-2 + vbitsel.v vr25, vr3, vr25, vr17 //dst-1 + vbitsel.v vr19, vr4, vr19, vr17 //dst+0 + vbitsel.v vr20, vr28, vr20, vr17 //dst+1 + vbitsel.v vr21, vr12, vr21, vr17 //dst+2 + + vbitsel.v vr7, vr7, vr23, vr16 //-3 + vbitsel.v vr8, vr8, vr24, vr16 //-2 + vbitsel.v vr9, vr9, vr25, vr16 //-1 + vbitsel.v vr10, vr10, vr19, vr16 //+0 + vbitsel.v vr11, vr11, vr20, vr16 //+1 + vbitsel.v vr12, vr12, vr21, vr16 //+2 + +.ifc \DIR, h + vilvl.b vr6, vr7, vr6 + vilvl.b vr8, vr9, vr8 + vilvl.b vr10, vr11, vr10 + vilvl.b vr12, vr13, vr12 + vilvl.h vr6, vr8, vr6 //p3p2p1p0 -- -- -- + vilvl.h vr10, vr12, vr10 //q0q1q2q3 -- -- -- + vilvl.w vr0, vr10, vr6 //p3p2p1p0q0q1q2q3 -- + vilvh.w vr1, vr10, vr6 //-- + + addi.d t5, a0, -4 + vstelm.d vr0, t5, 0, 0 + add.d t5, t5, a1 + vstelm.d vr0, t5, 0, 1 + add.d t5, t5, a1 + vstelm.d vr1, t5, 0, 0 + add.d t5, t5, a1 + vstelm.d vr1, t5, 0, 1 +.else + alsl.d t5, a1, a1, 1 + sub.d t5, a0, t5 + fst.s f7, t5, 0 + fstx.s f8, t5, a1 + add.d t5, t5, a1 + fstx.s f9, t5, a1 + + fst.s f10, a0, 0 + add.d t5, a0, a1 + fst.s f11, t5, 0 + fstx.s f12, t5, a1 +.endif +.END_FILTER_\DIR\()\TYPE\()_W8: +.endm + +.macro FILTER_W16 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -7 + vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6 + vldx vr7, t5, a1 + add.d t5, t5, a1 + vldx vr8, t5, a1 + add.d t5, t5, a1 + vldx vr9, t5, a1 + + vilvl.b vr10, vr7, vr6 + vilvh.b vr11, vr7, vr6 + vilvl.b vr12, vr9, vr8 + vilvh.b vr13, vr9, vr8 + vilvl.h vr6, vr12, vr10 + vilvh.h vr10, vr12, vr10 //p2--- + vilvl.h vr15, vr13, vr11 //q1--- + vilvh.h vr19, vr13, vr11 + + vbsrl.v vr7, vr6, 4 //p5--- + vbsrl.v vr8, vr6, 8 //p4--- + vbsrl.v vr9, vr6, 12 //p3--- + vbsrl.v vr12, vr10, 4 //p1--- + vbsrl.v vr13, vr10, 8 //p0--- + vbsrl.v vr14, vr10, 12 //q0--- + vbsrl.v vr16, vr15, 4 //q2--- + vbsrl.v vr17, vr15, 8 //q3--- + vbsrl.v vr18, vr15, 12 //q4--- + vbsrl.v vr20, vr19, 4 //q6--- +.else + slli.d t5, a1, 3 + sub.d t5, a0, t5 + fldx.s f6, t5, a1 //p6 + alsl.d t5, a1, t5, 1 + fld.s f7, t5, 0 //p5 + fldx.s f8, t5, a1 //p4 + alsl.d t5, a1, t5, 1 + fld.s f9, t5, 0 //p3 + fldx.s f10, t5, a1 //p2 + alsl.d t5, a1, t5, 1 + fld.s f12, t5, 0 //p1 + fldx.s f13, t5, a1 //p0 + alsl.d t5, a1, t5, 1 + fld.s f14, t5, 0 //q0 + fldx.s f15, t5, a1 //q1 + alsl.d t5, a1, t5, 1 + fld.s f16, t5, 0 //q2 + fldx.s f17, t5, a1 //q3 + alsl.d t5, a1, t5, 1 + fld.s f18, t5, 0 //q4 + fldx.s f19, t5, a1 //q5 + add.d t5, t5, a1 + fldx.s f20, t5, a1 //q6 + + //temp store + addi.d sp, sp, -96 + fst.d f7, sp, 0 + fst.d f8, sp, 8 + fst.d f9, sp, 16 + fst.d f10, sp, 24 + fst.d f12, sp, 32 + fst.d f13, sp, 40 + fst.d f14, sp, 48 + fst.d f15, sp, 56 + fst.d f16, sp, 64 + fst.d f17, sp, 72 + fst.d f18, sp, 80 + fst.d f19, sp, 88 +.endif + + vabsd.bu vr21, vr12, vr13 //abs(p1-p0) + vabsd.bu vr22, vr15, vr14 //abs(q1-q0) + vmax.bu vr0, vr21, vr22 + vslt.bu vr2, vr2, vr0 //hev + vabsd.bu vr1, vr10, vr12 //abs(p2-p1) + vmax.bu vr0, vr0, vr1 + vabsd.bu vr1, vr16, vr15 //abs(q2-q1) + vmax.bu vr0, vr0, vr1 + vabsd.bu vr1, vr9, vr10 //abs(p3-p2) + vmax.bu vr0, vr0, vr1 + vabsd.bu vr1, vr17, vr16 //abs(q3-q2) + vmax.bu vr0, vr0, vr1 + vsle.bu vr0, vr0, vr4 //vr4 released I + vabsd.bu vr1, vr13, vr14 //abs(p0-q0) + vsadd.bu vr1, vr1, vr1 + vabsd.bu vr4, vr12, vr15 //abs(p1-q1) + vsrli.b vr4, vr4, 1 + vsadd.bu vr1, vr1, vr4 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) + vsle.bu vr1, vr1, vr3 //vr3 released E + vand.v vr0, vr0, vr1 //fm + + vpickve2gr.wu t5, vr0, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W16 + + vabsd.bu vr1, vr6, vr13 //abs(p6-p0) + vabsd.bu vr4, vr7, vr13 //abs(p5-p0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr8, vr13 //abs(p4-p0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr18, vr14 //abs(q4-q0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr19, vr14 //abs(q5-q0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr20, vr14 + vmax.bu vr1, vr1, vr4 + vxor.v vr5, vr5, vr5 + vaddi.bu vr5, vr5, 1 //F + vsle.bu vr1, vr1, vr5 //flat8out + + vabsd.bu vr3, vr10, vr13 //abs(p2-p0) + vmax.bu vr3, vr3, vr21 + vmax.bu vr3, vr3, vr22 + vabsd.bu vr4, vr16, vr14 //abs(q2-q0) + vmax.bu vr3, vr3, vr4 + vabsd.bu vr4, vr9, vr13 //abs(p3-p0) + vmax.bu vr3, vr3, vr4 + vabsd.bu vr4, vr17, vr14 //abs(q3-q0) + vmax.bu vr3, vr3, vr4 + vsle.bu vr3, vr3, vr5 //flatin released vr5 + + vsllwil.hu.bu vr6, vr6, 0 //p6 + vsllwil.hu.bu vr7, vr7, 0 //p5 + vsllwil.hu.bu vr8, vr8, 0 //p4 + vsllwil.hu.bu vr9, vr9, 0 //p3 + vsllwil.hu.bu vr10, vr10, 0 //p2 + vsllwil.hu.bu vr12, vr12, 0 //p1 + vsllwil.hu.bu vr13, vr13, 0 //p0 + vsllwil.hu.bu vr14, vr14, 0 //q0 + vsllwil.hu.bu vr15, vr15, 0 //q1 + vsllwil.hu.bu vr16, vr16, 0 //q2 + vsllwil.hu.bu vr17, vr17, 0 //q3 + vsllwil.hu.bu vr18, vr18, 0 //q4 + vsllwil.hu.bu vr19, vr19, 0 //q5 + vsllwil.hu.bu vr20, vr20, 0 //q6 + + //dst-6 + vslli.w vr21, vr6, 3 + vssub.hu vr21, vr21, vr6 + vsadd.hu vr21, vr21, vr7 + vsadd.hu vr21, vr21, vr7 + vsadd.hu vr21, vr21, vr8 + vsadd.hu vr21, vr21, vr8 + vsadd.hu vr21, vr21, vr9 + vsadd.hu vr21, vr21, vr10 + vsadd.hu vr21, vr21, vr12 + vsadd.hu vr21, vr21, vr13 + vsadd.hu vr21, vr21, vr14 + + //dst-5 + vsadd.hu vr22, vr21, vr15 + vsadd.hu vr22, vr22, vr9 + vssub.hu vr22, vr22, vr6 + vssub.hu vr22, vr22, vr6 + + //dst-4 + vsadd.hu vr23, vr22, vr16 + vsadd.hu vr23, vr23, vr10 + vssub.hu vr23, vr23, vr7 + vssub.hu vr23, vr23, vr6 + + //dst-3 + vsadd.hu vr24, vr23, vr12 + vsadd.hu vr24, vr24, vr17 + vssub.hu vr24, vr24, vr6 + vssub.hu vr24, vr24, vr8 + + //dst-2 + vsadd.hu vr25, vr24, vr18 + vsadd.hu vr25, vr25, vr13 + vssub.hu vr25, vr25, vr6 + vssub.hu vr25, vr25, vr9 + + //dst-1 + vsadd.hu vr26, vr25, vr19 + vsadd.hu vr26, vr26, vr14 + vssub.hu vr26, vr26, vr6 + vssub.hu vr26, vr26, vr10 + + //dst+0 + vsadd.hu vr27, vr26, vr20 + vsadd.hu vr27, vr27, vr15 + vssub.hu vr27, vr27, vr6 + vssub.hu vr27, vr27, vr12 + + //dst+1 + vsadd.hu vr28, vr27, vr20 + vsadd.hu vr28, vr28, vr16 + vssub.hu vr28, vr28, vr7 + vssub.hu vr28, vr28, vr13 + + //dst+2 + vsadd.hu vr29, vr28, vr20 + vsadd.hu vr29, vr29, vr17 + vssub.hu vr29, vr29, vr8 + vssub.hu vr29, vr29, vr14 + + //dst+3 + vsadd.hu vr30, vr29, vr20 + vsadd.hu vr30, vr30, vr18 + vssub.hu vr30, vr30, vr9 + vssub.hu vr30, vr30, vr15 + + //dst+4 + vsadd.hu vr31, vr30, vr20 + vsadd.hu vr31, vr31, vr19 + vssub.hu vr31, vr31, vr10 + vssub.hu vr31, vr31, vr16 + + //dst+5 + vsadd.hu vr11, vr31, vr20 + vsadd.hu vr11, vr11, vr20 + vssub.hu vr11, vr11, vr12 + vssub.hu vr11, vr11, vr17 + + vsrari.h vr21, vr21, 4 + vsrari.h vr22, vr22, 4 + vsrari.h vr23, vr23, 4 + vsrari.h vr24, vr24, 4 + vsrari.h vr25, vr25, 4 + vsrari.h vr26, vr26, 4 + vsrari.h vr27, vr27, 4 + vsrari.h vr28, vr28, 4 + vsrari.h vr29, vr29, 4 + vsrari.h vr30, vr30, 4 + vsrari.h vr31, vr31, 4 + vsrari.h vr11, vr11, 4 + + vand.v vr1, vr1, vr3 + vsllwil.h.b vr1, vr1, 0 //expand to h + //(flat8out & flat8in) + vbitsel.v vr21, vr7, vr21, vr1 //dst-6 + vbitsel.v vr22, vr8, vr22, vr1 //dst-5 + vbitsel.v vr23, vr9, vr23, vr1 //dst-4 + vbitsel.v vr30, vr17, vr30, vr1 //dst+3 + vbitsel.v vr31, vr18, vr31, vr1 //dst+4 + vbitsel.v vr11, vr19, vr11, vr1 //dst+5 + + //flat8in + //dst-3 + vslli.h vr4, vr9, 1 + vsadd.hu vr4, vr4, vr9 //p3*3 + vsadd.hu vr4, vr4, vr10 + vsadd.hu vr4, vr4, vr10 + vsadd.hu vr4, vr4, vr12 + vsadd.hu vr4, vr4, vr13 + vsadd.hu vr4, vr4, vr14 + + //dst-2 + vsadd.hu vr5, vr4, vr12 + vsadd.hu vr5, vr5, vr15 + vssub.hu vr5, vr5, vr9 + vssub.hu vr5, vr5, vr10 + + //dst-1 + vsadd.hu vr18, vr5, vr13 + vsadd.hu vr18, vr18, vr16 + vssub.hu vr18, vr18, vr9 + vssub.hu vr18, vr18, vr12 + + //dst+0 + vsadd.hu vr7, vr18, vr14 + vsadd.hu vr7, vr7, vr17 + vssub.hu vr7, vr7, vr9 + vssub.hu vr7, vr7, vr13 + + //dst+1 + vsadd.hu vr8, vr7, vr15 + vsadd.hu vr8, vr8, vr17 + vssub.hu vr8, vr8, vr10 + vssub.hu vr8, vr8, vr14 + + //dst+2 + vsadd.hu vr9, vr8, vr16 + vsadd.hu vr9, vr9, vr17 + vssub.hu vr9, vr9, vr12 + vssub.hu vr9, vr9, vr15 + + vsrari.h vr4, vr4, 3 + vsrari.h vr5, vr5, 3 + vsrari.h vr18, vr18, 3 + vsrari.h vr7, vr7, 3 + vsrari.h vr8, vr8, 3 + vsrari.h vr9, vr9, 3 + + //flat8out & flat8in + vbitsel.v vr24, vr4, vr24, vr1 //dst-3 + vbitsel.v vr25, vr5, vr25, vr1 //dst-2 + vbitsel.v vr26, vr18, vr26, vr1 //dst-1 + vbitsel.v vr27, vr7, vr27, vr1 //dst+0 + vbitsel.v vr28, vr8, vr28, vr1 //dst+1 + vbitsel.v vr29, vr9, vr29, vr1 //dst+2 + + //!flat8in + vsub.h vr17, vr12, vr15 //p1-q1 + vsllwil.h.b vr2, vr2, 0 + vand.v vr17, vr17, vr2 //&hev + vssrani.b.h vr17, vr17, 0 + vsllwil.h.b vr17, vr17, 0 + + vsub.h vr7, vr14, vr13 + vsadd.h vr8, vr7, vr7 + vsadd.h vr7, vr7, vr8 + vsadd.h vr7, vr7, vr17 + vssrani.b.h vr7, vr7, 0 + vsllwil.h.b vr17, vr7, 0 //f = iclip_diff(3 * (q0 - p0) + f); + + vaddi.hu vr7, vr17, 4 + vaddi.hu vr8, vr17, 3 + li.w t5, 127 + vreplgr2vr.h vr9, t5 + vmin.h vr7, vr7, vr9 + vmin.h vr8, vr8, vr9 + vsrai.h vr7, vr7, 3 //f1 + vsrai.h vr8, vr8, 3 //f2 + + vsadd.h vr4, vr13, vr8 //dst-1 + vssub.h vr5, vr14, vr7 //dst+0 + + vsrari.h vr7, vr7, 1 + vsadd.h vr17, vr12, vr7 + vssub.h vr7, vr15, vr7 + vbitsel.v vr17, vr17, vr12, vr2 //dst-2 + vbitsel.v vr7, vr7, vr15, vr2 //dst+1 + + //flat8in or !flat8in + vsllwil.h.b vr3, vr3, 0 + vbitsel.v vr24, vr10, vr24, vr3 //dst-3 + vbitsel.v vr25, vr17, vr25, vr3 //dst-2 + vbitsel.v vr26, vr4, vr26, vr3 //dst-1 + vbitsel.v vr27, vr5, vr27, vr3 //dst+0 + vbitsel.v vr28, vr7, vr28, vr3 //dst+1 + vbitsel.v vr29, vr16, vr29, vr3 //dst+2 + +.ifc \DIR, h + //dst-6,dst-2,dst-5,dst-1 + vssrani.bu.h vr25, vr21, 0 + vssrani.bu.h vr26, vr22, 0 + vpermi.w vr25, vr25, 0xd8 + vpermi.w vr26, vr26, 0xd8 + vilvl.b vr6, vr26, vr25 //65656565 21212121 + + //dst-4,dst+0,dst-3,dst+1 + vssrani.bu.h vr27, vr23, 0 + vssrani.bu.h vr28, vr24, 0 + vpermi.w vr27, vr27, 0xd8 + vpermi.w vr28, vr28, 0xd8 + vilvl.b vr26, vr28, vr27 //43434343 01010101 + + vilvl.h vr21, vr26, vr6 //6543 -- -- -- + vilvh.h vr22, vr26, vr6 //2101 -- -- -- + vilvl.w vr20, vr22, vr21 //65432101 -- + vilvh.w vr22, vr22, vr21 //65432101 -- + vreplvei.d vr21, vr20, 1 + vreplvei.d vr23, vr22, 1 + + //dst+2,dst+4,dst+3,dst+5 + vssrani.bu.h vr31, vr29, 0 + vssrani.bu.h vr11, vr30, 0 + vpermi.w vr31, vr31, 0xd8 + vpermi.w vr11, vr11, 0xd8 + vilvl.b vr11, vr11, vr31 //23232323 45454545 + vshuf4i.w vr11, vr11, 0xd8 + vshuf4i.h vr11, vr11, 0xd8 //2345 -- -- -- + + vextrins.w vr20, vr11, 0x20 + vextrins.w vr21, vr11, 0x21 + vextrins.w vr22, vr11, 0x22 + vextrins.w vr23, vr11, 0x23 + + addi.d t5, a0, -6 + vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6 + vldx vr7, t5, a1 + add.d t5, t5, a1 + vldx vr8, t5, a1 + add.d t5, t5, a1 + vldx vr9, t5, a1 + + //expand fm to 128 + vreplvei.b vr10, vr0, 0 + vreplvei.b vr11, vr0, 1 + vreplvei.b vr12, vr0, 2 + vreplvei.b vr13, vr0, 3 + + vbitsel.v vr20, vr6, vr20, vr10 + vbitsel.v vr21, vr7, vr21, vr11 + vbitsel.v vr22, vr8, vr22, vr12 + vbitsel.v vr23, vr9, vr23, vr13 + + addi.d t5, a0, -6 + vstelm.d vr20, t5, 0, 0 + vstelm.w vr20, t5, 8, 2 + add.d t5, t5, a1 + vstelm.d vr21, t5, 0, 0 + vstelm.w vr21, t5, 8, 2 + add.d t5, t5, a1 + vstelm.d vr22, t5, 0, 0 + vstelm.w vr22, t5, 8, 2 + add.d t5, t5, a1 + vstelm.d vr23, t5, 0, 0 + vstelm.w vr23, t5, 8, 2 +.else + //reload + fld.d f7, sp, 0 + fld.d f8, sp, 8 + fld.d f9, sp, 16 + fld.d f10, sp, 24 + fld.d f12, sp, 32 + fld.d f13, sp, 40 + fld.d f14, sp, 48 + fld.d f15, sp, 56 + fld.d f16, sp, 64 + fld.d f17, sp, 72 + fld.d f18, sp, 80 + fld.d f19, sp, 88 + + vssrarni.bu.h vr21, vr21, 0 + vssrarni.bu.h vr22, vr22, 0 + vssrarni.bu.h vr23, vr23, 0 + vssrarni.bu.h vr24, vr24, 0 + vssrarni.bu.h vr25, vr25, 0 + vssrarni.bu.h vr26, vr26, 0 + vssrarni.bu.h vr27, vr27, 0 + vssrarni.bu.h vr28, vr28, 0 + vssrarni.bu.h vr29, vr29, 0 + vssrarni.bu.h vr30, vr30, 0 + vssrarni.bu.h vr31, vr31, 0 + vssrarni.bu.h vr11, vr11, 0 + + vbitsel.v vr7, vr7, vr21, vr0 //p5 + vbitsel.v vr8, vr8, vr22, vr0 //p4 + vbitsel.v vr9, vr9, vr23, vr0 //p3 + vbitsel.v vr10, vr10, vr24, vr0 //p2 + vbitsel.v vr12, vr12, vr25, vr0 //p1 + vbitsel.v vr13, vr13, vr26, vr0 //p0 + vbitsel.v vr14, vr14, vr27, vr0 //q0 + vbitsel.v vr15, vr15, vr28, vr0 //q1 + vbitsel.v vr16, vr16, vr29, vr0 //q2 + vbitsel.v vr17, vr17, vr30, vr0 //q3 + vbitsel.v vr18, vr18, vr31, vr0 //q4 + vbitsel.v vr19, vr19, vr11, vr0 //q5 + + fst.s f14, a0, 0 + fstx.s f15, a0, a1 + alsl.d t5, a1, a0, 1 + fst.s f16, t5, 0 + fstx.s f17, t5, a1 + alsl.d t5, a1, t5, 1 + fst.s f18, t5, 0 + fstx.s f19, t5, a1 + + slli.w t5, a1, 2 + alsl.d t5, a1, t5, 1 + sub.d t5, a0, t5 + fst.s f7, t5, 0 + fstx.s f8, t5, a1 + alsl.d t5, a1, t5, 1 + fst.s f9, t5, 0 + fstx.s f10, t5, a1 + alsl.d t5, a1, t5, 1 + fst.s f12, t5, 0 + fstx.s f13, t5, a1 +.endif +.END_FILTER_\DIR\()\TYPE\()_W16: +.ifc \DIR, v + addi.d sp, sp, 96 +.endif +.endm + +.macro PUSH_REG + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 +.endm +.macro POP_REG + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +.endm + +.macro LPF_FUNC DIR, TYPE +function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx + PUSH_REG + vld vr0, a2, 0 //vmask + vpickve2gr.wu t0, vr0, 0 + vpickve2gr.wu t1, vr0, 1 + vpickve2gr.wu t2, vr0, 2 + li.w t3, 1 //y + or t0, t0, t1 +.ifc \TYPE, y + or t0, t0, t2 //vm +.endif + addi.w t8, t3, -1 + andn t8, t0, t8 + beqz t0, .\DIR\()\TYPE\()_END +.\DIR\()\TYPE\()_LOOP: + and t4, t0, t3 //vm & y + beqz t4, .\DIR\()\TYPE\()_LOOP_NEXT + vldrepl.b vr1, a3, 0 //l[0][0] +.ifc \DIR, h + addi.d t5, a3, -4 +.else + slli.d t5, a4, 2 + sub.d t5, a3, t5 +.endif + vldrepl.b vr2, t5, 0 //l[-1][0] + vseqi.b vr3, vr1, 0 + vbitsel.v vr1, vr1, vr2, vr3 //L + vpickve2gr.b t5, vr1, 0 + beqz t5, .\DIR\()\TYPE\()_LOOP_NEXT + vsrai.b vr2, vr1, 4 //H + add.d t6, a5, t5 + vldrepl.b vr3, t6, 0 //E + addi.d t6, t6, 64 + vldrepl.b vr4, t6, 0 //I +.ifc \TYPE, y + and t5, t2, t3 + bnez t5, .FILTER_\DIR\()\TYPE\()_16 +.endif + and t5, t1, t3 +.ifc \TYPE, y + bnez t5, .FILTER_\DIR\()\TYPE\()_8 +.else + bnez t5, .FILTER_\DIR\()\TYPE\()_6 +.endif + FILTER_W4 \DIR, \TYPE + b .\DIR\()\TYPE\()_LOOP_NEXT +.ifc \TYPE, uv +.FILTER_\DIR\()\TYPE\()_6: + FILTER_W6 \DIR, \TYPE +.endif +.ifc \TYPE, y +.FILTER_\DIR\()\TYPE\()_8: + FILTER_W8 \DIR, \TYPE + b .\DIR\()\TYPE\()_LOOP_NEXT +.FILTER_\DIR\()\TYPE\()_16: + FILTER_W16 \DIR, \TYPE +.endif +.\DIR\()\TYPE\()_LOOP_NEXT: + slli.w t3, t3, 1 +.ifc \DIR, h + alsl.d a0, a1, a0, 2 + slli.w t8, a4, 2 + add.d a3, a3, t8 +.else + addi.d a0, a0, 4 + addi.d a3, a3, 4 +.endif + addi.w t8, t3, -1 + andn t8, t0, t8 + bnez t8, .\DIR\()\TYPE\()_LOOP +.\DIR\()\TYPE\()_END: + POP_REG +endfunc +.endm + +LPF_FUNC h, y +LPF_FUNC v, y +LPF_FUNC h, uv +LPF_FUNC v, uv diff --git a/src/loongarch/loopfilter.h b/src/loongarch/loopfilter.h new file mode 100644 index 0000000..844faf0 --- /dev/null +++ b/src/loongarch/loopfilter.h @@ -0,0 +1,52 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_LOOPFILTER_H +#define DAV1D_SRC_LOONGARCH_LOOPFILTER_H + +#include "src/cpu.h" +#include "src/loopfilter.h" + +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, lsx)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, lsx)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, lsx)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, lsx)); + +static ALWAYS_INLINE void loop_filter_dsp_init_loongarch(Dav1dLoopFilterDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + +#if BITDEPTH == 8 + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, lsx); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, lsx); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, lsx); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, lsx); +#endif +} + +#endif /* DAV1D_SRC_LOONGARCH_LOOPFILTER_H */ diff --git a/src/loopfilter_tmpl.c b/src/loopfilter_tmpl.c index cacf258..7cc8964 100644 --- a/src/loopfilter_tmpl.c +++ b/src/loopfilter_tmpl.c @@ -247,6 +247,8 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride, #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/loopfilter.h" +#elif ARCH_LOONGARCH64 +#include "src/loongarch/loopfilter.h" #elif ARCH_X86 #include "src/x86/loopfilter.h" #endif @@ -261,6 +263,8 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM loop_filter_dsp_init_arm(c); +#elif ARCH_LOONGARCH64 + loop_filter_dsp_init_loongarch(c); #elif ARCH_X86 loop_filter_dsp_init_x86(c); #endif diff --git a/src/meson.build b/src/meson.build index d12667c..88d4ff1 100644 --- a/src/meson.build +++ b/src/meson.build @@ -242,6 +242,7 @@ if is_asm_enabled libdav1d_sources_asm = files( 'loongarch/mc.S', + 'loongarch/loopfilter.S', ) libdav1d_asm_objs += libdav1d_sources_asm endif -- cgit v1.2.3 From b98ea43379ba46d06322074f29c51757b6c6e5b1 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 13:38:07 +0800 Subject: loongarch: Improve the performance of looprestoration_8bpc series functions Relative speedup over C code: wiener_5tap_8bpc_c: 13358.0 ( 1.00x) wiener_5tap_8bpc_lsx: 2484.7 ( 5.38x) wiener_7tap_8bpc_c: 13358.4 ( 1.00x) wiener_7tap_8bpc_lsx: 2486.4 ( 5.37x) sgr_3x3_8bpc_c: 18989.2 ( 1.00x) sgr_3x3_8bpc_lsx: 7981.6 ( 2.38x) sgr_5x5_8bpc_c: 17242.0 ( 1.00x) sgr_5x5_8bpc_lsx: 5735.5 ( 3.01x) --- src/loongarch/looprestoration.S | 1407 ++++++++++++++++++++++++++++++++++ src/loongarch/looprestoration.h | 78 ++ src/loongarch/looprestoration_tmpl.c | 274 +++++++ src/looprestoration_tmpl.c | 4 + src/meson.build | 5 + 5 files changed, 1768 insertions(+) create mode 100644 src/loongarch/looprestoration.S create mode 100644 src/loongarch/looprestoration.h create mode 100644 src/loongarch/looprestoration_tmpl.c diff --git a/src/loongarch/looprestoration.S b/src/loongarch/looprestoration.S new file mode 100644 index 0000000..ab512d1 --- /dev/null +++ b/src/loongarch/looprestoration.S @@ -0,0 +1,1407 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +#define REST_UNIT_STRIDE (400) + +.macro MADD_HU_BU in0, in1, out0, out1 + vsllwil.hu.bu vr12, \in0, 0 + vexth.hu.bu vr13, \in0 + vmadd.h \out0, vr12, \in1 + vmadd.h \out1, vr13, \in1 +.endm + +const wiener_shuf +.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 +endconst + +/* +void wiener_filter_h_lsx(int32_t *hor_ptr, + uint8_t *tmp_ptr, + const int16_t filterh[8], + const int w, const int h) +*/ +function wiener_filter_h_8bpc_lsx + addi.d sp, sp, -40 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + li.w t7, 1<<14 // clip_limit + + la.local t1, wiener_shuf + vld vr4, t1, 0 + vld vr14, a2, 0 // filter[0][k] + vreplvei.h vr21, vr14, 0 + vreplvei.h vr22, vr14, 1 + vreplvei.h vr23, vr14, 2 + vreplvei.h vr24, vr14, 3 + vreplvei.h vr25, vr14, 4 + vreplvei.h vr26, vr14, 5 + vreplvei.h vr27, vr14, 6 + vreplgr2vr.w vr0, t7 + +.WIENER_FILTER_H_H: + addi.w a4, a4, -1 // h + addi.w t0, a3, 0 // w + addi.d t1, a1, 0 // tmp_ptr + addi.d t2, a0, 0 // hor_ptr + +.WIENER_FILTER_H_W: + addi.w t0, t0, -16 + vld vr5, t1, 0 + vld vr13, t1, 16 + + vsubi.bu vr14, vr4, 2 + vsubi.bu vr15, vr4, 1 + vshuf.b vr6, vr13, vr5, vr14 // 1 ... 8, 9 ... 16 + vshuf.b vr7, vr13, vr5, vr15 // 2 ... 9, 10 ... 17 + vshuf.b vr8, vr13, vr5, vr4 // 3 ... 10, 11 ... 18 + vaddi.bu vr14, vr4, 1 + vaddi.bu vr15, vr4, 2 + vshuf.b vr9, vr13, vr5, vr14 // 4 ... 11, 12 ... 19 + vshuf.b vr10, vr13, vr5, vr15 // 5 ... 12, 13 ... 20 + vaddi.bu vr14, vr4, 3 + vshuf.b vr11, vr13, vr5, vr14 // 6 ... 13, 14 ... 21 + + vsllwil.hu.bu vr15, vr8, 0 // 3 4 5 6 7 8 9 10 + vexth.hu.bu vr16, vr8 // 11 12 13 14 15 16 17 18 + vsllwil.wu.hu vr17, vr15, 0 // 3 4 5 6 + vexth.wu.hu vr18, vr15 // 7 8 9 10 + vsllwil.wu.hu vr19, vr16, 0 // 11 12 13 14 + vexth.wu.hu vr20, vr16 // 15 16 17 18 + vslli.w vr17, vr17, 7 + vslli.w vr18, vr18, 7 + vslli.w vr19, vr19, 7 + vslli.w vr20, vr20, 7 + vxor.v vr15, vr15, vr15 + vxor.v vr14, vr14, vr14 + + MADD_HU_BU vr5, vr21, vr14, vr15 + MADD_HU_BU vr6, vr22, vr14, vr15 + MADD_HU_BU vr7, vr23, vr14, vr15 + MADD_HU_BU vr8, vr24, vr14, vr15 + MADD_HU_BU vr9, vr25, vr14, vr15 + MADD_HU_BU vr10, vr26, vr14, vr15 + MADD_HU_BU vr11, vr27, vr14, vr15 + + vsllwil.w.h vr5, vr14, 0 // 0 1 2 3 + vexth.w.h vr6, vr14 // 4 5 6 7 + vsllwil.w.h vr7, vr15, 0 // 8 9 10 11 + vexth.w.h vr8, vr15 // 12 13 14 15 + vadd.w vr17, vr17, vr5 + vadd.w vr18, vr18, vr6 + vadd.w vr19, vr19, vr7 + vadd.w vr20, vr20, vr8 + vadd.w vr17, vr17, vr0 + vadd.w vr18, vr18, vr0 + vadd.w vr19, vr19, vr0 + vadd.w vr20, vr20, vr0 + + vsrli.w vr1, vr0, 1 + vsubi.wu vr1, vr1, 1 + vxor.v vr3, vr3, vr3 + vsrari.w vr17, vr17, 3 + vsrari.w vr18, vr18, 3 + vsrari.w vr19, vr19, 3 + vsrari.w vr20, vr20, 3 + vclip.w vr17, vr17, vr3, vr1 + vclip.w vr18, vr18, vr3, vr1 + vclip.w vr19, vr19, vr3, vr1 + vclip.w vr20, vr20, vr3, vr1 + + vst vr17, t2, 0 + vst vr18, t2, 16 + vst vr19, t2, 32 + vst vr20, t2, 48 + addi.d t1, t1, 16 + addi.d t2, t2, 64 + blt zero, t0, .WIENER_FILTER_H_W + + addi.d a1, a1, REST_UNIT_STRIDE + addi.d a0, a0, (REST_UNIT_STRIDE << 2) + bnez a4, .WIENER_FILTER_H_H + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + addi.d sp, sp, 40 +endfunc + +.macro APPLY_FILTER in0, in1, in2 + alsl.d t7, \in0, \in1, 2 + vld vr10, t7, 0 + vld vr11, t7, 16 + vld vr12, t7, 32 + vld vr13, t7, 48 + vmadd.w vr14, vr10, \in2 + vmadd.w vr15, vr11, \in2 + vmadd.w vr16, vr12, \in2 + vmadd.w vr17, vr13, \in2 +.endm + +.macro wiener_filter_v_8bpc_core_lsx + vreplgr2vr.w vr14, t6 + vreplgr2vr.w vr15, t6 + vreplgr2vr.w vr16, t6 + vreplgr2vr.w vr17, t6 + + addi.w t7, t2, 0 // j + index k + mul.w t7, t7, t8 // (j + index) * REST_UNIT_STRIDE + add.w t7, t7, t4 // (j + index) * REST_UNIT_STRIDE + i + + APPLY_FILTER t7, a2, vr2 + APPLY_FILTER t8, t7, vr3 + APPLY_FILTER t8, t7, vr4 + APPLY_FILTER t8, t7, vr5 + APPLY_FILTER t8, t7, vr6 + APPLY_FILTER t8, t7, vr7 + APPLY_FILTER t8, t7, vr8 + vssrarni.hu.w vr15, vr14, 11 + vssrarni.hu.w vr17, vr16, 11 + vssrlni.bu.h vr17, vr15, 0 +.endm + +/* +void wiener_filter_v_lsx(uint8_t *p, + const ptrdiff_t p_stride, + const int32_t *hor, + const int16_t filterv[8], + const int w, const int h) +*/ +function wiener_filter_v_8bpc_lsx + li.w t6, -(1 << 18) + + li.w t8, REST_UNIT_STRIDE + ld.h t0, a3, 0 + ld.h t1, a3, 2 + vreplgr2vr.w vr2, t0 + vreplgr2vr.w vr3, t1 + ld.h t0, a3, 4 + ld.h t1, a3, 6 + vreplgr2vr.w vr4, t0 + vreplgr2vr.w vr5, t1 + ld.h t0, a3, 8 + ld.h t1, a3, 10 + vreplgr2vr.w vr6, t0 + vreplgr2vr.w vr7, t1 + ld.h t0, a3, 12 + vreplgr2vr.w vr8, t0 + + andi t1, a4, 0xf + sub.w t0, a4, t1 // w-w%16 + or t2, zero, zero // j + or t4, zero, zero + beqz t0, .WIENER_FILTER_V_W_LT16 + +.WIENER_FILTER_V_H: + andi t1, a4, 0xf + add.d t3, zero, a0 // p + or t4, zero, zero // i + +.WIENER_FILTER_V_W: + + wiener_filter_v_8bpc_core_lsx + + mul.w t5, t2, a1 // j * stride + add.w t5, t5, t4 // j * stride + i + add.d t3, a0, t5 + addi.w t4, t4, 16 + vst vr17, t3, 0 + bne t0, t4, .WIENER_FILTER_V_W + + beqz t1, .WIENER_FILTER_V_W_EQ16 + + wiener_filter_v_8bpc_core_lsx + + addi.d t3, t3, 16 + andi t1, a4, 0xf + +.WIENER_FILTER_V_ST_REM: + vstelm.b vr17, t3, 0, 0 + vbsrl.v vr17, vr17, 1 + addi.d t3, t3, 1 + addi.w t1, t1, -1 + bnez t1, .WIENER_FILTER_V_ST_REM +.WIENER_FILTER_V_W_EQ16: + addi.w t2, t2, 1 + blt t2, a5, .WIENER_FILTER_V_H + b .WIENER_FILTER_V_END + +.WIENER_FILTER_V_W_LT16: + andi t1, a4, 0xf + add.d t3, zero, a0 + + wiener_filter_v_8bpc_core_lsx + + mul.w t5, t2, a1 // j * stride + add.d t3, a0, t5 + +.WIENER_FILTER_V_ST_REM_1: + vstelm.b vr17, t3, 0, 0 + vbsrl.v vr17, vr17, 1 + addi.d t3, t3, 1 + addi.w t1, t1, -1 + bnez t1, .WIENER_FILTER_V_ST_REM_1 + + addi.w t2, t2, 1 + blt t2, a5, .WIENER_FILTER_V_W_LT16 + +.WIENER_FILTER_V_END: +endfunc + +/* +void boxsum3_h(int32_t *sumsq, coef *sum, const pixel *src, + const int w, const int h) +*/ +function boxsum3_h_8bpc_lsx + addi.d a2, a2, REST_UNIT_STRIDE + li.w t0, 1 + addi.w a3, a3, -2 + addi.w a4, a4, -4 + +.LBS3_H_H: + alsl.d t1, t0, a1, 1 // sum_v *sum_v = sum + x + alsl.d t2, t0, a0, 2 // sumsq_v *sumsq_v = sumsq + x + add.d t3, t0, a2 // s + addi.w t5, a3, 0 +.LBS3_H_W: + vld vr0, t3, 0 + vld vr1, t3, REST_UNIT_STRIDE + vld vr2, t3, (REST_UNIT_STRIDE<<1) + + vilvl.b vr3, vr1, vr0 + vhaddw.hu.bu vr4, vr3, vr3 + vilvh.b vr5, vr1, vr0 + vhaddw.hu.bu vr6, vr5, vr5 + vsllwil.hu.bu vr7, vr2, 0 + vexth.hu.bu vr8, vr2 + // sum_v + vadd.h vr4, vr4, vr7 + vadd.h vr6, vr6, vr8 + vst vr4, t1, REST_UNIT_STRIDE<<1 + vst vr6, t1, (REST_UNIT_STRIDE<<1)+16 + addi.d t1, t1, 32 + // sumsq + vmulwev.h.bu vr9, vr3, vr3 + vmulwod.h.bu vr10, vr3, vr3 + vmulwev.h.bu vr11, vr5, vr5 + vmulwod.h.bu vr12, vr5, vr5 + vmul.h vr7, vr7, vr7 + vmul.h vr8, vr8, vr8 + vaddwev.w.hu vr13, vr10, vr9 + vaddwod.w.hu vr14, vr10, vr9 + vilvl.w vr3, vr14, vr13 + vilvh.w vr4, vr14, vr13 + vaddwev.w.hu vr13, vr12, vr11 + vaddwod.w.hu vr14, vr12, vr11 + vilvl.w vr15, vr14, vr13 + vilvh.w vr16, vr14, vr13 + vsllwil.wu.hu vr9, vr7, 0 + vexth.wu.hu vr10, vr7 + vsllwil.wu.hu vr11, vr8, 0 + vexth.wu.hu vr12, vr8 + vadd.w vr9, vr9, vr3 + vadd.w vr10, vr10, vr4 + vadd.w vr11, vr11, vr15 + vadd.w vr12, vr12, vr16 + vst vr9, t2, REST_UNIT_STRIDE<<2 + vst vr10, t2, (REST_UNIT_STRIDE<<2)+16 + vst vr11, t2, (REST_UNIT_STRIDE<<2)+32 + vst vr12, t2, (REST_UNIT_STRIDE<<2)+48 + addi.d t2, t2, 64 + + addi.w t5, t5, -16 + addi.d t3, t3, 16 + blt zero, t5, .LBS3_H_W + + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a2, a2, REST_UNIT_STRIDE + addi.d a4, a4, -1 + blt zero, a4, .LBS3_H_H + +.LBS3_H_END: +endfunc + +/* +void boxsum3_v(int32_t *sumsq, coef *sum, + const int w, const int h) +*/ +function boxsum3_v_8bpc_lsx + addi.d a0, a0, (REST_UNIT_STRIDE<<2) + addi.d a1, a1, (REST_UNIT_STRIDE<<1) + addi.w a3, a3, -4 + addi.w a2, a2, -4 + +.LBS3_V_H: + sub.w t3, a2, zero + addi.d t0, a0, 4 + addi.d t1, a1, 2 + addi.d t5, a0, 8 + addi.d t6, a1, 4 + + vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 + vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 + vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9 + vld vr3, t0, 0 // a2 0 1 2 3 + vld vr4, t0, 4 // b2 1 2 3 4 + vld vr5, t0, 8 // c2 2 3 4 5 + vld vr6, t0, 16 // 3 4 5 6 + vld vr7, t0, 20 // 4 5 6 7 + vld vr8, t0, 24 // 5 6 7 8 + vadd.h vr9, vr0, vr1 + vadd.h vr9, vr9, vr2 + vadd.w vr10, vr3, vr4 + vadd.w vr10, vr10, vr5 + vadd.w vr11, vr6, vr7 + vadd.w vr11, vr11, vr8 + vpickve2gr.h t7, vr2, 6 + vpickve2gr.w t8, vr8, 2 + vst vr9, t6, 0 + vst vr10, t5, 0 + vst vr11, t5, 16 + + addi.d t1, t1, 16 + addi.d t0, t0, 32 + addi.d t5, t5, 32 + addi.d t6, t6, 16 + addi.d t3, t3, -8 + ble t3, zero, .LBS3_V_H0 + +.LBS3_V_W8: + vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 + vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 + vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9 + vld vr3, t0, 0 // a2 0 1 2 3 + vld vr4, t0, 4 // b2 1 2 3 4 + vld vr5, t0, 8 // c2 2 3 4 5 + vld vr6, t0, 16 // 3 4 5 6 + vld vr7, t0, 20 // 4 5 6 7 + vld vr8, t0, 24 // 5 6 7 8 + vinsgr2vr.h vr0, t7, 0 + vinsgr2vr.w vr3, t8, 0 + vpickve2gr.h t7, vr2, 6 + vpickve2gr.w t8, vr8, 2 + vadd.h vr9, vr0, vr1 + vadd.w vr10, vr3, vr4 + vadd.w vr11, vr6, vr7 + vadd.h vr9, vr9, vr2 + vadd.w vr10, vr10, vr5 + vadd.w vr11, vr11, vr8 + vst vr9, t6, 0 + vst vr10, t5, 0 + vst vr11, t5, 16 + addi.d t3, t3, -8 + addi.d t1, t1, 16 + addi.d t0, t0, 32 + addi.d t5, t5, 32 + addi.d t6, t6, 16 + blt zero, t3, .LBS3_V_W8 + +.LBS3_V_H0: + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.w a3, a3, -1 + bnez a3, .LBS3_V_H + +.LBS3_V_END: +endfunc + +/* +boxsum3_selfguided_filter(int32_t *sumsq, coef *sum, + const int w, const int h, + const unsigned s) +*/ +function boxsum3_sgf_h_8bpc_lsx + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a0, a0, 12 // AA + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a1, a1, 6 // BB + la.local t8, dav1d_sgr_x_by_x + li.w t6, 455 + vreplgr2vr.w vr20, t6 + li.w t6, 255 + vreplgr2vr.w vr22, t6 + vaddi.wu vr21, vr22, 1 // 256 + vreplgr2vr.w vr6, a4 + vldi vr19, 0x809 + addi.w a2, a2, 2 // w + 2 + addi.w a3, a3, 2 // h + 2 + +.LBS3SGF_H_H: + addi.w t2, a2, 0 + addi.d t0, a0, -4 + addi.d t1, a1, -2 + +.LBS3SGF_H_W: + addi.w t2, t2, -8 + vld vr0, t0, 0 // AA[i] + vld vr1, t0, 16 + vld vr2, t1, 0 // BB[i] + + vmul.w vr4, vr0, vr19 // a * n + vmul.w vr5, vr1, vr19 // a * n + vsllwil.w.h vr9, vr2, 0 + vexth.w.h vr10, vr2 + vmsub.w vr4, vr9, vr9 // p + vmsub.w vr5, vr10, vr10 // p + vmaxi.w vr4, vr4, 0 + vmaxi.w vr5, vr5, 0 // p + vmul.w vr4, vr4, vr6 // p * s + vmul.w vr5, vr5, vr6 // p * s + vsrlri.w vr4, vr4, 20 + vsrlri.w vr5, vr5, 20 // z + vmin.w vr4, vr4, vr22 + vmin.w vr5, vr5, vr22 + + vpickve2gr.w t6, vr4, 0 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 0 + vpickve2gr.w t6, vr4, 1 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 1 + vpickve2gr.w t6, vr4, 2 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 2 + vpickve2gr.w t6, vr4, 3 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 3 + + vpickve2gr.w t6, vr5, 0 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 0 + vpickve2gr.w t6, vr5, 1 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 1 + vpickve2gr.w t6, vr5, 2 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 2 + vpickve2gr.w t6, vr5, 3 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 3 // x + + vmul.w vr9, vr7, vr9 // x * BB[i] + vmul.w vr10, vr8, vr10 + vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x + vmul.w vr10, vr10, vr20 + vsrlri.w vr9, vr9, 12 + vsrlri.w vr10, vr10, 12 + vsub.w vr7, vr21, vr7 + vsub.w vr8, vr21, vr8 + vpickev.h vr8, vr8, vr7 + + vst vr9, t0, 0 + vst vr10, t0, 16 + vst vr8, t1, 0 + addi.d t0, t0, 32 + addi.d t1, t1, 16 + blt zero, t2, .LBS3SGF_H_W + + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.w a3, a3, -1 + bnez a3, .LBS3SGF_H_H +endfunc + +/* +boxsum3_selfguided_filter(coef *dst, pixel *src, + int32_t *sumsq, coef *sum, + const int w, const int h) +*/ +function boxsum3_sgf_v_8bpc_lsx + addi.d a1, a1, (3*REST_UNIT_STRIDE+3) // src + addi.d a2, a2, REST_UNIT_STRIDE<<2 + addi.d a2, a2, (REST_UNIT_STRIDE<<2)+12 + addi.d a3, a3, REST_UNIT_STRIDE<<2 + addi.d a3, a3, 6 +.LBS3SGF_V_H: + // A int32_t *sumsq + addi.d t0, a2, -(REST_UNIT_STRIDE<<2) // -stride + addi.d t1, a2, 0 // sumsq + addi.d t2, a2, REST_UNIT_STRIDE<<2 // +stride + addi.d t6, a1, 0 + addi.w t7, a4, 0 + addi.d t8, a0, 0 + // B coef *sum + addi.d t3, a3, -(REST_UNIT_STRIDE<<1) // -stride + addi.d t4, a3, 0 + addi.d t5, a3, REST_UNIT_STRIDE<<1 + +.LBS3SGF_V_W: + vld vr0, t0, 0 // P[i - REST_UNIT_STRIDE] + vld vr1, t0, 16 + vld vr2, t1, -4 // P[i-1] + vld vr3, t1, 12 + vld vr4, t2, 0 // P[i + REST_UNIT_STRIDE] + vld vr5, t2, 16 + vld vr6, t1, 0 // p[i] + vld vr7, t1, 16 + vld vr8, t1, 4 // p[i+1] + vld vr9, t1, 20 + + vld vr10, t0, -4 // P[i - 1 - REST_UNIT_STRIDE] + vld vr11, t0, 12 + vld vr12, t2, -4 // P[i - 1 + REST_UNIT_STRIDE] + vld vr13, t2, 12 + vld vr14, t0, 4 // P[i + 1 - REST_UNIT_STRIDE] + vld vr15, t0, 20 + vld vr16, t2, 4 // P[i + 1 + REST_UNIT_STRIDE] + vld vr17, t2, 20 + + vadd.w vr0, vr2, vr0 + vadd.w vr4, vr6, vr4 + vadd.w vr0, vr0, vr8 + vadd.w vr20, vr0, vr4 + vslli.w vr20, vr20, 2 // 0 1 2 3 + vadd.w vr0, vr1, vr3 + vadd.w vr4, vr5, vr7 + vadd.w vr0, vr0, vr9 + vadd.w vr21, vr0, vr4 + vslli.w vr21, vr21, 2 // 4 5 6 7 + vadd.w vr12, vr10, vr12 + vadd.w vr16, vr14, vr16 + vadd.w vr22, vr12, vr16 + vslli.w vr23, vr22, 1 + vadd.w vr22, vr23, vr22 + vadd.w vr11, vr11, vr13 + vadd.w vr15, vr15, vr17 + vadd.w vr0, vr11, vr15 + vslli.w vr23, vr0, 1 + vadd.w vr23, vr23, vr0 + vadd.w vr20, vr20, vr22 // b + vadd.w vr21, vr21, vr23 + + // B coef *sum + vld vr0, t3, 0 // P[i - REST_UNIT_STRIDE] + vld vr1, t4, -2 // p[i - 1] + vld vr2, t4, 0 // p[i] + vld vr3, t4, 2 // p[i + 1] + vld vr4, t5, 0 // P[i + REST_UNIT_STRIDE] + vld vr5, t3, -2 // P[i - 1 - REST_UNIT_STRIDE] + vld vr6, t5, -2 // P[i - 1 + REST_UNIT_STRIDE] + vld vr7, t3, 2 // P[i + 1 - REST_UNIT_STRIDE] + vld vr8, t5, 2 // P[i + 1 + REST_UNIT_STRIDE] + vaddwev.w.h vr9, vr0, vr1 + vaddwod.w.h vr10, vr0, vr1 + vaddwev.w.h vr11, vr2, vr3 + vaddwod.w.h vr12, vr2, vr3 + vadd.w vr9, vr11, vr9 + vadd.w vr10, vr12, vr10 + vilvl.w vr11, vr10, vr9 // 0 1 2 3 + vilvh.w vr12, vr10, vr9 // 4 5 6 7 + vsllwil.w.h vr0, vr4, 0 + vexth.w.h vr1, vr4 + vadd.w vr0, vr11, vr0 + vadd.w vr1, vr12, vr1 + vslli.w vr0, vr0, 2 + vslli.w vr1, vr1, 2 + vaddwev.w.h vr9, vr5, vr6 + vaddwod.w.h vr10, vr5, vr6 + vaddwev.w.h vr11, vr7, vr8 + vaddwod.w.h vr12, vr7, vr8 + vadd.w vr9, vr11, vr9 + vadd.w vr10, vr12, vr10 + vilvl.w vr13, vr10, vr9 + vilvh.w vr14, vr10, vr9 + vslli.w vr15, vr13, 1 + vslli.w vr16, vr14, 1 + vadd.w vr15, vr13, vr15 // a + vadd.w vr16, vr14, vr16 + vadd.w vr22, vr0, vr15 + vadd.w vr23, vr1, vr16 + vld vr0, t6, 0 // src + vsllwil.hu.bu vr0, vr0, 0 + vsllwil.wu.hu vr1, vr0, 0 + vexth.wu.hu vr2, vr0 + vmadd.w vr20, vr22, vr1 + vmadd.w vr21, vr23, vr2 + vssrlrni.h.w vr21, vr20, 9 + vst vr21, t8, 0 + addi.d t8, t8, 16 + + addi.d t0, t0, 32 + addi.d t1, t1, 32 + addi.d t2, t2, 32 + addi.d t3, t3, 16 + addi.d t4, t4, 16 + addi.d t5, t5, 16 + addi.d t6, t6, 8 + addi.w t7, t7, -8 + blt zero, t7, .LBS3SGF_V_W + + addi.w a5, a5, -1 + addi.d a0, a0, 384*2 + addi.d a1, a1, REST_UNIT_STRIDE + addi.d a3, a3, REST_UNIT_STRIDE<<1 + addi.d a2, a2, REST_UNIT_STRIDE<<2 + bnez a5, .LBS3SGF_V_H +endfunc + +#define FILTER_OUT_STRIDE (384) + +/* +sgr_3x3_finish_c(const pixel *p, const ptrdiff_t stride, + const int16_t *dst, const int w1; + const int w, const int h); +*/ +function sgr_3x3_finish_8bpc_lsx + vreplgr2vr.w vr3, a3 // w1 + andi t4, a4, 0x7 + sub.w t5, a4, t4 + + beq zero, t5, .LSGR3X3_REM + +.LSGR3X3_H: + addi.d t0, a0, 0 + addi.d t1, a2, 0 + addi.w t2, t5, 0 + andi t4, a4, 0x7 +.LSGR3X3_W: + vld vr0, t0, 0 + vld vr1, t1, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // p + vexth.wu.hu vr5, vr2 // p + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst + vexth.w.h vr9, vr1 // dst + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + vstelm.d vr7, t0, 0, 0 + addi.d t0, t0, 8 + addi.d t1, t1, 16 + addi.d t2, t2, -8 + bne zero, t2, .LSGR3X3_W + + beq t4, zero, .LSGR3X3_NOREM + + vld vr0, t0, 0 + vld vr1, t1, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // p + vexth.wu.hu vr5, vr2 // p + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst + vexth.w.h vr9, vr1 // dst + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + +.LSGR3X3_ST: + vstelm.b vr7, t0, 0, 0 + addi.d t0, t0, 1 + vbsrl.v vr7, vr7, 1 + addi.w t4, t4, -1 + bnez t4, .LSGR3X3_ST + +.LSGR3X3_NOREM: + addi.w a5, a5, -1 + add.d a0, a0, a1 + addi.d a2, a2, (FILTER_OUT_STRIDE<<1) + bnez a5, .LSGR3X3_H + b .LSGR3X3_END + +.LSGR3X3_REM: + andi t4, a4, 0x7 + addi.d t0, a0, 0 + vld vr0, t0, 0 + vld vr1, a2, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // p + vexth.wu.hu vr5, vr2 // p + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst + vexth.w.h vr9, vr1 // dst + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + +.LSGR3X3_REM_ST: + vstelm.b vr7, t0, 0, 0 + addi.d t0, t0, 1 + vbsrl.v vr7, vr7, 1 + addi.w t4, t4, -1 + bnez t4, .LSGR3X3_REM_ST + addi.w a5, a5, -1 + add.d a0, a0, a1 + addi.d a2, a2, (FILTER_OUT_STRIDE<<1) + bnez a5, .LSGR3X3_REM + +.LSGR3X3_END: +endfunc + +/* +void boxsum5(int32_t *sumsq, coef *sum, + const pixel *const src, + const int w, const int h) +*/ +function boxsum5_h_8bpc_lsx + addi.w a4, a4, -4 + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a1, a1, REST_UNIT_STRIDE<<1 + li.w t6, 1 +.LBOXSUM5_H_H: + addi.w t3, a3, 0 + addi.d t2, a2, 0 + addi.d t0, a0, 0 + addi.d t1, a1, 0 + +.LBOXSUM5_H_W: + vld vr0, t2, 0 // a + vld vr1, t2, REST_UNIT_STRIDE // b + vld vr2, t2, REST_UNIT_STRIDE<<1 // c + vld vr3, t2, REST_UNIT_STRIDE*3 // d + vld vr4, t2, REST_UNIT_STRIDE<<2 // e + + vilvl.b vr5, vr1, vr0 + vilvh.b vr6, vr1, vr0 + vilvl.b vr7, vr3, vr2 + vilvh.b vr8, vr3, vr2 + //sum_v + vhaddw.hu.bu vr9, vr5, vr5 // 0 1 2 3 4 5 6 7 + vhaddw.hu.bu vr10, vr6, vr6 // 8 9 10 11 12 13 14 15 a+b + vhaddw.hu.bu vr11, vr7, vr7 + vhaddw.hu.bu vr12, vr8, vr8 + vadd.h vr9, vr9, vr11 + vadd.h vr10, vr10, vr12 // a + b + c + d + vsllwil.hu.bu vr11, vr4, 0 + vexth.hu.bu vr12, vr4 + vadd.h vr9, vr9, vr11 + vadd.h vr10, vr10, vr12 + vst vr9, t1, 0 + vst vr10, t1, 16 + addi.d t1, t1, 32 + + // sumsq + vmulwev.h.bu vr9, vr5, vr5 // a*a 0 1 2 3 4 5 6 7 + vmulwev.h.bu vr10, vr6, vr6 // a*a 8 9 10 11 12 13 14 15 + vmulwod.h.bu vr13, vr5, vr5 // b*b 0 1 2 3 4 5 6 7 + vmulwod.h.bu vr14, vr6, vr6 // b*b 8 9 10 11 12 13 14 15 + vmulwev.h.bu vr15, vr7, vr7 // c*c 0 1 2 3 4 5 6 7 + vmulwev.h.bu vr16, vr8, vr8 // c*c 8 9 10 11 12 13 14 15 + vmulwod.h.bu vr17, vr7, vr7 // d*d 0 1 2 3 4 5 6 7 + vmulwod.h.bu vr18, vr8, vr8 // d*d 8 9 10 11 12 13 14 15 + vaddwev.w.hu vr5, vr9, vr13 // 0 2 4 6 + vaddwod.w.hu vr6, vr9, vr13 // 1 3 5 7 + vaddwev.w.hu vr7, vr10, vr14 // 8 10 12 14 + vaddwod.w.hu vr8, vr10, vr14 // 9 11 13 15 a + b + vaddwev.w.hu vr19, vr15, vr17 // 0 2 4 6 + vaddwod.w.hu vr20, vr15, vr17 // 1 3 5 7 + vaddwev.w.hu vr21, vr16, vr18 // 8 10 12 14 + vaddwod.w.hu vr22, vr16, vr18 // 9 11 13 15 c + d + vadd.w vr5, vr5, vr19 + vadd.w vr6, vr6, vr20 + vadd.w vr7, vr7, vr21 + vadd.w vr8, vr8, vr22 + vilvl.w vr19, vr6, vr5 + vilvh.w vr20, vr6, vr5 + vilvl.w vr21, vr8, vr7 + vilvh.w vr22, vr8, vr7 + vmul.h vr11, vr11, vr11 + vmul.h vr12, vr12, vr12 + vsllwil.wu.hu vr0, vr11, 0 + vexth.wu.hu vr1, vr11 + vsllwil.wu.hu vr2, vr12, 0 + vexth.wu.hu vr3, vr12 + vadd.w vr19, vr19, vr0 + vadd.w vr20, vr20, vr1 + vadd.w vr21, vr21, vr2 + vadd.w vr22, vr22, vr3 + vst vr19, t0, 0 + vst vr20, t0, 16 + vst vr21, t0, 32 + vst vr22, t0, 48 + addi.d t0, t0, 64 + addi.d t2, t2, 16 + addi.w t3, t3, -16 + blt zero, t3, .LBOXSUM5_H_W + + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a2, a2, REST_UNIT_STRIDE + addi.d a4, a4, -1 + bnez a4, .LBOXSUM5_H_H +endfunc + +/* +void boxsum5_h(int32_t *sumsq, coef *sum, + const int w, const int h) +*/ +function boxsum5_v_8bpc_lsx + addi.d a0, a0, (REST_UNIT_STRIDE<<2) + addi.d a1, a1, (REST_UNIT_STRIDE<<1) + addi.w a3, a3, -4 + addi.w a2, a2, -4 + +.LBOXSUM5_V_H: + addi.w t3, a2, 0 + addi.d t0, a0, 0 + addi.d t1, a1, 0 + addi.d t2, a0, 8 + addi.d t3, a1, 4 + addi.d t4, a2, 0 + + vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 + vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 + vld vr2, t1, 4 // c 2 + vld vr3, t1, 6 // d 3 + vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11 + vadd.h vr5, vr0, vr1 + vadd.h vr6, vr2, vr3 + vpickve2gr.w t5, vr4, 2 + vadd.h vr5, vr5, vr6 + vadd.h vr5, vr5, vr4 + vst vr5, t3, 0 + + vld vr0, t0, 0 // 0 1 2 3 a + vld vr1, t0, 4 // 1 2 3 4 b + vld vr2, t0, 8 // 2 3 4 5 c + vld vr3, t0, 12 // 3 4 5 6 d + vld vr4, t0, 16 // 4 5 6 7 e a + vld vr5, t0, 20 // 5 6 7 8 b + vld vr6, t0, 24 // 6 7 8 9 c + vld vr7, t0, 28 // 7 8 9 10 d + vld vr8, t0, 32 // 8 9 10 11 e + + vadd.w vr9, vr0, vr1 + vadd.w vr10, vr2, vr3 + vadd.w vr9, vr9, vr10 + vadd.w vr9, vr9, vr4 + vadd.w vr10, vr4, vr5 + vadd.w vr11, vr6, vr7 + vadd.w vr10, vr10, vr8 + vadd.w vr10, vr10, vr11 + vst vr9, t2, 0 + vst vr10, t2, 16 + + addi.d t3, t3, 16 + addi.d t1, t1, 16 + addi.d t0, t0, 32 + addi.d t2, t2, 32 + addi.w t4, t4, -8 + ble t4, zero, .LBOXSUM5_V_H1 + +.LBOXSUM5_V_W: + vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 + vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 + vld vr2, t1, 4 // c 2 + vld vr3, t1, 6 // d 3 + vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11 + vinsgr2vr.w vr0, t5, 0 + vpickve2gr.w t5, vr4, 2 + vextrins.h vr1, vr0, 0x01 + vadd.h vr5, vr0, vr1 + vadd.h vr6, vr2, vr3 + vadd.h vr5, vr5, vr6 + vadd.h vr5, vr5, vr4 + vst vr5, t3, 0 + + vaddi.hu vr0, vr8, 0 // 8 9 10 11 a + vld vr1, t0, 4 // 9 10 11 12 b + vld vr2, t0, 8 // 10 11 12 13 c + vld vr3, t0, 12 // 14 15 16 17 d + vld vr4, t0, 16 // 15 16 17 18 e a + vld vr5, t0, 20 // 16 17 18 19 b + vld vr6, t0, 24 // 17 18 19 20 c + vld vr7, t0, 28 // 18 19 20 21 d + vld vr8, t0, 32 // 19 20 21 22 e + vextrins.w vr1, vr0, 0x01 + vadd.w vr9, vr0, vr1 + vadd.w vr10, vr2, vr3 + vadd.w vr9, vr9, vr10 + vadd.w vr9, vr9, vr4 + vadd.w vr10, vr4, vr5 + vadd.w vr11, vr6, vr7 + vadd.w vr10, vr10, vr8 + vadd.w vr10, vr10, vr11 + vst vr9, t2, 0 + vst vr10, t2, 16 + + addi.d t3, t3, 16 + addi.d t1, t1, 16 + addi.d t0, t0, 32 + addi.d t2, t2, 32 + addi.w t4, t4, -8 + blt zero, t4, .LBOXSUM5_V_W + +.LBOXSUM5_V_H1: + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.w a3, a3, -1 + bnez a3, .LBOXSUM5_V_H +endfunc + +/* +selfguided_filter(int32_t *sumsq, coef *sum, + const int w, const int h, + const unsigned s) +*/ +function boxsum5_sgf_h_8bpc_lsx + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a0, a0, 12 // AA + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a1, a1, 6 // BB + la.local t8, dav1d_sgr_x_by_x + li.w t6, 164 + vreplgr2vr.w vr20, t6 + li.w t6, 255 + vreplgr2vr.w vr22, t6 + vaddi.wu vr21, vr22, 1 // 256 + vreplgr2vr.w vr6, a4 + vldi vr19, 0x819 + addi.w a2, a2, 2 // w + 2 + addi.w a3, a3, 2 // h + 2 + +.LBS5SGF_H_H: + addi.w t2, a2, 0 + addi.d t0, a0, -4 + addi.d t1, a1, -2 + +.LBS5SGF_H_W: + vld vr0, t0, 0 // AA[i] + vld vr1, t0, 16 + vld vr2, t1, 0 // BB[i] + + vmul.w vr4, vr0, vr19 // a * n + vmul.w vr5, vr1, vr19 // a * n + vsllwil.w.h vr9, vr2, 0 + vexth.w.h vr10, vr2 + vmsub.w vr4, vr9, vr9 // p + vmsub.w vr5, vr10, vr10 // p + vmaxi.w vr4, vr4, 0 + vmaxi.w vr5, vr5, 0 // p + vmul.w vr4, vr4, vr6 // p * s + vmul.w vr5, vr5, vr6 // p * s + vsrlri.w vr4, vr4, 20 + vsrlri.w vr5, vr5, 20 // z + vmin.w vr4, vr4, vr22 + vmin.w vr5, vr5, vr22 + + // load table data + vpickve2gr.w t6, vr4, 0 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 0 + vpickve2gr.w t6, vr4, 1 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 1 + vpickve2gr.w t6, vr4, 2 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 2 + vpickve2gr.w t6, vr4, 3 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 3 + + vpickve2gr.w t6, vr5, 0 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 0 + vpickve2gr.w t6, vr5, 1 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 1 + vpickve2gr.w t6, vr5, 2 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 2 + vpickve2gr.w t6, vr5, 3 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 3 // x + + vmul.w vr9, vr7, vr9 // x * BB[i] + vmul.w vr10, vr8, vr10 + vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x + vmul.w vr10, vr10, vr20 + vsrlri.w vr9, vr9, 12 + vsrlri.w vr10, vr10, 12 + vsub.w vr7, vr21, vr7 + vsub.w vr8, vr21, vr8 + vpickev.h vr8, vr8, vr7 + vst vr9, t0, 0 + vst vr10, t0, 16 + vst vr8, t1, 0 + addi.d t0, t0, 32 + addi.d t1, t1, 16 + addi.w t2, t2, -8 + blt zero, t2, .LBS5SGF_H_W + + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a1, a1, REST_UNIT_STRIDE<<2 + addi.w a3, a3, -2 + blt zero, a3, .LBS5SGF_H_H +endfunc + +/* +selfguided_filter(coef *dst, pixel *src, + int32_t *sumsq, coef *sum, + const int w, const int h) +*/ +function boxsum5_sgf_v_8bpc_lsx + addi.d a1, a1, 3*REST_UNIT_STRIDE+3 // src + addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 // A + addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 + addi.d a3, a3, (2*REST_UNIT_STRIDE+3)<<1 // B + addi.w a5, a5, -1 + vldi vr10, 0x806 + vldi vr11, 0x805 + vldi vr22, 0x406 + +.LBS5SGF_V_H: + addi.d t0, a0, 0 + addi.d t1, a1, 0 + addi.d t2, a2, 0 + addi.d t3, a3, 0 + addi.w t4, a4, 0 + + addi.d t5, a0, 384*2 + addi.d t6, a1, REST_UNIT_STRIDE + addi.d t7, a2, REST_UNIT_STRIDE<<2 + addi.d t8, a3, REST_UNIT_STRIDE<<1 // B +.LBS5SGF_V_W: + // a + vld vr0, t3, -REST_UNIT_STRIDE*2 + vld vr1, t3, REST_UNIT_STRIDE*2 + vld vr2, t3, (-REST_UNIT_STRIDE-1)*2 + vld vr3, t3, (REST_UNIT_STRIDE-1)*2 + vld vr4, t3, (1-REST_UNIT_STRIDE)*2 + vld vr5, t3, (1+REST_UNIT_STRIDE)*2 + vaddwev.w.h vr6, vr0, vr1 + vaddwod.w.h vr7, vr0, vr1 + vmul.w vr6, vr6, vr10 + vmul.w vr7, vr7, vr10 + vaddwev.w.h vr8, vr2, vr3 + vaddwod.w.h vr9, vr2, vr3 + vaddwev.w.h vr12, vr4, vr5 + vaddwod.w.h vr13, vr4, vr5 + vadd.w vr8, vr8, vr12 + vadd.w vr9, vr9, vr13 + vmadd.w vr6, vr8, vr11 + vmadd.w vr7, vr9, vr11 + vilvl.w vr18, vr7, vr6 + vilvh.w vr19, vr7, vr6 + // b + vld vr0, t2, -REST_UNIT_STRIDE*4 + vld vr1, t2, -REST_UNIT_STRIDE*4+16 + vld vr2, t2, REST_UNIT_STRIDE*4 + vld vr3, t2, REST_UNIT_STRIDE*4+16 + vld vr4, t2, (-REST_UNIT_STRIDE-1)*4 + vld vr5, t2, (-REST_UNIT_STRIDE-1)*4+16 + vld vr8, t2, (REST_UNIT_STRIDE-1)*4 + vld vr9, t2, (REST_UNIT_STRIDE-1)*4+16 + vld vr12, t2, (1-REST_UNIT_STRIDE)*4 + vld vr13, t2, (1-REST_UNIT_STRIDE)*4+16 + vld vr14, t2, (1+REST_UNIT_STRIDE)*4 + vld vr15, t2, (1+REST_UNIT_STRIDE)*4+16 + vadd.w vr0, vr0, vr2 // 0 1 2 3 + vadd.w vr1, vr1, vr3 // 4 5 6 7 + vmul.w vr20, vr0, vr10 + vmul.w vr21, vr1, vr10 + vadd.w vr4, vr4, vr8 // 0 1 2 3 + vadd.w vr5, vr5, vr9 // 4 5 6 7 + vadd.w vr12, vr12, vr14 + vadd.w vr13, vr13, vr15 + vadd.w vr12, vr12, vr4 + vadd.w vr13, vr13, vr5 + vmadd.w vr20, vr12, vr11 + vmadd.w vr21, vr13, vr11 + vld vr2, t1, 0 + vsllwil.hu.bu vr2, vr2, 0 + vsllwil.wu.hu vr3, vr2, 0 + vexth.wu.hu vr4, vr2 + vmadd.w vr20, vr18, vr3 + vmadd.w vr21, vr19, vr4 + vssrlrni.h.w vr21, vr20, 9 + vst vr21, t0, 0 + + addi.d t1, t1, 8 + addi.d t2, t2, 32 + addi.d t3, t3, 16 + + // a + vld vr0, t8, 0 + vld vr1, t8, -2 + vld vr2, t8, 2 + vmulwev.w.h vr3, vr0, vr22 + vmulwod.w.h vr4, vr0, vr22 + vaddwev.w.h vr5, vr1, vr2 + vaddwod.w.h vr6, vr1, vr2 + vmadd.w vr3, vr5, vr11 + vmadd.w vr4, vr6, vr11 + vilvl.w vr19, vr4, vr3 + vilvh.w vr20, vr4, vr3 + // b + vld vr0, t7, 0 + vld vr1, t7, -4 + vld vr2, t7, 4 + vld vr5, t7, 16 + vld vr6, t7, 12 + vld vr7, t7, 20 + vmul.w vr8, vr0, vr10 + vmul.w vr9, vr5, vr10 + vadd.w vr12, vr1, vr2 + vadd.w vr13, vr6, vr7 + vmadd.w vr8, vr12, vr11 + vmadd.w vr9, vr13, vr11 + vld vr2, t6, 0 + vsllwil.hu.bu vr2, vr2, 0 + vsllwil.wu.hu vr3, vr2, 0 + vexth.wu.hu vr4, vr2 + vmadd.w vr8, vr19, vr3 + vmadd.w vr9, vr20, vr4 + vssrlrni.h.w vr9, vr8, 8 + vst vr9, t0, 384*2 + + addi.d t0, t0, 16 + addi.d t8, t8, 16 + addi.d t7, t7, 32 + addi.d t6, t6, 8 + addi.w t4, t4, -8 + blt zero, t4, .LBS5SGF_V_W + + addi.w a5, a5, -2 + addi.d a0, a0, 384*4 // dst + addi.d a1, a1, REST_UNIT_STRIDE<<1 // src + addi.d a2, a2, REST_UNIT_STRIDE<<2 // + addi.d a2, a2, REST_UNIT_STRIDE<<2 + addi.d a3, a3, REST_UNIT_STRIDE<<2 // + blt zero, a5, .LBS5SGF_V_H + bnez a5, .LBS5SGF_END +.LBS5SGF_V_W1: + // a + vld vr0, a3, -REST_UNIT_STRIDE*2 + vld vr1, a3, REST_UNIT_STRIDE*2 + vld vr2, a3, (-REST_UNIT_STRIDE-1)*2 + vld vr3, a3, (REST_UNIT_STRIDE-1)*2 + vld vr4, a3, (1-REST_UNIT_STRIDE)*2 + vld vr5, a3, (1+REST_UNIT_STRIDE)*2 + vaddwev.w.h vr6, vr0, vr1 + vaddwod.w.h vr7, vr0, vr1 + vmul.w vr6, vr6, vr10 + vmul.w vr7, vr7, vr10 + vaddwev.w.h vr8, vr2, vr3 + vaddwod.w.h vr9, vr2, vr3 + vaddwev.w.h vr12, vr4, vr5 + vaddwod.w.h vr13, vr4, vr5 + vadd.w vr8, vr8, vr12 + vadd.w vr9, vr9, vr13 + vmadd.w vr6, vr8, vr11 + vmadd.w vr7, vr9, vr11 + vilvl.w vr18, vr7, vr6 + vilvh.w vr19, vr7, vr6 + // b + vld vr0, a2, -REST_UNIT_STRIDE*4 + vld vr1, a2, -REST_UNIT_STRIDE*4+16 + vld vr2, a2, REST_UNIT_STRIDE*4 + vld vr3, a2, REST_UNIT_STRIDE*4+16 + vld vr4, a2, (-REST_UNIT_STRIDE-1)*4 + vld vr5, a2, (-REST_UNIT_STRIDE-1)*4+16 + vld vr8, a2, (REST_UNIT_STRIDE-1)*4 + vld vr9, a2, (REST_UNIT_STRIDE-1)*4+16 + vld vr12, a2, (1-REST_UNIT_STRIDE)*4 + vld vr13, a2, (1-REST_UNIT_STRIDE)*4+16 + vld vr14, a2, (1+REST_UNIT_STRIDE)*4 + vld vr15, a2, (1+REST_UNIT_STRIDE)*4+16 + vadd.w vr0, vr0, vr2 // 0 1 2 3 + vadd.w vr1, vr1, vr3 // 4 5 6 7 + vmul.w vr20, vr0, vr10 + vmul.w vr21, vr1, vr10 + vadd.w vr4, vr4, vr8 // 0 1 2 3 + vadd.w vr5, vr5, vr9 // 4 5 6 7 + vadd.w vr12, vr12, vr14 + vadd.w vr13, vr13, vr15 + vadd.w vr12, vr12, vr4 + vadd.w vr13, vr13, vr5 + vmadd.w vr20, vr12, vr11 + vmadd.w vr21, vr13, vr11 + vld vr2, a1, 0 + vsllwil.hu.bu vr2, vr2, 0 + vsllwil.wu.hu vr3, vr2, 0 + vexth.wu.hu vr4, vr2 + vmadd.w vr20, vr18, vr3 + vmadd.w vr21, vr19, vr4 + vssrlrni.h.w vr21, vr20, 9 + vst vr21, a0, 0 + addi.d a3, a3, 16 + addi.d a2, a2, 32 + addi.d a1, a1, 8 + addi.d a0, a0, 16 + addi.w a4, a4, -8 + blt zero, a4, .LBS5SGF_V_W1 +.LBS5SGF_END: +endfunc + +/* +void dav1d_sgr_mix_finish_lsx(uint8_t *p, const ptrdiff_t stride, + const int16_t *dst0, const int16_t *dst1, + const int w0, const int w1, + const int w, const int h); +*/ +function sgr_mix_finish_8bpc_lsx + vreplgr2vr.w vr3, a4 // w0 + vreplgr2vr.w vr13, a5 // w1 + andi t4, a6, 0x7 + sub.w t5, a6, t4 + + beq zero, t5, .LSGRMIX_REM + +.LSGRMIX_H: + addi.d t0, a0, 0 + addi.d t1, a2, 0 // dst0 + addi.d t3, a3, 0 // dst1 + addi.w t2, t5, 0 + andi t4, a6, 0x7 +.LSGRMIX_W: + vld vr0, t0, 0 + vld vr1, t1, 0 + vld vr10, t3, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // u 0 1 2 3 + vexth.wu.hu vr5, vr2 // u 4 5 6 7 + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst0 + vexth.w.h vr9, vr1 // dst0 + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + + vsllwil.w.h vr11, vr10, 0 // dst1 + vexth.w.h vr12, vr10 // dst1 + vsub.w vr11, vr11, vr4 + vsub.w vr12, vr12, vr5 + vmadd.w vr6, vr11, vr13 + vmadd.w vr7, vr12, vr13 + + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + vstelm.d vr7, t0, 0, 0 + addi.d t0, t0, 8 + addi.d t1, t1, 16 + addi.d t3, t3, 16 + addi.d t2, t2, -8 + bne zero, t2, .LSGRMIX_W + + beq t4, zero, .LSGRMIX_W8 + + vld vr0, t0, 0 + vld vr1, t1, 0 + vld vr10, t3, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // p + vexth.wu.hu vr5, vr2 // p + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst + vexth.w.h vr9, vr1 // dst + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + + vsllwil.w.h vr11, vr10, 0 // dst1 + vexth.w.h vr12, vr10 // dst1 + vsub.w vr11, vr11, vr4 + vsub.w vr12, vr12, vr5 + vmadd.w vr6, vr11, vr13 + vmadd.w vr7, vr12, vr13 + + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + +.LSGRMIX_ST: + vstelm.b vr7, t0, 0, 0 + addi.d t0, t0, 1 + vbsrl.v vr7, vr7, 1 + addi.w t4, t4, -1 + bnez t4, .LSGRMIX_ST + +.LSGRMIX_W8: + addi.w a7, a7, -1 + add.d a0, a0, a1 + addi.d a2, a2, (FILTER_OUT_STRIDE<<1) + addi.d a3, a3, (FILTER_OUT_STRIDE<<1) + bnez a7, .LSGRMIX_H + b .LSGR_MIX_END + +.LSGRMIX_REM: + andi t4, a6, 0x7 + vld vr0, a0, 0 + vld vr1, a2, 0 + vld vr10, a3, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // p + vexth.wu.hu vr5, vr2 // p + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst + vexth.w.h vr9, vr1 // dst + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + + vsllwil.w.h vr11, vr10, 0 // dst1 + vexth.w.h vr12, vr10 // dst1 + vsub.w vr11, vr11, vr4 + vsub.w vr12, vr12, vr5 + vmadd.w vr6, vr11, vr13 + vmadd.w vr7, vr12, vr13 + + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + addi.d t0, a0, 0 +.LSGRMIX_REM_ST: + vstelm.b vr7, t0, 0, 0 + addi.d t0, t0, 1 + vbsrl.v vr7, vr7, 1 + addi.w t4, t4, -1 + bnez t4, .LSGRMIX_REM_ST + + addi.w a7, a7, -1 + add.d a0, a0, a1 + addi.d a2, a2, (FILTER_OUT_STRIDE<<1) + addi.d a3, a3, (FILTER_OUT_STRIDE<<1) + bnez a7, .LSGRMIX_REM + +.LSGR_MIX_END: +endfunc diff --git a/src/loongarch/looprestoration.h b/src/loongarch/looprestoration.h new file mode 100644 index 0000000..ac0cb06 --- /dev/null +++ b/src/loongarch/looprestoration.h @@ -0,0 +1,78 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H +#define DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H + +#include "common/intops.h" +#include "src/cpu.h" +#include "src/looprestoration.h" + +void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t stride, + const uint8_t (*const left)[4], + const uint8_t *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + +void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + +void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + +void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + +static ALWAYS_INLINE void loop_restoration_dsp_init_loongarch(Dav1dLoopRestorationDSPContext *const c, int bpc) +{ + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + +#if BITDEPTH == 8 + c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_lsx; + + c->sgr[0] = dav1d_sgr_filter_5x5_lsx; + c->sgr[1] = dav1d_sgr_filter_3x3_lsx; + c->sgr[2] = dav1d_sgr_filter_mix_lsx; +#endif +} + +#endif /* DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H */ diff --git a/src/loongarch/looprestoration_tmpl.c b/src/loongarch/looprestoration_tmpl.c new file mode 100644 index 0000000..66d0d63 --- /dev/null +++ b/src/loongarch/looprestoration_tmpl.c @@ -0,0 +1,274 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/looprestoration.h" + +#if BITDEPTH == 8 + +#define REST_UNIT_STRIDE (400) + +void BF(dav1d_wiener_filter_h, lsx)(int32_t *hor_ptr, + uint8_t *tmp_ptr, + const int16_t filterh[8], + const int w, const int h); + +void BF(dav1d_wiener_filter_v, lsx)(uint8_t *p, + const ptrdiff_t p_stride, + const int32_t *hor, + const int16_t filterv[8], + const int w, const int h); + +// This function refers to the function in the ppc/looprestoration_init_tmpl.c. +static inline void padding(uint8_t *dst, const uint8_t *p, + const ptrdiff_t stride, const uint8_t (*left)[4], + const uint8_t *lpf, int unit_w, const int stripe_h, + const enum LrEdgeFlags edges) +{ + const int have_left = !!(edges & LR_HAVE_LEFT); + const int have_right = !!(edges & LR_HAVE_RIGHT); + + // Copy more pixels if we don't have to pad them + unit_w += 3 * have_left + 3 * have_right; + uint8_t *dst_l = dst + 3 * !have_left; + p -= 3 * have_left; + lpf -= 3 * have_left; + + if (edges & LR_HAVE_TOP) { + // Copy previous loop filtered rows + const uint8_t *const above_1 = lpf; + const uint8_t *const above_2 = above_1 + PXSTRIDE(stride); + pixel_copy(dst_l, above_1, unit_w); + pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w); + } else { + // Pad with first row + pixel_copy(dst_l, p, unit_w); + pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w); + if (have_left) { + pixel_copy(dst_l, &left[0][1], 3); + pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3); + } + } + + uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE; + if (edges & LR_HAVE_BOTTOM) { + // Copy next loop filtered rows + const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride); + const uint8_t *const below_2 = below_1 + PXSTRIDE(stride); + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w); + } else { + // Pad with last row + const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride); + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w); + if (have_left) { + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + } + } + + // Inner UNIT_WxSTRIPE_H + for (int j = 0; j < stripe_h; j++) { + pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left); + dst_tl += REST_UNIT_STRIDE; + p += PXSTRIDE(stride); + } + + if (!have_right) { + uint8_t *pad = dst_l + unit_w; + uint8_t *row_last = &dst_l[unit_w - 1]; + // Pad 3x(STRIPE_H+6) with last column + for (int j = 0; j < stripe_h + 6; j++) { + pixel_set(pad, *row_last, 3); + pad += REST_UNIT_STRIDE; + row_last += REST_UNIT_STRIDE; + } + } + + if (!have_left) { + // Pad 3x(STRIPE_H+6) with first column + for (int j = 0; j < stripe_h + 6; j++) { + pixel_set(dst, *dst_l, 3); + dst += REST_UNIT_STRIDE; + dst_l += REST_UNIT_STRIDE; + } + } else { + dst += 3 * REST_UNIT_STRIDE; + for (int j = 0; j < stripe_h; j++) { + pixel_copy(dst, &left[j][1], 3); + dst += REST_UNIT_STRIDE; + } + } +} + +// This function refers to the function in the ppc/looprestoration_init_tmpl.c. + +// FIXME Could split into luma and chroma specific functions, +// (since first and last tops are always 0 for chroma) +// FIXME Could implement a version that requires less temporary memory +// (should be possible to implement with only 6 rows of temp storage) +void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t p_stride, + const uint8_t (*const left)[4], + const uint8_t *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + const int16_t (*const filter)[8] = params->filter; + + // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels + // of padding above and below + ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); + padding(tmp, p, p_stride, left, lpf, w, h, edges); + ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,); + + BF(dav1d_wiener_filter_h, lsx)(hor, tmp, filter[0], w, h + 6); + BF(dav1d_wiener_filter_v, lsx)(p, p_stride, hor, filter[1], w, h); +} + +void BF(dav1d_boxsum3_h, lsx)(int32_t *sumsq, int16_t *sum, pixel *src, + const int w, const int h); +void BF(dav1d_boxsum3_v, lsx)(int32_t *sumsq, int16_t *sum, + const int w, const int h); + +void BF(dav1d_boxsum3_sgf_h, lsx)(int32_t *sumsq, int16_t *sum, + const int w, const int h, const int w1); +void BF(dav1d_boxsum3_sgf_v, lsx)(int16_t *dst, uint8_t *tmp, + int32_t *sumsq, int16_t *sum, + const int w, const int h); +void BF(dav1d_sgr_3x3_finish, lsx)(pixel *p, const ptrdiff_t p_stride, + int16_t *dst, int w1, + const int w, const int h); + + +static inline void boxsum3_lsx(int32_t *sumsq, coef *sum, pixel *src, + const int w, const int h) +{ + BF(dav1d_boxsum3_h, lsx)(sumsq, sum, src, w + 6, h + 6); + BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6); +} + +void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); + padding(tmp, p, p_stride, left, lpf, w, h, edges); + coef dst[64 * 384]; + + ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, ); + ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, ); + + boxsum3_lsx(sumsq, sum, tmp, w, h); + BF(dav1d_boxsum3_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s1); + BF(dav1d_boxsum3_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h); + BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h); +} + +void BF(dav1d_boxsum5_h, lsx)(int32_t *sumsq, int16_t *sum, + const uint8_t *const src, + const int w, const int h); + +void BF(dav1d_boxsum5_v, lsx)(int32_t *sumsq, int16_t *sum, + const int w, const int h); + +void BF(dav1d_boxsum5_sgf_h, lsx)(int32_t *sumsq, int16_t *sum, + const int w, const int h, + const unsigned s); + +void BF(dav1d_boxsum5_sgf_v, lsx)(int16_t *dst, uint8_t *src, + int32_t *sumsq, int16_t *sum, + const int w, const int h); + +void BF(dav1d_sgr_mix_finish, lsx)(uint8_t *p, const ptrdiff_t stride, + const int16_t *dst0, const int16_t *dst1, + const int w0, const int w1, + const int w, const int h); + +static inline void boxsum5_lsx(int32_t *sumsq, coef *sum, pixel *src, + const int w, const int h) +{ + BF(dav1d_boxsum5_h, lsx)(sumsq, sum, src, w + 6, h + 6); + BF(dav1d_boxsum5_v, lsx)(sumsq, sum, w + 6, h + 6); +} + +void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); + padding(tmp, p, p_stride, left, lpf, w, h, edges); + coef dst[64 * 384]; + + ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, ); + ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, ); + + boxsum5_lsx(sumsq, sum, tmp, w, h); + BF(dav1d_boxsum5_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s0); + BF(dav1d_boxsum5_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h); + BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w0, w, h); +} + +void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); + padding(tmp, p, p_stride, left, lpf, w, h, edges); + coef dst0[64 * 384]; + coef dst1[64 * 384]; + + ALIGN_STK_16(int32_t, sumsq0, 68 * REST_UNIT_STRIDE + 8, ); + ALIGN_STK_16(int16_t, sum0, 68 * REST_UNIT_STRIDE + 16, ); + + boxsum5_lsx(sumsq0, sum0, tmp, w, h); + BF(dav1d_boxsum5_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s0); + BF(dav1d_boxsum5_sgf_v, lsx)(dst0, tmp, sumsq0, sum0, w, h); + + boxsum3_lsx(sumsq0, sum0, tmp, w, h); + BF(dav1d_boxsum3_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s1); + BF(dav1d_boxsum3_sgf_v, lsx)(dst1, tmp, sumsq0, sum0, w, h); + + BF(dav1d_sgr_mix_finish, lsx)(p, p_stride, dst0, dst1, params->sgr.w0, + params->sgr.w1, w, h); +} +#endif diff --git a/src/looprestoration_tmpl.c b/src/looprestoration_tmpl.c index d4d7867..9922908 100644 --- a/src/looprestoration_tmpl.c +++ b/src/looprestoration_tmpl.c @@ -527,6 +527,8 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t stride, #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/looprestoration.h" +#elif ARCH_LOONGARCH64 +#include "src/loongarch/looprestoration.h" #elif ARCH_PPC64LE #include "src/ppc/looprestoration.h" #elif ARCH_X86 @@ -545,6 +547,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM loop_restoration_dsp_init_arm(c, bpc); +#elif ARCH_LOONGARCH64 + loop_restoration_dsp_init_loongarch(c, bpc); #elif ARCH_PPC64LE loop_restoration_dsp_init_ppc(c, bpc); #elif ARCH_X86 diff --git a/src/meson.build b/src/meson.build index 88d4ff1..a3c211e 100644 --- a/src/meson.build +++ b/src/meson.build @@ -240,9 +240,14 @@ if is_asm_enabled 'loongarch/cpu.c', ) + libdav1d_arch_tmpl_sources += files( + 'loongarch/looprestoration_tmpl.c', + ) + libdav1d_sources_asm = files( 'loongarch/mc.S', 'loongarch/loopfilter.S', + 'loongarch/looprestoration.S', ) libdav1d_asm_objs += libdav1d_sources_asm endif -- cgit v1.2.3 From 38bc00849a76bb6cee75d2874ef1a9cd67daab81 Mon Sep 17 00:00:00 2001 From: jinbo Date: Fri, 1 Dec 2023 14:49:40 +0800 Subject: loongarch: Improve the performance of msac series functions Relative speedup over C code: msac_decode_bool_c: 0.5 ( 1.00x) msac_decode_bool_lsx: 0.5 ( 1.09x) msac_decode_bool_adapt_c: 0.7 ( 1.00x) msac_decode_bool_adapt_lsx: 0.6 ( 1.20x) msac_decode_symbol_adapt4_c: 1.3 ( 1.00x) msac_decode_symbol_adapt4_lsx: 1.0 ( 1.30x) msac_decode_symbol_adapt8_c: 2.1 ( 1.00x) msac_decode_symbol_adapt8_lsx: 1.0 ( 2.05x) msac_decode_symbol_adapt16_c: 3.7 ( 1.00x) msac_decode_symbol_adapt16_lsx: 0.8 ( 4.77x) --- src/loongarch/msac.S | 368 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/msac.h | 46 +++++++ src/meson.build | 1 + src/msac.h | 2 + tests/checkasm/msac.c | 8 ++ 5 files changed, 425 insertions(+) create mode 100644 src/loongarch/msac.S create mode 100644 src/loongarch/msac.h diff --git a/src/loongarch/msac.S b/src/loongarch/msac.S new file mode 100644 index 0000000..c371eba --- /dev/null +++ b/src/loongarch/msac.S @@ -0,0 +1,368 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "loongson_asm.S" + +const min_prob + .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 +endconst + +.macro decode_symbol_adapt w + addi.d sp, sp, -48 + addi.d a4, a0, 24 + vldrepl.h vr0, a4, 0 //rng + fst.s f0, sp, 0 //val==0 + vld vr1, a1, 0 //cdf +.if \w == 16 + li.w t4, 16 + vldx vr11, a1, t4 +.endif + addi.d a6, a0, 16 + vldrepl.d vr2, a6, 0 //dif + addi.d t0, a0, 32 + ld.w t1, t0, 0 //allow_update_cdf + la.local t2, min_prob + addi.d t2, t2, 32 + addi.w t3, a2, 1 + slli.w t3, t3, 1 + sub.d t2, t2, t3 + vld vr3, t2, 0 //min_prob +.if \w == 16 + vldx vr13, t2, t4 +.endif + vsrli.h vr4, vr0, 8 //r = s->rng >> 8 + vslli.h vr4, vr4, 8 //r << 8 + vsrli.h vr5, vr1, 6 + vslli.h vr5, vr5, 7 +.if \w == 16 + vsrli.h vr15, vr11, 6 + vslli.h vr15, vr15, 7 +.endif + vmuh.hu vr5, vr4, vr5 + vadd.h vr5, vr5, vr3 //v +.if \w == 16 + vmuh.hu vr15, vr4, vr15 + vadd.h vr15, vr15, vr13 +.endif + addi.d t8, sp, 4 + vst vr5, t8, 0 //store v +.if \w == 16 + vstx vr15, t8, t4 +.endif + vreplvei.h vr20, vr2, 3 //c + vssub.hu vr6, vr5, vr20 //c >=v + vseqi.h vr6, vr6, 0 +.if \w == 16 + vssub.hu vr16, vr15, vr20 //c >=v + vseqi.h vr16, vr16, 0 + vpickev.b vr21, vr16, vr6 +.endif +.if \w <= 8 + vmskltz.h vr10, vr6 +.else + vmskltz.b vr10, vr21 +.endif + beqz t1, .renorm\()\w + + // update_cdf + alsl.d t1, a2, a1, 1 + ld.h t2, t1, 0 //count + srli.w t3, t2, 4 //count >> 4 + addi.w t3, t3, 4 + li.w t5, 2 + sltu t5, t5, a2 + add.w t3, t3, t5 //rate + sltui t5, t2, 32 + add.w t2, t2, t5 //count + (count < 32) + vreplgr2vr.h vr9, t3 + vseq.h vr7, vr7, vr7 + vavgr.hu vr5, vr6, vr7 //i >= val ? -1 : 32768 + vsub.h vr5, vr5, vr1 + vsub.h vr8, vr1, vr6 +.if \w == 16 + vavgr.hu vr15, vr16, vr7 + vsub.h vr15, vr15, vr11 + vsub.h vr18, vr11, vr16 +.endif + vsra.h vr5, vr5, vr9 + vadd.h vr8, vr8, vr5 +.if \w == 4 + fst.d f8, a1, 0 +.else + vst vr8, a1, 0 +.endif +.if \w == 16 + vsra.h vr15, vr15, vr9 + vadd.h vr18, vr18, vr15 + vstx vr18, a1, t4 +.endif + st.h t2, t1, 0 + +.renorm\()\w: + vpickve2gr.h t3, vr10, 0 + ctz.w a7, t3 // ret + alsl.d t3, a7, t8, 1 + ld.hu t4, t3, 0 // v + addi.d t3, t3, -2 + ld.hu t5, t3, 0 // u + sub.w t5, t5, t4 // rng + slli.d t4, t4, 48 + vpickve2gr.d t6, vr2, 0 + sub.d t6, t6, t4 // dif + addi.d t6, t6, 1 + clz.w t4, t5 // d + xori t4, t4, 16 // d + sll.d t6, t6, t4 + addi.d t6, t6, -1 // dif + addi.d a5, a0, 28 // cnt + ld.w t7, a5, 0 + sub.w t7, t7, t4 // cnt-d + sll.w t5, t5, t4 + st.w t5, a4, 0 // store rng + bge t7, zero, 9f + + // refill + ld.d t0, a0, 0 // buf_pos + addi.d t1, a0, 8 + ld.d t1, t1, 0 // buf_end + addi.d t2, t0, 8 + blt t1, t2, 1f + + ld.d t0, t0, 0 // next_bits + addi.w t3, t7, 23 // shift_bits = cnt + 23 + addi.w t7, t7, 16 // cnt += 16 + revb.d t0, t0 // next_bits = bswap(next_bits) + srli.w t4, t3, 3 + sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3 + st.d t2, a0, 0 + andi t3, t3, 24 // shift_bits &= 24 + srl.d t0, t0, t3 // next_bits >>= shift_bits + sub.w t3, t3, t7 // shift_bits -= 16 + cnt + sll.d t0, t0, t3 // next_bits <<= shift_bits + li.w t5, 48 + sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits + xor t6, t6, t0 // dif ^= next_bits + b 9f +1: + li.w t4, 40 + sub.w t5, t4, t7 // c = 40 - cnt +2: + bge t0, t1, 3f + ld.bu t2, t0, 0 + addi.d t0, t0, 1 + sll.d t2, t2, t5 + xor t6, t6, t2 + addi.w t5, t5, -8 + bge t5, zero, 2b + // refill_eob_end +3: + st.d t0, a0, 0 // s->buf_pos = buf_pos + sub.w t7, t4, t5 // cnt = 40 - c +9: + st.w t7, a5, 0 // store cnt + st.d t6, a6, 0 // store dif + move a0, a7 + addi.d sp, sp, 48 +.endm + +function msac_decode_symbol_adapt4_lsx + decode_symbol_adapt 4 +endfunc + +function msac_decode_symbol_adapt8_lsx + decode_symbol_adapt 8 +endfunc + +function msac_decode_symbol_adapt16_lsx + decode_symbol_adapt 16 +endfunc + +function msac_decode_bool_lsx + ld.w t0, a0, 24 // rng + srli.w a1, a1, 6 + ld.d t1, a0, 16 // dif + srli.w t2, t0, 8 // r >> 8 + mul.w t2, t2, a1 + ld.w a5, a0, 28 // cnt + addi.d t1, t1, 1 // dif + 1 + srli.w t2, t2, 1 + addi.w t2, t2, 4 // v + slli.d t3, t2, 48 // vw + sltu t4, t1, t3 + move t8, t4 // ret + xori t4, t4, 1 + maskeqz t6, t3, t4 // if (ret) vw + sub.d t6, t1, t6 // dif + slli.w t5, t2, 1 + sub.w t5, t0, t5 // r - 2v + maskeqz t7, t5, t4 // if (ret) r - 2v + add.w t5, t2, t7 // v(rng) + + // renorm + clz.w t4, t5 // d + xori t4, t4, 16 // d + sll.d t6, t6, t4 + addi.d t6, t6, -1 // dif + sub.w t7, a5, t4 // cnt-d + sll.w t5, t5, t4 + st.w t5, a0, 24 // store rng + bge t7, zero, 9f + + // refill + ld.d t0, a0, 0 // buf_pos + addi.d t1, a0, 8 + ld.d t1, t1, 0 // buf_end + addi.d t2, t0, 8 + blt t1, t2, 1f + + ld.d t0, t0, 0 // next_bits + addi.w t3, t7, 23 // shift_bits = cnt + 23 + addi.w t7, t7, 16 // cnt += 16 + revb.d t0, t0 // next_bits = bswap(next_bits) + srli.w t4, t3, 3 + sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3 + st.d t2, a0, 0 + andi t3, t3, 24 // shift_bits &= 24 + srl.d t0, t0, t3 // next_bits >>= shift_bits + sub.w t3, t3, t7 // shift_bits -= 16 + cnt + sll.d t0, t0, t3 // next_bits <<= shift_bits + li.w t5, 48 + sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits + xor t6, t6, t0 // dif ^= next_bits + b 9f +1: + li.w t4, 40 + sub.w t5, t4, t7 // c = 40 - cnt +2: + bge t0, t1, 3f + ld.bu t2, t0, 0 + addi.d t0, t0, 1 + sll.d t2, t2, t5 + xor t6, t6, t2 + addi.w t5, t5, -8 + bge t5, zero, 2b + // refill_eob_end +3: + st.d t0, a0, 0 // s->buf_pos = buf_pos + sub.w t7, t4, t5 // cnt = 40 - c +9: + st.w t7, a0, 28 // store cnt + st.d t6, a0, 16 // store dif + move a0, t8 +endfunc + +function msac_decode_bool_adapt_lsx + ld.hu a3, a1, 0 // cdf[0] /f + ld.w t0, a0, 24 // rng + ld.d t1, a0, 16 // dif + srli.w t2, t0, 8 // r >> 8 + srli.w a7, a3, 6 + mul.w t2, t2, a7 + ld.w a4, a0, 32 // allow_update_cdf + ld.w a5, a0, 28 // cnt + srli.w t2, t2, 1 + addi.w t2, t2, 4 // v + slli.d t3, t2, 48 // vw + sltu t4, t1, t3 + move t8, t4 // bit + xori t4, t4, 1 + maskeqz t6, t3, t4 // if (ret) vw + sub.d t6, t1, t6 // dif + slli.w t5, t2, 1 + sub.w t5, t0, t5 // r - 2v + maskeqz t7, t5, t4 // if (ret) r - 2v + add.w t5, t2, t7 // v(rng) + beqz a4, .renorm + + // update_cdf + ld.hu t0, a1, 2 // cdf[1] + srli.w t1, t0, 4 + addi.w t1, t1, 4 // rate + sltui t2, t0, 32 // count < 32 + add.w t0, t0, t2 // count + (count < 32) + sub.w a3, a3, t8 // cdf[0] -= bit + slli.w t4, t8, 15 + sub.w t7, a3, t4 // cdf[0] - bit - 32768 + sra.w t7, t7, t1 // (cdf[0] - bit - 32768) >> rate + sub.w t7, a3, t7 // cdf[0] + st.h t7, a1, 0 + st.h t0, a1, 2 + +.renorm: + // renorm + addi.d t6, t6, 1 + clz.w t4, t5 // d + xori t4, t4, 16 // d + sll.d t6, t6, t4 + addi.d t6, t6, -1 // dif + sub.w t7, a5, t4 // cnt-d + sll.w t5, t5, t4 + st.w t5, a0, 24 // store rng + bge t7, zero, 9f + + // refill + ld.d t0, a0, 0 // buf_pos + addi.d t1, a0, 8 + ld.d t1, t1, 0 // buf_end + addi.d t2, t0, 8 + blt t1, t2, 1f + + ld.d t0, t0, 0 // next_bits + addi.w t3, t7, 23 // shift_bits = cnt + 23 + addi.w t7, t7, 16 // cnt += 16 + revb.d t0, t0 // next_bits = bswap(next_bits) + srli.w t4, t3, 3 + sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3 + st.d t2, a0, 0 + andi t3, t3, 24 // shift_bits &= 24 + srl.d t0, t0, t3 // next_bits >>= shift_bits + sub.w t3, t3, t7 // shift_bits -= 16 + cnt + sll.d t0, t0, t3 // next_bits <<= shift_bits + li.w t5, 48 + sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits + xor t6, t6, t0 // dif ^= next_bits + b 9f +1: + li.w t4, 40 + sub.w t5, t4, t7 // c = 40 - cnt +2: + bge t0, t1, 3f + ld.bu t2, t0, 0 + addi.d t0, t0, 1 + sll.d t2, t2, t5 + xor t6, t6, t2 + addi.w t5, t5, -8 + bge t5, zero, 2b + // refill_eob_end +3: + st.d t0, a0, 0 // s->buf_pos = buf_pos + sub.w t7, t4, t5 // cnt = 40 - c +9: + st.w t7, a0, 28 // store cnt + st.d t6, a0, 16 // store dif + move a0, t8 +endfunc diff --git a/src/loongarch/msac.h b/src/loongarch/msac.h new file mode 100644 index 0000000..fdcff83 --- /dev/null +++ b/src/loongarch/msac.h @@ -0,0 +1,46 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_MSAC_H +#define DAV1D_SRC_LOONGARCH_MSAC_H + +unsigned dav1d_msac_decode_symbol_adapt4_lsx(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt8_lsx(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_lsx(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_bool_adapt_lsx(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_lsx(MsacContext *s, unsigned f); + +#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_lsx +#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_lsx +#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_lsx +#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_lsx +#define dav1d_msac_decode_bool dav1d_msac_decode_bool_lsx + +#endif /* DAV1D_SRC_LOONGARCH_MSAC_H */ diff --git a/src/meson.build b/src/meson.build index a3c211e..1716520 100644 --- a/src/meson.build +++ b/src/meson.build @@ -248,6 +248,7 @@ if is_asm_enabled 'loongarch/mc.S', 'loongarch/loopfilter.S', 'loongarch/looprestoration.S', + 'loongarch/msac.S', ) libdav1d_asm_objs += libdav1d_sources_asm endif diff --git a/src/msac.h b/src/msac.h index eb04f58..c3e07e1 100644 --- a/src/msac.h +++ b/src/msac.h @@ -51,6 +51,8 @@ typedef struct MsacContext { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/msac.h" +#elif ARCH_LOONGARCH64 +#include "src/loongarch/msac.h" #elif ARCH_X86 #include "src/x86/msac.h" #endif diff --git a/tests/checkasm/msac.c b/tests/checkasm/msac.c index b9c89b4..81fd593 100644 --- a/tests/checkasm/msac.c +++ b/tests/checkasm/msac.c @@ -266,6 +266,14 @@ void checkasm_check_msac(void) { c.decode_bool = dav1d_msac_decode_bool_neon; c.decode_hi_tok = dav1d_msac_decode_hi_tok_neon; } +#elif ARCH_LOONGARCH64 && HAVE_ASM + if (dav1d_get_cpu_flags() & DAV1D_LOONGARCH_CPU_FLAG_LSX) { + c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_lsx; + c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_lsx; + c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_lsx; + c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_lsx; + c.decode_bool = dav1d_msac_decode_bool_lsx; + } #elif ARCH_X86 && HAVE_ASM if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) { c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2; -- cgit v1.2.3 From 14df65f217c5fb03b640ddaeeb2c7a36b411bcab Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 14:54:11 +0800 Subject: loongarch: Improve the performance of refmvs.splat_mv function Relative speedup over C code: splat_mv_w1_c: 0.6 ( 1.00x) splat_mv_w1_lsx: 0.4 ( 1.28x) splat_mv_w2_c: 0.9 ( 1.00x) splat_mv_w2_lsx: 0.6 ( 1.65x) splat_mv_w4_c: 2.2 ( 1.00x) splat_mv_w4_lsx: 0.8 ( 2.87x) splat_mv_w8_c: 7.7 ( 1.00x) splat_mv_w8_lsx: 2.0 ( 3.80x) splat_mv_w16_c: 19.1 ( 1.00x) splat_mv_w16_lsx: 4.6 ( 4.18x) splat_mv_w32_c: 49.0 ( 1.00x) splat_mv_w32_lsx: 10.3 ( 4.76x) --- src/loongarch/refmvs.S | 152 +++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/refmvs.h | 44 ++++++++++++++ src/meson.build | 1 + src/refmvs.c | 4 ++ src/refmvs.h | 1 + 5 files changed, 202 insertions(+) create mode 100644 src/loongarch/refmvs.S create mode 100644 src/loongarch/refmvs.h diff --git a/src/loongarch/refmvs.S b/src/loongarch/refmvs.S new file mode 100644 index 0000000..63a83d3 --- /dev/null +++ b/src/loongarch/refmvs.S @@ -0,0 +1,152 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +/* +static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv, + const int bx4, const int bw4, int bh4) +*/ + +function splat_mv_lsx + vld vr0, a1, 0 // 0 1 ... 11 ... + clz.w t4, a3 + vaddi.bu vr1, vr0, 0 + addi.w t4, t4, -26 + vextrins.w vr1, vr0, 0x30 // 0 1 2 ... 11 0 1 2 3 + la.local t5, .SPLAT_LSX_JRTABLE + vbsrl.v vr2, vr1, 4 // 4 5 6 7...11 0 1 2 3 0 0 0 0 + alsl.d t6, t4, t5, 1 + vextrins.w vr2, vr0, 0x31 // 4 5 6 7...11 0 1 2 3 4 5 6 7 + ld.h t7, t6, 0 + vbsrl.v vr3, vr2, 4 // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0 + add.d t8, t5, t7 + alsl.d a2, a2, a2, 1 + vextrins.w vr3, vr0, 0x32 // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 + slli.w a2, a2, 2 + jirl $r0, t8, 0 + +.SPLAT_LSX_JRTABLE: + .hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE + .hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE + .hword .SPLAT_W8_LSX - .SPLAT_LSX_JRTABLE + .hword .SPLAT_W4_LSX - .SPLAT_LSX_JRTABLE + .hword .SPLAT_W2_LSX - .SPLAT_LSX_JRTABLE + .hword .SPLAT_W1_LSX - .SPLAT_LSX_JRTABLE + +.SPLAT_W1_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + + fst.d f1, t3, 0 + fst.s f3, t3, 8 + blt zero, a4, .SPLAT_W1_LSX + b .splat_end +.SPLAT_W2_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + + vst vr1, t3, 0 + fst.d f2, t3, 16 + blt zero, a4, .SPLAT_W2_LSX + b .splat_end + +.SPLAT_W4_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + + vst vr1, t3, 0 + vst vr2, t3, 16 + vst vr3, t3, 32 + blt zero, a4, .SPLAT_W4_LSX + b .splat_end + +.SPLAT_W8_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + + vst vr1, t3, 0 + vst vr2, t3, 16 + vst vr3, t3, 32 + + vst vr1, t3, 48 + vst vr2, t3, 64 + vst vr3, t3, 80 + blt zero, a4, .SPLAT_W8_LSX + b .splat_end + +.SPLAT_W16_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + +.rept 2 + vst vr1, t3, 0 + vst vr2, t3, 16 + vst vr3, t3, 32 + + vst vr1, t3, 48 + vst vr2, t3, 64 + vst vr3, t3, 80 + + addi.d t3, t3, 96 +.endr + + blt zero, a4, .SPLAT_W16_LSX + b .splat_end + +.SPLAT_W32_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + +.rept 4 + vst vr1, t3, 0 + vst vr2, t3, 16 + vst vr3, t3, 32 + + vst vr1, t3, 48 + vst vr2, t3, 64 + vst vr3, t3, 80 + + addi.d t3, t3, 96 +.endr + + blt zero, a4, .SPLAT_W32_LSX + +.splat_end: +endfunc diff --git a/src/loongarch/refmvs.h b/src/loongarch/refmvs.h new file mode 100644 index 0000000..60ff435 --- /dev/null +++ b/src/loongarch/refmvs.h @@ -0,0 +1,44 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_REFMVS_H +#define DAV1D_SRC_LOONGARCH_REFMVS_H + +#include "src/cpu.h" +#include "src/refmvs.h" + +decl_splat_mv_fn(dav1d_splat_mv_lsx); + +static ALWAYS_INLINE void refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + + c->splat_mv = dav1d_splat_mv_lsx; +} + +#endif /* DAV1D_SRC_LOONGARCH_REFMVS_H */ diff --git a/src/meson.build b/src/meson.build index 1716520..f07348b 100644 --- a/src/meson.build +++ b/src/meson.build @@ -249,6 +249,7 @@ if is_asm_enabled 'loongarch/loopfilter.S', 'loongarch/looprestoration.S', 'loongarch/msac.S', + 'loongarch/refmvs.S', ) libdav1d_asm_objs += libdav1d_sources_asm endif diff --git a/src/refmvs.c b/src/refmvs.c index 0b5ccd3..200afeb 100644 --- a/src/refmvs.c +++ b/src/refmvs.c @@ -919,6 +919,8 @@ static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv, #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/refmvs.h" +#elif ARCH_LOONGARCH64 +#include "src/loongarch/refmvs.h" #elif ARCH_X86 #include "src/x86/refmvs.h" #endif @@ -933,6 +935,8 @@ COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c) #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM refmvs_dsp_init_arm(c); +#elif ARCH_LOONGARCH64 + refmvs_dsp_init_loongarch(c); #elif ARCH_X86 refmvs_dsp_init_x86(c); #endif diff --git a/src/refmvs.h b/src/refmvs.h index 70dc967..d63874d 100644 --- a/src/refmvs.h +++ b/src/refmvs.h @@ -171,6 +171,7 @@ void dav1d_refmvs_find(const refmvs_tile *rt, void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *dsp); void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *dsp); +void dav1d_refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *dsp); void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *dsp); #endif /* DAV1D_SRC_REF_MVS_H */ -- cgit v1.2.3 From a4cd834991c3ef6b9b4922f0663f09ac8a865f88 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Sat, 2 Dec 2023 10:18:42 +0800 Subject: loongarch: Improve the performance of itx_8bpc.add_4x4 series functions Relative speedup over C code: inv_txfm_add_4x4_adst_adst_0_8bpc_c: 14.1 ( 1.00x) inv_txfm_add_4x4_adst_adst_0_8bpc_lsx: 1.3 (11.16x) inv_txfm_add_4x4_adst_adst_1_8bpc_c: 14.1 ( 1.00x) inv_txfm_add_4x4_adst_adst_1_8bpc_lsx: 1.3 (11.17x) inv_txfm_add_4x4_adst_dct_0_8bpc_c: 14.8 ( 1.00x) inv_txfm_add_4x4_adst_dct_0_8bpc_lsx: 1.4 (10.99x) inv_txfm_add_4x4_adst_dct_1_8bpc_c: 14.9 ( 1.00x) inv_txfm_add_4x4_adst_dct_1_8bpc_lsx: 1.3 (11.42x) inv_txfm_add_4x4_adst_flipadst_0_8bpc_c: 14.4 ( 1.00x) inv_txfm_add_4x4_adst_flipadst_0_8bpc_lsx: 1.2 (11.52x) inv_txfm_add_4x4_adst_flipadst_1_8bpc_c: 14.4 ( 1.00x) inv_txfm_add_4x4_adst_flipadst_1_8bpc_lsx: 1.2 (11.52x) inv_txfm_add_4x4_adst_identity_0_8bpc_c: 12.5 ( 1.00x) inv_txfm_add_4x4_adst_identity_0_8bpc_lsx: 1.2 (10.22x) inv_txfm_add_4x4_adst_identity_1_8bpc_c: 12.5 ( 1.00x) inv_txfm_add_4x4_adst_identity_1_8bpc_lsx: 1.2 (10.26x) inv_txfm_add_4x4_dct_adst_0_8bpc_c: 14.6 ( 1.00x) inv_txfm_add_4x4_dct_adst_0_8bpc_lsx: 1.3 (11.37x) inv_txfm_add_4x4_dct_adst_1_8bpc_c: 14.6 ( 1.00x) inv_txfm_add_4x4_dct_adst_1_8bpc_lsx: 1.3 (11.55x) inv_txfm_add_4x4_dct_dct_0_8bpc_c: 3.2 ( 1.00x) inv_txfm_add_4x4_dct_dct_0_8bpc_lsx: 0.5 ( 6.28x) inv_txfm_add_4x4_dct_dct_1_8bpc_c: 15.4 ( 1.00x) inv_txfm_add_4x4_dct_dct_1_8bpc_lsx: 1.2 (13.19x) inv_txfm_add_4x4_dct_flipadst_0_8bpc_c: 15.0 ( 1.00x) inv_txfm_add_4x4_dct_flipadst_0_8bpc_lsx: 1.3 (11.73x) inv_txfm_add_4x4_dct_flipadst_1_8bpc_c: 15.0 ( 1.00x) inv_txfm_add_4x4_dct_flipadst_1_8bpc_lsx: 1.3 (11.72x) inv_txfm_add_4x4_dct_identity_0_8bpc_c: 13.0 ( 1.00x) inv_txfm_add_4x4_dct_identity_0_8bpc_lsx: 1.1 (12.36x) inv_txfm_add_4x4_dct_identity_1_8bpc_c: 13.0 ( 1.00x) inv_txfm_add_4x4_dct_identity_1_8bpc_lsx: 1.0 (12.36x) inv_txfm_add_4x4_flipadst_adst_0_8bpc_c: 14.2 ( 1.00x) inv_txfm_add_4x4_flipadst_adst_0_8bpc_lsx: 1.3 (11.00x) inv_txfm_add_4x4_flipadst_adst_1_8bpc_c: 14.2 ( 1.00x) inv_txfm_add_4x4_flipadst_adst_1_8bpc_lsx: 1.3 (11.03x) inv_txfm_add_4x4_flipadst_dct_0_8bpc_c: 15.0 ( 1.00x) inv_txfm_add_4x4_flipadst_dct_0_8bpc_lsx: 1.3 (11.43x) inv_txfm_add_4x4_flipadst_dct_1_8bpc_c: 15.0 ( 1.00x) inv_txfm_add_4x4_flipadst_dct_1_8bpc_lsx: 1.3 (11.44x) inv_txfm_add_4x4_flipadst_flipadst_0_8bpc_c: 14.5 ( 1.00x) inv_txfm_add_4x4_flipadst_flipadst_0_8bpc_lsx: 1.3 (11.60x) inv_txfm_add_4x4_flipadst_flipadst_1_8bpc_c: 14.5 ( 1.00x) inv_txfm_add_4x4_flipadst_flipadst_1_8bpc_lsx: 1.2 (11.61x) inv_txfm_add_4x4_flipadst_identity_0_8bpc_c: 12.5 ( 1.00x) inv_txfm_add_4x4_flipadst_identity_0_8bpc_lsx: 1.1 (11.01x) inv_txfm_add_4x4_flipadst_identity_1_8bpc_c: 12.5 ( 1.00x) inv_txfm_add_4x4_flipadst_identity_1_8bpc_lsx: 1.1 (10.99x) inv_txfm_add_4x4_identity_adst_0_8bpc_c: 12.1 ( 1.00x) inv_txfm_add_4x4_identity_adst_0_8bpc_lsx: 1.1 (11.50x) inv_txfm_add_4x4_identity_adst_1_8bpc_c: 12.1 ( 1.00x) inv_txfm_add_4x4_identity_adst_1_8bpc_lsx: 1.1 (10.98x) inv_txfm_add_4x4_identity_dct_0_8bpc_c: 12.9 ( 1.00x) inv_txfm_add_4x4_identity_dct_0_8bpc_lsx: 1.0 (12.95x) inv_txfm_add_4x4_identity_dct_1_8bpc_c: 13.0 ( 1.00x) inv_txfm_add_4x4_identity_dct_1_8bpc_lsx: 1.0 (12.97x) inv_txfm_add_4x4_identity_flipadst_0_8bpc_c: 12.4 ( 1.00x) inv_txfm_add_4x4_identity_flipadst_0_8bpc_lsx: 1.1 (11.26x) inv_txfm_add_4x4_identity_flipadst_1_8bpc_c: 12.4 ( 1.00x) inv_txfm_add_4x4_identity_flipadst_1_8bpc_lsx: 1.1 (11.32x) inv_txfm_add_4x4_identity_identity_0_8bpc_c: 10.6 ( 1.00x) inv_txfm_add_4x4_identity_identity_0_8bpc_lsx: 0.9 (11.45x) inv_txfm_add_4x4_identity_identity_1_8bpc_c: 10.6 ( 1.00x) inv_txfm_add_4x4_identity_identity_1_8bpc_lsx: 0.9 (11.78x) inv_txfm_add_4x4_wht_wht_0_8bpc_c: 4.1 ( 1.00x) inv_txfm_add_4x4_wht_wht_0_8bpc_lsx: 0.6 ( 6.84x) inv_txfm_add_4x4_wht_wht_1_8bpc_c: 4.1 ( 1.00x) inv_txfm_add_4x4_wht_wht_1_8bpc_lsx: 0.6 ( 6.83x) --- src/itx_tmpl.c | 5 + src/loongarch/itx.S | 917 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/itx.h | 80 +++++ src/meson.build | 1 + 4 files changed, 1003 insertions(+) create mode 100644 src/loongarch/itx.S create mode 100644 src/loongarch/itx.h diff --git a/src/itx_tmpl.c b/src/itx_tmpl.c index d385989..9ecf8bf 100644 --- a/src/itx_tmpl.c +++ b/src/itx_tmpl.c @@ -183,6 +183,8 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/itx.h" +#elif ARCH_LOONGARCH64 +#include "src/loongarch/itx.h" #elif ARCH_X86 #include "src/x86/itx.h" #endif @@ -257,6 +259,9 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { #if ARCH_AARCH64 || ARCH_ARM itx_dsp_init_arm(c, bpc); #endif +#if ARCH_LOONGARCH64 + itx_dsp_init_loongarch(c, bpc); +#endif #if ARCH_X86 itx_dsp_init_x86(c, bpc); #endif diff --git a/src/loongarch/itx.S b/src/loongarch/itx.S new file mode 100644 index 0000000..006f1d4 --- /dev/null +++ b/src/loongarch/itx.S @@ -0,0 +1,917 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +/* +void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrlowff_t stride, + coef *const coeff, const int eob + HIGHBD_DECL_SUFFIX) +*/ +function inv_txfm_add_wht_wht_4x4_8bpc_lsx + vld vr0, a2, 0 + vld vr2, a2, 16 + + vreplgr2vr.h vr20, zero + + vsrai.h vr0, vr0, 2 + vsrai.h vr2, vr2, 2 + + vst vr20, a2, 0 + + vpickod.d vr1, vr0, vr0 + vpickod.d vr3, vr2, vr2 + + vadd.h vr4, vr0, vr1 + vsub.h vr5, vr2, vr3 + vsub.h vr6, vr4, vr5 + vsrai.h vr6, vr6, 1 + vsub.h vr0, vr6, vr3 + vsub.h vr2, vr6, vr1 + vsub.h vr1, vr4, vr0 + vadd.h vr3, vr5, vr2 + + vst vr20, a2, 16 + + vilvl.h vr4, vr0, vr1 + vilvl.h vr5, vr3, vr2 + vilvl.w vr0, vr5, vr4 + vilvh.w vr2, vr5, vr4 + vilvh.d vr1, vr0, vr0 + vilvh.d vr3, vr2, vr2 + + vadd.h vr4, vr0, vr1 + vsub.h vr5, vr2, vr3 + vsub.h vr6, vr4, vr5 + vsrai.h vr6, vr6, 1 + vsub.h vr0, vr6, vr3 + vsub.h vr2, vr6, vr1 + vsub.h vr1, vr4, vr0 + vadd.h vr3, vr5, vr2 + + vld vr4, a0, 0 + vldx vr5, a0, a1 + alsl.d t0, a1, a0, 1 + vld vr6, t0, 0 + vldx vr7, t0, a1 + + vsllwil.hu.bu vr4, vr4, 0 + vsllwil.hu.bu vr5, vr5, 0 + vsllwil.hu.bu vr6, vr6, 0 + vsllwil.hu.bu vr7, vr7, 0 + vilvl.d vr1, vr0, vr1 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr6, vr7, vr6 + vadd.h vr1, vr1, vr4 + vadd.h vr2, vr2, vr6 + vssrani.bu.h vr2, vr1, 0 + + vstelm.w vr2, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr2, a0, 0, 1 + add.d a0, a0, a1 + vstelm.w vr2, a0, 0, 2 + add.d a0, a0, a1 + vstelm.w vr2, a0, 0, 3 +endfunc + +const idct_coeffs, align=4 + // idct4 + .word 2896, 2896*8, 1567, 3784 + // idct8 + .word 799, 4017, 3406, 2276 + // idct16 + .word 401, 4076, 3166, 2598 + .word 1931, 3612, 3920, 1189 + // idct32 + .word 201, 4091, 3035, 2751 + .word 1751, 3703, 3857, 1380 + .word 995, 3973, 3513, 2106 + .word 2440, 3290, 4052, 601 +endconst + +.macro vld_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 + vld \in0, \src, \start + vld \in1, \src, \start+(\stride*1) + vld \in2, \src, \start+(\stride*2) + vld \in3, \src, \start+(\stride*3) + vld \in4, \src, \start+(\stride*4) + vld \in5, \src, \start+(\stride*5) + vld \in6, \src, \start+(\stride*6) + vld \in7, \src, \start+(\stride*7) +.endm + +.macro vst_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 + vst \in0, \src, \start + vst \in1, \src, \start+(\stride*1) + vst \in2, \src, \start+(\stride*2) + vst \in3, \src, \start+(\stride*3) + vst \in4, \src, \start+(\stride*4) + vst \in5, \src, \start+(\stride*5) + vst \in6, \src, \start+(\stride*6) + vst \in7, \src, \start+(\stride*7) +.endm + +.macro vld_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15 + + vld_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + + vld \in8, \src, \start+(\stride*8) + vld \in9, \src, \start+(\stride*9) + vld \in10, \src, \start+(\stride*10) + vld \in11, \src, \start+(\stride*11) + vld \in12, \src, \start+(\stride*12) + vld \in13, \src, \start+(\stride*13) + vld \in14, \src, \start+(\stride*14) + vld \in15, \src, \start+(\stride*15) +.endm + +.macro vst_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15 + + vst_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + + vst \in8, \src, \start+(\stride*8) + vst \in9, \src, \start+(\stride*9) + vst \in10, \src, \start+(\stride*10) + vst \in11, \src, \start+(\stride*11) + vst \in12, \src, \start+(\stride*12) + vst \in13, \src, \start+(\stride*13) + vst \in14, \src, \start+(\stride*14) + vst \in15, \src, \start+(\stride*15) +.endm + +.macro DST_ADD_W4 in0, in1, in2, in3, in4, in5 + vilvl.w vr10, \in1, \in0 // 0 1 2 3 4 5 6 7 x ... + vilvl.w vr12, \in3, \in2 // 8 9 10 11 12 13 14 15 x ... + vsllwil.hu.bu vr10, vr10, 0 + vsllwil.hu.bu vr12, vr12, 0 + vadd.h vr10, \in4, vr10 + vadd.h vr12, \in5, vr12 + vssrani.bu.h vr12, vr10, 0 + vstelm.w vr12, a0, 0, 0 + add.d t8, a0, a1 + vstelm.w vr12, t8, 0, 1 + vstelm.w vr12, t2, 0, 2 + add.d t8, t2, a1 + vstelm.w vr12, t8, 0, 3 +.endm + +.macro VLD_DST_ADD_W4 in0, in1 + vld vr0, a0, 0 + vldx vr1, a0, a1 + vld vr2, t2, 0 + vldx vr3, t2, a1 + + DST_ADD_W4 vr0, vr1, vr2, vr3, \in0, \in1 +.endm + +.macro dct_4x4_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1 + vexth.w.h vr4, \in0 // in1 + vexth.w.h vr5, \in1 // in3 + vmul.w vr6, vr4, \in4 + vmul.w vr7, vr4, \in5 + vmadd.w vr6, vr5, \in5 // t3 + vmsub.w vr7, vr5, \in4 // t2 + vsllwil.w.h vr4, \in2, 0 // in0 + vsllwil.w.h vr5, \in3, 0 // in2 + vmul.w vr9, vr4, \in6 + vmul.w vr10, vr4, \in7 + vmadd.w vr9, vr5, \in7 // t0 + vmsub.w vr10, vr5, \in6 // t1 + vssrarni.h.w vr10, vr9, 12 // t0 t1 + vssrarni.h.w vr7, vr6, 12 // t3 t2 + vsadd.h \out0, vr10, vr7 // 0 4 8 12 1 5 9 13 c[0] c[1] + vssub.h \out1, vr10, vr7 // 3 7 11 15 2 6 10 14 c[3] c[2] +.endm + +.macro inv_dct_dct_4x4_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 + + vldrepl.w vr2, t0, 8 // 1567 + vldrepl.w vr3, t0, 12 // 3784 + vldrepl.w vr8, t0, 0 // 2896 + + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12 + + vreplgr2vr.h vr15, zero + vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15 + vst vr15, a2, 0 + vst vr15, a2, 16 + + vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 + vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 + + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + vshuf4i.d vr14, vr14, 0x01 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro identity_4x4_lsx in0, in1, in2, in3, out0 + vsllwil.w.h vr2, \in0, 0 + vexth.w.h vr3, \in1 + vmul.w vr4, vr2, \in2 + vmul.w vr5, vr3, \in2 + vssrarni.h.w vr5, vr4, 12 + vsadd.h \out0, vr5, \in3 +.endm + +.macro inv_identity_identity_4x4_lsx + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 + identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + identity_4x4_lsx vr0, vr0, vr20, vr0, vr6 + identity_4x4_lsx vr1, vr1, vr20, vr1, vr7 + + vsrari.h vr6, vr6, 4 + vsrari.h vr7, vr7, 4 + vilvh.d vr8, vr6, vr6 + vilvh.d vr9, vr7, vr7 + vilvl.h vr4, vr8, vr6 + vilvl.h vr5, vr9, vr7 + vilvl.w vr6, vr5, vr4 + vilvh.w vr7, vr5, vr4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr6, vr7 +.endm + +const iadst4_coeffs, align=4 + .word 1321, 3803, 2482, 3344 +endconst + +.macro adst4x4_1d_lsx in0, in1, in2, in3, out0, out1, out2, out3 + vsub.w vr6, \in0, \in2 // in0-in2 + vmul.w vr7, \in0, vr20 // in0*1321 + vmadd.w vr7, \in2, vr21 // in0*1321+in2*3803 + vmadd.w vr7, \in3, vr22 // in0*1321+in2*3803+in3*2482 + vmul.w vr8, \in1, vr23 // in1*3344 + vadd.w vr6, vr6, \in3 // in0-in2+in3 + vmul.w vr9, \in0, vr22 // in0*2482 + vmsub.w vr9, \in2, vr20 // in2*1321 + vmsub.w vr9, \in3, vr21 // in0*2482-in2*1321-in3*3803 + vadd.w vr5, vr7, vr9 + vmul.w \out2, vr6, vr23 // out[2] 8 9 10 11 + vadd.w \out0, vr7, vr8 // out[0] 0 1 2 3 + vadd.w \out1, vr9, vr8 // out[1] 4 5 6 7 + vsub.w \out3, vr5, vr8 // out[3] 12 13 14 15 +.endm + +.macro inv_adst_dct_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 + vssrarni.h.w vr13, vr11, 12 + vssrarni.h.w vr14, vr12, 12 + + vreplgr2vr.h vr15, zero + la.local t0, idct_coeffs + vst vr15, a2, 0 + vst vr15, a2, 16 + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14 + + vshuf4i.d vr14, vr14, 0x01 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_adst_adst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 + + vsrari.w vr11, vr11, 12 + vsrari.w vr13, vr13, 12 + vsrari.w vr12, vr12, 12 + vsrari.w vr14, vr14, 12 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr13, vr12, vr14 + + vssrarni.h.w vr13, vr11, 12 + vssrarni.h.w vr14, vr12, 12 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_dct_adst_4x4_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14 + + vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 + vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 + + vsllwil.w.h vr2, vr11, 0 // in0 + vexth.w.h vr3, vr11 // in1 + vsllwil.w.h vr4, vr12, 0 // in2 + vexth.w.h vr5, vr12 // in3 + + la.local t0, iadst4_coeffs + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr13, vr12, vr14 + + vssrarni.h.w vr13, vr11, 12 + vssrarni.h.w vr14, vr12, 12 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_dct_flipadst_4x4_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14 + + vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 + vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 + vsllwil.w.h vr2, vr11, 0 // in0 + vexth.w.h vr3, vr11 // in1 + vsllwil.w.h vr4, vr12, 0 // in2 + vexth.w.h vr5, vr12 // in3 + + la.local t0, iadst4_coeffs + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr12, vr13, vr14 + + vssrarni.h.w vr11, vr12, 12 // 0 1 2 3 4 5 6 7 + vssrarni.h.w vr13, vr14, 12 // 8 9 10 11 12 13 14 15 + vsrari.h vr11, vr11, 4 + vsrari.h vr13, vr13, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr11 +.endm + +.macro inv_flipadst_adst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + vsrari.w vr0, vr0, 12 + vsrari.w vr1, vr1, 12 + vsrari.w vr2, vr2, 12 + vsrari.w vr3, vr3, 12 + + vilvl.w vr4, vr0, vr1 + vilvh.w vr5, vr0, vr1 + vilvl.w vr6, vr2, vr3 + vilvh.w vr7, vr2, vr3 + vilvl.d vr11, vr4, vr6 + vilvh.d vr12, vr4, vr6 + vilvl.d vr13, vr5, vr7 + vilvh.d vr14, vr5, vr7 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr13, vr12, vr14 + + vssrarni.h.w vr13, vr11, 12 + vssrarni.h.w vr14, vr12, 12 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_adst_flipadst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 + vsrari.w vr11, vr11, 12 + vsrari.w vr12, vr12, 12 + vsrari.w vr13, vr13, 12 + vsrari.w vr14, vr14, 12 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr12, vr13, vr14 + + vssrarni.h.w vr11, vr12, 12 + vssrarni.h.w vr13, vr14, 12 + vsrari.h vr11, vr11, 4 + vsrari.h vr13, vr13, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr11 +.endm + +.macro inv_flipadst_dct_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + vilvl.w vr4, vr0, vr1 + vilvh.w vr5, vr0, vr1 + vilvl.w vr6, vr2, vr3 + vilvh.w vr7, vr2, vr3 + + vilvl.d vr11, vr4, vr6 + vilvh.d vr12, vr4, vr6 + vilvl.d vr13, vr5, vr7 + vilvh.d vr14, vr5, vr7 + + vssrarni.h.w vr12, vr11, 12 + vssrarni.h.w vr14, vr13, 12 + + vreplgr2vr.h vr15, zero + la.local t0, idct_coeffs + vst vr15, a2, 0 + vst vr15, a2, 16 + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_4x4_core_lsx vr12, vr14, vr12, vr14, vr21, vr20, vr22, vr22, vr13, vr14 + + vshuf4i.d vr14, vr14, 0x01 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_flipadst_flipadst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + vilvl.w vr4, vr0, vr1 + vilvh.w vr5, vr0, vr1 + vilvl.w vr6, vr2, vr3 + vilvh.w vr7, vr2, vr3 + vilvl.d vr11, vr4, vr6 + vilvh.d vr12, vr4, vr6 + vilvl.d vr13, vr5, vr7 + vilvh.d vr14, vr5, vr7 + + vsrari.w vr11, vr11, 12 + vsrari.w vr12, vr12, 12 + vsrari.w vr13, vr13, 12 + vsrari.w vr14, vr14, 12 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr12, vr13, vr14 + + vssrarni.h.w vr11, vr12, 12 + vssrarni.h.w vr13, vr14, 12 + vsrari.h vr11, vr11, 4 + vsrari.h vr13, vr13, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr11 +.endm + +.macro inv_dct_identity_4x4_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 + vld vr1, a2, 16 + + vldrepl.w vr2, t0, 8 // 1567 + vldrepl.w vr3, t0, 12 // 3784 + vldrepl.w vr8, t0, 0 // 2896 + + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12 + vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15 + + vreplgr2vr.h vr15, zero + li.w t0, 1697 + + vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 + vilvl.h vr10, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 + + vst vr15, a2, 0 + vst vr15, a2, 16 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr10, vr10, vr20, vr10, vr6 + identity_4x4_lsx vr12, vr12, vr20, vr12, vr7 + vsrari.h vr11, vr6, 4 + vsrari.h vr13, vr7, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr11, vr13 +.endm + +.macro inv_identity_dct_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 + identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 + + vreplgr2vr.h vr15, zero + + vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 + vilvl.h vr13, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr14, vr5, vr4 // 8 9 10 11 12 13 14 15 + + vst vr15, a2, 0 + vst vr15, a2, 16 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14 + + vshuf4i.d vr14, vr14, 0x01 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_flipadst_identity_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr10, vr11, vr12, vr13 + + vssrarni.h.w vr12, vr13, 12 + vssrarni.h.w vr10, vr11, 12 + + vilvl.h vr4, vr10, vr12 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr10, vr12 // 1 3 5 7 9 11 13 15 + vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr13, vr5, vr4 // 8 9 10 11 12 13 14 15 + + vreplgr2vr.h vr15, zero + li.w t0, 1697 + + vst vr15, a2, 0 + vst vr15, a2, 16 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr11, vr11, vr20, vr11, vr6 + identity_4x4_lsx vr13, vr13, vr20, vr13, vr7 + vsrari.h vr11, vr6, 4 + vsrari.h vr13, vr7, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr11, vr13 +.endm + +.macro inv_identity_flipadst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 + identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr11, vr5, vr4 + vilvh.h vr13, vr5, vr4 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr11, 0 // in0 + vexth.w.h vr3, vr11 // in1 + vsllwil.w.h vr4, vr13, 0 // in2 + vexth.w.h vr5, vr13 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + vssrarni.h.w vr0, vr1, 12 // 8 9 10 11 12 13 14 15 + vssrarni.h.w vr2, vr3, 12 // 0 1 2 3 4 5 6 7 + vsrari.h vr11, vr0, 4 + vsrari.h vr13, vr2, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr11 +.endm + +.macro inv_identity_adst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 + identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr11, vr5, vr4 + vilvh.h vr13, vr5, vr4 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr11, 0 // in0 + vexth.w.h vr3, vr11 // in1 + vsllwil.w.h vr4, vr13, 0 // in2 + vexth.w.h vr5, vr13 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + vssrarni.h.w vr1, vr0, 12 + vssrarni.h.w vr3, vr2, 12 + vsrari.h vr11, vr1, 4 + vsrari.h vr13, vr3, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr11, vr13 +.endm + +.macro inv_adst_identity_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 + + vssrarni.h.w vr13, vr11, 12 + vssrarni.h.w vr14, vr12, 12 + + vreplgr2vr.h vr15, zero + li.w t0, 1697 + + vst vr15, a2, 0 + vst vr15, a2, 16 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr13, vr13, vr20, vr13, vr6 + identity_4x4_lsx vr14, vr14, vr20, vr14, vr7 + vsrari.h vr11, vr6, 4 + vsrari.h vr13, vr7, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr11, vr13 +.endm + +.macro fun4x4 type1, type2 +function inv_txfm_add_\type1\()_\type2\()_4x4_8bpc_lsx +.ifc \type1\()_\type2, dct_dct + bnez a3, .LLL + + vldi vr0, 0x8b5 // 181 + ld.h t2, a2, 0 // dc + st.h zero, a2, 0 + vreplgr2vr.w vr1, t2 + vldi vr3, 0x880 // 128 + vmul.w vr2, vr0, vr1 + vld vr10, a0, 0 + vsrari.w vr2, vr2, 8 + vldx vr11, a0, a1 + vmadd.w vr3, vr2, vr0 + alsl.d t2, a1, a0, 1 + vssrarni.h.w vr3, vr3, 12 + vld vr12, t2, 0 + vldx vr13, t2, a1 + + DST_ADD_W4 vr10, vr11, vr12, vr13, vr3, vr3 + + b .IDST_\type1\()_\type2\()_4X4_END +.LLL: +.endif + + inv_\type1\()_\type2\()_4x4_lsx +.IDST_\type1\()_\type2\()_4X4_END: +endfunc +.endm + +fun4x4 dct, dct +fun4x4 identity, identity +fun4x4 adst, dct +fun4x4 dct, adst +fun4x4 adst, adst +fun4x4 dct, flipadst +fun4x4 flipadst, adst +fun4x4 adst, flipadst +fun4x4 flipadst, dct +fun4x4 flipadst, flipadst +fun4x4 dct, identity +fun4x4 identity, dct +fun4x4 flipadst, identity +fun4x4 identity, flipadst +fun4x4 identity, adst +fun4x4 adst, identity diff --git a/src/loongarch/itx.h b/src/loongarch/itx.h new file mode 100644 index 0000000..de42e00 --- /dev/null +++ b/src/loongarch/itx.h @@ -0,0 +1,80 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_ITX_H +#define DAV1D_SRC_LOONGARCH_ITX_H + +#include "src/cpu.h" +#include "src/itx.h" + +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_4x4, lsx)); + +static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) { +#if BITDEPTH == 8 + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + + if (BITDEPTH != 8 ) return; + + c->itxfm_add[TX_4X4][WHT_WHT] = dav1d_inv_txfm_add_wht_wht_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][IDTX] = dav1d_inv_txfm_add_identity_identity_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_4x4_8bpc_lsx; +#endif +} + +#endif /* DAV1D_SRC_LOONGARCH_ITX_H */ diff --git a/src/meson.build b/src/meson.build index f07348b..f443c05 100644 --- a/src/meson.build +++ b/src/meson.build @@ -250,6 +250,7 @@ if is_asm_enabled 'loongarch/looprestoration.S', 'loongarch/msac.S', 'loongarch/refmvs.S', + 'loongarch/itx.S', ) libdav1d_asm_objs += libdav1d_sources_asm endif -- cgit v1.2.3 From 951646ce567dc8ce045243db6f201f3cb15df171 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 15:16:46 +0800 Subject: loongarch: Improve the performance of itx_8bpc.add_8x4 series functions Relative speedup over C code: inv_txfm_add_8x4_adst_adst_0_8bpc_c: 32.0 ( 1.00x) inv_txfm_add_8x4_adst_adst_0_8bpc_lsx: 4.1 ( 7.87x) inv_txfm_add_8x4_adst_adst_1_8bpc_c: 32.3 ( 1.00x) inv_txfm_add_8x4_adst_adst_1_8bpc_lsx: 4.1 ( 7.92x) inv_txfm_add_8x4_adst_dct_0_8bpc_c: 33.7 ( 1.00x) inv_txfm_add_8x4_adst_dct_0_8bpc_lsx: 3.8 ( 8.77x) inv_txfm_add_8x4_adst_dct_1_8bpc_c: 33.1 ( 1.00x) inv_txfm_add_8x4_adst_dct_1_8bpc_lsx: 3.8 ( 8.63x) inv_txfm_add_8x4_adst_flipadst_0_8bpc_c: 32.7 ( 1.00x) inv_txfm_add_8x4_adst_flipadst_0_8bpc_lsx: 4.1 ( 7.99x) inv_txfm_add_8x4_adst_flipadst_1_8bpc_c: 32.8 ( 1.00x) inv_txfm_add_8x4_adst_flipadst_1_8bpc_lsx: 4.0 ( 8.16x) inv_txfm_add_8x4_adst_identity_0_8bpc_c: 31.2 ( 1.00x) inv_txfm_add_8x4_adst_identity_0_8bpc_lsx: 3.8 ( 8.29x) inv_txfm_add_8x4_adst_identity_1_8bpc_c: 28.7 ( 1.00x) inv_txfm_add_8x4_adst_identity_1_8bpc_lsx: 3.7 ( 7.78x) inv_txfm_add_8x4_dct_adst_0_8bpc_c: 32.0 ( 1.00x) inv_txfm_add_8x4_dct_adst_0_8bpc_lsx: 3.0 (10.76x) inv_txfm_add_8x4_dct_adst_1_8bpc_c: 31.5 ( 1.00x) inv_txfm_add_8x4_dct_adst_1_8bpc_lsx: 2.8 (11.46x) inv_txfm_add_8x4_dct_dct_0_8bpc_c: 5.5 ( 1.00x) inv_txfm_add_8x4_dct_dct_0_8bpc_lsx: 0.6 ( 9.22x) inv_txfm_add_8x4_dct_dct_1_8bpc_c: 33.1 ( 1.00x) inv_txfm_add_8x4_dct_dct_1_8bpc_lsx: 2.8 (11.89x) inv_txfm_add_8x4_dct_flipadst_0_8bpc_c: 32.4 ( 1.00x) inv_txfm_add_8x4_dct_flipadst_0_8bpc_lsx: 3.0 (10.81x) inv_txfm_add_8x4_dct_flipadst_1_8bpc_c: 32.4 ( 1.00x) inv_txfm_add_8x4_dct_flipadst_1_8bpc_lsx: 3.0 (10.81x) inv_txfm_add_8x4_dct_identity_0_8bpc_c: 27.9 ( 1.00x) inv_txfm_add_8x4_dct_identity_0_8bpc_lsx: 2.7 (10.35x) inv_txfm_add_8x4_dct_identity_1_8bpc_c: 28.5 ( 1.00x) inv_txfm_add_8x4_dct_identity_1_8bpc_lsx: 2.7 (10.53x) inv_txfm_add_8x4_flipadst_adst_0_8bpc_c: 32.2 ( 1.00x) inv_txfm_add_8x4_flipadst_adst_0_8bpc_lsx: 4.1 ( 7.86x) inv_txfm_add_8x4_flipadst_adst_1_8bpc_c: 32.2 ( 1.00x) inv_txfm_add_8x4_flipadst_adst_1_8bpc_lsx: 4.0 ( 7.95x) inv_txfm_add_8x4_flipadst_dct_0_8bpc_c: 33.6 ( 1.00x) inv_txfm_add_8x4_flipadst_dct_0_8bpc_lsx: 3.8 ( 8.73x) inv_txfm_add_8x4_flipadst_dct_1_8bpc_c: 33.6 ( 1.00x) inv_txfm_add_8x4_flipadst_dct_1_8bpc_lsx: 3.8 ( 8.74x) inv_txfm_add_8x4_flipadst_flipadst_0_8bpc_c: 32.6 ( 1.00x) inv_txfm_add_8x4_flipadst_flipadst_0_8bpc_lsx: 4.0 ( 8.16x) inv_txfm_add_8x4_flipadst_flipadst_1_8bpc_c: 32.6 ( 1.00x) inv_txfm_add_8x4_flipadst_flipadst_1_8bpc_lsx: 4.0 ( 8.15x) inv_txfm_add_8x4_flipadst_identity_0_8bpc_c: 28.7 ( 1.00x) inv_txfm_add_8x4_flipadst_identity_0_8bpc_lsx: 3.8 ( 7.64x) inv_txfm_add_8x4_flipadst_identity_1_8bpc_c: 28.7 ( 1.00x) inv_txfm_add_8x4_flipadst_identity_1_8bpc_lsx: 3.8 ( 7.55x) inv_txfm_add_8x4_identity_adst_0_8bpc_c: 21.9 ( 1.00x) inv_txfm_add_8x4_identity_adst_0_8bpc_lsx: 1.9 (11.81x) inv_txfm_add_8x4_identity_adst_1_8bpc_c: 26.9 ( 1.00x) inv_txfm_add_8x4_identity_adst_1_8bpc_lsx: 1.9 (14.39x) inv_txfm_add_8x4_identity_dct_0_8bpc_c: 23.3 ( 1.00x) inv_txfm_add_8x4_identity_dct_0_8bpc_lsx: 1.7 (13.53x) inv_txfm_add_8x4_identity_dct_1_8bpc_c: 23.3 ( 1.00x) inv_txfm_add_8x4_identity_dct_1_8bpc_lsx: 1.7 (13.53x) inv_txfm_add_8x4_identity_flipadst_0_8bpc_c: 22.3 ( 1.00x) inv_txfm_add_8x4_identity_flipadst_0_8bpc_lsx: 1.9 (11.46x) inv_txfm_add_8x4_identity_flipadst_1_8bpc_c: 23.4 ( 1.00x) inv_txfm_add_8x4_identity_flipadst_1_8bpc_lsx: 1.9 (12.02x) inv_txfm_add_8x4_identity_identity_0_8bpc_c: 18.5 ( 1.00x) inv_txfm_add_8x4_identity_identity_0_8bpc_lsx: 1.6 (11.23x) inv_txfm_add_8x4_identity_identity_1_8bpc_c: 18.5 ( 1.00x) inv_txfm_add_8x4_identity_identity_1_8bpc_lsx: 1.6 (11.57x) --- src/loongarch/itx.S | 1253 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/itx.h | 35 ++ 2 files changed, 1288 insertions(+) diff --git a/src/loongarch/itx.S b/src/loongarch/itx.S index 006f1d4..4ea353e 100644 --- a/src/loongarch/itx.S +++ b/src/loongarch/itx.S @@ -915,3 +915,1256 @@ fun4x4 flipadst, identity fun4x4 identity, flipadst fun4x4 identity, adst fun4x4 adst, identity + +.macro rect2_w4_lsx in0, in1, in2, out0, out1 + vsllwil.w.h vr22, \in0, 0 + vexth.w.h vr23, \in1 + vmul.w vr22, vr22, \in2 + vmul.w vr23, vr23, \in2 + vsrari.w \out0, vr22, 12 + vsrari.w \out1, vr23, 12 +.endm + +.macro dct_8x4_core_lsx1 out0, out1, out2, out3 + // dct4 stride=1<<1 + vmul.w vr0, vr6, vr21 + vmul.w vr1, vr6, vr20 + vmadd.w vr0, vr10, vr20 // t3 + vmsub.w vr1, vr10, vr21 // t2 + vmul.w vr2, vr18, vr22 + vmul.w vr3, vr18, vr22 + vmadd.w vr2, vr8, vr22 // t0 + vmsub.w vr3, vr8, vr22 // t1 + vssrarni.h.w vr1, vr0, 12 // t3 t2 + vssrarni.h.w vr3, vr2, 12 // t0 t1 + vsadd.h vr8, vr3, vr1 // t0 t1 + vssub.h vr10, vr3, vr1 // t3 t2 + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vldrepl.w vr22, t0, 24 // 3406 + vldrepl.w vr23, t0, 28 // 2276 + + vmul.w vr0, vr19, vr21 // in1 * 4017 + vmul.w vr1, vr19, vr20 // in1 * 799 + vmadd.w vr0, vr11, vr20 // in7 * 799 // t7a + vmsub.w vr1, vr11, vr21 // in7 * 4017 // t4a + vmul.w vr2, vr9, vr23 // in5 * 1138 + vmul.w vr3, vr9, vr22 // in5 * 1703 + vmadd.w vr2, vr7, vr22 // in3 * 1703 // t6a + vmsub.w vr3, vr7, vr23 // in3 * 1138 // t5a + vssrarni.h.w vr0, vr1, 12 // t4a t7a + vssrarni.h.w vr2, vr3, 12 // t5a t6a + vsadd.h vr9, vr0, vr2 // t4 t7 + vssub.h vr11, vr0, vr2 // t5a t6a + + vldrepl.w vr22, t0, 0 // 2896 + vexth.w.h vr18, vr11 // t6a + vsllwil.w.h vr19, vr11, 0 // t5a + vmul.w vr6, vr18, vr22 + vmul.w vr7, vr18, vr22 + vmadd.w vr6, vr19, vr22 // t6 + vmsub.w vr7, vr19, vr22 // t5 + vssrarni.h.w vr6, vr7, 12 // t5 t6 + + vilvh.d vr11, vr6, vr9 // t7 t6 + vilvl.d vr9, vr6, vr9 // t4 t5 + + vsadd.h \out0, vr8, vr11 // c[0] c[1] + vsadd.h \out1, vr10, vr9 // c[3] c[2] + vssub.h \out2, vr10, vr9 // c[4] c[5] + vssub.h \out3, vr8, vr11 // c[7] c[6] +.endm + +.macro dct_8x4_core_lsx2 in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 + vexth.w.h vr4, \in0 // in1 + vexth.w.h vr5, \in1 // in3 + vmul.w vr6, vr4, \in4 + vmul.w vr7, vr4, \in5 + vmadd.w vr6, vr5, \in5 // t3 + vmsub.w vr7, vr5, \in4 // t2 + vexth.w.h vr4, \in2 // in1 + vexth.w.h vr5, \in3 // in3 + vmul.w vr8, vr4, \in4 + vmul.w vr9, vr4, \in5 + vmadd.w vr8, vr5, \in5 // t3 + vmsub.w vr9, vr5, \in4 // t2 + vssrarni.h.w vr8, vr6, 12 // t3 + vssrarni.h.w vr9, vr7, 12 // t2 + + vsllwil.w.h vr4, \in0, 0 + vsllwil.w.h vr5, \in1, 0 + vmul.w vr11, vr4, \in6 + vmul.w vr12, vr4, \in7 + vmadd.w vr11, vr5, \in7 // t0 + vmsub.w vr12, vr5, \in6 // t1 + vsllwil.w.h vr4, \in2, 0 + vsllwil.w.h vr5, \in3, 0 + vmul.w vr13, vr4, \in6 + vmul.w vr14, vr4, \in7 + vmadd.w vr13, vr5, \in7 // t0 + vmsub.w vr14, vr5, \in6 // t1 + vssrarni.h.w vr13, vr11, 12 // t0 + vssrarni.h.w vr14, vr12, 12 // t1 + + vsadd.h \out0, vr13, vr8 + vsadd.h \out1, vr14, vr9 + vssub.h \out2, vr14, vr9 + vssub.h \out3, vr13, vr8 +.endm + +.macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7 + vsllwil.hu.bu vr10, \in0, 0 + vsllwil.hu.bu vr11, \in1, 0 + vsllwil.hu.bu vr12, \in2, 0 + vsllwil.hu.bu vr13, \in3, 0 + vadd.h vr10, \in4, vr10 + vadd.h vr11, \in5, vr11 + vadd.h vr12, \in6, vr12 + vadd.h vr13, \in7, vr13 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vstelm.d vr11, a0, 0, 0 + add.d t8, a0, a1 + vstelm.d vr11, t8, 0, 1 + vstelm.d vr13, t2, 0, 0 + add.d t8, t2, a1 + vstelm.d vr13, t8, 0, 1 +.endm + +.macro VLD_DST_ADD_W8 in0, in1, in2, in3 + vld vr0, a0, 0 + vldx vr1, a0, a1 + vld vr2, t2, 0 + vldx vr3, t2, a1 + + DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3 +.endm + +function inv_txfm_add_dct_dct_8x4_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_8x4 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 + vld vr10, a0, 0 + vmul.w vr2, vr2, vr0 + vldx vr11, a0, a1 + vsrari.w vr2, vr2, 8 + alsl.d t2, a1, a0, 1 + vmadd.w vr5, vr2, vr0 + vld vr12, t2, 0 + vssrarni.h.w vr5, vr5, 12 + vldx vr13, t2, a1 + + DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 + + b .DCT_DCT_8X4_END + +.NO_HAS_DCONLY_8x4: + la.local t0, idct_coeffs + + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a2, 32 + vld vr3, a2, 48 + + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 + + vshuf4i.d vr1, vr1, 0x01 + vshuf4i.d vr3, vr3, 0x01 + + vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 + vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 in0 + vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 in1 + vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15 + vilvl.h vr2, vr5, vr4 // 16 - 23 in2 + vilvh.h vr3, vr5, vr4 // 24 - 31 in3 + + la.local t0, idct_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + + dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ + vr22, vr15, vr16, vr17, vr18 + + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + vsrari.h vr18, vr18, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 + +.DCT_DCT_8X4_END: +endfunc + +.macro identity8_lsx in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 + vssrarni.h.w \in1, \in0, 0 + vssrarni.h.w \in3, \in2, 0 + vssrarni.h.w \in5, \in4, 0 + vssrarni.h.w \in7, \in6, 0 + vsadd.h \out0, \in1, \in1 + vsadd.h \out1, \in3, \in3 + vsadd.h \out2, \in5, \in5 + vsadd.h \out3, \in7, \in7 +.endm + +function inv_txfm_add_identity_identity_8x4_8bpc_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ + vr19, vr7, vr9, vr11 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + identity_4x4_lsx vr19, vr19, vr20, vr19, vr19 + identity_4x4_lsx vr7, vr7, vr20, vr7, vr7 + identity_4x4_lsx vr9, vr9, vr20, vr9, vr9 + identity_4x4_lsx vr11, vr11, vr20, vr11, vr11 + + vsrari.h vr15, vr19, 4 + vsrari.h vr16, vr7, 4 + vsrari.h vr17, vr9, 4 + vsrari.h vr18, vr11, 4 + + vilvl.h vr4, vr16, vr15 + vilvh.h vr5, vr16, vr15 + vilvl.h vr11, vr5, vr4 + vilvh.h vr12, vr5, vr4 + vilvl.h vr4, vr18, vr17 + vilvh.h vr5, vr18, vr17 + vilvl.h vr13, vr5, vr4 + vilvh.h vr14, vr5, vr4 + vilvl.d vr15, vr13, vr11 + vilvh.d vr16, vr13, vr11 + vilvl.d vr17, vr14, vr12 + vilvh.d vr18, vr14, vr12 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 +endfunc + +const iadst8_coeffs, align=4 + .word 4076, 401, 3612, 1931 + .word 2598, 3166, 1189, 3920 + // idct_coeffs + .word 2896, 0, 1567, 3784, 0, 0, 0, 0 +endconst + +.macro vmadd_vmsub_vssrarni_hw_12 in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, out0, out1, out2, out3 + vmul.w \out0, \in0, \in4 + vmul.w \out1, \in0, \in5 + vmadd.w \out0, \in1, \in6 // t0a + vmsub.w \out1, \in1, \in7 // t1a + vmul.w \out2, \in2, \in8 + vmul.w \out3, \in2, \in9 + vmadd.w \out2, \in3, \in10 // t2a + vmsub.w \out3, \in3, \in11 // t3a + vssrarni.h.w \out1, \out0, 12 // t0a t1a + vssrarni.h.w \out3, \out2, 12 // t2a t3a +.endm + +.macro adst8x4_1d_lsx + la.local t0, iadst8_coeffs + + vldrepl.w vr20, t0, 0 // 4076 + vldrepl.w vr21, t0, 4 // 401 + vldrepl.w vr22, t0, 8 // 3612 + vldrepl.w vr23, t0, 12 // 1931 + + // vr13 t0a t1a vr15 t2a t3a + vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \ + vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15 + vldrepl.w vr20, t0, 16 // 2598 + vldrepl.w vr21, t0, 20 // 3166 + vldrepl.w vr22, t0, 24 // 1189 + vldrepl.w vr23, t0, 28 // 3920 + + // vr18 t4a t5a vr6 t6a t7a + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \ + vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6 + + vsadd.h vr12, vr13, vr18 // t0 t1 + vsadd.h vr14, vr15, vr6 // t2 t3 + vssub.h vr16, vr13, vr18 // t4 t5 + vssub.h vr18, vr15, vr6 // t6 t7 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + vsllwil.w.h vr7, vr16, 0 // t4 + vexth.w.h vr8, vr16 // t5 + vsllwil.w.h vr10, vr18, 0 // t6 + vexth.w.h vr11, vr18 // t7 + + // vr13 out0 out7 vr17 out1 out6 + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \ + vr20, vr21, vr21, vr20, vr13, vr15, vr17, vr19 + vshuf4i.d vr19, vr19, 0x01 + + vsadd.h vr13, vr12, vr14 // out0 out7 + vssub.h vr16, vr12, vr14 // t2 t3 + vsadd.h vr17, vr15, vr19 // out1 out6 + vssub.h vr18, vr15, vr19 // t6 t7 + + vexth.w.h vr20, vr13 // out7 + vsllwil.w.h vr21, vr17, 0 // out1 + vneg.w vr20, vr20 + vneg.w vr21, vr21 + vssrarni.h.w vr21, vr20, 0 // out7 out1 + vilvl.d vr13, vr21, vr13 // out0 out7 + vilvh.d vr17, vr17, vr21 // out1 out6 + + vsllwil.w.h vr7, vr16, 0 // t2 + vexth.w.h vr8, vr16 // t3 + vsllwil.w.h vr10, vr18, 0 // t6 + vexth.w.h vr11, vr18 // t7 + + // vr15 out[3] out[4] vr18 out[2] out[5] + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \ + vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18 + + vexth.w.h vr20, vr18 // out5 + vsllwil.w.h vr21, vr15, 0 // out3 + vneg.w vr20, vr20 + vneg.w vr21, vr21 + vssrarni.h.w vr21, vr20, 0 // out5 out3 + vilvl.d vr18, vr21, vr18 // out2 out5 + vilvh.d vr15, vr15, vr21 // out3 out4 +.endm + +function inv_txfm_add_adst_dct_8x4_8bpc_lsx + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + adst8x4_1d_lsx + + vilvl.h vr4, vr17, vr13 + vilvl.h vr5, vr15, vr18 + vilvl.w vr0, vr5, vr4 + vilvh.w vr1, vr5, vr4 + vilvh.h vr4, vr18, vr15 + vilvh.h vr5, vr13, vr17 + vilvl.w vr2, vr5, vr4 + vilvh.w vr3, vr5, vr4 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ + vr22, vr15, vr16, vr17, vr18 + + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + vsrari.h vr18, vr18, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 +endfunc + +function inv_txfm_add_dct_adst_8x4_8bpc_lsx + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 + + vshuf4i.d vr1, vr1, 0x01 + vshuf4i.d vr3, vr3, 0x01 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr0, vr5, vr4 + vilvh.h vr1, vr5, vr4 + vilvl.h vr4, vr3, vr2 + vilvh.h vr5, vr3, vr2 + vilvl.h vr2, vr5, vr4 + vilvh.h vr3, vr5, vr4 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 + vexth.w.h vr11, vr0 + vsllwil.w.h vr12, vr1, 0 + vexth.w.h vr13, vr1 + + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc + +function inv_txfm_add_adst_adst_8x4_8bpc_lsx + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + adst8x4_1d_lsx + + vilvl.h vr4, vr17, vr13 + vilvl.h vr5, vr15, vr18 + vilvl.w vr0, vr5, vr4 + vilvh.w vr1, vr5, vr4 + vilvh.h vr4, vr18, vr15 + vilvh.h vr5, vr13, vr17 + vilvl.w vr2, vr5, vr4 + vilvh.w vr3, vr5, vr4 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 + vexth.w.h vr11, vr0 + vsllwil.w.h vr12, vr1, 0 + vexth.w.h vr13, vr1 + + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc + +function inv_txfm_add_flipadst_adst_8x4_8bpc_lsx + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + adst8x4_1d_lsx + + vilvl.h vr20, vr15, vr13 + vilvl.h vr21, vr18, vr17 + vilvl.w vr0, vr21, vr20 + vilvh.w vr1, vr21, vr20 + vilvh.h vr20, vr15, vr13 + vilvh.h vr21, vr18, vr17 + vilvl.w vr2, vr21, vr20 + vilvh.w vr3, vr21, vr20 + vshuf4i.h vr0, vr0, 0x2d + vshuf4i.h vr1, vr1, 0x2d + vshuf4i.h vr2, vr2, 0x78 + vshuf4i.h vr3, vr3, 0x78 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr2, 0 + vexth.w.h vr11, vr2 + vsllwil.w.h vr12, vr3, 0 + vexth.w.h vr13, vr3 + + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr0, 0 + vexth.w.h vr15, vr0 + vsllwil.w.h vr16, vr1, 0 + vexth.w.h vr17, vr1 + + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc + +function inv_txfm_add_adst_flipadst_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 + + adst8x4_1d_lsx + + vilvl.h vr4, vr17, vr13 + vilvl.h vr5, vr15, vr18 + vilvl.w vr0, vr5, vr4 + vilvh.w vr1, vr5, vr4 + vilvh.h vr4, vr18, vr15 + vilvh.h vr5, vr13, vr17 + vilvl.w vr2, vr5, vr4 + vilvh.w vr3, vr5, vr4 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 + vexth.w.h vr11, vr0 + vsllwil.w.h vr12, vr1, 0 + vexth.w.h vr13, vr1 + + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 +endfunc + +function inv_txfm_add_flipadst_dct_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 + + adst8x4_1d_lsx + + vilvl.h vr20, vr15, vr13 + vilvl.h vr21, vr18, vr17 + vilvl.w vr0, vr21, vr20 + vilvh.w vr1, vr21, vr20 + vilvh.h vr20, vr15, vr13 + vilvh.h vr21, vr18, vr17 + vilvl.w vr2, vr21, vr20 + vilvh.w vr3, vr21, vr20 + vshuf4i.h vr0, vr0, 0x2d + vshuf4i.h vr1, vr1, 0x2d + vshuf4i.h vr2, vr2, 0x78 + vshuf4i.h vr3, vr3, 0x78 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx2 vr2, vr3, vr0, vr1, vr21, vr20, vr22, \ + vr22, vr15, vr16, vr17, vr18 + + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + vsrari.h vr18, vr18, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 +endfunc + +function inv_txfm_add_dct_flipadst_8x4_8bpc_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 + + vshuf4i.d vr1, vr1, 0x01 + vshuf4i.d vr3, vr3, 0x01 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr0, vr5, vr4 + vilvh.h vr1, vr5, vr4 + vilvl.h vr4, vr3, vr2 + vilvh.h vr5, vr3, vr2 + vilvl.h vr2, vr5, vr4 + vilvh.h vr3, vr5, vr4 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 // in0 + vexth.w.h vr11, vr0 // in1 + vsllwil.w.h vr12, vr1, 0 // in2 + vexth.w.h vr13, vr1 // in3 + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 +endfunc + +function inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 + + adst8x4_1d_lsx + + vilvl.h vr20, vr15, vr13 + vilvl.h vr21, vr18, vr17 + vilvl.w vr0, vr21, vr20 + vilvh.w vr1, vr21, vr20 + vilvh.h vr20, vr15, vr13 + vilvh.h vr21, vr18, vr17 + vilvl.w vr2, vr21, vr20 + vilvh.w vr3, vr21, vr20 + vshuf4i.h vr0, vr0, 0x2d + vshuf4i.h vr1, vr1, 0x2d + vshuf4i.h vr2, vr2, 0x78 + vshuf4i.h vr3, vr3, 0x78 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr2, 0 // in0 + vexth.w.h vr11, vr2 // in1 + vsllwil.w.h vr12, vr3, 0 // in2 + vexth.w.h vr13, vr3 // in3 + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr0, 0 + vexth.w.h vr15, vr0 + vsllwil.w.h vr16, vr1, 0 + vexth.w.h vr17, vr1 + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 +endfunc + +function inv_txfm_add_dct_identity_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 + + vshuf4i.d vr1, vr1, 0x01 + vshuf4i.d vr3, vr3, 0x01 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr0, vr5, vr4 + vilvh.h vr1, vr5, vr4 + vilvl.h vr4, vr3, vr2 + vilvh.h vr5, vr3, vr2 + vilvl.h vr2, vr5, vr4 + vilvh.h vr3, vr5, vr4 + vilvl.d vr14, vr2, vr0 + vilvh.d vr15, vr2, vr0 + vilvl.d vr16, vr3, vr1 + vilvh.d vr17, vr3, vr1 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr14, vr14, vr20, vr14, vr14 + identity_4x4_lsx vr15, vr15, vr20, vr15, vr15 + identity_4x4_lsx vr16, vr16, vr20, vr16, vr16 + identity_4x4_lsx vr17, vr17, vr20, vr17, vr17 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc + +function inv_txfm_add_identity_dct_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 + + identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ + vr19, vr7, vr9, vr11 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vilvl.h vr4, vr7, vr19 + vilvh.h vr5, vr7, vr19 + vilvl.h vr0, vr5, vr4 + vilvh.h vr1, vr5, vr4 + vilvl.h vr4, vr11, vr9 + vilvh.h vr5, vr11, vr9 + vilvl.h vr2, vr5, vr4 + vilvh.h vr3, vr5, vr4 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ + vr22, vr15, vr16, vr17, vr18 + + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + vsrari.h vr18, vr18, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 +endfunc + +function inv_txfm_add_flipadst_identity_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 + + adst8x4_1d_lsx + + vilvl.h vr20, vr15, vr13 + vilvl.h vr21, vr18, vr17 + vilvl.w vr0, vr21, vr20 + vilvh.w vr1, vr21, vr20 + vilvh.h vr20, vr15, vr13 + vilvh.h vr21, vr18, vr17 + vilvl.w vr2, vr21, vr20 + vilvh.w vr3, vr21, vr20 + vshuf4i.h vr0, vr0, 0x2d + vshuf4i.h vr1, vr1, 0x2d + vshuf4i.h vr2, vr2, 0x78 + vshuf4i.h vr3, vr3, 0x78 + vilvl.d vr14, vr0, vr2 // in0 + vilvh.d vr15, vr0, vr2 // in1 + vilvl.d vr16, vr1, vr3 // in2 + vilvh.d vr17, vr1, vr3 // in3 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr14, vr14, vr20, vr14, vr14 + identity_4x4_lsx vr15, vr15, vr20, vr15, vr15 + identity_4x4_lsx vr16, vr16, vr20, vr16, vr16 + identity_4x4_lsx vr17, vr17, vr20, vr17, vr17 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc + +function inv_txfm_add_identity_flipadst_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 + + identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ + vr19, vr7, vr9, vr11 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vilvl.h vr4, vr7, vr19 + vilvh.h vr5, vr7, vr19 + vilvl.h vr0, vr5, vr4 + vilvh.h vr1, vr5, vr4 + vilvl.h vr4, vr11, vr9 + vilvh.h vr5, vr11, vr9 + vilvl.h vr2, vr5, vr4 + vilvh.h vr3, vr5, vr4 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 // in0 + vexth.w.h vr11, vr0 // in1 + vsllwil.w.h vr12, vr1, 0 // in2 + vexth.w.h vr13, vr1 // in3 + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 +endfunc + +function inv_txfm_add_adst_identity_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 + + adst8x4_1d_lsx + + vilvl.h vr4, vr17, vr13 + vilvl.h vr5, vr15, vr18 + vilvl.w vr14, vr5, vr4 // in0 in1 + vilvh.w vr16, vr5, vr4 // in2 in3 + vilvh.h vr4, vr18, vr15 + vilvh.h vr5, vr13, vr17 + vilvl.w vr17, vr5, vr4 + vilvh.w vr18, vr5, vr4 + vilvl.d vr10, vr17, vr14 // in0 + vilvh.d vr11, vr17, vr14 // in1 + vilvl.d vr12, vr18, vr16 // in2 + vilvh.d vr13, vr18, vr16 // in3 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr10, vr10, vr20, vr10, vr15 + identity_4x4_lsx vr11, vr11, vr20, vr11, vr16 + identity_4x4_lsx vr12, vr12, vr20, vr12, vr17 + identity_4x4_lsx vr13, vr13, vr20, vr13, vr18 + + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + vsrari.h vr18, vr18, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 +endfunc + +function inv_txfm_add_identity_adst_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 + + identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ + vr0, vr1, vr2, vr3 + + vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 + vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 + vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15 + vilvl.h vr2, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr3, vr5, vr4 // 8 9 10 11 12 13 14 15 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + la.local t0, iadst4_coeffs + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 + vexth.w.h vr11, vr0 + vsllwil.w.h vr12, vr1, 0 + vexth.w.h vr13, vr1 + + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc diff --git a/src/loongarch/itx.h b/src/loongarch/itx.h index de42e00..880a8f1 100644 --- a/src/loongarch/itx.h +++ b/src/loongarch/itx.h @@ -49,6 +49,23 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_4x4, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_4x4, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x4, lsx)); + static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) { #if BITDEPTH == 8 const unsigned flags = dav1d_get_cpu_flags(); @@ -74,6 +91,24 @@ static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c c->itxfm_add[TX_4X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_4x4_8bpc_lsx; c->itxfm_add[TX_4X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_4x4_8bpc_lsx; c->itxfm_add[TX_4X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_4x4_8bpc_lsx; + + c->itxfm_add[RTX_8X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][IDTX] = dav1d_inv_txfm_add_identity_identity_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x4_8bpc_lsx; + #endif } -- cgit v1.2.3 From 32809a022245f0ffca1f0210dd2e75cd55985517 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 15:20:02 +0800 Subject: loongarch: Improve the performance of itx_8bpc.add_8x8 series functions Relative speedup over C code: inv_txfm_add_8x8_adst_adst_0_8bpc_c: 70.1 ( 1.00x) inv_txfm_add_8x8_adst_adst_0_8bpc_lsx: 9.4 ( 7.45x) inv_txfm_add_8x8_adst_adst_1_8bpc_c: 70.1 ( 1.00x) inv_txfm_add_8x8_adst_adst_1_8bpc_lsx: 9.4 ( 7.43x) inv_txfm_add_8x8_adst_dct_0_8bpc_c: 68.7 ( 1.00x) inv_txfm_add_8x8_adst_dct_0_8bpc_lsx: 7.6 ( 9.08x) inv_txfm_add_8x8_adst_dct_1_8bpc_c: 68.7 ( 1.00x) inv_txfm_add_8x8_adst_dct_1_8bpc_lsx: 7.6 ( 9.00x) inv_txfm_add_8x8_adst_flipadst_0_8bpc_c: 70.3 ( 1.00x) inv_txfm_add_8x8_adst_flipadst_0_8bpc_lsx: 9.4 ( 7.47x) inv_txfm_add_8x8_adst_flipadst_1_8bpc_c: 70.3 ( 1.00x) inv_txfm_add_8x8_adst_flipadst_1_8bpc_lsx: 9.4 ( 7.47x) inv_txfm_add_8x8_adst_identity_0_8bpc_c: 50.6 ( 1.00x) inv_txfm_add_8x8_adst_identity_0_8bpc_lsx: 5.7 ( 8.88x) inv_txfm_add_8x8_adst_identity_1_8bpc_c: 49.8 ( 1.00x) inv_txfm_add_8x8_adst_identity_1_8bpc_lsx: 5.7 ( 8.73x) inv_txfm_add_8x8_dct_adst_0_8bpc_c: 67.9 ( 1.00x) inv_txfm_add_8x8_dct_adst_0_8bpc_lsx: 7.5 ( 9.05x) inv_txfm_add_8x8_dct_adst_1_8bpc_c: 67.9 ( 1.00x) inv_txfm_add_8x8_dct_adst_1_8bpc_lsx: 7.4 ( 9.13x) inv_txfm_add_8x8_dct_dct_0_8bpc_c: 9.1 ( 1.00x) inv_txfm_add_8x8_dct_dct_0_8bpc_lsx: 0.8 (11.20x) inv_txfm_add_8x8_dct_dct_1_8bpc_c: 66.5 ( 1.00x) inv_txfm_add_8x8_dct_dct_1_8bpc_lsx: 5.0 (13.42x) inv_txfm_add_8x8_dct_flipadst_0_8bpc_c: 67.9 ( 1.00x) inv_txfm_add_8x8_dct_flipadst_0_8bpc_lsx: 7.5 ( 9.06x) inv_txfm_add_8x8_dct_flipadst_1_8bpc_c: 67.9 ( 1.00x) inv_txfm_add_8x8_dct_flipadst_1_8bpc_lsx: 7.5 ( 9.06x) inv_txfm_add_8x8_dct_identity_0_8bpc_c: 47.3 ( 1.00x) inv_txfm_add_8x8_dct_identity_0_8bpc_lsx: 3.7 (12.70x) inv_txfm_add_8x8_dct_identity_1_8bpc_c: 47.3 ( 1.00x) inv_txfm_add_8x8_dct_identity_1_8bpc_lsx: 3.7 (12.70x) inv_txfm_add_8x8_flipadst_adst_0_8bpc_c: 70.3 ( 1.00x) inv_txfm_add_8x8_flipadst_adst_0_8bpc_lsx: 9.6 ( 7.35x) inv_txfm_add_8x8_flipadst_adst_1_8bpc_c: 70.3 ( 1.00x) inv_txfm_add_8x8_flipadst_adst_1_8bpc_lsx: 9.6 ( 7.33x) inv_txfm_add_8x8_flipadst_dct_0_8bpc_c: 68.9 ( 1.00x) inv_txfm_add_8x8_flipadst_dct_0_8bpc_lsx: 7.6 ( 9.10x) inv_txfm_add_8x8_flipadst_dct_1_8bpc_c: 68.9 ( 1.00x) inv_txfm_add_8x8_flipadst_dct_1_8bpc_lsx: 7.6 ( 9.11x) inv_txfm_add_8x8_flipadst_flipadst_0_8bpc_c: 70.4 ( 1.00x) inv_txfm_add_8x8_flipadst_flipadst_0_8bpc_lsx: 9.6 ( 7.32x) inv_txfm_add_8x8_flipadst_flipadst_1_8bpc_c: 70.4 ( 1.00x) inv_txfm_add_8x8_flipadst_flipadst_1_8bpc_lsx: 9.6 ( 7.34x) inv_txfm_add_8x8_flipadst_identity_0_8bpc_c: 49.9 ( 1.00x) inv_txfm_add_8x8_flipadst_identity_0_8bpc_lsx: 5.6 ( 8.91x) inv_txfm_add_8x8_flipadst_identity_1_8bpc_c: 49.9 ( 1.00x) inv_txfm_add_8x8_flipadst_identity_1_8bpc_lsx: 5.6 ( 8.91x) inv_txfm_add_8x8_identity_adst_0_8bpc_c: 51.3 ( 1.00x) inv_txfm_add_8x8_identity_adst_0_8bpc_lsx: 5.5 ( 9.28x) inv_txfm_add_8x8_identity_adst_1_8bpc_c: 51.3 ( 1.00x) inv_txfm_add_8x8_identity_adst_1_8bpc_lsx: 5.5 ( 9.28x) inv_txfm_add_8x8_identity_dct_0_8bpc_c: 50.5 ( 1.00x) inv_txfm_add_8x8_identity_dct_0_8bpc_lsx: 3.6 (13.83x) inv_txfm_add_8x8_identity_dct_1_8bpc_c: 50.6 ( 1.00x) inv_txfm_add_8x8_identity_dct_1_8bpc_lsx: 3.6 (13.87x) inv_txfm_add_8x8_identity_flipadst_0_8bpc_c: 52.0 ( 1.00x) inv_txfm_add_8x8_identity_flipadst_0_8bpc_lsx: 5.5 ( 9.40x) inv_txfm_add_8x8_identity_flipadst_1_8bpc_c: 52.0 ( 1.00x) inv_txfm_add_8x8_identity_flipadst_1_8bpc_lsx: 5.5 ( 9.39x) inv_txfm_add_8x8_identity_identity_0_8bpc_c: 31.1 ( 1.00x) inv_txfm_add_8x8_identity_identity_0_8bpc_lsx: 1.8 (17.06x) inv_txfm_add_8x8_identity_identity_1_8bpc_c: 31.1 ( 1.00x) inv_txfm_add_8x8_identity_identity_1_8bpc_lsx: 1.8 (16.97x) --- src/loongarch/itx.S | 1853 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/itx.h | 33 + 2 files changed, 1886 insertions(+) diff --git a/src/loongarch/itx.S b/src/loongarch/itx.S index 4ea353e..b14e1c4 100644 --- a/src/loongarch/itx.S +++ b/src/loongarch/itx.S @@ -2168,3 +2168,1856 @@ function inv_txfm_add_identity_adst_8x4_8bpc_lsx VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 endfunc + +function inv_txfm_add_identity_identity_8x8_8bpc_lsx + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 + + // identity8 + vsllwil.w.h vr6, vr0, 1 + vsllwil.w.h vr7, vr1, 1 + vsllwil.w.h vr8, vr2, 1 + vsllwil.w.h vr9, vr3, 1 + vsllwil.w.h vr10, vr4, 1 + vsllwil.w.h vr11, vr5, 1 + vsllwil.w.h vr12, vr14, 1 + vsllwil.w.h vr13, vr15, 1 + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 + vexth.w.h \i, \i +.endr + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr0, vr6, 1 // in0 + vssrarni.h.w vr1, vr7, 1 // in1 + vssrarni.h.w vr2, vr8, 1 // in2 + vssrarni.h.w vr3, vr9, 1 // in3 + vssrarni.h.w vr4, vr10, 1 // in4 + vssrarni.h.w vr5, vr11, 1 // in5 + vssrarni.h.w vr14, vr12, 1 // in6 + vssrarni.h.w vr15, vr13, 1 // in7 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15, \ + vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr12 vr13 + + vsllwil.w.h vr6, vr16, 1 + vsllwil.w.h vr7, vr17, 1 + vsllwil.w.h vr8, vr18, 1 + vsllwil.w.h vr9, vr19, 1 + vsllwil.w.h vr10, vr20, 1 + vsllwil.w.h vr11, vr21, 1 + vsllwil.w.h vr12, vr22, 1 + vsllwil.w.h vr13, vr23, 1 + +.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 + vexth.w.h \i, \i +.endr + +.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr16, vr6, 4 // in0 + vssrarni.h.w vr17, vr7, 4 // in1 + vssrarni.h.w vr18, vr8, 4 // in2 + vssrarni.h.w vr19, vr9, 4 // in3 + vssrarni.h.w vr20, vr10, 4 // in4 + vssrarni.h.w vr21, vr11, 4 // in5 + vssrarni.h.w vr22, vr12, 4 // in6 + vssrarni.h.w vr23, vr13, 4 // in7 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + +endfunc + +.macro adst8x8_1d_lsx out0, out1, out2, out3 + la.local t0, iadst8_coeffs + + vldrepl.w vr20, t0, 0 // 4076 + vldrepl.w vr21, t0, 4 // 401 + vldrepl.w vr22, t0, 8 // 3612 + vldrepl.w vr23, t0, 12 // 1931 + + // vr13 t0a t1a vr15 t2a t3a + vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \ + vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15 + vldrepl.w vr20, t0, 16 // 2598 + vldrepl.w vr21, t0, 20 // 3166 + vldrepl.w vr22, t0, 24 // 1189 + vldrepl.w vr23, t0, 28 // 3920 + + // vr18 t4a t5a vr6 t6a t7a + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \ + vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6 + + vsadd.h vr12, vr13, vr18 // t0 t1 + vsadd.h vr14, vr15, vr6 // t2 t3 + vssub.h vr9, vr13, vr18 // t4 t5 + vssub.h vr18, vr15, vr6 // t6 t7 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + vsllwil.w.h vr7, vr9, 0 // t4 + vexth.w.h vr8, vr9 // t5 + vsllwil.w.h vr10, vr18, 0 // t6 + vexth.w.h vr11, vr18 // t7 + + // vr13 out0 out7 vr17 out1 out6 + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \ + vr20, vr21, vr21, vr20, vr13, vr15, vr18, vr19 + vshuf4i.d vr19, vr19, 0x01 + + vsadd.h vr13, vr12, vr14 // out0 out7 + vssub.h vr6, vr12, vr14 // t2 t3 + vsadd.h vr7, vr15, vr19 // out1 out6 + vssub.h vr18, vr15, vr19 // t6 t7 + + vexth.w.h vr20, vr13 // out7 + vsllwil.w.h vr21, vr7, 0 // out1 + vneg.w vr20, vr20 + vneg.w vr21, vr21 + vssrarni.h.w vr21, vr20, 0 // out7 out1 + vilvl.d \out0, vr21, vr13 // out0 out7 + vilvh.d \out1, vr7, vr21 // out1 out6 + + vsllwil.w.h vr7, vr6, 0 // t2 + vexth.w.h vr8, vr6 // t3 + vsllwil.w.h vr10, vr18, 0 // t6 + vexth.w.h vr11, vr18 // t7 + + // vr15 out[3] out[4] vr18 out[2] out[5] + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \ + vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18 + + vexth.w.h vr20, vr18 // out5 + vsllwil.w.h vr21, vr15, 0 // out3 + vneg.w vr20, vr20 + vneg.w vr21, vr21 + vssrarni.h.w vr21, vr20, 0 // out5 out3 + vilvl.d \out2, vr21, vr18 // out2 out5 + vilvh.d \out3, vr15, vr21 // out3 out4 +.endm + +function inv_txfm_add_adst_dct_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr24, vr25, vr26, vr27 + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ + vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 + + vshuf4i.h vr14, vr14, 0x1b + vshuf4i.h vr15, vr15, 0x1b + vshuf4i.h vr24, vr24, 0x1b + vshuf4i.h vr25, vr25, 0x1b + + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr12 + vexth.w.h vr11, vr13 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr4, vr5, vr12, vr13 + + vshuf4i.d vr5, vr5, 0x01 + vshuf4i.d vr13, vr13, 0x01 + + vsllwil.w.h vr18, vr14, 0 + vsllwil.w.h vr19, vr15, 0 + vsllwil.w.h vr6, vr24, 0 + vsllwil.w.h vr7, vr25, 0 + vexth.w.h vr8, vr14 + vexth.w.h vr9, vr15 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr14, vr15, vr24, vr25 + + vshuf4i.d vr15, vr15, 0x01 + vshuf4i.d vr25, vr25, 0x01 + + vilvl.d vr20, vr14, vr4 + vilvh.d vr21, vr14, vr4 + vilvl.d vr22, vr15, vr5 + vilvh.d vr23, vr15, vr5 + vilvl.d vr16, vr24, vr12 + vilvh.d vr17, vr24, vr12 + vilvl.d vr18, vr25, vr13 + vilvh.d vr19, vr25, vr13 + +.irp i, vr20, vr21, vr22, vr23, vr16, vr17, vr18, vr19 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_dct_adst_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vsllwil.w.h vr8, vr14, 0 + vsllwil.w.h vr9, vr15, 0 + vsllwil.w.h vr10, vr24, 0 + vsllwil.w.h vr11, vr25, 0 + + dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 + + vshuf4i.d vr27, vr27, 0x01 + vshuf4i.d vr29, vr29, 0x01 + + vilvl.h vr8, vr27, vr26 // 0 2 4 6 8 10 12 14 + vilvh.h vr9, vr27, vr26 // 1 3 5 7 9 11 13 15 + vilvl.h vr26, vr9, vr8 // 0 - 7 in0 + vilvh.h vr27, vr9, vr8 // 8 - 15 in1 + vilvl.h vr8, vr29, vr28 // 0 2 4 6 8 10 12 14 + vilvh.h vr9, vr29, vr28 // 1 3 5 7 9 11 13 15 + vilvl.h vr28, vr9, vr8 // 16 - 23 in2 + vilvh.h vr29, vr9, vr8 // 24 - 31 in3 + + vsrari.h vr26, vr26, 1 // in0low in1low + vsrari.h vr27, vr27, 1 // in2low in3low + vsrari.h vr28, vr28, 1 // in0high in1high + vsrari.h vr29, vr29, 1 // in2high in3high + + vexth.w.h vr18, vr4 + vexth.w.h vr19, vr5 + vexth.w.h vr6, vr12 + vexth.w.h vr7, vr13 + vexth.w.h vr8, vr14 + vexth.w.h vr9, vr15 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 + + vshuf4i.d vr13, vr13, 0x01 + vshuf4i.d vr15, vr15, 0x01 + + vilvl.h vr8, vr13, vr12 // 0 2 4 6 8 10 12 14 + vilvh.h vr9, vr13, vr12 // 1 3 5 7 9 11 13 15 + vilvl.h vr12, vr9, vr8 // 0 - 7 in0 + vilvh.h vr13, vr9, vr8 // 8 - 15 in1 + vilvl.h vr8, vr15, vr14 // 0 2 4 6 8 10 12 14 + vilvh.h vr9, vr15, vr14 // 1 3 5 7 9 11 13 15 + vilvl.h vr14, vr9, vr8 // 16 - 23 in2 + vilvh.h vr15, vr9, vr8 // 24 - 31 in3 + + vsrari.h vr0, vr12, 1 // in4low in5low + vsrari.h vr1, vr13, 1 // in6low in7low + vsrari.h vr2, vr14, 1 // in4high in5high + vsrari.h vr3, vr15, 1 // in6high in7high + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsllwil.w.h vr18, vr26, 0 // in0 + vexth.w.h vr19, vr26 // in1 + vsllwil.w.h vr6, vr27, 0 // in2 + vexth.w.h vr7, vr27 // in3 + vsllwil.w.h vr8, vr0, 0 // in3 + vexth.w.h vr9, vr0 // in4 + vsllwil.w.h vr10, vr1, 0 // in5 + vexth.w.h vr11, vr1 // in6 + adst8x8_1d_lsx vr26, vr27, vr0, vr1 + + vsllwil.w.h vr18, vr28, 0 // in0 + vexth.w.h vr19, vr28 // in1 + vsllwil.w.h vr6, vr29, 0 // in2 + vexth.w.h vr7, vr29 // in3 + vsllwil.w.h vr8, vr2, 0 // in4 + vexth.w.h vr9, vr2 // in5 + vsllwil.w.h vr10, vr3, 0 // in6 + vexth.w.h vr11, vr3 // in7 + adst8x8_1d_lsx vr28, vr29, vr16, vr17 + + vilvl.d vr4, vr28, vr26 // 0 ... 7 + vilvl.d vr5, vr29, vr27 // 8 ... 15 + vilvl.d vr6, vr16, vr0 // 16 ... 23 + vilvl.d vr7, vr17, vr1 // 24 ... 31 + vilvh.d vr14, vr17, vr1 // 32 ... 39 + vilvh.d vr15, vr16, vr0 // 40 ... 47 + vilvh.d vr16, vr29, vr27 // 48 ... 55 + vilvh.d vr17, vr28, vr26 // 56 ... 63 + +.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 +endfunc + +function inv_txfm_add_adst_adst_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr24, vr25, vr26, vr27 + + vexth.w.h vr18, vr0 // in0 + vexth.w.h vr19, vr1 // in1 + vexth.w.h vr6, vr2 // in2 + vexth.w.h vr7, vr3 // in3 + vexth.w.h vr8, vr4 // in3 + vexth.w.h vr9, vr5 // in4 + vexth.w.h vr10, vr16 // in5 + vexth.w.h vr11, vr17 // in6 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ + vr14, vr15, vr12, vr13, vr4, vr5, vr24, vr25, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 + + vshuf4i.h vr4, vr4, 0x1b + vshuf4i.h vr5, vr5, 0x1b + vshuf4i.h vr24, vr24, 0x1b + vshuf4i.h vr25, vr25, 0x1b + + vsllwil.w.h vr18, vr14, 0 + vsllwil.w.h vr19, vr15, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vexth.w.h vr8, vr14 // in3 + vexth.w.h vr9, vr15 // in4 + vexth.w.h vr10, vr12 // in5 + vexth.w.h vr11, vr13 // in6 + + adst8x8_1d_lsx vr26, vr27, vr0, vr1 + + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr24, 0 + vsllwil.w.h vr7, vr25, 0 + vexth.w.h vr8, vr4 // in3 + vexth.w.h vr9, vr5 // in4 + vexth.w.h vr10, vr24 // in5 + vexth.w.h vr11, vr25 // in6 + + adst8x8_1d_lsx vr24, vr25, vr16, vr17 + + vilvl.d vr4, vr24, vr26 // 0 ... 7 + vilvl.d vr5, vr25, vr27 // 8 ... 15 + vilvl.d vr6, vr16, vr0 // 16 ... 23 + vilvl.d vr7, vr17, vr1 // 24 ... 31 + vilvh.d vr14, vr17, vr1 // 32 ... 39 + vilvh.d vr15, vr16, vr0 // 40 ... 47 + vilvh.d vr16, vr25, vr27 // 48 ... 55 + vilvh.d vr17, vr24, vr26 // 56 ... 63 + +.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_flipadst_adst_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr24, vr20, vr21 + vilvh.w vr25, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr26, vr20, vr21 + vilvh.w vr27, vr20, vr21 + vshuf4i.h vr26, vr26, 0x1b + vshuf4i.h vr27, vr27, 0x1b + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr0, vr20, vr21 + vilvh.w vr1, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr2, vr20, vr21 + vilvh.w vr3, vr20, vr21 + vshuf4i.h vr2, vr2, 0x1b + vshuf4i.h vr3, vr3, 0x1b + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + vsllwil.w.h vr18, vr26, 0 // in0 + vexth.w.h vr19, vr26 // in1 + vsllwil.w.h vr6, vr27, 0 // in2 + vexth.w.h vr7, vr27 // in3 + vsllwil.w.h vr8, vr2, 0 // in4 + vexth.w.h vr9, vr2 // in5 + vsllwil.w.h vr10, vr3, 0 // in6 + vexth.w.h vr11, vr3 // in7 + adst8x8_1d_lsx vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr24, 0 // in0 + vexth.w.h vr19, vr24 // in1 + vsllwil.w.h vr6, vr25, 0 // in2 + vexth.w.h vr7, vr25 // in3 + vsllwil.w.h vr8, vr0, 0 // in4 + vexth.w.h vr9, vr0 // in5 + vsllwil.w.h vr10, vr1, 0 // in6 + vexth.w.h vr11, vr1 // in7 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvl.d vr20, vr0, vr4 // 0 ... 7 + vilvl.d vr21, vr1, vr5 // 8 ... 15 + vilvl.d vr22, vr2, vr16 // 16 ... 23 + vilvl.d vr23, vr3, vr17 // 24 ... 31 + vilvh.d vr14, vr3, vr17 // 32 ... 39 + vilvh.d vr15, vr2, vr16 // 40 ... 47 + vilvh.d vr16, vr1, vr5 // 48 ... 55 + vilvh.d vr17, vr0, vr4 // 56 ... 63 + +.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr16, vr17 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_adst_flipadst_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr24, vr25, vr26, vr27 + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ + vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 + + vshuf4i.h vr0, vr0, 0x1b + vshuf4i.h vr1, vr1, 0x1b + vshuf4i.h vr2, vr2, 0x1b + vshuf4i.h vr3, vr3, 0x1b + + vsllwil.w.h vr18, vr0, 0 // in0 + vsllwil.w.h vr19, vr1, 0 // in1 + vsllwil.w.h vr6, vr2, 0 // in2 + vsllwil.w.h vr7, vr3, 0 // in3 + vexth.w.h vr8, vr0 // in4 + vexth.w.h vr9, vr1 // in5 + vexth.w.h vr10, vr2 // in6 + vexth.w.h vr11, vr3 // in7 + adst8x8_1d_lsx vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr24, 0 // in0 + vsllwil.w.h vr19, vr25, 0 // in1 + vsllwil.w.h vr6, vr26, 0 // in2 + vsllwil.w.h vr7, vr27, 0 // in3 + vexth.w.h vr8, vr24 // in4 + vexth.w.h vr9, vr25 // in5 + vexth.w.h vr10, vr26 // in6 + vexth.w.h vr11, vr27 // in7 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvh.d vr20, vr4, vr0 + vilvh.d vr21, vr5, vr1 + vilvh.d vr22, vr16, vr2 + vilvh.d vr23, vr17, vr3 + vilvl.d vr14, vr17, vr3 + vilvl.d vr15, vr16, vr2 + vilvl.d vr18, vr5, vr1 + vilvl.d vr19, vr4, vr0 + +.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr18, vr19 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr18, vr19 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_flipadst_dct_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr24, vr20, vr21 + vilvh.w vr25, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr26, vr20, vr21 + vilvh.w vr27, vr20, vr21 + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr0, vr20, vr21 + vilvh.w vr1, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr2, vr20, vr21 + vilvh.w vr3, vr20, vr21 + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsrari.h vr24, vr24, 1 + vsrari.h vr25, vr25, 1 + vsrari.h vr26, vr26, 1 + vsrari.h vr27, vr27, 1 + vsrari.h vr14, vr0, 1 + vsrari.h vr15, vr1, 1 + vsrari.h vr16, vr2, 1 + vsrari.h vr17, vr3, 1 + + vsllwil.w.h vr18, vr26, 0 + vexth.w.h vr19, vr26 + vsllwil.w.h vr6, vr27, 0 + vexth.w.h vr7, vr27 + vsllwil.w.h vr8, vr16, 0 + vexth.w.h vr9, vr16 + vsllwil.w.h vr10, vr17, 0 + vexth.w.h vr11, vr17 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr26, vr27, vr16, vr17 + + vshuf4i.h vr26, vr26, 0x1b + vshuf4i.h vr27, vr27, 0x1b + vshuf4i.h vr16, vr16, 0x1b + vshuf4i.h vr17, vr17, 0x1b + + vsllwil.w.h vr18, vr24, 0 + vexth.w.h vr19, vr24 + vsllwil.w.h vr6, vr25, 0 + vexth.w.h vr7, vr25 + vsllwil.w.h vr8, vr14, 0 + vexth.w.h vr9, vr14 + vsllwil.w.h vr10, vr15, 0 + vexth.w.h vr11, vr15 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr24, vr25, vr14, vr15 + + vilvl.d vr4, vr24, vr26 + vilvh.d vr5, vr24, vr26 + vilvh.d vr6, vr25, vr27 + vilvl.d vr7, vr25, vr27 + vilvl.d vr24, vr14, vr16 + vilvh.d vr25, vr14, vr16 + vilvh.d vr26, vr15, vr17 + vilvl.d vr27, vr15, vr17 + +.irp i, vr4, vr5, vr6, vr7, vr24, vr25, vr26, vr27 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr24, vr25, vr26, vr27 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_dct_flipadst_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vsllwil.w.h vr8, vr14, 0 + vsllwil.w.h vr9, vr15, 0 + vsllwil.w.h vr10, vr24, 0 + vsllwil.w.h vr11, vr25, 0 + dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 + vshuf4i.d vr27, vr27, 0x01 + vshuf4i.d vr29, vr29, 0x01 + + vilvl.h vr8, vr27, vr26 + vilvh.h vr9, vr27, vr26 + vilvl.h vr26, vr9, vr8 + vilvh.h vr27, vr9, vr8 + vilvl.h vr8, vr29, vr28 + vilvh.h vr9, vr29, vr28 + vilvl.h vr28, vr9, vr8 + vilvh.h vr29, vr9, vr8 + + vsrari.h vr26, vr26, 1 // in0low in1low + vsrari.h vr27, vr27, 1 // in2low in3low + vsrari.h vr28, vr28, 1 // in0high in1high + vsrari.h vr29, vr29, 1 // in2high in3high + + vexth.w.h vr18, vr4 + vexth.w.h vr19, vr5 + vexth.w.h vr6, vr12 + vexth.w.h vr7, vr13 + vexth.w.h vr8, vr14 + vexth.w.h vr9, vr15 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 + vshuf4i.d vr13, vr13, 0x01 + vshuf4i.d vr15, vr15, 0x01 + + vilvl.h vr8, vr13, vr12 + vilvh.h vr9, vr13, vr12 + vilvl.h vr12, vr9, vr8 + vilvh.h vr13, vr9, vr8 + vilvl.h vr8, vr15, vr14 + vilvh.h vr9, vr15, vr14 + vilvl.h vr14, vr9, vr8 + vilvh.h vr15, vr9, vr8 + + vsrari.h vr0, vr12, 1 + vsrari.h vr1, vr13, 1 + vsrari.h vr2, vr14, 1 + vsrari.h vr3, vr15, 1 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsllwil.w.h vr18, vr28, 0 // in0 + vexth.w.h vr19, vr28 // in1 + vsllwil.w.h vr6, vr29, 0 // in2 + vexth.w.h vr7, vr29 // in3 + vsllwil.w.h vr8, vr2, 0 // in4 + vexth.w.h vr9, vr2 // in5 + vsllwil.w.h vr10, vr3, 0 // in6 + vexth.w.h vr11, vr3 // in7 + adst8x8_1d_lsx vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr26, 0 // in0 + vexth.w.h vr19, vr26 // in1 + vsllwil.w.h vr6, vr27, 0 // in2 + vexth.w.h vr7, vr27 // in3 + vsllwil.w.h vr8, vr0, 0 // in4 + vexth.w.h vr9, vr0 // in5 + vsllwil.w.h vr10, vr1, 0 // in6 + vexth.w.h vr11, vr1 // in7 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvh.d vr26, vr4, vr0 + vilvh.d vr27, vr5, vr1 + vilvh.d vr28, vr16, vr2 + vilvh.d vr29, vr17, vr3 + vilvl.d vr20, vr17, vr3 + vilvl.d vr21, vr16, vr2 + vilvl.d vr22, vr5, vr1 + vilvl.d vr23, vr4, vr0 + +.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr26, vr27, vr28, vr29 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 +endfunc + +function inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr24, vr20, vr21 + vilvh.w vr25, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr26, vr20, vr21 + vilvh.w vr27, vr20, vr21 + vshuf4i.h vr26, vr26, 0x1b + vshuf4i.h vr27, vr27, 0x1b + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr0, vr20, vr21 + vilvh.w vr1, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr2, vr20, vr21 + vilvh.w vr3, vr20, vr21 + vshuf4i.h vr2, vr2, 0x1b + vshuf4i.h vr3, vr3, 0x1b + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsllwil.w.h vr18, vr26, 0 // in0 + vexth.w.h vr19, vr26 // in1 + vsllwil.w.h vr6, vr27, 0 // in2 + vexth.w.h vr7, vr27 // in3 + vsllwil.w.h vr8, vr2, 0 // in4 + vexth.w.h vr9, vr2 // in5 + vsllwil.w.h vr10, vr3, 0 // in6 + vexth.w.h vr11, vr3 // in7 + adst8x8_1d_lsx vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr24, 0 // in0 + vexth.w.h vr19, vr24 // in1 + vsllwil.w.h vr6, vr25, 0 // in2 + vexth.w.h vr7, vr25 // in3 + vsllwil.w.h vr8, vr0, 0 // in4 + vexth.w.h vr9, vr0 // in5 + vsllwil.w.h vr10, vr1, 0 // in6 + vexth.w.h vr11, vr1 // in7 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvh.d vr24, vr0, vr4 + vilvh.d vr25, vr1, vr5 + vilvh.d vr26, vr2, vr16 + vilvh.d vr27, vr3, vr17 + vilvl.d vr20, vr3, vr17 + vilvl.d vr21, vr2, vr16 + vilvl.d vr22, vr1, vr5 + vilvl.d vr23, vr0, vr4 + +.irp i, vr24, vr25, vr26, vr27, vr20, vr21, vr22, vr23 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr24, vr25, vr26, vr27 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_dct_identity_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vsllwil.w.h vr8, vr14, 0 + vsllwil.w.h vr9, vr15, 0 + vsllwil.w.h vr10, vr24, 0 + vsllwil.w.h vr11, vr25, 0 + dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 + vshuf4i.d vr27, vr27, 0x01 + vshuf4i.d vr29, vr29, 0x01 + + vilvl.h vr8, vr27, vr26 + vilvh.h vr9, vr27, vr26 + vilvl.h vr26, vr9, vr8 + vilvh.h vr27, vr9, vr8 + vilvl.h vr8, vr29, vr28 + vilvh.h vr9, vr29, vr28 + vilvl.h vr28, vr9, vr8 + vilvh.h vr29, vr9, vr8 + + vsrari.h vr26, vr26, 1 // in0low in1low + vsrari.h vr27, vr27, 1 // in2low in3low + vsrari.h vr28, vr28, 1 // in0high in1high + vsrari.h vr29, vr29, 1 // in2high in3high + + vexth.w.h vr18, vr4 + vexth.w.h vr19, vr5 + vexth.w.h vr6, vr12 + vexth.w.h vr7, vr13 + vexth.w.h vr8, vr14 + vexth.w.h vr9, vr15 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 + + vshuf4i.d vr13, vr13, 0x01 + vshuf4i.d vr15, vr15, 0x01 + + vilvl.h vr8, vr13, vr12 + vilvh.h vr9, vr13, vr12 + vilvl.h vr12, vr9, vr8 + vilvh.h vr13, vr9, vr8 + vilvl.h vr8, vr15, vr14 + vilvh.h vr9, vr15, vr14 + vilvl.h vr14, vr9, vr8 + vilvh.h vr15, vr9, vr8 + + vsrari.h vr20, vr12, 1 + vsrari.h vr21, vr13, 1 + vsrari.h vr22, vr14, 1 + vsrari.h vr23, vr15, 1 + + vreplgr2vr.h vr19, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr19, a2, \i +.endr + // identity8 + vsllwil.w.h vr10, vr26, 1 + vsllwil.w.h vr11, vr27, 1 + vsllwil.w.h vr16, vr28, 1 + vsllwil.w.h vr17, vr29, 1 + vsllwil.w.h vr6, vr20, 1 + vsllwil.w.h vr7, vr21, 1 + vsllwil.w.h vr18, vr22, 1 + vsllwil.w.h vr19, vr23, 1 + +.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 + vexth.w.h \i, \i +.endr + +.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr16, vr10, 4 // in0 + vssrarni.h.w vr28, vr26, 4 // in1 + vssrarni.h.w vr17, vr11, 4 // in2 + vssrarni.h.w vr29, vr27, 4 // in3 + vssrarni.h.w vr18, vr6, 4 // in4 + vssrarni.h.w vr22, vr20, 4 // in5 + vssrarni.h.w vr19, vr7, 4 // in6 + vssrarni.h.w vr23, vr21, 4 // in7 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr16, vr28, vr17, vr29 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr18, vr22, vr19, vr23 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 +endfunc + +function inv_txfm_add_identity_dct_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + + // identity8 + vsllwil.w.h vr6, vr0, 1 + vsllwil.w.h vr7, vr1, 1 + vsllwil.w.h vr8, vr2, 1 + vsllwil.w.h vr9, vr3, 1 + vsllwil.w.h vr10, vr4, 1 + vsllwil.w.h vr11, vr5, 1 + vsllwil.w.h vr12, vr24, 1 + vsllwil.w.h vr13, vr25, 1 + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vexth.w.h \i, \i +.endr + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vslli.w \i, \i, 1 +.endr + vssrarni.h.w vr0, vr6, 1 // in0 + vssrarni.h.w vr1, vr7, 1 // in1 + vssrarni.h.w vr2, vr8, 1 // in2 + vssrarni.h.w vr3, vr9, 1 // in3 + vssrarni.h.w vr4, vr10, 1 // in4 + vssrarni.h.w vr5, vr11, 1 // in5 + vssrarni.h.w vr24, vr12, 1 // in6 + vssrarni.h.w vr25, vr13, 1 // in7 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ + vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + // dct4 in0 in2 in4 in6 + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vsllwil.w.h vr8, vr14, 0 + vsllwil.w.h vr9, vr15, 0 + vsllwil.w.h vr10, vr24, 0 + vsllwil.w.h vr11, vr25, 0 + dct_8x4_core_lsx1 vr16, vr17, vr26, vr27 + + vexth.w.h vr18, vr4 + vexth.w.h vr19, vr5 + vexth.w.h vr6, vr12 + vexth.w.h vr7, vr13 + vexth.w.h vr8, vr14 + vexth.w.h vr9, vr15 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + dct_8x4_core_lsx1 vr4, vr5, vr24, vr25 + + vilvl.d vr8, vr4, vr16 + vilvh.d vr9, vr4, vr16 + vilvh.d vr6, vr5, vr17 + vilvl.d vr7, vr5, vr17 + vilvl.d vr16, vr24, vr26 + vilvh.d vr17, vr24, vr26 + vilvh.d vr18, vr25, vr27 + vilvl.d vr19, vr25, vr27 + +.irp i, vr8, vr9, vr6, vr7, vr16, vr17, vr18, vr19 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr8, vr9, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 +endfunc + +function inv_txfm_add_flipadst_identity_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr24, vr20, vr21 + vilvh.w vr25, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr26, vr20, vr21 + vilvh.w vr27, vr20, vr21 + vshuf4i.h vr26, vr26, 0x1b + vshuf4i.h vr27, vr27, 0x1b + + vexth.w.h vr18, vr0 // in0 + vexth.w.h vr19, vr1 // in1 + vexth.w.h vr6, vr2 // in2 + vexth.w.h vr7, vr3 // in3 + vexth.w.h vr8, vr4 // in3 + vexth.w.h vr9, vr5 // in4 + vexth.w.h vr10, vr16 // in5 + vexth.w.h vr11, vr17 // in6 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr16, vr20, vr21 + vilvh.w vr17, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr18, vr20, vr21 + vilvh.w vr19, vr20, vr21 + vshuf4i.h vr18, vr18, 0x1b + vshuf4i.h vr19, vr19, 0x1b + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 + vsrari.h \i, \i, 1 +.endr + + // identity8 + vsllwil.w.h vr20, vr24, 1 + vsllwil.w.h vr21, vr25, 1 + vsllwil.w.h vr12, vr26, 1 + vsllwil.w.h vr13, vr27, 1 + vsllwil.w.h vr22, vr16, 1 + vsllwil.w.h vr23, vr17, 1 + vsllwil.w.h vr14, vr18, 1 + vsllwil.w.h vr15, vr19, 1 + +.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 + vexth.w.h \i, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr20, vr12, 4 // in0 + vssrarni.h.w vr24, vr26, 4 // in1 + vssrarni.h.w vr21, vr13, 4 // in2 + vssrarni.h.w vr25, vr27, 4 // in3 + vssrarni.h.w vr22, vr14, 4 // in4 + vssrarni.h.w vr16, vr18, 4 // in5 + vssrarni.h.w vr23, vr15, 4 // in6 + vssrarni.h.w vr17, vr19, 4 // in7 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr24, vr21, vr25 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr22, vr16, vr23, vr17 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_identity_flipadst_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + + // identity8 + vsllwil.w.h vr6, vr0, 1 + vsllwil.w.h vr7, vr1, 1 + vsllwil.w.h vr8, vr2, 1 + vsllwil.w.h vr9, vr3, 1 + vsllwil.w.h vr10, vr4, 1 + vsllwil.w.h vr11, vr5, 1 + vsllwil.w.h vr12, vr24, 1 + vsllwil.w.h vr13, vr25, 1 + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vexth.w.h \i, \i +.endr + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr0, vr6, 1 // in0 + vssrarni.h.w vr1, vr7, 1 // in1 + vssrarni.h.w vr2, vr8, 1 // in2 + vssrarni.h.w vr3, vr9, 1 // in3 + vssrarni.h.w vr4, vr10, 1 // in4 + vssrarni.h.w vr5, vr11, 1 // in5 + vssrarni.h.w vr24, vr12, 1 // in6 + vssrarni.h.w vr25, vr13, 1 // in7 + + LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsllwil.w.h vr18, vr0, 0 // in0 + vsllwil.w.h vr19, vr1, 0 // in1 + vsllwil.w.h vr6, vr2, 0 // in2 + vsllwil.w.h vr7, vr3, 0 // in3 + vsllwil.w.h vr8, vr4, 0 // in3 + vsllwil.w.h vr9, vr5, 0 // in4 + vsllwil.w.h vr10, vr24, 0 // in5 + vsllwil.w.h vr11, vr25, 0 // in6 + adst8x8_1d_lsx vr26, vr27, vr28, vr29 + + vexth.w.h vr18, vr0 // in0 + vexth.w.h vr19, vr1 // in1 + vexth.w.h vr6, vr2 // in2 + vexth.w.h vr7, vr3 // in3 + vexth.w.h vr8, vr4 // in3 + vexth.w.h vr9, vr5 // in4 + vexth.w.h vr10, vr24 // in5 + vexth.w.h vr11, vr25 // in6 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvh.d vr4, vr0, vr26 + vilvh.d vr5, vr1, vr27 + vilvh.d vr6, vr2, vr28 + vilvh.d vr7, vr3, vr29 + vilvl.d vr14, vr3, vr29 + vilvl.d vr15, vr2, vr28 + vilvl.d vr16, vr1, vr27 + vilvl.d vr17, vr0, vr26 + +.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 + +endfunc + +function inv_txfm_add_adst_identity_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr24, vr25, vr26, vr27 + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ + vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 + + vshuf4i.h vr26, vr26, 0x1b + vshuf4i.h vr27, vr27, 0x1b + vshuf4i.h vr22, vr22, 0x1b + vshuf4i.h vr23, vr23, 0x1b + + // identity8 + vsllwil.w.h vr16, vr24, 1 + vsllwil.w.h vr17, vr25, 1 + vsllwil.w.h vr10, vr20, 1 + vsllwil.w.h vr11, vr21, 1 + vsllwil.w.h vr18, vr26, 1 + vsllwil.w.h vr19, vr27, 1 + vsllwil.w.h vr14, vr22, 1 + vsllwil.w.h vr15, vr23, 1 + +.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23 + vexth.w.h \i, \i +.endr + +.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr18, vr16, 4 // in0 + vssrarni.h.w vr19, vr17, 4 // in1 + vssrarni.h.w vr14, vr10, 4 // in2 + vssrarni.h.w vr15, vr11, 4 // in3 + vssrarni.h.w vr26, vr24, 4 // in4 + vssrarni.h.w vr27, vr25, 4 // in5 + vssrarni.h.w vr22, vr20, 4 // in6 + vssrarni.h.w vr23, vr21, 4 // in7 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr18, vr19, vr14, vr15 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr26, vr27, vr22, vr23 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_identity_adst_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + + // identity8 + vsllwil.w.h vr6, vr0, 1 + vsllwil.w.h vr7, vr1, 1 + vsllwil.w.h vr8, vr2, 1 + vsllwil.w.h vr9, vr3, 1 + vsllwil.w.h vr10, vr4, 1 + vsllwil.w.h vr11, vr5, 1 + vsllwil.w.h vr12, vr24, 1 + vsllwil.w.h vr13, vr25, 1 + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vexth.w.h \i, \i +.endr + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr0, vr6, 1 // in0 + vssrarni.h.w vr1, vr7, 1 // in1 + vssrarni.h.w vr2, vr8, 1 // in2 + vssrarni.h.w vr3, vr9, 1 // in3 + vssrarni.h.w vr4, vr10, 1 // in4 + vssrarni.h.w vr5, vr11, 1 // in5 + vssrarni.h.w vr24, vr12, 1 // in6 + vssrarni.h.w vr25, vr13, 1 // in7 + + LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13 + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr24, 0 + vsllwil.w.h vr11, vr25, 0 + adst8x8_1d_lsx vr26, vr27, vr28, vr29 + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvl.d vr4, vr0, vr26 // 0 ... 7 + vilvl.d vr5, vr1, vr27 // 8 ... 15 + vilvl.d vr6, vr2, vr28 // 16 ... 23 + vilvl.d vr7, vr3, vr29 // 24 ... 31 + vilvh.d vr14, vr3, vr29 // 32 ... 39 + vilvh.d vr15, vr2, vr28 // 40 ... 47 + vilvh.d vr16, vr1, vr27 // 48 ... 55 + vilvh.d vr17, vr0, vr26 // 56 ... 63 + +.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 +endfunc + +.macro vmul_vmadd_w in0, in1, in2, in3, out0, out1 + vsllwil.w.h vr22, \in0, 0 + vexth.w.h vr23, \in0 + vmul.w \out0, vr22, \in2 + vmul.w \out1, vr23, \in2 + vsllwil.w.h vr22, \in1, 0 + vexth.w.h vr23, \in1 + vmadd.w \out0, vr22, \in3 + vmadd.w \out1, vr23, \in3 +.endm + +.macro vmul_vmsub_w in0, in1, in2, in3, out0, out1 + vsllwil.w.h vr22, \in0, 0 + vexth.w.h vr23, \in0 + vmul.w \out0, vr22, \in2 + vmul.w \out1, vr23, \in2 + vsllwil.w.h vr22, \in1, 0 + vexth.w.h vr23, \in1 + vmsub.w \out0, vr22, \in3 + vmsub.w \out1, vr23, \in3 +.endm + +.macro rect2_lsx in0, in1, out0 + vsllwil.w.h vr22, \in0, 0 // in1 + vexth.w.h \in0, \in0 // in1 + vmul.w vr22, vr22, \in1 + vmul.w \out0, \in0, \in1 + vssrarni.h.w \out0, vr22, 12 +.endm + +.macro dct_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7, rect2 + + la.local t0, idct_coeffs + +.ifc \rect2, rect2_lsx + vldrepl.w vr23, t0, 0 // 2896 +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + rect2_lsx \i, vr23, \i +.endr +.endif + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + + vmul_vmadd_w \in2, \in6, vr21, vr20, vr8, vr9 + vssrarni.h.w vr9, vr8, 12 // t3 + vmul_vmsub_w \in2, \in6, vr20, vr21, vr8, vr10 + vssrarni.h.w vr10, vr8, 12 // t2 + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w \in0, \in4, vr20, vr20, vr8, \in2 + vssrarni.h.w \in2, vr8, 12 // t0 + vmul_vmsub_w \in0, \in4, vr20, vr20, vr8, \in6 + vssrarni.h.w \in6, vr8, 12 // t1 + + vsadd.h vr8, \in2, vr9 // c[0] + vssub.h vr9, \in2, vr9 // c[3] + vsadd.h \in0, \in6, vr10 // c[1] + vssub.h vr10, \in6, vr10 // c[2] + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vmul_vmadd_w \in1, \in7, vr21, vr20, \in2, \in4 + vssrarni.h.w \in4, \in2, 12 // t7a + vmul_vmsub_w \in1, \in7, vr20, vr21, \in2, \in6 + vssrarni.h.w \in6, \in2, 12 // t4a + + vldrepl.w vr20, t0, 24 // 3406 + vldrepl.w vr21, t0, 28 // 2276 + vmul_vmadd_w \in5, \in3, vr21, vr20, \in2, \in1 + vssrarni.h.w \in1, \in2, 12 // t6a + vmul_vmsub_w \in5, \in3, vr20, vr21, \in2, \in7 + vssrarni.h.w \in7, \in2, 12 // t5a + + vsadd.h \in3, \in6, \in7 // t4 + vssub.h \in6, \in6, \in7 // t5a + vsadd.h \in5, \in4, \in1 // t7 + vssub.h \in4, \in4, \in1 // t6a + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w \in4, \in6, vr20, vr20, \in2, \in1 + vssrarni.h.w \in1, \in2, 12 // t6 + vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7 + vssrarni.h.w \in7, \in2, 12 // t5 + + vsadd.h \out0, vr8, \in5 // c[0] + vssub.h \out7, vr8, \in5 // c[7] + vsadd.h \out1, \in0, \in1 // c[1] + vssub.h \out6, \in0, \in1 // c[6] + vsadd.h \out2, vr10, \in7 // c[2] + vssub.h \out5, vr10, \in7 // c[5] + vsadd.h \out3, vr9, \in3 // c[3] + vssub.h \out4, vr9, \in3 // c[4] +.endm + +function inv_txfm_add_dct_dct_8x8_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_8x8 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 + vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift + vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 + alsl.d t2, a1, a0, 1 + vmadd.w vr5, vr2, vr0 + vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 + vssrarni.h.w vr5, vr5, 12 + vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 + + DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 + + b .DCT_DCT_8X8_END + +.NO_HAS_DCONLY_8x8: + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + vsrari.h \i, \i, 1 +.endr + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + dct_8x8_core_lsx vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23, no_rect2 + +.irp i, vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + +.DCT_DCT_8X8_END: + +endfunc diff --git a/src/loongarch/itx.h b/src/loongarch/itx.h index 880a8f1..7fdd3f7 100644 --- a/src/loongarch/itx.h +++ b/src/loongarch/itx.h @@ -66,6 +66,22 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x4, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x4, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x8, lsx)); static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) { #if BITDEPTH == 8 const unsigned flags = dav1d_get_cpu_flags(); @@ -109,6 +125,23 @@ static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c c->itxfm_add[RTX_8X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x4_8bpc_lsx; c->itxfm_add[RTX_8X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x4_8bpc_lsx; + c->itxfm_add[TX_8X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][IDTX] = dav1d_inv_txfm_add_identity_identity_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x8_8bpc_lsx; + #endif } -- cgit v1.2.3 From 5ebe32283e5f0354430c645e6e78be1820a64432 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 15:39:13 +0800 Subject: loongarch: Improve four functions in itx_8bpc.add_8x16 series 1. inv_txfm_add_dct_dct_8x16 2. inv_txfm_add_identity_identity_8x16 3. inv_txfm_add_adst_dct_8x16 4 .inv_txfm_add_dct_adst_8x16 Relative speedup over C code: inv_txfm_add_8x16_adst_dct_0_8bpc_c: 151.0 ( 1.00x) inv_txfm_add_8x16_adst_dct_0_8bpc_lsx: 14.9 (10.10x) inv_txfm_add_8x16_adst_dct_1_8bpc_c: 151.0 ( 1.00x) inv_txfm_add_8x16_adst_dct_1_8bpc_lsx: 15.0 (10.10x) inv_txfm_add_8x16_adst_dct_2_8bpc_c: 151.0 ( 1.00x) inv_txfm_add_8x16_adst_dct_2_8bpc_lsx: 15.0 (10.10x) inv_txfm_add_8x16_dct_adst_0_8bpc_c: 157.3 ( 1.00x) inv_txfm_add_8x16_dct_adst_0_8bpc_lsx: 13.6 (11.59x) inv_txfm_add_8x16_dct_adst_1_8bpc_c: 154.9 ( 1.00x) inv_txfm_add_8x16_dct_adst_1_8bpc_lsx: 13.6 (11.38x) inv_txfm_add_8x16_dct_adst_2_8bpc_c: 154.8 ( 1.00x) inv_txfm_add_8x16_dct_adst_2_8bpc_lsx: 13.5 (11.46x) inv_txfm_add_8x16_dct_dct_0_8bpc_c: 17.8 ( 1.00x) inv_txfm_add_8x16_dct_dct_0_8bpc_lsx: 1.5 (11.75x) inv_txfm_add_8x16_dct_dct_1_8bpc_c: 149.4 ( 1.00x) inv_txfm_add_8x16_dct_dct_1_8bpc_lsx: 12.0 (12.49x) inv_txfm_add_8x16_dct_dct_2_8bpc_c: 159.5 ( 1.00x) inv_txfm_add_8x16_dct_dct_2_8bpc_lsx: 12.0 (13.33x) inv_txfm_add_8x16_identity_identity_0_8bpc_c: 75.0 ( 1.00x) inv_txfm_add_8x16_identity_identity_0_8bpc_lsx: 6.0 (12.50x) inv_txfm_add_8x16_identity_identity_1_8bpc_c: 67.4 ( 1.00x) inv_txfm_add_8x16_identity_identity_1_8bpc_lsx: 6.0 (11.26x) inv_txfm_add_8x16_identity_identity_2_8bpc_c: 66.7 ( 1.00x) inv_txfm_add_8x16_identity_identity_2_8bpc_lsx: 5.9 (11.40x) --- src/loongarch/itx.S | 1002 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/itx.h | 10 + 2 files changed, 1012 insertions(+) diff --git a/src/loongarch/itx.S b/src/loongarch/itx.S index b14e1c4..f82c776 100644 --- a/src/loongarch/itx.S +++ b/src/loongarch/itx.S @@ -4021,3 +4021,1005 @@ function inv_txfm_add_dct_dct_8x8_8bpc_lsx .DCT_DCT_8X8_END: endfunc + +.macro dct_8x16_core_lsx + dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 32 // 401 + vldrepl.w vr21, t0, 36 // 4076 + vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10 + vssrarni.h.w vr10, vr0, 12 // t15a + vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29 + vssrarni.h.w vr29, vr0, 12 // t8a + + vldrepl.w vr20, t0, 40 // 3166 -> 1583 + vldrepl.w vr21, t0, 44 // 2598 -> 1299 + vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t14a + vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31 + vssrarni.h.w vr31, vr0, 12 // t9a + + vldrepl.w vr20, t0, 48 // 1931 + vldrepl.w vr21, t0, 52 // 3612 + vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24 + vssrarni.h.w vr24, vr0, 12 // t13a + vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25 + vssrarni.h.w vr25, vr0, 12 // t10a + + vldrepl.w vr20, t0, 56 // 3920 + vldrepl.w vr21, t0, 60 // 1189 + vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t12a + vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t11a + + // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 + vsadd.h vr28, vr29, vr31 // t8 + vssub.h vr19, vr29, vr31 // t9 + vssub.h vr29, vr27, vr25 // t10 + vsadd.h vr9, vr27, vr25 // t11 + vsadd.h vr31, vr26, vr24 // t12 + vssub.h vr25, vr26, vr24 // t13 + vssub.h vr27, vr10, vr30 // t14 + vsadd.h vr24, vr10, vr30 // t15 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t14a + vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t9a + + vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 + vneg.w vr0, vr0 + vneg.w vr19, vr19 + vssrarni.h.w vr19, vr0, 12 // t10a + vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t13a + + vsadd.h vr25, vr28, vr9 // t8a + vssub.h vr29, vr28, vr9 // t11a + vssub.h vr28, vr24, vr31 // t12a + vsadd.h vr10, vr24, vr31 // t15a + vsadd.h vr9, vr30, vr19 // t9 + vssub.h vr31, vr30, vr19 // t10 + vssub.h vr30, vr26, vr27 // t13 + vsadd.h vr24, vr26, vr27 // t14 + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t13a + vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t10a + + vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 + vssrarni.h.w vr31, vr0, 12 // t12 + vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t11 + + // vr11 vr12 ... vr18 + vsadd.h vr28, vr14, vr31 // c[3] + vssub.h vr29, vr14, vr31 // c[12] + vsadd.h vr20, vr15, vr30 // c[4] + vssub.h vr21, vr15, vr30 // c[11] + vsadd.h vr14, vr16, vr27 // c[5] + vssub.h vr23, vr16, vr27 // c[10] + vsadd.h vr15, vr17, vr9 // c[6] + vssub.h vr30, vr17, vr9 // c[9] + vsadd.h vr16, vr18, vr25 // c[7] + vssub.h vr27, vr18, vr25 // c[8] + vsadd.h vr17, vr13, vr26 // c[2] + vssub.h vr26, vr13, vr26 // c[13] + vsadd.h vr18, vr12, vr24 // c[1] + vssub.h vr25, vr12, vr24 // c[14] + vsadd.h vr22, vr11, vr10 // c[0] + vssub.h vr24, vr11, vr10 // c[15] +.endm + +function inv_txfm_add_dct_dct_8x16_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_8x16 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 + vmul.w vr2, vr0, vr2 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift + vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 + alsl.d t2, a1, a0, 1 + vmadd.w vr5, vr2, vr0 + vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 + vssrarni.h.w vr5, vr5, 12 + vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 + + DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 + + b .DCT_DCT_8X16_END + +.NO_HAS_DCONLY_8x16: + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx + + vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + vsrari.h \i, \i, 1 +.endr + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 + + LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ + vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 + + dct_8x16_core_lsx + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr22, vr18, vr17, vr28 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr14, vr15, vr16 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr27, vr30, vr23, vr21 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr29, vr26, vr25, vr24 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +.DCT_DCT_8X16_END: +endfunc + +.macro identity_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, rect2 + + la.local t0, idct_coeffs + +.ifc \rect2, rect2_lsx + vldrepl.w vr23, t0, 0 // 2896 +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + rect2_lsx \i, vr23, \i +.endr +.endif + vsllwil.w.h vr8, \in0, 1 + vsllwil.w.h vr9, \in1, 1 + vsllwil.w.h vr10, \in2, 1 + vsllwil.w.h vr11, \in3, 1 + vsllwil.w.h vr12, \in4, 1 + vsllwil.w.h vr13, \in5, 1 + vsllwil.w.h vr14, \in6, 1 + vsllwil.w.h vr15, \in7, 1 + +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + vexth.w.h \i, \i +.endr + +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w \in0, vr8, 1 + vssrarni.h.w \in1, vr9, 1 + vssrarni.h.w \in2, vr10, 1 + vssrarni.h.w \in3, vr11, 1 + vssrarni.h.w \in4, vr12, 1 + vssrarni.h.w \in5, vr13, 1 + vssrarni.h.w \in6, vr14, 1 + vssrarni.h.w \in7, vr15, 1 +.endm + +.macro identity_8x16_core_lsx in0, out0 + vsadd.h vr10, \in0, \in0 + vsllwil.w.h vr8, \in0, 0 + vexth.w.h \out0, \in0 + vmul.w vr8, vr8, vr20 + vmul.w \out0, \out0, vr20 + vssrarni.h.w \out0, vr8, 11 + vsadd.h \out0, \out0, vr10 +.endm + +function inv_txfm_add_identity_identity_8x16_8bpc_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + identity_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, rect2_lsx + + vld_x8 a2, 128, 16, vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27 + + identity_8x8_core_lsx vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27, rect2_lsx + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + + LSX_TRANSPOSE8x8_H vr0, vr2, vr4, vr6, vr16, vr18, vr24, vr26, \ + vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21 + + LSX_TRANSPOSE8x8_H vr1, vr3, vr5, vr7, vr17, vr19, vr25, vr27, \ + vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + +.irp i, vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \ + vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27 + identity_8x16_core_lsx \i, \i + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr22, vr23 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr16, vr18, vr24, vr26 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr28, vr29, vr30, vr31 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr17, vr19, vr25, vr27 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc + +.macro adst_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7, rect2 + + la.local t0, iadst8_coeffs + +.ifc \rect2, rect2_lsx + vldrepl.w vr23, t0, 32 // 2896 +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + rect2_lsx \i, vr23, \i +.endr +.endif + + vldrepl.w vr20, t0, 0 // 4076 + vldrepl.w vr21, t0, 4 // 401 + + vmul_vmadd_w vr7, vr0, vr20, vr21, vr8, vr9 + vssrarni.h.w vr9, vr8, 12 // t0a low + vmul_vmsub_w vr7, vr0, vr21, vr20, vr8, vr10 + vssrarni.h.w vr10, vr8, 12 // t1a low + + vldrepl.w vr20, t0, 8 // 3612 + vldrepl.w vr21, t0, 12 // 1931 + vmul_vmadd_w vr5, vr2, vr20, vr21, vr8, vr0 + vssrarni.h.w vr0, vr8, 12 // t2a low + vmul_vmsub_w vr5, vr2, vr21, vr20, vr8, vr7 + vssrarni.h.w vr7, vr8, 12 // t3a low + + vldrepl.w vr20, t0, 16 // 2598 -> 1299 + vldrepl.w vr21, t0, 20 // 3166 -> 1583 + vmul_vmadd_w vr3, vr4, vr20, vr21, vr8, vr2 + vssrarni.h.w vr2, vr8, 12 // t4a low + vmul_vmsub_w vr3, vr4, vr21, vr20, vr8, vr5 + vssrarni.h.w vr5, vr8, 12 // t5a low + + vldrepl.w vr20, t0, 24 // 1189 + vldrepl.w vr21, t0, 28 // 3920 + vmul_vmadd_w vr1, vr6, vr20, vr21, vr8, vr3 + vssrarni.h.w vr3, vr8, 12 // t6a low + vmul_vmsub_w vr1, vr6, vr21, vr20, vr8, vr4 + vssrarni.h.w vr4, vr8, 12 // t7a low + + vsadd.h vr1, vr9, vr2 // t0 + vssub.h vr6, vr9, vr2 // t4 + vsadd.h vr8, vr10, vr5 // t1 + vssub.h vr2, vr10, vr5 // t5 + vsadd.h vr9, vr0, vr3 // t2 + vssub.h vr5, vr0, vr3 // t6 + vsadd.h vr10, vr7, vr4 // t3 + vssub.h vr0, vr7, vr4 // t7 + + vldrepl.w vr20, t0, 40 // 1567 + vldrepl.w vr21, t0, 44 // 3784 + vmul_vmadd_w vr6, vr2, vr21, vr20, vr3, vr4 + vssrarni.h.w vr4, vr3, 12 // t4a low + vmul_vmsub_w vr6, vr2, vr20, vr21, vr3, vr7 + vssrarni.h.w vr7, vr3, 12 // t5a low + + vmul_vmadd_w vr0, vr5, vr20, vr21, vr3, vr2 + vssrarni.h.w vr2, vr3, 12 // t7a low + vmul_vmsub_w vr0, vr5, vr21, vr20, vr3, vr6 + vssrarni.h.w vr6, vr3, 12 // t6a low + + vsadd.h \out0, vr1, vr9 // out[0] + vssub.h vr5, vr1, vr9 // t2 + vsadd.h vr3, vr8, vr10 // out[7] + vssub.h vr1, vr8, vr10 // t3 + vexth.w.h vr9, vr3 + vsllwil.w.h vr21, vr3, 0 + vneg.w \out7, vr9 + vneg.w vr21, vr21 + vssrarni.h.w \out7, vr21, 0 // out[7] + + vsadd.h vr8, vr4, vr6 // out[1] + vssub.h vr10, vr4, vr6 // t6 + vexth.w.h vr20, vr8 + vsllwil.w.h vr21, vr8, 0 + vneg.w \out1, vr20 + vneg.w vr21, vr21 + vssrarni.h.w \out1, vr21, 0 // out[1] + vsadd.h \out6, vr7, vr2 // out[6] + vssub.h vr4, vr7, vr2 // t7 + + vldrepl.w vr20, t0, 32 // 2896 + vmul_vmadd_w vr5, vr1, vr20, vr20, vr9, vr6 + vssrarni.h.w vr6, vr9, 12 // out[3] + vmul_vmsub_w vr5, vr1, vr20, vr20, vr9, \out4 + vssrarni.h.w \out4, vr9, 12 // out[4] + + vmul_vmadd_w vr10, vr4, vr20, vr20, vr9, \out2 + vssrarni.h.w \out2, vr9, 12 // out[2] + vmul_vmsub_w vr10, vr4, vr20, vr20, vr9, vr5 + vssrarni.h.w vr5, vr9, 12 // out[5] + + vexth.w.h vr20, vr6 + vsllwil.w.h vr21, vr6, 0 + vneg.w \out3, vr20 + vneg.w vr21, vr21 + vssrarni.h.w \out3, vr21, 0 // out[3] + + vexth.w.h vr20, vr5 + vsllwil.w.h vr21, vr5, 0 + vneg.w \out5, vr20 + vneg.w vr21, vr21 + vssrarni.h.w \out5, vr21, 0 // out[5] +.endm + +function inv_txfm_add_adst_dct_8x16_8bpc_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx + + vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + vsrari.h \i, \i, 1 +.endr + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 + + LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ + vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 + + dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 32 // 401 + vldrepl.w vr21, t0, 36 // 4076 + vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10 + vssrarni.h.w vr10, vr0, 12 // t15a + vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29 + vssrarni.h.w vr29, vr0, 12 // t8a + + vldrepl.w vr20, t0, 40 // 3166 -> 1583 + vldrepl.w vr21, t0, 44 // 2598 -> 1299 + vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t14a + vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31 + vssrarni.h.w vr31, vr0, 12 // t9a + + vldrepl.w vr20, t0, 48 // 1931 + vldrepl.w vr21, t0, 52 // 3612 + vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24 + vssrarni.h.w vr24, vr0, 12 // t13a + vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25 + vssrarni.h.w vr25, vr0, 12 // t10a + + vldrepl.w vr20, t0, 56 // 3920 + vldrepl.w vr21, t0, 60 // 1189 + vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t12a + vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t11a + + // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 + vsadd.h vr28, vr29, vr31 // t8 + vssub.h vr19, vr29, vr31 // t9 + vssub.h vr29, vr27, vr25 // t10 + vsadd.h vr9, vr27, vr25 // t11 + vsadd.h vr31, vr26, vr24 // t12 + vssub.h vr25, vr26, vr24 // t13 + vssub.h vr27, vr10, vr30 // t14 + vsadd.h vr24, vr10, vr30 // t15 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t14a + vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t9a + + vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 + vneg.w vr0, vr0 + vneg.w vr19, vr19 + vssrarni.h.w vr19, vr0, 12 // t10a + vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t13a + + vsadd.h vr25, vr28, vr9 // t8a + vssub.h vr29, vr28, vr9 // t11a + vssub.h vr28, vr24, vr31 // t12a + vsadd.h vr10, vr24, vr31 // t15a + vsadd.h vr9, vr30, vr19 // t9 + vssub.h vr31, vr30, vr19 // t10 + vssub.h vr30, vr26, vr27 // t13 + vsadd.h vr24, vr26, vr27 // t14 + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t13a + vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t10a + + vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 + vssrarni.h.w vr31, vr0, 12 // t12 + vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t11 + + // vr11 vr12 ... vr18 + vsadd.h vr28, vr14, vr31 // c[3] + vssub.h vr29, vr14, vr31 // c[12] + vsadd.h vr20, vr15, vr30 // c[4] + vssub.h vr21, vr15, vr30 // c[11] + vsadd.h vr14, vr16, vr27 // c[5] + vssub.h vr23, vr16, vr27 // c[10] + vsadd.h vr15, vr17, vr9 // c[6] + vssub.h vr30, vr17, vr9 // c[9] + vsadd.h vr16, vr18, vr25 // c[7] + vssub.h vr27, vr18, vr25 // c[8] + vsadd.h vr17, vr13, vr26 // c[2] + vssub.h vr26, vr13, vr26 // c[13] + vsadd.h vr18, vr12, vr24 // c[1] + vssub.h vr25, vr12, vr24 // c[14] + vsadd.h vr22, vr11, vr10 // c[0] + vssub.h vr24, vr11, vr10 // c[15] + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr22, vr18, vr17, vr28 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr14, vr15, vr16 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr27, vr30, vr23, vr21 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr29, vr26, vr25, vr24 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc + +const iadst16_coeffs, align=4 + .word 4091, 201, 3973, 995 + .word 3703, 1751, 3290, 2440 + .word 2751, 3035, 2106, 3513 + .word 1380, 3857, 601, 4052 +endconst + +.macro adst16_core_lsx transpose8x8, shift, vst + la.local t0, iadst16_coeffs + vldrepl.w vr20, t0, 0 // 4091 + vldrepl.w vr21, t0, 4 // 201 + + vmul_vmadd_w vr15, vr0, vr20, vr21, vr16, vr18 + vmul_vmsub_w vr15, vr0, vr21, vr20, vr17, vr19 + vssrarni.h.w vr18, vr16, 12 // t0 + vssrarni.h.w vr19, vr17, 12 // t1 + + vldrepl.w vr20, t0, 8 // 3973 + vldrepl.w vr21, t0, 12 // 995 + vmul_vmadd_w vr13, vr2, vr20, vr21, vr16, vr0 + vmul_vmsub_w vr13, vr2, vr21, vr20, vr17, vr15 + vssrarni.h.w vr0, vr16, 12 // t2 + vssrarni.h.w vr15, vr17, 12 // t3 + + vldrepl.w vr20, t0, 16 // 3703 + vldrepl.w vr21, t0, 20 // 1751 + vmul_vmadd_w vr11, vr4, vr20, vr21, vr16, vr2 + vmul_vmsub_w vr11, vr4, vr21, vr20, vr17, vr13 + vssrarni.h.w vr2, vr16, 12 // t4 + vssrarni.h.w vr13, vr17, 12 // t5 + + vldrepl.w vr20, t0, 24 // 3290 -> 1645 + vldrepl.w vr21, t0, 28 // 2440 -> 1220 + vmul_vmadd_w vr9, vr6, vr20, vr21, vr16, vr4 + vmul_vmsub_w vr9, vr6, vr21, vr20, vr17, vr11 + vssrarni.h.w vr4, vr16, 12 // t6 + vssrarni.h.w vr11, vr17, 12 // t7 + + vldrepl.w vr20, t0, 32 // 2751 + vldrepl.w vr21, t0, 36 // 3035 + vmul_vmadd_w vr7, vr8, vr20, vr21, vr16, vr6 + vmul_vmsub_w vr7, vr8, vr21, vr20, vr17, vr9 + vssrarni.h.w vr6, vr16, 12 // t8 + vssrarni.h.w vr9, vr17, 12 // t9 + + vldrepl.w vr20, t0, 40 // 2106 + vldrepl.w vr21, t0, 44 // 3513 + vmul_vmadd_w vr5, vr10, vr20, vr21, vr16, vr7 + vmul_vmsub_w vr5, vr10, vr21, vr20, vr17, vr8 + vssrarni.h.w vr7, vr16, 12 // t10 + vssrarni.h.w vr8, vr17, 12 // t11 + + vldrepl.w vr20, t0, 48 // 1380 + vldrepl.w vr21, t0, 52 // 3857 + vmul_vmadd_w vr3, vr12, vr20, vr21, vr16, vr5 + vmul_vmsub_w vr3, vr12, vr21, vr20, vr17, vr10 + vssrarni.h.w vr5, vr16, 12 // t12 + vssrarni.h.w vr10, vr17, 12 // t13 + + vldrepl.w vr20, t0, 56 // 601 + vldrepl.w vr21, t0, 60 // 4052 + vmul_vmadd_w vr1, vr14, vr20, vr21, vr16, vr3 + vmul_vmsub_w vr1, vr14, vr21, vr20, vr17, vr12 + vssrarni.h.w vr3, vr16, 12 // t14 + vssrarni.h.w vr12, vr17, 12 // t15 + + vsadd.h vr1, vr18, vr6 // t0a + vssub.h vr14, vr18, vr6 // t8a + vsadd.h vr16, vr19, vr9 // t1a + vssub.h vr17, vr19, vr9 // t9a + vsadd.h vr6, vr0, vr7 // t2a + vssub.h vr18, vr0, vr7 // t10a + vsadd.h vr9, vr15, vr8 // t3a + vssub.h vr19, vr15, vr8 // t11a + vsadd.h vr0, vr2, vr5 // t4a + vssub.h vr7, vr2, vr5 // t12a + vsadd.h vr8, vr13, vr10 // t5a + vssub.h vr15, vr13, vr10 // t13a + vsadd.h vr2, vr4, vr3 // t6a + vssub.h vr5, vr4, vr3 // t14a + vsadd.h vr10, vr11, vr12 // t7a + vssub.h vr13, vr11, vr12 // t15a + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vmul_vmadd_w vr14, vr17, vr21, vr20, vr3, vr11 + vmul_vmsub_w vr14, vr17, vr20, vr21, vr4, vr12 + vssrarni.h.w vr11, vr3, 12 // t8 + vssrarni.h.w vr12, vr4, 12 // t9 + + vmul_vmadd_w vr15, vr7, vr20, vr21, vr3, vr14 + vmul_vmsub_w vr15, vr7, vr21, vr20, vr4, vr17 + vssrarni.h.w vr14, vr3, 12 // t13 + vssrarni.h.w vr17, vr4, 12 // t12 + + vldrepl.w vr20, t0, 24 // 3406 + vldrepl.w vr21, t0, 28 // 2276 + vmul_vmadd_w vr18, vr19, vr21, vr20, vr3, vr7 + vmul_vmsub_w vr18, vr19, vr20, vr21, vr4, vr15 + vssrarni.h.w vr7, vr3, 12 // t10 + vssrarni.h.w vr15, vr4, 12 // t11 + + vmul_vmadd_w vr13, vr5, vr20, vr21, vr3, vr18 + vmul_vmsub_w vr13, vr5, vr21, vr20, vr4, vr19 + vssrarni.h.w vr18, vr3, 12 // t15 + vssrarni.h.w vr19, vr4, 12 // t14 + + vsadd.h vr5, vr1, vr0 // t0 + vssub.h vr13, vr1, vr0 // t4 + vsadd.h vr3, vr16, vr8 // t1 + vssub.h vr4, vr16, vr8 // t5 + vsadd.h vr0, vr6, vr2 // t2 + vssub.h vr1, vr6, vr2 // t6 + vsadd.h vr8, vr9, vr10 // t3 + vssub.h vr16, vr9, vr10 // t7 + vsadd.h vr2, vr11, vr17 // t8a + vssub.h vr6, vr11, vr17 // t12a + vsadd.h vr9, vr12, vr14 // t9a + vssub.h vr10, vr12, vr14 // t13a + vsadd.h vr11, vr7, vr19 // t10a + vssub.h vr17, vr7, vr19 // t14a + vsadd.h vr12, vr15, vr18 // t11a + vssub.h vr14, vr15, vr18 // t15a + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr13, vr4, vr21, vr20, vr7, vr18 + vmul_vmsub_w vr13, vr4, vr20, vr21, vr15, vr19 + vssrarni.h.w vr18, vr7, 12 // t4a + vssrarni.h.w vr19, vr15, 12 // t5a + + vmul_vmadd_w vr16, vr1, vr20, vr21, vr7, vr4 + vmul_vmsub_w vr16, vr1, vr21, vr20, vr15, vr13 + vssrarni.h.w vr4, vr7, 12 // t7a + vssrarni.h.w vr13, vr15, 12 // t6a + + vmul_vmadd_w vr6, vr10, vr21, vr20, vr7, vr1 + vmul_vmsub_w vr6, vr10, vr20, vr21, vr15, vr16 + vssrarni.h.w vr1, vr7, 12 // t12 + vssrarni.h.w vr16, vr15, 12 // t13 + + vmul_vmadd_w vr14, vr17, vr20, vr21, vr7, vr6 + vmul_vmsub_w vr14, vr17, vr21, vr20, vr15, vr10 + vssrarni.h.w vr6, vr7, 12 // t15 + vssrarni.h.w vr10, vr15, 12 // t14 + + vsadd.h vr14, vr5, vr0 // out[0] + vssub.h vr17, vr5, vr0 // t2a + vssub.h vr7, vr3, vr8 // t3a + vsadd.h vr15, vr3, vr8 // out[15] + vsllwil.w.h vr22, vr15, 0 + vexth.w.h vr15, vr15 + vneg.w vr22, vr22 + vneg.w vr15, vr15 + vssrarni.h.w vr15, vr22, 0 // out[15] + vsadd.h vr14, vr5, vr0 // out[0] + vssub.h vr17, vr5, vr0 // t2a + vssub.h vr7, vr3, vr8 // t3a + + vsadd.h vr3, vr19, vr4 // out[12] + vssub.h vr8, vr19, vr4 // t7 + vssub.h vr0, vr18, vr13 // t6 + vsadd.h vr5, vr18, vr13 // out[3] + vsllwil.w.h vr22, vr5, 0 + vexth.w.h vr5, vr5 + vneg.w vr22, vr22 + vneg.w vr5, vr5 + vssrarni.h.w vr5, vr22, 0 // out[3] + + vsadd.h vr13, vr9, vr12 // out[14] + vssub.h vr19, vr9, vr12 // t11 + vssub.h vr4, vr2, vr11 // t10 + vsadd.h vr18, vr2, vr11 // out[1] + vsllwil.w.h vr22, vr18, 0 + vexth.w.h vr18, vr18 + vneg.w vr22, vr22 + vneg.w vr18, vr18 + vssrarni.h.w vr18, vr22, 0 // out[1] + + vsadd.h vr2, vr1, vr10 // out[2] + vssub.h vr11, vr1, vr10 // t14a + vssub.h vr12, vr16, vr6 // t15a + vsadd.h vr9, vr16, vr6 // out[13] + vsllwil.w.h vr22, vr9, 0 + vexth.w.h vr9, vr9 + vneg.w vr22, vr22 + vneg.w vr9, vr9 + vssrarni.h.w vr9, vr22, 0 // out[13] + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w vr17, vr7, vr20, vr20, vr6, vr10 + vmul_vmsub_w vr17, vr7, vr20, vr20, vr16, vr1 + vssrarni.h.w vr10, vr6, 12 // out[7] + + vsllwil.w.h vr7, vr10, 0 + vexth.w.h vr10, vr10 + vneg.w vr7, vr7 + vneg.w vr10, vr10 + vssrarni.h.w vr10, vr7, 0 + vssrarni.h.w vr1, vr16, 12 // out[8] + + vmul_vmsub_w vr0, vr8, vr20, vr20, vr16, vr17 + vmul_vmadd_w vr0, vr8, vr20, vr20, vr6, vr7 + vssrarni.h.w vr17, vr16, 12 // out[11] + + vsllwil.w.h vr0, vr17, 0 + vexth.w.h vr17, vr17 + vneg.w vr0, vr0 + vneg.w vr17, vr17 + vssrarni.h.w vr17, vr0, 0 + vssrarni.h.w vr7, vr6, 12 // out[4] + + vmul_vmsub_w vr4, vr19, vr20, vr20, vr16, vr0 + vmul_vmadd_w vr4, vr19, vr20, vr20, vr6, vr8 + vssrarni.h.w vr0, vr16, 12 // out[9] + + vsllwil.w.h vr4, vr0, 0 + vexth.w.h vr0, vr0 + vneg.w vr4, vr4 + vneg.w vr0, vr0 + vssrarni.h.w vr0, vr4, 0 + vssrarni.h.w vr8, vr6, 12 // out[6] + + vmul_vmadd_w vr11, vr12, vr20, vr20, vr6, vr4 + vmul_vmsub_w vr11, vr12, vr20, vr20, vr16, vr19 + vssrarni.h.w vr4, vr6, 12 // out[5] + + vsllwil.w.h vr24, vr4, 0 + vexth.w.h vr4, vr4 + vneg.w vr24, vr24 + vneg.w vr4, vr4 + vssrarni.h.w vr4, vr24, 0 + vssrarni.h.w vr19, vr16, 12 // out[10] + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ + vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ + vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 + + LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ + vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ + vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 +.endif + +.ifnb \shift +.irp i, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ + vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + vsrari.h \i, \i, \shift +.endr +.endif + +.ifnb \vst + vst_x16 t1, 0, 16, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ + vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 +.endif +// out0 out1 out2 out3 out4 out5 out6 out7 +// vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 +// out8 out9 out10 out11 out12 out13 out14 out15 +// vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 +.endm // adst16_core_lsx + +.macro adst16_core_finish_lsx in0, in1, in2, in3, in4, in5, in6, in7 + fld.d f20, t2, 0 + fldx.d f21, t2, a1 + fld.d f22, t3, 0 + fldx.d f23, t3, a1 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + fld.d f24, t2, 0 + fldx.d f25, t2, a1 + fld.d f26, t3, 0 + fldx.d f27, t3, a1 + +.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 + vsllwil.hu.bu \i, \i, 0 +.endr + +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + vsrari.h \i, \i, 4 +.endr + + vadd.h vr20, vr20, \in0 + vadd.h vr21, vr21, \in1 + vadd.h vr22, vr22, \in2 + vadd.h vr23, vr23, \in3 + vadd.h vr24, vr24, \in4 + vadd.h vr25, vr25, \in5 + vadd.h vr26, vr26, \in6 + vadd.h vr27, vr27, \in7 + + vssrani.bu.h vr21, vr20, 0 + vssrani.bu.h vr23, vr22, 0 + vssrani.bu.h vr25, vr24, 0 + vssrani.bu.h vr27, vr26, 0 + + vstelm.d vr21, t4, 0, 0 + vstelm.d vr21, t5, 0, 1 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + vstelm.d vr23, t4, 0, 0 + vstelm.d vr23, t5, 0, 1 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + vstelm.d vr25, t4, 0, 0 + vstelm.d vr25, t5, 0, 1 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + vstelm.d vr27, t4, 0, 0 + vstelm.d vr27, t5, 0, 1 + +.endm // adst16_core_finish_lsx + +function inv_txfm_add_dct_adst_8x16_8bpc_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx + + vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + vsrari.h \i, \i, 1 +.endr + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 + + LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ + vr16, vr17, vr18, vr20, vr21, vr22, vr23, vr31 + + adst16_core_lsx , , + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc diff --git a/src/loongarch/itx.h b/src/loongarch/itx.h index 7fdd3f7..c2fd2a9 100644 --- a/src/loongarch/itx.h +++ b/src/loongarch/itx.h @@ -82,6 +82,12 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x8, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x8, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x8, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x8, lsx)); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x16, lsx)); + static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) { #if BITDEPTH == 8 const unsigned flags = dav1d_get_cpu_flags(); @@ -142,6 +148,10 @@ static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c c->itxfm_add[TX_8X8][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x8_8bpc_lsx; c->itxfm_add[TX_8X8][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x8_8bpc_lsx; + c->itxfm_add[RTX_8X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x16_8bpc_lsx; + c->itxfm_add[RTX_8X16][IDTX] = dav1d_inv_txfm_add_identity_identity_8x16_8bpc_lsx; + c->itxfm_add[RTX_8X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x16_8bpc_lsx; + c->itxfm_add[RTX_8X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x16_8bpc_lsx; #endif } -- cgit v1.2.3 From 8626d9f9a6e8c1e2164e55177c64b69f51204b4e Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 15:50:15 +0800 Subject: loongarch: Improve two functions in itx_8bpc.add_16x8 series 1. inv_txfm_add_dct_dct_16x8 2. inv_txfm_add_adst_dct_16x8 Relative speedup over C code: inv_txfm_add_16x8_adst_dct_0_8bpc_c: 152.1 ( 1.00x) inv_txfm_add_16x8_adst_dct_0_8bpc_lsx: 13.7 (11.08x) inv_txfm_add_16x8_adst_dct_1_8bpc_c: 152.1 ( 1.00x) inv_txfm_add_16x8_adst_dct_1_8bpc_lsx: 13.7 (11.07x) inv_txfm_add_16x8_adst_dct_2_8bpc_c: 152.1 ( 1.00x) inv_txfm_add_16x8_adst_dct_2_8bpc_lsx: 13.7 (11.08x) inv_txfm_add_16x8_dct_dct_0_8bpc_c: 17.0 ( 1.00x) inv_txfm_add_16x8_dct_dct_0_8bpc_lsx: 1.3 (13.10x) inv_txfm_add_16x8_dct_dct_1_8bpc_c: 147.5 ( 1.00x) inv_txfm_add_16x8_dct_dct_1_8bpc_lsx: 11.6 (12.73x) inv_txfm_add_16x8_dct_dct_2_8bpc_c: 147.5 ( 1.00x) inv_txfm_add_16x8_dct_dct_2_8bpc_lsx: 11.6 (12.74x) --- src/loongarch/itx.S | 226 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/itx.h | 7 ++ 2 files changed, 233 insertions(+) diff --git a/src/loongarch/itx.S b/src/loongarch/itx.S index f82c776..f5dc824 100644 --- a/src/loongarch/itx.S +++ b/src/loongarch/itx.S @@ -5023,3 +5023,229 @@ function inv_txfm_add_dct_adst_8x16_8bpc_lsx fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc + +.macro malloc_space number + li.w t0, \number + sub.d sp, sp, t0 + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 +.endm + +.macro free_space number + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + li.w t0, \number + add.d sp, sp, t0 + addi.d sp, sp, 64 +.endm + +.macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11 + vsllwil.hu.bu vr10, \in0, 0 + vexth.hu.bu vr0, \in0 + vsllwil.hu.bu vr11, \in1, 0 + vexth.hu.bu vr1, \in1 + vsllwil.hu.bu vr12, \in2, 0 + vexth.hu.bu vr2, \in2 + vsllwil.hu.bu vr13, \in3, 0 + vexth.hu.bu vr3, \in3 + vadd.h vr10, vr10, \in4 + vadd.h vr0, vr0, \in5 + vadd.h vr11, vr11, \in6 + vadd.h vr1, vr1, \in7 + vadd.h vr12, vr12, \in8 + vadd.h vr2, vr2, \in9 + vadd.h vr13, vr13, \in10 + vadd.h vr3, vr3, \in11 + vssrani.bu.h vr0, vr10, 0 + vssrani.bu.h vr1, vr11, 0 + vssrani.bu.h vr2, vr12, 0 + vssrani.bu.h vr3, vr13, 0 + vst vr0, a0, 0 + vstx vr1, a0, a1 + vst vr2, t2, 0 + vstx vr3, t2, a1 +.endm + +.macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, shift + +.ifnb \shift +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + vsrari.h \i, \i, \shift +.endr +.endif + + vld vr0, a0, 0 + vldx vr1, a0, a1 + vld vr2, t2, 0 + vldx vr3, t2, a1 + DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \ + \in4, \in5, \in6, \in7 +.endm + +function inv_txfm_add_dct_dct_16x8_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_16x8 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + alsl.d t2, a1, a0, 1 + vmul.w vr2, vr2, vr0 + vldx vr1, a0, a1 + vsrari.w vr2, vr2, 8 + vldx vr3, t2, a1 + vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift + vmadd.w vr5, vr2, vr0 + vld vr0, a0, 0 + vssrarni.h.w vr5, vr5, 12 + vld vr2, t2, 0 + + DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, + + b .DCT_DCT_16x8_END + +.NO_HAS_DCONLY_16x8: + malloc_space 512 + + vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + la.local t0, idct_coeffs + + vldrepl.w vr23, t0, 0 //2896 +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + rect2_lsx \i, vr23, \i +.endr + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ + vr13, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \ + vr13, vr31, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ + vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 1 +.endr + + vst_x16 sp, 64, 16, vr13, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr12, vr29, vr26, vr25, vr24 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + dct_8x8_core_lsx vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ + vr4, vr5, vr6, vr16, vr7, vr18, vr19, vr31, no_rect2 + + dct_8x8_core_lsx vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \ + vr14, vr15, vr17, vr20, vr21, vr22, vr23, vr28, no_rect2 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W16 vr4, vr14, vr5, vr15, vr6, vr17, vr16, vr20, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W16 vr7, vr21, vr18, vr22, vr19, vr23, vr31, vr28, 4 + + free_space 512 + +.DCT_DCT_16x8_END: + +endfunc + +function inv_txfm_add_adst_dct_16x8_8bpc_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + addi.d t1, sp, 64 + addi.d t2, a2, 0 + + vld_x16 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + la.local t0, idct_coeffs + + vldrepl.w vr23, t0, 0 //2896 +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + rect2_lsx \i, vr23, \i +.endr + + adst16_core_lsx , 1, + + // out0 out1 out2 out3 out4 out5 out6 out7 + // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 + // out8 out9 out10 out11 out12 out13 out14 out15 + // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 + + LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ + vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \ + vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 + + LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ + vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \ + vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + dct_8x8_core_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \ + vr27, vr28, vr29, vr25, vr30, vr31, vr6, vr16, no_rect2 + + dct_8x8_core_lsx vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \ + vr5, vr7, vr18, vr20, vr21, vr22, vr23, vr24, no_rect2 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W16 vr27, vr5, vr28, vr7, vr29, vr18, vr25, vr20, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W16 vr30, vr21, vr31, vr22, vr6, vr23, vr16, vr24, 4 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc diff --git a/src/loongarch/itx.h b/src/loongarch/itx.h index c2fd2a9..ef5df65 100644 --- a/src/loongarch/itx.h +++ b/src/loongarch/itx.h @@ -88,6 +88,9 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x16, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x16, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x8, lsx)); + static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) { #if BITDEPTH == 8 const unsigned flags = dav1d_get_cpu_flags(); @@ -152,6 +155,10 @@ static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c c->itxfm_add[RTX_8X16][IDTX] = dav1d_inv_txfm_add_identity_identity_8x16_8bpc_lsx; c->itxfm_add[RTX_8X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x16_8bpc_lsx; c->itxfm_add[RTX_8X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x16_8bpc_lsx; + + c->itxfm_add[RTX_16X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x8_8bpc_lsx; + c->itxfm_add[RTX_16X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x8_8bpc_lsx; + #endif } -- cgit v1.2.3 From 233be20140793a09a9a8e56d3b41d1622fddbc25 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 16:05:26 +0800 Subject: loongarch: Improve one functions in itx_8bpc.add_4x8 series 1. inv_txfm_add_dct_dct_4x8 Relative speedup over C code: inv_txfm_add_4x8_dct_dct_0_8bpc_c: 5.7 ( 1.00x) inv_txfm_add_4x8_dct_dct_0_8bpc_lsx: 0.8 ( 7.12x) inv_txfm_add_4x8_dct_dct_1_8bpc_c: 34.5 ( 1.00x) inv_txfm_add_4x8_dct_dct_1_8bpc_lsx: 3.0 (11.64x) --- src/loongarch/itx.S | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/itx.h | 4 ++ 2 files changed, 186 insertions(+) diff --git a/src/loongarch/itx.S b/src/loongarch/itx.S index f5dc824..a14d77c 100644 --- a/src/loongarch/itx.S +++ b/src/loongarch/itx.S @@ -916,6 +916,188 @@ fun4x4 identity, flipadst fun4x4 identity, adst fun4x4 adst, identity +function inv_txfm_add_dct_dct_4x8_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_4x8 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 + vld vr10, a0, 0 + vmul.w vr2, vr2, vr0 + vldx vr11, a0, a1 + vsrari.w vr2, vr2, 8 + alsl.d t2, a1, a0, 1 + vmadd.w vr5, vr2, vr0 + vld vr12, t2, 0 + vssrarni.h.w vr5, vr5, 12 + vldx vr13, t2, a1 + + DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + + VLD_DST_ADD_W4 vr5, vr5 + b .DCT_DCT_4x8_END + +.NO_HAS_DCONLY_4x8: + // sh=8 sw=4 + la.local t0, idct_coeffs + + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr20, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr21, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + vldrepl.w vr2, t0, 8 // 1567 + vldrepl.w vr3, t0, 12 // 3784 + vldrepl.w vr8, t0, 0 // 2896 + +.macro DCT4_4Wx8H_1D_LSX + // in1 in3 + vsllwil.w.h vr4, vr1, 0 // in1 + vsllwil.w.h vr5, vr21, 0 // in3 + vmul.w vr4, vr4, vr8 + vmul.w vr5, vr5, vr8 + vsrari.w vr4, vr4, 12 + vsrari.w vr5, vr5, 12 + vmul.w vr6, vr4, vr3 + vmul.w vr7, vr4, vr2 + vmadd.w vr6, vr5, vr2 // t3 0 1 2 3 + vmsub.w vr7, vr5, vr3 // t2 0 1 2 3 + vexth.w.h vr4, vr1 // in1 + vexth.w.h vr5, vr21 // in3 + vmul.w vr4, vr4, vr8 + vmul.w vr5, vr5, vr8 + vsrari.w vr4, vr4, 12 + vsrari.w vr5, vr5, 12 + vmul.w vr9, vr4, vr3 + vmul.w vr10, vr4, vr2 + vmadd.w vr9, vr5, vr2 // t3 4 5 6 7 + vmsub.w vr10, vr5, vr3 // t2 4 5 6 7 + + // in0 in2 + vsllwil.w.h vr4, vr0, 0 // in0 + vsllwil.w.h vr5, vr20, 0 // in2 + vmul.w vr4, vr4, vr8 + vmul.w vr5, vr5, vr8 + vsrari.w vr4, vr4, 12 + vsrari.w vr5, vr5, 12 + vmul.w vr11, vr4, vr8 + vmul.w vr12, vr4, vr8 + vmadd.w vr11, vr5, vr8 // t0 0 1 2 3 + vmsub.w vr12, vr5, vr8 // t1 0 1 2 3 + vexth.w.h vr4, vr0 // in0 + vexth.w.h vr5, vr20 // in2 + vmul.w vr4, vr4, vr8 + vmul.w vr5, vr5, vr8 + vsrari.w vr4, vr4, 12 + vsrari.w vr5, vr5, 12 + vmul.w vr13, vr4, vr8 + vmul.w vr14, vr4, vr8 + vmadd.w vr13, vr5, vr8 // t0 4 5 6 7 + vmsub.w vr14, vr5, vr8 // t1 4 5 6 7 + vssrarni.h.w vr9, vr6, 12 // t3 + vssrarni.h.w vr10, vr7, 12 // t2 + vssrarni.h.w vr14, vr12, 12 // t1 + vssrarni.h.w vr13, vr11, 12 // t0 + vsadd.h vr4, vr13, vr9 // c[0] 0 4 8 12 16 20 24 28 + vsadd.h vr5, vr14, vr10 // c[1] 1 5 9 13 17 21 25 29 + vssub.h vr20, vr14, vr10 // c[2] 2 6 10 14 18 22 26 30 + vssub.h vr21, vr13, vr9 // c[3] 3 7 11 15 19 23 27 31 +.endm + + DCT4_4Wx8H_1D_LSX + + vreplgr2vr.h vr22, zero + vst vr22, a2, 0 + vst vr22, a2, 16 + vst vr22, a2, 32 + vst vr22, a2, 48 + + vilvl.h vr0, vr5, vr4 // 0 1 4 5 8 9 12 13 + vilvl.h vr1, vr21, vr20 // 2 3 6 7 10 11 14 15 + vilvh.h vr6, vr5, vr4 // 16 17 20 21 24 25 28 29 + vilvh.h vr7, vr21, vr20 // 18 19 22 23 26 27 30 31 + vilvl.w vr9, vr1, vr0 // 0 1 2 3 4 5 6 7 in0 + vilvh.w vr10, vr1, vr0 // 8 9 10 11 12 13 14 15 in1 + vilvl.w vr11, vr7, vr6 // 16 17 18 19 20 21 22 23 in2 + vilvh.w vr12, vr7, vr6 // 24 25 26 27 28 29 30 31 in3 + + vilvl.d vr0, vr10, vr9 + vilvl.d vr1, vr12, vr11 + vilvh.d vr20, vr9, vr11 // in5 in1 + vilvh.d vr21, vr12, vr10 // in3 in7 + +.macro DCT8_4Wx8H_1D_LSX + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14 + + vldrepl.w vr17, t0, 16 // 799 + vldrepl.w vr18, t0, 20 // 4017 + vldrepl.w vr11, t0, 24 // 3406 + vldrepl.w vr12, t0, 28 // 2276 + + vexth.w.h vr4, vr20 + vexth.w.h vr5, vr21 + vmul.w vr6, vr4, vr18 // in1 * 4017 + vmul.w vr7, vr4, vr17 // in1 * 799 + vmadd.w vr6, vr5, vr17 // in7 * 799 + vmsub.w vr7, vr5, vr18 // in7 * 4017 + vsllwil.w.h vr4, vr20, 0 + vsllwil.w.h vr5, vr21, 0 + vmul.w vr9, vr4, vr12 + vmul.w vr10, vr4, vr11 + vmadd.w vr9, vr5, vr11 + vmsub.w vr10, vr5, vr12 + vssrarni.h.w vr10, vr9, 12 // t6a t5a + vssrarni.h.w vr7, vr6, 12 // t7a t4a + vsadd.h vr15, vr7, vr10 // t7 t4 + vssub.h vr16, vr7, vr10 // t6a t5a + + vexth.w.h vr4, vr16 // t5a + vsllwil.w.h vr5, vr16, 0 // t6a + vldi vr2, 0x8b5 // 181 + vsub.w vr6, vr5, vr4 + vadd.w vr7, vr5, vr4 + vmul.w vr6, vr6, vr2 + vmul.w vr7, vr7, vr2 + vssrarni.h.w vr7, vr6, 8 // t5 t6 + vaddi.hu vr18, vr7, 0 + vshuf4i.d vr7, vr15, 0x06 // t7 t6 + vshuf4i.d vr15, vr18, 0x09 // t4 t5 + + // vr17 -> vr7 vr18 -> vr15 + vsadd.h vr4, vr13, vr7 + vsadd.h vr5, vr14, vr15 + vssub.h vr6, vr14, vr15 + vssub.h vr7, vr13, vr7 +.endm + + DCT8_4Wx8H_1D_LSX + + vshuf4i.d vr5, vr5, 0x01 + vshuf4i.d vr7, vr7, 0x01 + + vsrari.h vr4, vr4, 4 + vsrari.h vr5, vr5, 4 + vsrari.h vr6, vr6, 4 + vsrari.h vr7, vr7, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W4 vr4, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + + VLD_DST_ADD_W4 vr6, vr7 +.DCT_DCT_4x8_END: +endfunc + .macro rect2_w4_lsx in0, in1, in2, out0, out1 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in1 diff --git a/src/loongarch/itx.h b/src/loongarch/itx.h index ef5df65..ed35724 100644 --- a/src/loongarch/itx.h +++ b/src/loongarch/itx.h @@ -49,6 +49,8 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_4x4, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_4x4, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x8, lsx)); + decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x4, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x4, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x4, lsx)); @@ -117,6 +119,8 @@ static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c c->itxfm_add[TX_4X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_4x4_8bpc_lsx; c->itxfm_add[TX_4X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_4x4_8bpc_lsx; + c->itxfm_add[RTX_4X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x8_8bpc_lsx; + c->itxfm_add[RTX_8X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x4_8bpc_lsx; c->itxfm_add[RTX_8X4][IDTX] = dav1d_inv_txfm_add_identity_identity_8x4_8bpc_lsx; c->itxfm_add[RTX_8X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x4_8bpc_lsx; -- cgit v1.2.3 From 8c32cde7c10e259b6035c97ee8f5e69c3644c202 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 16:16:20 +0800 Subject: loongarch: Improve six functions in itx_8bpc.add_16x16 series 1. inv_txfm_add_dct_dct_16x16 2. inv_txfm_add_adst_adst_16x16 3. inv_txfm_add_adst_dct_16x16 4. inv_txfm_add_dct_adst_16x16 5. inv_txfm_add_flipadst_dct_16x16 6. inv_txfm_add_dct_flipadst_16x16 Relative speedup over C code: inv_txfm_add_16x16_adst_adst_0_8bpc_c: 327.6 ( 1.00x) inv_txfm_add_16x16_adst_adst_0_8bpc_lsx: 30.5 (10.74x) inv_txfm_add_16x16_adst_adst_1_8bpc_c: 327.6 ( 1.00x) inv_txfm_add_16x16_adst_adst_1_8bpc_lsx: 30.7 (10.67x) inv_txfm_add_16x16_adst_adst_2_8bpc_c: 327.6 ( 1.00x) inv_txfm_add_16x16_adst_adst_2_8bpc_lsx: 30.5 (10.73x) inv_txfm_add_16x16_adst_dct_0_8bpc_c: 321.0 ( 1.00x) inv_txfm_add_16x16_adst_dct_0_8bpc_lsx: 27.6 (11.64x) inv_txfm_add_16x16_adst_dct_1_8bpc_c: 320.9 ( 1.00x) inv_txfm_add_16x16_adst_dct_1_8bpc_lsx: 27.4 (11.70x) inv_txfm_add_16x16_adst_dct_2_8bpc_c: 320.8 ( 1.00x) inv_txfm_add_16x16_adst_dct_2_8bpc_lsx: 27.5 (11.67x) inv_txfm_add_16x16_dct_adst_0_8bpc_c: 321.1 ( 1.00x) inv_txfm_add_16x16_dct_adst_0_8bpc_lsx: 27.1 (11.85x) inv_txfm_add_16x16_dct_adst_1_8bpc_c: 321.1 ( 1.00x) inv_txfm_add_16x16_dct_adst_1_8bpc_lsx: 27.2 (11.80x) inv_txfm_add_16x16_dct_adst_2_8bpc_c: 329.4 ( 1.00x) inv_txfm_add_16x16_dct_adst_2_8bpc_lsx: 27.2 (12.10x) inv_txfm_add_16x16_dct_dct_0_8bpc_c: 31.9 ( 1.00x) inv_txfm_add_16x16_dct_dct_0_8bpc_lsx: 1.8 (18.14x) inv_txfm_add_16x16_dct_dct_1_8bpc_c: 314.3 ( 1.00x) inv_txfm_add_16x16_dct_dct_1_8bpc_lsx: 23.9 (13.16x) inv_txfm_add_16x16_dct_dct_2_8bpc_c: 314.3 ( 1.00x) inv_txfm_add_16x16_dct_dct_2_8bpc_lsx: 24.1 (13.05x) inv_txfm_add_16x16_dct_flipadst_0_8bpc_c: 321.0 ( 1.00x) inv_txfm_add_16x16_dct_flipadst_0_8bpc_lsx: 27.1 (11.83x) inv_txfm_add_16x16_dct_flipadst_1_8bpc_c: 321.0 ( 1.00x) inv_txfm_add_16x16_dct_flipadst_1_8bpc_lsx: 27.1 (11.84x) inv_txfm_add_16x16_dct_flipadst_2_8bpc_c: 327.7 ( 1.00x) inv_txfm_add_16x16_dct_flipadst_2_8bpc_lsx: 27.1 (12.07x) inv_txfm_add_16x16_flipadst_dct_0_8bpc_c: 322.6 ( 1.00x) inv_txfm_add_16x16_flipadst_dct_0_8bpc_lsx: 28.1 (11.49x) inv_txfm_add_16x16_flipadst_dct_1_8bpc_c: 322.5 ( 1.00x) inv_txfm_add_16x16_flipadst_dct_1_8bpc_lsx: 28.1 (11.48x) inv_txfm_add_16x16_flipadst_dct_2_8bpc_c: 322.7 ( 1.00x) inv_txfm_add_16x16_flipadst_dct_2_8bpc_lsx: 28.0 (11.53x) --- src/loongarch/itx.S | 618 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/itx.h | 13 ++ 2 files changed, 631 insertions(+) diff --git a/src/loongarch/itx.S b/src/loongarch/itx.S index a14d77c..eab879a 100644 --- a/src/loongarch/itx.S +++ b/src/loongarch/itx.S @@ -5431,3 +5431,621 @@ function inv_txfm_add_adst_dct_16x8_8bpc_lsx fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc + +function inv_txfm_add_dct_dct_16x16_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_16x16 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + alsl.d t2, a1, a0, 1 + vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift + vldx vr1, a0, a1 + vmadd.w vr5, vr2, vr0 + vldx vr3, t2, a1 + vssrarni.h.w vr5, vr5, 12 + vld vr0, a0, 0 + vld vr2, t2, 0 + + DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, + + b .DCT_DCT_16x16_END + +.NO_HAS_DCONLY_16x16: + + malloc_space 512 + + vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vreplgr2vr.h vr31, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr31, a2, \i +.endr + + vld_x8 sp, 64, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 sp, 320, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x8 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 + vst_x8 sp, 320, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x8 sp, 192, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 sp, 448, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + alsl.d t2, a1, a0, 1 + vld vr4, sp, 64 + vld vr5, sp, 80 + vld vr6, sp, 96 + vld vr7, sp, 112 + VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 128 + vld vr5, sp, 144 + vld vr6, sp, 160 + vld vr7, sp, 176 + VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 320 + vld vr5, sp, 336 + vld vr6, sp, 352 + vld vr7, sp, 368 + VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 384 + vld vr5, sp, 400 + vld vr6, sp, 416 + vld vr7, sp, 432 + VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4 + + free_space 512 + +.DCT_DCT_16x16_END: +endfunc + +function inv_txfm_add_adst_adst_16x16_8bpc_lsx + + malloc_space 256+256 + + addi.d t1, sp, 64 + addi.d t2, a2, 0 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + addi.d t2, a2, 16 + addi.d t1, t1, 256 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr23, a2, \i +.endr + + addi.d t2, sp, 64 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + // out0 out1 out2 out3 out4 out5 out6 out7 + // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 + // out8 out9 out10 out11 out12 out13 out14 out15 + // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + + addi.d t2, sp, 64+128 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + addi.d a0, a0, 8 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + + free_space 256+256 +endfunc + +function inv_txfm_add_adst_dct_16x16_8bpc_lsx + malloc_space 256+256 + + addi.d t1, sp, 64 + addi.d t2, a2, 0 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + addi.d t2, a2, 16 + addi.d t1, t1, 256 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr23, a2, \i +.endr + + addi.d t2, sp, 64 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 + vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + addi.d t2, sp, 64+128 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + alsl.d t2, a1, a0, 1 + vld vr4, sp, 64 + vld vr5, sp, 80 + vld vr6, sp, 96 + vld vr7, sp, 112 + VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 128 + vld vr5, sp, 144 + vld vr6, sp, 160 + vld vr7, sp, 176 + VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 320 + vld vr5, sp, 336 + vld vr6, sp, 352 + vld vr7, sp, 368 + VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 384 + vld vr5, sp, 400 + vld vr6, sp, 416 + vld vr7, sp, 432 + VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4 + + free_space 256+256 +endfunc + +function inv_txfm_add_dct_adst_16x16_8bpc_lsx + malloc_space 256+256 + + vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vreplgr2vr.h vr31, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr31, a2, \i +.endr + + addi.d t2, sp, 64 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + // out0 out1 out2 out3 out4 out5 out6 out7 + // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 + // out8 out9 out10 out11 out12 out13 out14 out15 + // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + + addi.d t2, sp, 64+128 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + addi.d a0, a0, 8 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + + free_space 256+256 +endfunc + +const shufb + .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 +endconst + +function inv_txfm_add_flipadst_dct_16x16_8bpc_lsx + malloc_space 256+256 + + addi.d t1, sp, 64 + addi.d t2, a2, 0 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + addi.d t2, a2, 16 + addi.d t1, t1, 256 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr23, a2, \i +.endr + + addi.d t2, sp, 64 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + la.local t0, shufb + vld vr0, t0, 0 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vshuf.b \i, \i, \i, vr0 +.endr + + vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 + vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + addi.d t2, sp, 64+128 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + la.local t0, shufb + vld vr0, t0, 0 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vshuf.b \i, \i, \i, vr0 +.endr + + alsl.d t2, a1, a0, 1 + vld vr4, sp, 64 + vld vr5, sp, 80 + vld vr6, sp, 96 + vld vr7, sp, 112 + VLD_DST_ADD_W16 vr22, vr4, vr18, vr5, vr17, vr6, vr28, vr7, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 128 + vld vr5, sp, 144 + vld vr6, sp, 160 + vld vr7, sp, 176 + VLD_DST_ADD_W16 vr20, vr4, vr14, vr5, vr15, vr6, vr16, vr7, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 320 + vld vr5, sp, 336 + vld vr6, sp, 352 + vld vr7, sp, 368 + VLD_DST_ADD_W16 vr27, vr4, vr30, vr5, vr23, vr6, vr21, vr7, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 384 + vld vr5, sp, 400 + vld vr6, sp, 416 + vld vr7, sp, 432 + VLD_DST_ADD_W16 vr29, vr4, vr26, vr5, vr25, vr6, vr24, vr7, 4 + + free_space 256+256 +endfunc + +function inv_txfm_add_dct_flipadst_16x16_8bpc_lsx + malloc_space 256+256 + + vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vreplgr2vr.h vr31, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr31, a2, \i +.endr + + addi.d t2, sp, 64 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + // out0 out1 out2 out3 out4 out5 out6 out7 + // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 + // out8 out9 out10 out11 out12 out13 out14 out15 + // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 + + la.local t0, shufb + vld vr31, t0, 0 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14 + + addi.d t2, sp, 64+128 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + addi.d a0, a0, 8 + + la.local t0, shufb + vld vr31, t0, 0 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14 + + free_space 256+256 + +endfunc diff --git a/src/loongarch/itx.h b/src/loongarch/itx.h index ed35724..e48cef7 100644 --- a/src/loongarch/itx.h +++ b/src/loongarch/itx.h @@ -93,6 +93,13 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x16, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x8, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_16x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_16x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_16x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_16x16, lsx)); + static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) { #if BITDEPTH == 8 const unsigned flags = dav1d_get_cpu_flags(); @@ -163,6 +170,12 @@ static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c c->itxfm_add[RTX_16X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x8_8bpc_lsx; c->itxfm_add[RTX_16X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x8_8bpc_lsx; + c->itxfm_add[TX_16X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x16_8bpc_lsx; + c->itxfm_add[TX_16X16][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_16x16_8bpc_lsx; + c->itxfm_add[TX_16X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x16_8bpc_lsx; + c->itxfm_add[TX_16X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_16x16_8bpc_lsx; + c->itxfm_add[TX_16X16][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_16x16_8bpc_lsx; + c->itxfm_add[TX_16X16][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_16x16_8bpc_lsx; #endif } -- cgit v1.2.3 From fbefb34ae91c3a144926e760008713f7bcde981c Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 16:19:13 +0800 Subject: loongarch: Improve one functions in itx_8bpc.add_8x32 series 1. inv_txfm_add_dct_dct_8x32 Relative speedup over C code: inv_txfm_add_8x32_dct_dct_0_8bpc_c: 33.3 ( 1.00x) inv_txfm_add_8x32_dct_dct_0_8bpc_lsx: 2.1 (15.58x) inv_txfm_add_8x32_dct_dct_1_8bpc_c: 311.1 ( 1.00x) inv_txfm_add_8x32_dct_dct_1_8bpc_lsx: 24.9 (12.49x) inv_txfm_add_8x32_dct_dct_2_8bpc_c: 308.4 ( 1.00x) inv_txfm_add_8x32_dct_dct_2_8bpc_lsx: 24.9 (12.37x) inv_txfm_add_8x32_dct_dct_3_8bpc_c: 309.3 ( 1.00x) inv_txfm_add_8x32_dct_dct_3_8bpc_lsx: 25.0 (12.37x) inv_txfm_add_8x32_dct_dct_4_8bpc_c: 308.4 ( 1.00x) inv_txfm_add_8x32_dct_dct_4_8bpc_lsx: 25.0 (12.35x) --- src/loongarch/itx.S | 442 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/itx.h | 5 + 2 files changed, 447 insertions(+) diff --git a/src/loongarch/itx.S b/src/loongarch/itx.S index eab879a..df6ba6e 100644 --- a/src/loongarch/itx.S +++ b/src/loongarch/itx.S @@ -6049,3 +6049,445 @@ function inv_txfm_add_dct_flipadst_16x16_8bpc_lsx free_space 256+256 endfunc + +function inv_txfm_add_dct_dct_8x32_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_8x32 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 + vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift + vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 + alsl.d t2, a1, a0, 1 + vmadd.w vr5, vr2, vr0 + vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 + vssrarni.h.w vr5, vr5, 12 + vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 + + DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 + +.rept 7 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 +.endr + + b .DCT_DCT_8X32_END + +.NO_HAS_DCONLY_8x32: + malloc_space 512 + + vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + vsrari.h \i, \i, 2 +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + vst_x8 sp, 64, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vld_x8 a2, 16, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + vsrari.h \i, \i, 2 +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + vst_x8 sp, 192, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vld_x8 a2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + vsrari.h \i, \i, 2 +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + vst_x8 sp, 320, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vld_x8 a2, 48, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + vsrari.h \i, \i, 2 +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + vst_x8 sp, 448, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vreplgr2vr.h vr31, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr31, a2, \i +.endr + + addi.d t2, sp, 64 + addi.d t3, sp, 64 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x16 t3, 0, 32, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + // in1 in3 in5 in7 in9 in11 in13 in15 + // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + // in17 in19 in21 in23 in25 in27 in29 in31 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 64 // 201 + vldrepl.w vr21, t0, 68 // 4091 + + vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9 + vssrarni.h.w vr9, vr8, 12 // t31a + vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10 + vssrarni.h.w vr10, vr11, 12 // t16a + + vldrepl.w vr20, t0, 72 // 3035 + vldrepl.w vr21, t0, 76 // 2751 + vmul_vmadd_w vr19, vr7, vr21, vr20, vr11, vr0 + vssrarni.h.w vr0, vr11, 12 // t30a + vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30 + vssrarni.h.w vr30, vr11, 12 // t17a + + vldrepl.w vr20, t0, 80 // 1751 + vldrepl.w vr21, t0, 84 // 3703 + vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7 + vssrarni.h.w vr7, vr8, 12 // t29a + vmul_vmsub_w vr4, vr26, vr20, vr21, vr8, vr19 + vssrarni.h.w vr19, vr8, 12 // t18a + + vldrepl.w vr20, t0, 88 // 3857 + vldrepl.w vr21, t0, 92 // 1380 + vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4 + vssrarni.h.w vr4, vr8, 12 // t28a + vmul_vmsub_w vr27, vr3, vr20, vr21, vr8, vr26 + vssrarni.h.w vr26, vr8, 12 // t19a + + vldrepl.w vr20, t0, 96 // 995 + vldrepl.w vr21, t0, 100 // 3973 + vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3 + vssrarni.h.w vr3, vr8, 12 // t27a + vmul_vmsub_w vr2, vr28, vr20, vr21, vr8, vr27 + vssrarni.h.w vr27, vr8, 12 // t20a + + vldrepl.w vr20, t0, 104 // 3513 + vldrepl.w vr21, t0, 108 // 2106 + vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2 + vssrarni.h.w vr2, vr8, 12 // t26a + vmul_vmsub_w vr25, vr5, vr20, vr21, vr8, vr28 + vssrarni.h.w vr28, vr8, 12 // t21a + + vldrepl.w vr20, t0, 112 // 2440 -> 1220 + vldrepl.w vr21, t0, 116 // 3290 -> 1645 + vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5 + vssrarni.h.w vr5, vr8, 12 // t25a + vmul_vmsub_w vr6, vr24, vr20, vr21, vr8, vr25 + vssrarni.h.w vr25, vr8, 12 // t22a + + vldrepl.w vr20, t0, 120 // 4052 + vldrepl.w vr21, t0, 124 // 601 + vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6 + vssrarni.h.w vr6, vr8, 12 // t24a + vmul_vmsub_w vr29, vr1, vr20, vr21, vr8, vr24 + vssrarni.h.w vr24, vr8, 12 // t23a + + vsadd.h vr1, vr10, vr30 // t16 + vssub.h vr29, vr10, vr30 // t17 + vssub.h vr8, vr26, vr19 // t18 + vsadd.h vr31, vr26, vr19 // t19 + vsadd.h vr10, vr27, vr28 // t20 + vssub.h vr30, vr27, vr28 // t21 + vssub.h vr19, vr24, vr25 // t22 + vsadd.h vr26, vr24, vr25 // t23 + vsadd.h vr27, vr6, vr5 // t24 + vssub.h vr28, vr6, vr5 // t25 + vssub.h vr24, vr3, vr2 // t26 + vsadd.h vr25, vr3, vr2 // t27 + vsadd.h vr5, vr4, vr7 // t28 + vssub.h vr6, vr4, vr7 // t29 + vssub.h vr2, vr9, vr0 // t30 + vsadd.h vr3, vr9, vr0 // t31 + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 + vssrarni.h.w vr7, vr4, 12 // t30a + vmul_vmsub_w vr2, vr29, vr20, vr21, vr4, vr0 + vssrarni.h.w vr0, vr4, 12 // t17a + vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 + vneg.w vr4, vr4 + vneg.w vr9, vr9 + vssrarni.h.w vr9, vr4, 12 // t18a + vmul_vmsub_w vr6, vr8, vr20, vr21, vr4, vr2 + vssrarni.h.w vr2, vr4, 12 // t29a + + vldrepl.w vr20, t0, 24 // 3406 -> 1703 + vldrepl.w vr21, t0, 28 // 2276 -> 1138 + vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 + vssrarni.h.w vr29, vr4, 12 // t26a + vmul_vmsub_w vr24, vr30, vr20, vr21, vr4, vr6 + vssrarni.h.w vr6, vr4, 12 // t21a + + vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 + vneg.w vr4, vr4 + vneg.w vr8, vr8 + vssrarni.h.w vr8, vr4, 12 // t22a + vmul_vmsub_w vr28, vr19, vr20, vr21, vr4, vr24 + vssrarni.h.w vr24, vr4, 12 // t25a + + vsadd.h vr4, vr1, vr31 // t16a + vssub.h vr30, vr1, vr31 // t19a + vsadd.h vr19, vr0, vr9 // t17 + vssub.h vr28, vr0, vr9 // t18 + vssub.h vr1, vr26, vr10 // t20a + vsadd.h vr31, vr26, vr10 // t23a + vssub.h vr0, vr8, vr6 // t21 + vsadd.h vr9, vr8, vr6 // t22 + vsadd.h vr10, vr27, vr25 // t24a + vssub.h vr26, vr27, vr25 // t27a + vsadd.h vr6, vr24, vr29 // t25 + vssub.h vr8, vr24, vr29 // t26 + vssub.h vr25, vr3, vr5 // t28a + vsadd.h vr27, vr3, vr5 // t31a + vssub.h vr24, vr7, vr2 // t29 + vsadd.h vr29, vr7, vr2 // t30 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 + vssrarni.h.w vr5, vr3, 12 // t29a + vmul_vmsub_w vr24, vr28, vr20, vr21, vr3, vr2 + vssrarni.h.w vr2, vr3, 12 // 18a + + vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 + vssrarni.h.w vr7, vr3, 12 // t28 + vmul_vmsub_w vr25, vr30, vr20, vr21, vr3, vr24 + vssrarni.h.w vr24, vr3, 12 // t19 + + vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 + vneg.w vr3, vr3 + vneg.w vr28, vr28 + vssrarni.h.w vr28, vr3, 12 // t20 + vmul_vmsub_w vr26, vr1, vr20, vr21, vr3, vr25 + vssrarni.h.w vr25, vr3, 12 // t27 + + vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 + vneg.w vr3, vr3 + vneg.w vr30, vr30 + vssrarni.h.w vr30, vr3, 12 // t21a + vmul_vmsub_w vr8, vr0, vr20, vr21, vr3, vr1 + vssrarni.h.w vr1, vr3, 12 // t26a + + vsadd.h vr3, vr4, vr31 // t16 + vssub.h vr26, vr4, vr31 // t23 + vsadd.h vr0, vr19, vr9 // t17a + vssub.h vr8, vr19, vr9 // t22a + vsadd.h vr4, vr2, vr30 // t18 + vssub.h vr31, vr2, vr30 // t21 + vsadd.h vr9, vr24, vr28 // t19a + vssub.h vr19, vr24, vr28 // t20a + vssub.h vr2, vr27, vr10 // t24 + vsadd.h vr30, vr27, vr10 // t31 + vssub.h vr24, vr29, vr6 // t25a + vsadd.h vr28, vr29, vr6 // t30a + vssub.h vr10, vr5, vr1 // t26 + vsadd.h vr27, vr5, vr1 // t29 + vssub.h vr6, vr7, vr25 // t27a + vsadd.h vr29, vr7, vr25 // t28a + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 + vssrarni.h.w vr5, vr1, 12 // t20 + vmul_vmadd_w vr6, vr19, vr20, vr20, vr1, vr7 + vssrarni.h.w vr7, vr1, 12 // t27 + + vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 + vssrarni.h.w vr25, vr1, 12 // t21a + vmul_vmadd_w vr10, vr31, vr20, vr20, vr1, vr6 + vssrarni.h.w vr6, vr1, 12 // t26a + + vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 + vssrarni.h.w vr19, vr1, 12 // t22 + vmul_vmadd_w vr24, vr8, vr20, vr20, vr1, vr10 + vssrarni.h.w vr10, vr1, 12 // t25 + + vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 + vssrarni.h.w vr31, vr1, 12 // t23a + vmul_vmadd_w vr2, vr26, vr20, vr20, vr1, vr8 + vssrarni.h.w vr8, vr1, 12 // t24a + + // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 + // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 + + vld_x8 t3, 0, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr30 // c[0] + vssub.h vr2, vr11, vr30 // c[31] + vsadd.h vr24, vr12, vr28 // c[1] + vssub.h vr26, vr12, vr28 // c[30] + vsadd.h vr11, vr13, vr27 // c[2] + vssub.h vr30, vr13, vr27 // c[29] + vsadd.h vr12, vr14, vr29 // c[3] + vssub.h vr28, vr14, vr29 // c[28] + vsadd.h vr13, vr15, vr7 // c[4] + vssub.h vr27, vr15, vr7 // c[27] + vsadd.h vr14, vr16, vr6 // c[5] + vssub.h vr29, vr16, vr6 // c[26] + vsadd.h vr7, vr17, vr10 // c[6] + vssub.h vr15, vr17, vr10 // c[25] + vsadd.h vr6, vr18, vr8 // c[7] + vssub.h vr16, vr18, vr8 // c[24] + +.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + vsrari.h \i, \i, 4 +.endr + + vst_x8 t2, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + + vst_x8 t2, 128, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + + vld_x8 t3, 256, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr31 // c[8] + vssub.h vr2, vr11, vr31 // c[23] + vsadd.h vr24, vr12, vr19 // c[9] + vssub.h vr26, vr12, vr19 // c[22] + vsadd.h vr11, vr13, vr25 // c[10] + vssub.h vr30, vr13, vr25 // c[21] + vsadd.h vr12, vr14, vr5 // c[11] + vssub.h vr28, vr14, vr5 // c[20] + vsadd.h vr13, vr15, vr9 // c[12] + vssub.h vr27, vr15, vr9 // c[19] + vsadd.h vr14, vr16, vr4 // c[13] + vssub.h vr29, vr16, vr4 // c[18] + vsadd.h vr7, vr17, vr0 // c[14] + vssub.h vr15, vr17, vr0 // c[17] + vsadd.h vr6, vr18, vr3 // c[15] + vssub.h vr16, vr18, vr3 // c[16] + +.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + vsrari.h \i, \i, 4 +.endr + + vst_x8 t2, 256, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + + vst_x8 t2, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + + alsl.d t2, a1, a0, 1 + addi.d t3, sp, 64 + + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, sp, 64+64 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, sp, 64+256 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, t3, 64 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, sp, 64+384 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, t3, 64 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, sp, 64+128 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, t3, 64 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + free_space 512 +.DCT_DCT_8X32_END: +endfunc diff --git a/src/loongarch/itx.h b/src/loongarch/itx.h index e48cef7..eb76881 100644 --- a/src/loongarch/itx.h +++ b/src/loongarch/itx.h @@ -100,6 +100,8 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_16x16, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_16x16, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_16x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x32, lsx)); + static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) { #if BITDEPTH == 8 const unsigned flags = dav1d_get_cpu_flags(); @@ -176,6 +178,9 @@ static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c c->itxfm_add[TX_16X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_16x16_8bpc_lsx; c->itxfm_add[TX_16X16][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_16x16_8bpc_lsx; c->itxfm_add[TX_16X16][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_16x16_8bpc_lsx; + + c->itxfm_add[RTX_8X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x32_8bpc_lsx; + #endif } -- cgit v1.2.3 From 90f8fb77c7f2bd619d6a0abff7f79cb3daa9323b Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 16:22:35 +0800 Subject: loongarch: Improve one functions in itx_8bpc.add_32x32 series 1. inv_txfm_add_dct_dct_32x32 Relative speedup over C code: inv_txfm_add_32x32_dct_dct_0_8bpc_c: 122.2 ( 1.00x) inv_txfm_add_32x32_dct_dct_0_8bpc_lsx: 5.5 (22.06x) inv_txfm_add_32x32_dct_dct_1_8bpc_c: 1466.2 ( 1.00x) inv_txfm_add_32x32_dct_dct_1_8bpc_lsx: 108.7 (13.49x) inv_txfm_add_32x32_dct_dct_2_8bpc_c: 1465.5 ( 1.00x) inv_txfm_add_32x32_dct_dct_2_8bpc_lsx: 108.8 (13.47x) inv_txfm_add_32x32_dct_dct_3_8bpc_c: 1465.6 ( 1.00x) inv_txfm_add_32x32_dct_dct_3_8bpc_lsx: 108.8 (13.46x) inv_txfm_add_32x32_dct_dct_4_8bpc_c: 1466.2 ( 1.00x) inv_txfm_add_32x32_dct_dct_4_8bpc_lsx: 108.8 (13.47x) --- src/loongarch/itx.S | 491 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/itx.h | 2 + 2 files changed, 493 insertions(+) diff --git a/src/loongarch/itx.S b/src/loongarch/itx.S index df6ba6e..0c5dc69 100644 --- a/src/loongarch/itx.S +++ b/src/loongarch/itx.S @@ -6491,3 +6491,494 @@ function inv_txfm_add_dct_dct_8x32_8bpc_lsx free_space 512 .DCT_DCT_8X32_END: endfunc + +.macro dct_8x32_core_lsx in1, in2, vst_start0, vst_start1, vst_start2, \ + vst_start3, transpose8x8, shift + + // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + // in1 in3 in5 in7 in9 in11 in13 in15 + // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + // in17 in19 in21 in23 in25 in27 in29 in31 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 64 // 201 + vldrepl.w vr21, t0, 68 // 4091 + + vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9 + vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10 + vssrarni.h.w vr9, vr8, 12 // t31a + vssrarni.h.w vr10, vr11, 12 // t16a + + vldrepl.w vr20, t0, 72 // 3035 + vldrepl.w vr21, t0, 76 // 2751 + vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0 + vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30 + vssrarni.h.w vr0, vr8, 12 // t30a + vssrarni.h.w vr30, vr11, 12 // t17a + + vldrepl.w vr20, t0, 80 // 1751 + vldrepl.w vr21, t0, 84 // 3703 + vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7 + vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19 + vssrarni.h.w vr7, vr8, 12 // t29a + vssrarni.h.w vr19, vr11, 12 // t18a + + vldrepl.w vr20, t0, 88 // 3857 + vldrepl.w vr21, t0, 92 // 1380 + vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4 + vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26 + vssrarni.h.w vr4, vr8, 12 // t28a + vssrarni.h.w vr26, vr11, 12 // t19a + + vldrepl.w vr20, t0, 96 // 995 + vldrepl.w vr21, t0, 100 // 3973 + vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3 + vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27 + vssrarni.h.w vr3, vr8, 12 // t27a + vssrarni.h.w vr27, vr11, 12 // t20a + + vldrepl.w vr20, t0, 104 // 3513 + vldrepl.w vr21, t0, 108 // 2106 + vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2 + vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28 + vssrarni.h.w vr2, vr8, 12 // t26a + vssrarni.h.w vr28, vr11, 12 // t21a + + vldrepl.w vr20, t0, 112 // 2440 -> 1220 + vldrepl.w vr21, t0, 116 // 3290 -> 1645 + vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5 + vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25 + vssrarni.h.w vr5, vr8, 12 // t25a + vssrarni.h.w vr25, vr11, 12 // t22a + + vldrepl.w vr20, t0, 120 // 4052 + vldrepl.w vr21, t0, 124 // 601 + vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6 + vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24 + vssrarni.h.w vr6, vr8, 12 // t24a + vssrarni.h.w vr24, vr11, 12 // t23a + + vsadd.h vr1, vr10, vr30 // t16 + vssub.h vr29, vr10, vr30 // t17 + vssub.h vr8, vr26, vr19 // t18 + vsadd.h vr31, vr26, vr19 // t19 + vsadd.h vr10, vr27, vr28 // t20 + vssub.h vr30, vr27, vr28 // t21 + vssub.h vr19, vr24, vr25 // t22 + vsadd.h vr26, vr24, vr25 // t23 + vsadd.h vr27, vr6, vr5 // t24 + vssub.h vr28, vr6, vr5 // t25 + vssub.h vr24, vr3, vr2 // t26 + vsadd.h vr25, vr3, vr2 // t27 + vsadd.h vr5, vr4, vr7 // t28 + vssub.h vr6, vr4, vr7 // t29 + vssub.h vr2, vr9, vr0 // t30 + vsadd.h vr3, vr9, vr0 // t31 + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 + vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 + vssrarni.h.w vr7, vr4, 12 // t30a + vssrarni.h.w vr0, vr11, 12 // t17a + vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 + vneg.w vr4, vr4 + vneg.w vr9, vr9 + vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 + vssrarni.h.w vr9, vr4, 12 // t18a + vssrarni.h.w vr2, vr11, 12 // t29a + + vldrepl.w vr20, t0, 24 // 3406 -> 1703 + vldrepl.w vr21, t0, 28 // 2276 -> 1138 + vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 + vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 + vssrarni.h.w vr29, vr4, 12 // t26a + vssrarni.h.w vr6, vr11, 12 // t21a + + vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 + vneg.w vr4, vr4 + vneg.w vr8, vr8 + vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 + vssrarni.h.w vr8, vr4, 12 // t22a + vssrarni.h.w vr24, vr11, 12 // t25a + + vsadd.h vr4, vr1, vr31 // t16a + vssub.h vr30, vr1, vr31 // t19a + vsadd.h vr19, vr0, vr9 // t17 + vssub.h vr28, vr0, vr9 // t18 + vssub.h vr1, vr26, vr10 // t20a + vsadd.h vr31, vr26, vr10 // t23a + vssub.h vr0, vr8, vr6 // t21 + vsadd.h vr9, vr8, vr6 // t22 + vsadd.h vr10, vr27, vr25 // t24a + vssub.h vr26, vr27, vr25 // t27a + vsadd.h vr6, vr24, vr29 // t25 + vssub.h vr8, vr24, vr29 // t26 + vssub.h vr25, vr3, vr5 // t28a + vsadd.h vr27, vr3, vr5 // t31a + vssub.h vr24, vr7, vr2 // t29 + vsadd.h vr29, vr7, vr2 // t30 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 + vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 + vssrarni.h.w vr5, vr3, 12 // t29a + vssrarni.h.w vr2, vr11, 12 // 18a + + vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 + vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 + vssrarni.h.w vr7, vr3, 12 // t28 + vssrarni.h.w vr24, vr11, 12 // t19 + + vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 + vneg.w vr3, vr3 + vneg.w vr28, vr28 + vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 + vssrarni.h.w vr28, vr3, 12 // t20 + vssrarni.h.w vr25, vr11, 12 // t27 + + vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 + vneg.w vr3, vr3 + vneg.w vr30, vr30 + vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 + vssrarni.h.w vr30, vr3, 12 // t21a + vssrarni.h.w vr1, vr11, 12 // t26a + + vsadd.h vr3, vr4, vr31 // t16 + vssub.h vr26, vr4, vr31 // t23 + vsadd.h vr0, vr19, vr9 // t17a + vssub.h vr8, vr19, vr9 // t22a + vsadd.h vr4, vr2, vr30 // t18 + vssub.h vr31, vr2, vr30 // t21 + vsadd.h vr9, vr24, vr28 // t19a + vssub.h vr19, vr24, vr28 // t20a + vssub.h vr2, vr27, vr10 // t24 + vsadd.h vr30, vr27, vr10 // t31 + vssub.h vr24, vr29, vr6 // t25a + vsadd.h vr28, vr29, vr6 // t30a + vssub.h vr10, vr5, vr1 // t26 + vsadd.h vr27, vr5, vr1 // t29 + vssub.h vr6, vr7, vr25 // t27a + vsadd.h vr29, vr7, vr25 // t28a + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 + vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 + vssrarni.h.w vr5, vr1, 12 // t20 + vssrarni.h.w vr7, vr11, 12 // t27 + + vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 + vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 + vssrarni.h.w vr25, vr1, 12 // t21a + vssrarni.h.w vr6, vr11, 12 // t26a + + vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 + vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 + vssrarni.h.w vr19, vr1, 12 // t22 + vssrarni.h.w vr10, vr11, 12 // t25 + + vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 + vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 + vssrarni.h.w vr31, vr1, 12 // t23a + vssrarni.h.w vr8, vr11, 12 // t24a + + // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 + // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 + + vld_x8 \in2, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr30 // c[0] + vssub.h vr2, vr11, vr30 // c[31] + vsadd.h vr24, vr12, vr28 // c[1] + vssub.h vr26, vr12, vr28 // c[30] + vsadd.h vr11, vr13, vr27 // c[2] + vssub.h vr30, vr13, vr27 // c[29] + vsadd.h vr12, vr14, vr29 // c[3] + vssub.h vr28, vr14, vr29 // c[28] + vsadd.h vr13, vr15, vr7 // c[4] + vssub.h vr27, vr15, vr7 // c[27] + vsadd.h vr14, vr16, vr6 // c[5] + vssub.h vr29, vr16, vr6 // c[26] + vsadd.h vr7, vr17, vr10 // c[6] + vssub.h vr15, vr17, vr10 // c[25] + vsadd.h vr6, vr18, vr8 // c[7] + vssub.h vr16, vr18, vr8 // c[24] + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 +.endif + +.ifnb \shift +.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + vsrari.h \i, \i, \shift +.endr +.endif + + vst_x8 \in1, \vst_start0, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ + vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ + vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 +.endif + +.ifnb \shift +.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + vsrari.h \i, \i, \shift +.endr +.endif + + vst_x8 \in1, \vst_start3, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + + vld_x8 \in2, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr31 // c[8] + vssub.h vr2, vr11, vr31 // c[23] + vsadd.h vr24, vr12, vr19 // c[9] + vssub.h vr26, vr12, vr19 // c[22] + vsadd.h vr11, vr13, vr25 // c[10] + vssub.h vr30, vr13, vr25 // c[21] + vsadd.h vr12, vr14, vr5 // c[11] + vssub.h vr28, vr14, vr5 // c[20] + vsadd.h vr13, vr15, vr9 // c[12] + vssub.h vr27, vr15, vr9 // c[19] + vsadd.h vr14, vr16, vr4 // c[13] + vssub.h vr29, vr16, vr4 // c[18] + vsadd.h vr7, vr17, vr0 // c[14] + vssub.h vr15, vr17, vr0 // c[17] + vsadd.h vr6, vr18, vr3 // c[15] + vssub.h vr16, vr18, vr3 // c[16] + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 +.endif + +.ifnb \shift +.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + vsrari.h \i, \i, \shift +.endr +.endif + + vst_x8 \in1, \vst_start1, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ + vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ + vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 +.endif + +.ifnb \shift +.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + vsrari.h \i, \i, \shift +.endr +.endif + + vst_x8 \in1, \vst_start2, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 +.endm + +function inv_txfm_add_dct_dct_32x32_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_32x32 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr20, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + add.d t0, a0, a1 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + vld vr3, t0, 16 + vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift + vld vr1, a0, 16 + vmadd.w vr20, vr2, vr0 + vld vr2, t0, 0 + vssrarni.h.w vr20, vr20, 12 + vld vr0, a0, 0 + + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr7, vr3, 0 + vexth.hu.bu vr0, vr0 + vexth.hu.bu vr1, vr1 + vexth.hu.bu vr2, vr2 + vexth.hu.bu vr3, vr3 + vadd.h vr8, vr4, vr20 + vadd.h vr9, vr0, vr20 + vadd.h vr10, vr5, vr20 + vadd.h vr11, vr1, vr20 + vadd.h vr12, vr6, vr20 + vadd.h vr13, vr2, vr20 + vadd.h vr14, vr7, vr20 + vadd.h vr15, vr3, vr20 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vssrani.bu.h vr15, vr14, 0 + vst vr9, a0, 0 + vst vr11, a0, 16 + vst vr13, t0, 0 + vst vr15, t0, 16 + +.rept 15 + alsl.d a0, a1, a0, 1 + add.d t0, a0, a1 + + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, t0, 0 + vld vr3, t0, 16 + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr7, vr3, 0 + vexth.hu.bu vr0, vr0 + vexth.hu.bu vr1, vr1 + vexth.hu.bu vr2, vr2 + vexth.hu.bu vr3, vr3 + vadd.h vr8, vr4, vr20 + vadd.h vr9, vr0, vr20 + vadd.h vr10, vr5, vr20 + vadd.h vr11, vr1, vr20 + vadd.h vr12, vr6, vr20 + vadd.h vr13, vr2, vr20 + vadd.h vr14, vr7, vr20 + vadd.h vr15, vr3, vr20 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vssrani.bu.h vr15, vr14, 0 + vst vr9, a0, 0 + vst vr11, a0, 16 + vst vr13, t0, 0 + vst vr15, t0, 16 +.endr + + b .DCT_DCT_32X32_END +.NO_HAS_DCONLY_32x32: + + malloc_space 2560 // 32*32*2+512 + + addi.d t1, sp, 64 + addi.d t2, a2, 0 + addi.d t3, sp, 1024 + addi.d t3, t3, 1024 + addi.d t3, t3, 64 + + vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2 + +.rept 3 + addi.d t2, t2, 16 + addi.d t1, t1, 512 + + vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2 +.endr + + vreplgr2vr.h vr31, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032 + vst vr31, a2, \i +.endr + + addi.d t2, sp, 64 + addi.d t1, sp, 64 + + vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4 + +.rept 3 + addi.d t2, t2, 16 + addi.d t1, t1, 16 + + vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4 +.endr + + addi.d t2, sp, 64 + +.rept 16 + add.d t0, a0, a1 + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, t0, 0 + vld vr3, t0, 16 + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr7, vr3, 0 + vexth.hu.bu vr0, vr0 + vexth.hu.bu vr1, vr1 + vexth.hu.bu vr2, vr2 + vexth.hu.bu vr3, vr3 + vld_x8 t2, 0, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + vadd.h vr8, vr4, vr8 + vadd.h vr9, vr0, vr9 + vadd.h vr10, vr5, vr10 + vadd.h vr11, vr1, vr11 + vadd.h vr12, vr6, vr12 + vadd.h vr13, vr2, vr13 + vadd.h vr14, vr7, vr14 + vadd.h vr15, vr3, vr15 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vssrani.bu.h vr15, vr14, 0 + vst vr9, a0, 0 + vst vr11, a0, 16 + vst vr13, t0, 0 + vst vr15, t0, 16 + + alsl.d a0, a1, a0, 1 + addi.d t2, t2, 128 +.endr + + free_space 2560 // 32*32*2+512 + +.DCT_DCT_32X32_END: +endfunc diff --git a/src/loongarch/itx.h b/src/loongarch/itx.h index eb76881..7ed3e09 100644 --- a/src/loongarch/itx.h +++ b/src/loongarch/itx.h @@ -101,6 +101,7 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_16x16, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_16x16, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x32, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx)); static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) { #if BITDEPTH == 8 @@ -181,6 +182,7 @@ static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c c->itxfm_add[RTX_8X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x32_8bpc_lsx; + c->itxfm_add[TX_32X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_32x32_8bpc_lsx; #endif } -- cgit v1.2.3 From 110c23f68cc5237903b8b79264bf3d70fe85a556 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 1 Dec 2023 19:54:41 +0800 Subject: loongarch: Improve the performance of itx_8bpc.add_64x64 functions Relative speedup over C code: inv_txfm_add_64x64_dct_dct_0_8bpc_c: 485.6 ( 1.00x) inv_txfm_add_64x64_dct_dct_0_8bpc_lsx: 20.3 (23.89x) inv_txfm_add_64x64_dct_dct_1_8bpc_c: 4547.4 ( 1.00x) inv_txfm_add_64x64_dct_dct_1_8bpc_lsx: 363.9 (12.50x) inv_txfm_add_64x64_dct_dct_2_8bpc_c: 4547.5 ( 1.00x) inv_txfm_add_64x64_dct_dct_2_8bpc_lsx: 363.8 (12.50x) inv_txfm_add_64x64_dct_dct_3_8bpc_c: 4547.9 ( 1.00x) inv_txfm_add_64x64_dct_dct_3_8bpc_lsx: 363.8 (12.50x) inv_txfm_add_64x64_dct_dct_4_8bpc_c: 4547.1 ( 1.00x) inv_txfm_add_64x64_dct_dct_4_8bpc_lsx: 363.8 (12.50x) --- src/loongarch/itx.S | 1120 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/loongarch/itx.h | 6 + 2 files changed, 1126 insertions(+) diff --git a/src/loongarch/itx.S b/src/loongarch/itx.S index 0c5dc69..fc0c79e 100644 --- a/src/loongarch/itx.S +++ b/src/loongarch/itx.S @@ -6982,3 +6982,1123 @@ function inv_txfm_add_dct_dct_32x32_8bpc_lsx .DCT_DCT_32X32_END: endfunc + +.macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7 + + // in0 in1 in2 in3 + // dct4 in0 in2 + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vsllwil.w.h vr22, \in2, 0 + vexth.w.h vr23, \in2 + vmul.w vr8, vr22, vr20 + vmul.w vr10, vr23, vr20 + vmul.w \in2, vr22, vr21 + vmul.w vr9, vr23, vr21 + vssrarni.h.w vr10, vr8, 12 // t2 + vssrarni.h.w vr9, \in2, 12 // t3 + + vldrepl.w vr20, t0, 0 // 2896 + vsllwil.w.h vr22, \in0, 0 + vexth.w.h vr23, \in0 + vmul.w vr8, vr22, vr20 + vmul.w \in2, vr23, vr20 + vssrarni.h.w \in2, vr8, 12 + + vsadd.h vr8, \in2, vr9 // c[0] + vssub.h vr9, \in2, vr9 // c[3] + vsadd.h \in0, \in2, vr10 // c[1] + vssub.h vr10, \in2, vr10 // c[2] + + // inv_dct8_1d_internal_c tx64 + // in1 in3 + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + + vsllwil.w.h vr22, \in1, 0 + vexth.w.h vr23, \in1 + vmul.w \in2, vr22, vr21 + vmul.w \in4, vr23, vr21 + vmul.w \in1, vr22, vr20 + vmul.w \in6, vr23, vr20 + vssrarni.h.w \in4, \in2, 12 // t7a + vssrarni.h.w \in6, \in1, 12 // t4a + + vldrepl.w vr20, t0, 24 // 3406 + vldrepl.w vr21, t0, 28 // 2276 + + vsllwil.w.h vr22, \in3, 0 + vexth.w.h vr23, \in3 + vneg.w vr21, vr21 + vmul.w \in2, vr22, vr20 + vmul.w \in1, vr23, vr20 + vmul.w \in3, vr22, vr21 + vmul.w \in7, vr23, vr21 + vssrarni.h.w \in1, \in2, 12 // t6a + vssrarni.h.w \in7, \in3, 12 // t5a + + vsadd.h \in3, \in6, \in7 // t4 + vssub.h \in6, \in6, \in7 // t5a + vsadd.h \in5, \in4, \in1 // t7 + vssub.h \in4, \in4, \in1 // t6a + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w \in4, \in6, vr20, vr20, vr21, \in1 + vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7 + vssrarni.h.w \in1, vr21, 12 // t6 + vssrarni.h.w \in7, \in2, 12 // t5 + + vsadd.h \out0, vr8, \in5 // c[0] + vssub.h \out7, vr8, \in5 // c[7] + vsadd.h \out1, \in0, \in1 // c[1] + vssub.h \out6, \in0, \in1 // c[6] + vsadd.h \out2, vr10, \in7 // c[2] + vssub.h \out5, vr10, \in7 // c[5] + vsadd.h \out3, vr9, \in3 // c[3] + vssub.h \out4, vr9, \in3 // c[4] +.endm + +.macro dct_8x16_tx64_core_lsx + dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \ + vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + // in1 in3 in5 in7 in9 in11 in13 in15 + // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30 + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 32 // 401 + vldrepl.w vr21, t0, 36 // 4076 + vsllwil.w.h vr22, vr1, 0 + vexth.w.h vr23, vr1 + vmul.w vr0, vr22, vr21 + vmul.w vr10, vr23, vr21 + vmul.w vr1, vr22, vr20 + vmul.w vr29, vr23, vr20 + vssrarni.h.w vr10, vr0, 12 // t15a + vssrarni.h.w vr29, vr1, 12 // t8a + + vldrepl.w vr20, t0, 40 // 3166 -> 1583 + vldrepl.w vr21, t0, 44 // 2598 -> 1299 + vsllwil.w.h vr22, vr7, 0 + vexth.w.h vr23, vr7 + vneg.w vr21, vr21 + vmul.w vr0, vr22, vr20 + vmul.w vr30, vr23, vr20 + vmul.w vr7, vr22, vr21 + vmul.w vr31, vr23, vr21 + vssrarni.h.w vr30, vr0, 12 // t14a + vssrarni.h.w vr31, vr7, 12 // t9a + + vldrepl.w vr20, t0, 48 // 1931 + vldrepl.w vr21, t0, 52 // 3612 + vsllwil.w.h vr22, vr5, 0 + vexth.w.h vr23, vr5 + vmul.w vr0, vr22, vr21 + vmul.w vr24, vr23, vr21 + vmul.w vr5, vr22, vr20 + vmul.w vr25, vr23, vr20 + vssrarni.h.w vr24, vr0, 12 // t13a + vssrarni.h.w vr25, vr5, 12 // t10a + + vldrepl.w vr20, t0, 56 // 3920 + vldrepl.w vr21, t0, 60 // 1189 + vsllwil.w.h vr22, vr3, 0 + vexth.w.h vr23, vr3 + vneg.w vr21, vr21 + vmul.w vr0, vr22, vr20 + vmul.w vr26, vr23, vr20 + vmul.w vr3, vr22, vr21 + vmul.w vr27, vr23, vr21 + vssrarni.h.w vr26, vr0, 12 // t12a + vssrarni.h.w vr27, vr3, 12 // t11a + + // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 + vsadd.h vr28, vr29, vr31 // t8 + vssub.h vr19, vr29, vr31 // t9 + vssub.h vr29, vr27, vr25 // t10 + vsadd.h vr9, vr27, vr25 // t11 + vsadd.h vr31, vr26, vr24 // t12 + vssub.h vr25, vr26, vr24 // t13 + vssub.h vr27, vr10, vr30 // t14 + vsadd.h vr24, vr10, vr30 // t15 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 + vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30 + vssrarni.h.w vr26, vr0, 12 // t14a + vssrarni.h.w vr30, vr1, 12 // t9a + + vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 + vneg.w vr0, vr0 + vneg.w vr19, vr19 + vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27 + vssrarni.h.w vr19, vr0, 12 // t10a + vssrarni.h.w vr27, vr1, 12 // t13a + + vsadd.h vr25, vr28, vr9 // t8a + vssub.h vr29, vr28, vr9 // t11a + vssub.h vr28, vr24, vr31 // t12a + vsadd.h vr10, vr24, vr31 // t15a + vsadd.h vr9, vr30, vr19 // t9 + vssub.h vr31, vr30, vr19 // t10 + vssub.h vr30, vr26, vr27 // t13 + vsadd.h vr24, vr26, vr27 // t14 + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 + vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27 + vssrarni.h.w vr26, vr0, 12 // t13a + vssrarni.h.w vr27, vr1, 12 // t10a + + vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 + vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30 + vssrarni.h.w vr31, vr0, 12 // t12 + vssrarni.h.w vr30, vr1, 12 // t11 + + // vr11 vr12 ... vr18 + vsadd.h vr28, vr14, vr31 // c[3] + vssub.h vr29, vr14, vr31 // c[12] + vsadd.h vr20, vr15, vr30 // c[4] + vssub.h vr21, vr15, vr30 // c[11] + vsadd.h vr14, vr16, vr27 // c[5] + vssub.h vr23, vr16, vr27 // c[10] + vsadd.h vr15, vr17, vr9 // c[6] + vssub.h vr30, vr17, vr9 // c[9] + vsadd.h vr16, vr18, vr25 // c[7] + vssub.h vr27, vr18, vr25 // c[8] + vsadd.h vr17, vr13, vr26 // c[2] + vssub.h vr26, vr13, vr26 // c[13] + vsadd.h vr18, vr12, vr24 // c[1] + vssub.h vr25, vr12, vr24 // c[14] + vsadd.h vr22, vr11, vr10 // c[0] + vssub.h vr24, vr11, vr10 // c[15] +.endm // dct_8x16_tx64_core_lsx + +.macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1 + vsllwil.w.h vr22, \in0, 0 + vexth.w.h vr23, \in0 + vmul.w \tmp0, vr22, \in1 + vmul.w \out0, vr23, \in1 + vmul.w \tmp1, vr22, \in2 + vmul.w \out1, vr23, \in2 + vssrarni.h.w \out0, \tmp0, 12 + vssrarni.h.w \out1, \tmp1, 12 +.endm + +const idct64_coeffs, align=4 + .word 101, 4095, 2967, -2824 + .word 1660, 3745, 3822, -1474 + .word 4076, 401, 4017, 799 + + .word 4036, -700, 2359, 3349 + .word 3461, -2191, 897, 3996 + .word -3166, -2598, -799, -4017 + + .word 501, 4065, 3229, -2520 + .word 2019, 3564, 3948, -1092 + .word 3612, 1931, 2276, 3406 + + .word 4085, -301, 2675, 3102 + .word 3659, -1842, 1285, 3889 + .word -3920, -1189, -3406, -2276 +endconst + +// in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a +// in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a +// in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a +// in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + +.macro dct64_step1_lsx + + vldrepl.w vr20, t0, 0 // 101 + vldrepl.w vr21, t0, 4 // 4095 + vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9 // vr8 t32a vr9 t63a + + vldrepl.w vr20, t0, 8 // 2967 + vldrepl.w vr21, t0, 12 // -2824 + vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11 // vr10 t62a vr11 t33a + + vldrepl.w vr20, t0, 16 // 1660 + vldrepl.w vr21, t0, 20 // 3745 + vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13 // vr12 t34a vr13 t61a + + vldrepl.w vr20, t0, 24 // 3822 + vldrepl.w vr21, t0, 28 // -1474 + vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15 // vr14 t60a vr15 t35a + + vsadd.h vr0, vr8, vr11 // t32 + vssub.h vr1, vr8, vr11 // t33 + vssub.h vr2, vr15, vr12 // t34 + vsadd.h vr3, vr15, vr12 // t35 + vsadd.h vr4, vr14, vr13 // t60 + vssub.h vr5, vr14, vr13 // t61 + vssub.h vr6, vr9, vr10 // t62 + vsadd.h vr7, vr9, vr10 // t63 + + vldrepl.w vr20, t0, 32 // 4076 + vldrepl.w vr21, t0, 36 // 401 + vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10 + vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11 + vssrarni.h.w vr10, vr9, 12 // t62a + vssrarni.h.w vr11, vr13, 12 // t33a + + vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1 + vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6 + vneg.w vr9, vr9 + vneg.w vr1, vr1 + vssrarni.h.w vr6, vr13, 12 // t61a + vssrarni.h.w vr1, vr9, 12 // t34a + + vsadd.h vr2, vr0, vr3 // t32a + vssub.h vr5, vr0, vr3 // t35a + vsadd.h vr9, vr11, vr1 // t33 + vssub.h vr13, vr11, vr1 // t34 + vssub.h vr0, vr7, vr4 // t60a + vsadd.h vr3, vr7, vr4 // t63a + vssub.h vr1, vr10, vr6 // t61 + vsadd.h vr11, vr10, vr6 // t62 + + vldrepl.w vr20, t0, 40 // 4017 + vldrepl.w vr21, t0, 44 // 799 + + vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4 + vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7 + vssrarni.h.w vr4, vr8, 12 // t61a + vssrarni.h.w vr7, vr12, 12 // t34a + + vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6 + vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10 + vssrarni.h.w vr6, vr8, 12 // t60 + vssrarni.h.w vr10, vr12, 12 // t35 + + vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3 +.endm // dct64_step1 + + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a +.macro dct64_step2_lsx + vld vr0, t5, 0 // t32a + vld vr2, t4, 0 // t63a + vld vr3, t5, 16*8 // t56a + vld vr1, t4, 16*8 // t39a + vld vr4, t5, 16*16 // t40a + vld vr6, t4, 16*16 // t55a + vld vr7, t5, 16*24 // t48a + vld vr5, t4, 16*24 // t47a + + vsadd.h vr8, vr0, vr1 // t32 + vssub.h vr9, vr0, vr1 // t39 + vsadd.h vr10, vr2, vr3 // t63 + vssub.h vr11, vr2, vr3 // t56 + vssub.h vr12, vr5, vr4 // t40 + vsadd.h vr13, vr5, vr4 // t47 + vsadd.h vr14, vr7, vr6 // t48 + vssub.h vr15, vr7, vr6 // t55 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr11, vr9, vr21, vr20, vr0, vr2 + vmul_vmsub_w vr11, vr9, vr20, vr21, vr1, vr3 + vssrarni.h.w vr2, vr0, 12 // t56a + vssrarni.h.w vr3, vr1, 12 // t39a + + vmul_vmadd_w vr15, vr12, vr21, vr20, vr0, vr4 + vmul_vmsub_w vr15, vr12, vr20, vr21, vr1, vr5 + vneg.w vr0, vr0 + vneg.w vr4, vr4 + vssrarni.h.w vr5, vr1, 12 // t55a + vssrarni.h.w vr4, vr0, 12 // t40a + + vsadd.h vr9, vr8, vr13 // t32a + vssub.h vr11, vr8, vr13 // t47a + vsadd.h vr6, vr3, vr4 // t39 + vssub.h vr7, vr3, vr4 // t40 + vssub.h vr12, vr10, vr14 // t48a + vsadd.h vr15, vr10, vr14 // t63a + vssub.h vr0, vr2, vr5 // t55 + vsadd.h vr1, vr2, vr5 // t56 + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13 + vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4 + vssrarni.h.w vr13, vr8, 12 // t40a + vssrarni.h.w vr4, vr3, 12 // t55a + vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10 + vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14 + vssrarni.h.w vr10, vr8, 12 // t47 + vssrarni.h.w vr14, vr3, 12 // t48 + + // t32a t39 t40a t47 t48 t55a t56 t63a + // vr9 vr6 vr13 vr10 vr14 vr4 vr1 vr15 + vst vr9, t5, 0 // t32a + vst vr6, t4, 0 // t39 + vst vr13, t5, 16*8 // t40a + vst vr10, t4, 16*8 // t47 + vst vr14, t5, 16*16 // t48 + vst vr4, t4, 16*16 // t55a + vst vr1, t5, 16*24 // t56 + vst vr15, t4, 16*24 // t63a +.endm // dct64_step2_lsx + +.macro dct64_step3_lsx + // t0 t1 t2 t3 t4 t5 t6 t7 + vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17 + + vld vr9, t5, 16*24 // t56 + vld vr6, t5, 16*24+16 // t57a + vld vr13, t5, 16*24+32 // t58 + vld vr10, t5, 16*24+48 // t59a + vld vr14, t4, 16*24-48 // t60 + vld vr4, t4, 16*24-32 // t61a + vld vr1, t4, 16*24-16 // t62 + vld vr15, t4, 16*24 // t63a + + vsadd.h vr20, vr2, vr15 // c[0] + vssub.h vr21, vr2, vr15 // c[63] + vsadd.h vr22, vr3, vr1 // c[1] + vssub.h vr23, vr3, vr1 // c[62] + vsadd.h vr24, vr7, vr4 // c[2] + vssub.h vr25, vr7, vr4 // c[61] + vsadd.h vr26, vr8, vr14 // c[3] + vssub.h vr27, vr8, vr14 // c[60] + + vsadd.h vr28, vr11, vr10 // c[4] + vssub.h vr29, vr11, vr10 // c[59] + vsadd.h vr30, vr12, vr13 // c[5] + vssub.h vr31, vr12, vr13 // c[58] + vsadd.h vr2, vr16, vr6 // c[6] + vssub.h vr15, vr16, vr6 // c[57] + vsadd.h vr1, vr17, vr9 // c[7] + vssub.h vr3, vr17, vr9 // c[56] +.endm // dct64_step3_lsx + +.macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1 + + dct64_step3_lsx + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ + vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ + vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 + + LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ + vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ + vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 +.endif + +.ifnb \shift +.irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ + vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 + vsrari.h \i, \i, \shift +.endr +.endif + + vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 + + vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 + +.endm // dct64_step4_lsx + +.macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7 + + fld.d f4, t0, 0 + fldx.d f5, t0, a1 + fld.d f6, t6, 0 + fldx.d f7, t6, a1 + alsl.d t0, a1, t0, 2 + alsl.d t6, a1, t6, 2 + fld.d f8, t0, 0 + fldx.d f9, t0, a1 + fld.d f10, t6, 0 + fldx.d f11, t6, a1 + +.irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11 + vsllwil.hu.bu \i, \i, 0 +.endr + + vsrari.h vr20, \in0, 4 + vsrari.h vr22, \in1, 4 + vsrari.h vr24, \in2, 4 + vsrari.h vr26, \in3, 4 + vsrari.h vr28, \in4, 4 + vsrari.h vr30, \in5, 4 + vsrari.h vr2, \in6, 4 + vsrari.h vr1, \in7, 4 + + vadd.h vr4, vr4, vr20 + vadd.h vr5, vr5, vr22 + vadd.h vr6, vr6, vr24 + vadd.h vr7, vr7, vr26 + vadd.h vr8, vr8, vr28 + vadd.h vr9, vr9, vr30 + vadd.h vr10, vr10, vr2 + vadd.h vr11, vr11, vr1 + + vssrani.bu.h vr5, vr4, 0 + vssrani.bu.h vr7, vr6, 0 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + + vstelm.d vr5, t1, 0, 0 + vstelm.d vr5, t2, 0, 1 + + alsl.d t1, a1, t1, 1 + alsl.d t2, a1, t2, 1 + vstelm.d vr7, t1, 0, 0 + vstelm.d vr7, t2, 0, 1 + + alsl.d t1, a1, t1, 1 + alsl.d t2, a1, t2, 1 + vstelm.d vr9, t1, 0, 0 + vstelm.d vr9, t2, 0, 1 + + alsl.d t1, a1, t1, 1 + alsl.d t2, a1, t2, 1 + vstelm.d vr11, t1, 0, 0 + vstelm.d vr11, t2, 0, 1 +.endm // dct64_step5_lsx + +.macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1 + vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + dct_8x16_tx64_core_lsx + + vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 64 // 201 + vldrepl.w vr21, t0, 68 // 4091 + vsllwil.w.h vr22, vr0, 0 + vexth.w.h vr23, vr0 + vmul.w vr8, vr22, vr21 + vmul.w vr9, vr23, vr21 + vmul.w vr0, vr22, vr20 + vmul.w vr10, vr23, vr20 + vssrarni.h.w vr9, vr8, 12 // t31a + vssrarni.h.w vr10, vr0, 12 // t16a + + vldrepl.w vr20, t0, 72 // 3035 + vldrepl.w vr21, t0, 76 // 2751 + vsllwil.w.h vr22, vr7, 0 + vexth.w.h vr23, vr7 + vneg.w vr21, vr21 + vmul.w vr8, vr22, vr20 + vmul.w vr0, vr23, vr20 + vmul.w vr7, vr22, vr21 + vmul.w vr30, vr23, vr21 + vssrarni.h.w vr0, vr8, 12 // t30a + vssrarni.h.w vr30, vr7, 12 // t17a + + vldrepl.w vr20, t0, 80 // 1751 + vldrepl.w vr21, t0, 84 // 3703 + vsllwil.w.h vr22, vr4, 0 + vexth.w.h vr23, vr4 + vmul.w vr8, vr22, vr21 + vmul.w vr7, vr23, vr21 + vmul.w vr4, vr22, vr20 + vmul.w vr19, vr23, vr20 + vssrarni.h.w vr7, vr8, 12 // t29a + vssrarni.h.w vr19, vr4, 12 // t18a + + vldrepl.w vr20, t0, 88 // 3857 + vldrepl.w vr21, t0, 92 // 1380 + vsllwil.w.h vr22, vr3, 0 + vexth.w.h vr23, vr3 + vneg.w vr21, vr21 + vmul.w vr8, vr22, vr20 + vmul.w vr4, vr23, vr20 + vmul.w vr3, vr22, vr21 + vmul.w vr26, vr23, vr21 + vssrarni.h.w vr4, vr8, 12 // t28a + vssrarni.h.w vr26, vr3, 12 // t19a + + vldrepl.w vr20, t0, 96 // 995 + vldrepl.w vr21, t0, 100 // 3973 + vsllwil.w.h vr22, vr2, 0 + vexth.w.h vr23, vr2 + vmul.w vr8, vr22, vr21 + vmul.w vr3, vr23, vr21 + vmul.w vr2, vr22, vr20 + vmul.w vr27, vr23, vr20 + vssrarni.h.w vr3, vr8, 12 // t27a + vssrarni.h.w vr27, vr2, 12 // t20a + + vldrepl.w vr20, t0, 104 // 3513 + vldrepl.w vr21, t0, 108 // 2106 + vsllwil.w.h vr22, vr5, 0 + vexth.w.h vr23, vr5 + vneg.w vr21, vr21 + vmul.w vr8, vr22, vr20 + vmul.w vr2, vr23, vr20 + vmul.w vr5, vr22, vr21 + vmul.w vr28, vr23, vr21 + vssrarni.h.w vr2, vr8, 12 // t26a + vssrarni.h.w vr28, vr5, 12 // t21a + + vldrepl.w vr20, t0, 112 // 2440 -> 1220 + vldrepl.w vr21, t0, 116 // 3290 -> 1645 + vsllwil.w.h vr22, vr6, 0 + vexth.w.h vr23, vr6 + vmul.w vr8, vr22, vr21 + vmul.w vr5, vr23, vr21 + vmul.w vr6, vr22, vr20 + vmul.w vr25, vr23, vr20 + vssrarni.h.w vr5, vr8, 12 // t25a + vssrarni.h.w vr25, vr6, 12 // t22a + + vldrepl.w vr20, t0, 120 // 4052 + vldrepl.w vr21, t0, 124 // 601 + vsllwil.w.h vr22, vr1, 0 + vexth.w.h vr23, vr1 + vneg.w vr21, vr21 + vmul.w vr8, vr22, vr20 + vmul.w vr6, vr23, vr20 + vmul.w vr1, vr22, vr21 + vmul.w vr24, vr23, vr21 + vssrarni.h.w vr6, vr8, 12 // t24a + vssrarni.h.w vr24, vr1, 12 // t23a + + vsadd.h vr1, vr10, vr30 // t16 + vssub.h vr29, vr10, vr30 // t17 + vssub.h vr8, vr26, vr19 // t18 + vsadd.h vr31, vr26, vr19 // t19 + vsadd.h vr10, vr27, vr28 // t20 + vssub.h vr30, vr27, vr28 // t21 + vssub.h vr19, vr24, vr25 // t22 + vsadd.h vr26, vr24, vr25 // t23 + vsadd.h vr27, vr6, vr5 // t24 + vssub.h vr28, vr6, vr5 // t25 + vssub.h vr24, vr3, vr2 // t26 + vsadd.h vr25, vr3, vr2 // t27 + vsadd.h vr5, vr4, vr7 // t28 + vssub.h vr6, vr4, vr7 // t29 + vssub.h vr2, vr9, vr0 // t30 + vsadd.h vr3, vr9, vr0 // t31 + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 + vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 + vssrarni.h.w vr7, vr4, 12 // t30a + vssrarni.h.w vr0, vr11, 12 // t17a + vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 + vneg.w vr4, vr4 + vneg.w vr9, vr9 + vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 + vssrarni.h.w vr9, vr4, 12 // t18a + vssrarni.h.w vr2, vr11, 12 // t29a + + vldrepl.w vr20, t0, 24 // 3406 -> 1703 + vldrepl.w vr21, t0, 28 // 2276 -> 1138 + vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 + vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 + vssrarni.h.w vr29, vr4, 12 // t26a + vssrarni.h.w vr6, vr11, 12 // t21a + + vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 + vneg.w vr4, vr4 + vneg.w vr8, vr8 + vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 + vssrarni.h.w vr8, vr4, 12 // t22a + vssrarni.h.w vr24, vr11, 12 // t25a + + vsadd.h vr4, vr1, vr31 // t16a + vssub.h vr30, vr1, vr31 // t19a + vsadd.h vr19, vr0, vr9 // t17 + vssub.h vr28, vr0, vr9 // t18 + vssub.h vr1, vr26, vr10 // t20a + vsadd.h vr31, vr26, vr10 // t23a + vssub.h vr0, vr8, vr6 // t21 + vsadd.h vr9, vr8, vr6 // t22 + vsadd.h vr10, vr27, vr25 // t24a + vssub.h vr26, vr27, vr25 // t27a + vsadd.h vr6, vr24, vr29 // t25 + vssub.h vr8, vr24, vr29 // t26 + vssub.h vr25, vr3, vr5 // t28a + vsadd.h vr27, vr3, vr5 // t31a + vssub.h vr24, vr7, vr2 // t29 + vsadd.h vr29, vr7, vr2 // t30 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 + vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 + vssrarni.h.w vr5, vr3, 12 // t29a + vssrarni.h.w vr2, vr11, 12 // 18a + + vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 + vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 + vssrarni.h.w vr7, vr3, 12 // t28 + vssrarni.h.w vr24, vr11, 12 // t19 + + vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 + vneg.w vr3, vr3 + vneg.w vr28, vr28 + vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 + vssrarni.h.w vr28, vr3, 12 // t20 + vssrarni.h.w vr25, vr11, 12 // t27 + + vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 + vneg.w vr3, vr3 + vneg.w vr30, vr30 + vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 + vssrarni.h.w vr30, vr3, 12 // t21a + vssrarni.h.w vr1, vr11, 12 // t26a + + vsadd.h vr3, vr4, vr31 // t16 + vssub.h vr26, vr4, vr31 // t23 + vsadd.h vr0, vr19, vr9 // t17a + vssub.h vr8, vr19, vr9 // t22a + vsadd.h vr4, vr2, vr30 // t18 + vssub.h vr31, vr2, vr30 // t21 + vsadd.h vr9, vr24, vr28 // t19a + vssub.h vr19, vr24, vr28 // t20a + vssub.h vr2, vr27, vr10 // t24 + vsadd.h vr30, vr27, vr10 // t31 + vssub.h vr24, vr29, vr6 // t25a + vsadd.h vr28, vr29, vr6 // t30a + vssub.h vr10, vr5, vr1 // t26 + vsadd.h vr27, vr5, vr1 // t29 + vssub.h vr6, vr7, vr25 // t27a + vsadd.h vr29, vr7, vr25 // t28a + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 + vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 + vssrarni.h.w vr5, vr1, 12 // t20 + vssrarni.h.w vr7, vr11, 12 // t27 + + vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 + vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 + vssrarni.h.w vr25, vr1, 12 // t21a + vssrarni.h.w vr6, vr11, 12 // t26a + + vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 + vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 + vssrarni.h.w vr19, vr1, 12 // t22 + vssrarni.h.w vr10, vr11, 12 // t25 + + vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 + vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 + vssrarni.h.w vr31, vr1, 12 // t23a + vssrarni.h.w vr8, vr11, 12 // t24a + + // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 + // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 + + vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr30 // c[0] + vssub.h vr2, vr11, vr30 // c[31] + vsadd.h vr24, vr12, vr28 // c[1] + vssub.h vr26, vr12, vr28 // c[30] + vsadd.h vr11, vr13, vr27 // c[2] + vssub.h vr30, vr13, vr27 // c[29] + vsadd.h vr12, vr14, vr29 // c[3] + vssub.h vr28, vr14, vr29 // c[28] + vsadd.h vr13, vr15, vr7 // c[4] + vssub.h vr27, vr15, vr7 // c[27] + vsadd.h vr14, vr16, vr6 // c[5] + vssub.h vr29, vr16, vr6 // c[26] + vsadd.h vr7, vr17, vr10 // c[6] + vssub.h vr15, vr17, vr10 // c[25] + vsadd.h vr6, vr18, vr8 // c[7] + vssub.h vr16, vr18, vr8 // c[24] + + vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + + vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + + vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr31 // c[8] + vssub.h vr2, vr11, vr31 // c[23] + vsadd.h vr24, vr12, vr19 // c[9] + vssub.h vr26, vr12, vr19 // c[22] + vsadd.h vr11, vr13, vr25 // c[10] + vssub.h vr30, vr13, vr25 // c[21] + vsadd.h vr12, vr14, vr5 // c[11] + vssub.h vr28, vr14, vr5 // c[20] + vsadd.h vr13, vr15, vr9 // c[12] + vssub.h vr27, vr15, vr9 // c[19] + vsadd.h vr14, vr16, vr4 // c[13] + vssub.h vr29, vr16, vr4 // c[18] + vsadd.h vr7, vr17, vr0 // c[14] + vssub.h vr15, vr17, vr0 // c[17] + vsadd.h vr6, vr18, vr3 // c[15] + vssub.h vr16, vr18, vr3 // c[16] + + vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + + vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 +.endm // dct_8x32_tx64_new_lsx + +function inv_txfm_add_dct_dct_64x64_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_64x64 + + ld.h t2, a2, 0 + vldi vr0, 0x8b5 + vreplgr2vr.w vr1, t2 + vldi vr20, 0x880 + vmul.w vr2, vr0, vr1 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 + vld vr3, a0, 48 + vsrari.w vr2, vr2, 2 + vld vr1, a0, 16 + vmadd.w vr20, vr2, vr0 + vld vr2, a0, 32 + vssrarni.h.w vr20, vr20, 12 + vld vr0, a0, 0 + + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr7, vr3, 0 + vexth.hu.bu vr0, vr0 + vexth.hu.bu vr1, vr1 + vexth.hu.bu vr2, vr2 + vexth.hu.bu vr3, vr3 + vadd.h vr8, vr4, vr20 + vadd.h vr9, vr0, vr20 + vadd.h vr10, vr5, vr20 + vadd.h vr11, vr1, vr20 + vadd.h vr12, vr6, vr20 + vadd.h vr13, vr2, vr20 + vadd.h vr14, vr7, vr20 + vadd.h vr15, vr3, vr20 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vssrani.bu.h vr15, vr14, 0 + vst vr9, a0, 0 + vst vr11, a0, 16 + vst vr13, a0, 32 + vst vr15, a0, 48 + +.rept 63 + add.d a0, a0, a1 + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr7, vr3, 0 + vexth.hu.bu vr0, vr0 + vexth.hu.bu vr1, vr1 + vexth.hu.bu vr2, vr2 + vexth.hu.bu vr3, vr3 + vadd.h vr8, vr4, vr20 + vadd.h vr9, vr0, vr20 + vadd.h vr10, vr5, vr20 + vadd.h vr11, vr1, vr20 + vadd.h vr12, vr6, vr20 + vadd.h vr13, vr2, vr20 + vadd.h vr14, vr7, vr20 + vadd.h vr15, vr3, vr20 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vssrani.bu.h vr15, vr14, 0 + vst vr9, a0, 0 + vst vr11, a0, 16 + vst vr13, a0, 32 + vst vr15, a0, 48 +.endr + b .DCT_DCT_64X64_END +.NO_HAS_DCONLY_64x64: + + malloc_space 64*32*2+512+512 + + addi.d t7, sp, 64 + +.macro dct64x64_core1_lsx in0, in1, in2 + addi.d t2, a2, \in0 + addi.d t7, t7, \in1 + li.w t4, 64*32*2+64 + add.d t3, sp, t4 + addi.d t6, t3, 512 + add.d t5, t6, zero + + dct_8x32_tx64_new_lsx 0, 256, 128, 256 + + la.local t0, idct64_coeffs + + addi.d t2, a2, \in2 // 32 ... + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + vld vr0, t2, 128*0 // in1 + vld vr1, t2, 128*15 // in31 + vld vr2, t2, 128*8 // in17 + vld vr3, t2, 128*7 // in15 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + vld vr0, t2, 128*3 // in7 + vld vr1, t2, 128*12 // in25 + vld vr2, t2, 128*11 // in23 + vld vr3, t2, 128*4 // in9 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + vld vr0, t2, 128*2 // in5 + vld vr1, t2, 128*13 // in27 + vld vr2, t2, 128*10 // in21 + vld vr3, t2, 128*5 // in11 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + vld vr0, t2, 128*1 // in3 + vld vr1, t2, 128*14 // in29 + vld vr2, t2, 128*9 // in19 + vld vr3, t2, 128*6 // in13 + dct64_step1_lsx + + la.local t0, idct_coeffs + addi.d t4, t5, 16*7 + // t32a/t39/t40a/t47/t48/t55a/t56/t63a + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t33/t38a/t41/t46a/t49a/t54/t57a/t62 + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t34a/t37/t42a/t45/t50/t53a/t58/t61a + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t35/t36a/t43/t44a/t51a/t52/t59a/t60 + dct64_step2_lsx + + li.w t4, 64*32*2+64+512 + add.d t5, t4, sp + addi.d t4, t5, 16*7 + dct64_step4_lsx transpose8x8, 2, 0, 128, 112, 128 + + addi.d t3, t3, 128 + addi.d t4, t4, -16*8 + addi.d t5, t5, -16*8 + dct64_step4_lsx transpose8x8, 2, 16, 128, 96, 128 + + addi.d t5, t5, -16*8 + addi.d t4, t4, -16*8 + addi.d t3, t3, 128 + dct64_step4_lsx transpose8x8, 2, 32, 128, 80, 128 + + addi.d t5, t5, -16*8 + addi.d t4, t4, -16*8 + addi.d t3, t3, 128 + dct64_step4_lsx transpose8x8, 2, 48, 128, 64, 128 +.endm + + dct64x64_core1_lsx 0, 0, 64 + + dct64x64_core1_lsx 16, 128*8, 64+16 + + dct64x64_core1_lsx 32, 128*8, 64+16*2 + + dct64x64_core1_lsx 48, 128*8, 64+16*3 + + vreplgr2vr.h vr31, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032 + vst vr31, a2, \i +.endr + +.macro dct64x64_core2_lsx in0, in1 + addi.d t2, sp, 64+\in0 + addi.d t7, sp, 64+\in0 + li.w t4, 64*32*2+64 + add.d t3, sp, t4 + addi.d t6, t3, 512 + add.d t5, t6, zero + + addi.d t2, t2, 1024 + addi.d t2, t2, 1024 + dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512 + + la.local t0, idct64_coeffs + + addi.d t2, sp, 64+64*2+\in0 + addi.d t4, t2, 256*7 + addi.d t4, t4, 256 + + vld vr0, t2, 256*0 // in1 + vld vr1, t4, 256*7 // in31 + vld vr2, t4, 256*0 // in17 + vld vr3, t2, 256*7 // in15 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + vld vr0, t2, 256*3 // in7 + vld vr1, t4, 256*4 // in25 + vld vr2, t4, 256*3 // in23 + vld vr3, t2, 256*4 // in9 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + vld vr0, t2, 256*2 // in5 + vld vr1, t4, 256*5 // in27 + vld vr2, t4, 256*2 // in21 + vld vr3, t2, 256*5 // in11 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + vld vr0, t2, 256*1 // in3 + vld vr1, t4, 256*6 // in29 + vld vr2, t4, 256*1 // in19 + vld vr3, t2, 256*6 // in13 + dct64_step1_lsx + + la.local t0, idct_coeffs + addi.d t4, t5, 16*7 + // t32a/t39/t40a/t47/t48/t55a/t56/t63a + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t33/t38a/t41/t46a/t49a/t54/t57a/t62 + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t34a/t37/t42a/t45/t50/t53a/t58/t61a + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t35/t36a/t43/t44a/t51a/t52/t59a/t60 + dct64_step2_lsx + + li.w t4, 64*32*2+64+512 + add.d t5, t4, sp + addi.d t4, t5, 16*7 + addi.d a0, a0, \in1 + // 0 - 7, 56 -63 + dct64_step3_lsx + + li.w t8, 0 + mul.w t0, t8, a1 + add.d t0, a0, t0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 + + li.w t8, 56 + mul.w t0, t8, a1 + add.d t0, a0, t0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 + + // 8 - 15, 48 - 55 + addi.d t3, t3, 128 + addi.d t4, t4, -16*8 + addi.d t5, t5, -16*8 + dct64_step3_lsx + + li.w t8, 8 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 + + li.w t8, 48 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 + + // 16 - 23, 40 - 47 + addi.d t3, t3, 128 + addi.d t4, t4, -16*8 + addi.d t5, t5, -16*8 + dct64_step3_lsx + + li.w t8, 16 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 + + li.w t8, 40 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 + + // 24 - 31, 32 - 39 + addi.d t3, t3, 128 + addi.d t4, t4, -16*8 + addi.d t5, t5, -16*8 + dct64_step3_lsx + + li.w t8, 24 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 + + li.w t8, 32 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 +.endm + + dct64x64_core2_lsx 16*0, 0 + + dct64x64_core2_lsx 16*1, 8 + + dct64x64_core2_lsx 16*2, 8 + + dct64x64_core2_lsx 16*3, 8 + + dct64x64_core2_lsx 16*4, 8 + + dct64x64_core2_lsx 16*5, 8 + + dct64x64_core2_lsx 16*6, 8 + + dct64x64_core2_lsx 16*7, 8 + + free_space 64*32*2+512+512 +.DCT_DCT_64X64_END: +endfunc diff --git a/src/loongarch/itx.h b/src/loongarch/itx.h index 7ed3e09..3ad444f 100644 --- a/src/loongarch/itx.h +++ b/src/loongarch/itx.h @@ -103,6 +103,10 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_16x16, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x32, lsx)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx)); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, lsx)); + static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) { #if BITDEPTH == 8 const unsigned flags = dav1d_get_cpu_flags(); @@ -183,6 +187,8 @@ static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c c->itxfm_add[RTX_8X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x32_8bpc_lsx; c->itxfm_add[TX_32X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_32x32_8bpc_lsx; + + c->itxfm_add[TX_64X64][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_64x64_8bpc_lsx; #endif } -- cgit v1.2.3 From e2c7a4408b78a05460cea2b98afbca9048567f14 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 15 Jan 2024 15:12:23 +0100 Subject: x86: Add high bit-depth ipred z3 AVX-512 (Ice Lake) asm --- src/x86/ipred.h | 2 +- src/x86/ipred16_avx512.asm | 659 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 655 insertions(+), 6 deletions(-) diff --git a/src/x86/ipred.h b/src/x86/ipred.h index 1815e37..f5f187e 100644 --- a/src/x86/ipred.h +++ b/src/x86/ipred.h @@ -138,13 +138,13 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl); - init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl); #endif init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl); init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl); init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl); init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl); init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl); + init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl); init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl); c->pal_pred = BF(dav1d_pal_pred, avx512icl); diff --git a/src/x86/ipred16_avx512.asm b/src/x86/ipred16_avx512.asm index 94eaa3f..8124a3b 100644 --- a/src/x86/ipred16_avx512.asm +++ b/src/x86/ipred16_avx512.asm @@ -1,5 +1,5 @@ -; Copyright © 2022, VideoLAN and dav1d authors -; Copyright © 2022, Two Orioles, LLC +; Copyright © 2022-2024, VideoLAN and dav1d authors +; Copyright © 2022-2024, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without @@ -42,12 +42,16 @@ pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51 db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55 db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59 db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63 -pw_0to31: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - dw 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +pw_31to0: dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 + dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +pw_1to32: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 z_upsample: dw 0, -1, 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6 dw 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14 z_xpos_mul: dw 1, 1, 1, 1, 2, 2, 1, 1, 3, 3, 2, 2, 4, 4, 2, 2 dw 5, 5, 3, 3, 6, 6, 3, 3, 7, 7, 4, 4, 8, 8, 4, 4 +z_ypos_mul: dw 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 1, 3, 3, 1, 1 + dw 4, 4, 2, 2, 5, 5, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3 z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 z_xpos_off1a: dw 30720, 30784, 30848, 30912, 30976, 31040, 31104, 31168 @@ -75,13 +79,25 @@ z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 z_filter_k: dw 8, 8, 6, 6, 4, 4 dw 4, 4, 5, 5, 4, 4 dw 0, 0, 0, 0, 2, 2 +pw_15: times 2 dw 15 +pw_16: times 2 dw 16 pw_17: times 2 dw 17 +pw_24: times 2 dw 24 +pw_32: times 2 dw 32 pw_63: times 2 dw 63 +pw_64: times 2 dw 64 pw_512: times 2 dw 512 pw_31806: times 2 dw 31806 +pw_32640: times 2 dw 32640 +pw_32672: times 2 dw 32672 +pw_32704: times 2 dw 32704 +pw_32735: times 2 dw 32735 +pw_32736: times 2 dw 32736 +%define pw_2 (z_xpos_mul+4* 2) %define pw_3 (z_xpos_mul+4* 4) %define pw_7 (z_xpos_mul+4*12) +%define pw_0to31 (pw_1to32-2) %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) @@ -98,6 +114,7 @@ JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc @@ -757,7 +774,7 @@ cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx lea r3d, [angleq+216] movu ym5, [tlq] mov r3b, hb - mova m10, [base+pw_0to31] + movu m10, [base+pw_0to31] cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 lea r3d, [hq+7] @@ -1157,6 +1174,638 @@ cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx mov rsp, r7 RET +cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy + lea r7, [z_filter_t0] + tzcnt wd, wm + movifnidn angled, anglem + lea t0, [dr_intra_derivative+45*2-1] + movsxd wq, [base+ipred_z3_16bpc_avx512icl_table+wq*4] + sub angled, 180 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + mova m0, [base+pw_31to0] + movzx dyd, word [t0+dyq] + lea wq, [base+ipred_z3_16bpc_avx512icl_table+wq] + movifnidn hd, hm + vpbroadcastd m14, [base+pw_31806] + vpbroadcastd m15, [base+pw_1] + jmp wq +.w4: + lea r3d, [hq+3] + xor r3d, 31 ; 32 - (h + imin(w, h)) + vpbroadcastw m7, r3d + pmaxuw m7, m0 + vpermw m6, m7, [tlq-64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + cmp angleb, 40 + jae .w4_filter + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_filter ; h > 8 || (h == 8 && is_sm) + call .upsample + movsldup m1, [base+z_ypos_mul] + paddw m1, m1 + jmp .w4_main2 +.w4_filter: + lea r3d, [hq+3] + call .filter32 +.w4_main: + movsldup m1, [base+z_ypos_mul] +.w4_main2: + vpbroadcastq m0, [base+pw_1to32] + vpbroadcastw m4, dyd + lea r2d, [hq+4] + shr r2d, 3 + pmullw m4, m0 ; ypos + vpbroadcastw m0, r2d + imul r2, strideq ; stride * imax(height / 8, 1) + pmullw m1, m0 + lea r3, [r2*3] + paddd m1, [base+pw_32736] {1to16} + psrlw m2, m4, 6 + psllw m4, 9 + paddsw m2, m1 ; base+0 + vpandd m4, m14 ; frac << 9 + vpermw m3, m2, m6 ; left[base+0] +.w4_loop: + paddsw m2, m15 ; base+1 + vpermw m1, m2, m6 ; left[base+1] + psubw m0, m1, m3 + pmulhrsw m0, m4 + paddw m0, m3 + movq [dstq+r2*0], xm0 + movhps [dstq+r2*1], xm0 + vextracti32x4 xm3, ym0, 1 + movq [dstq+r2*2], xm3 + movhps [dstq+r3 ], xm3 + sub hd, 8 + jl .w4_end + lea r5, [dstq+r2*4] + vextracti32x8 ym0, m0, 1 + mova m3, m1 + movq [r5+r2*0], xm0 + movhps [r5+r2*1], xm0 + vextracti32x4 xm1, ym0, 1 + movq [r5+r2*2], xm1 + movhps [r5+r3 ], xm1 + add dstq, strideq + test hd, hd + jnz .w4_loop +.w4_end: + RET +.upsample: + vinserti32x4 m6, [tlq-14], 3 + mova m3, [base+z_upsample] + vpbroadcastd m4, [base+pd_65536] + add dyd, dyd + vpermw m0, m3, m6 + paddw m3, m4 + vpermw m1, m3, m6 + paddw m3, m4 + vpermw m2, m3, m6 + paddw m3, m4 + vpermw m3, m3, m6 + vpbroadcastw m6, r9m ; pixel_max + paddw m1, m2 ; b+c + paddw m0, m3 ; a+d + psubw m0, m1, m0 + psraw m0, 3 + pxor m2, m2 + paddw m0, m1 + pmaxsw m0, m2 + pavgw m0, m2 + pminsw m6, m0 + ret +.w8: + mova m6, [tlq-64*1] + cmp hd, 32 + je .w8_h32 + mov r3d, 8 + cmp hd, 4 + cmove r3d, hd + lea r3d, [r3+hq-1] + xor r3d, 31 ; 32 - (h + imin(w, h)) + vpbroadcastw m1, r3d + vpermw m7, m1, m6 + pmaxuw m1, m0 + vpermw m6, m1, m6 + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_filter ; is_sm || d >= 40 || h > 8 + call .upsample + movshdup m1, [base+z_ypos_mul] + paddw m1, m1 + call .w8_main_setup +.w8_upsample_loop: + vpermw m3, m2, m6 ; left[base+0] + paddw m2, m15 ; base+1 + vpermw m1, m2, m6 ; left[base+1] + psubw m0, m1, m3 + pmulhrsw m0, m4 + paddw m2, m15 ; base+2 + paddw m0, m3 + mova m3, m1 + mova [dstq+r2*0], xm0 + vextracti32x4 [dstq+r2*1], ym0, 1 + vextracti32x4 [dstq+r2*2], m0, 2 + vextracti32x4 [dstq+r3 ], m0, 3 + add dstq, strideq + sub hd, 4 + jg .w8_upsample_loop + RET +.w8_main_setup: + vbroadcasti32x4 m0, [base+pw_1to32] + vpbroadcastw m4, dyd + rorx r2d, hd, 2 + pmullw m4, m0 ; ypos + vpbroadcastw m0, r2d + imul r2, strideq ; stride * height / 4 + lea r3, [r2*3] + pmullw m1, m0 ; 0 1 2 3 + paddd m1, [base+pw_32704] {1to16} + psrlw m2, m4, 6 + psllw m4, 9 + paddsw m2, m1 ; base+0 + vpandd m4, m14 ; frac << 9 + ret +.w8_h32: + pmaxud m7, m0, [base+pw_24] {1to16} + vpermw m6, m0, m6 + vpermw m7, m7, [tlq-64*2] + test angled, 0x400 + jnz .w8_main + call .filter64 + vpbroadcastd m0, [base+pw_7] + pminuw m0, [base+pw_0to31] + vpermw m7, m0, m7 + jmp .w8_main +.w8_filter: + lea r3d, [hq+7] + call .filter32 +.w8_main: + movshdup m1, [base+z_ypos_mul] + call .w8_main_setup + mova m3, m6 + vpermt2w m3, m2, m7 ; left[base+0] +.w8_loop: + paddsw m2, m15 ; base+1 + mova m1, m6 + vpermt2w m1, m2, m7 ; left[base+1] + psubw m0, m1, m3 + pmulhrsw m0, m4 + paddw m0, m3 + mova m3, m1 + mova [dstq+r2*0], xm0 + vextracti32x4 [dstq+r2*1], ym0, 1 + vextracti32x4 [dstq+r2*2], m0, 2 + vextracti32x4 [dstq+r3 ], m0, 3 + add dstq, strideq + sub hd, 4 + jg .w8_loop + RET +.filter32: + vpbroadcastb ym10, r3d + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym10, [base+z_filter_wh] + mova xm2, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym2 + kmovd r5d, k1 + test r5d, r5d + jz .filter32_end + vpbroadcastw m2, [tlq] + popcnt r5d, r5d + vpbroadcastd m5, [base+z_filter_k+(r5-1)*4+12*0] + valignq m2, m6, m2, 6 + vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1] + valignq m4, m7, m6, 2 + vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2] + palignr m1, m6, m2, 14 + pmullw m5, m6 + palignr m3, m4, m6, 2 + paddw m1, m3 + palignr m2, m6, m2, 12 + pmullw m1, m8 + palignr m4, m6, 4 + paddw m2, m4 + pmullw m2, m9 + pmovzxbw m10, ym10 + pxor m6, m6 + paddw m5, m1 + pminuw m1, m10, [base+pw_0to31] + paddw m5, m2 + psrlw m5, 3 + pavgw m6, m5 + vpermw m7, m10, m6 + vpermw m6, m1, m6 +.filter32_end: + ret +.w16: + mova m6, [tlq-64*1] + cmp hd, 32 + jl .w16_h16 + pmaxud m8, m0, [base+pw_16] {1to16} + mova m7, [tlq-64*2] + vpermw m6, m0, m6 + jg .w16_h64 + vpermw m7, m8, m7 + test angled, 0x400 + jnz .w16_main + call .filter64 + vpbroadcastd m0, [base+pw_15] + vinserti32x8 m0, [base+pw_0to31], 0 + vpermw m7, m0, m7 + jmp .w16_main +.w16_h16: + lea r3d, [hq*2-1] + xor r3d, 31 ; 32 - (h + imin(w, h)) + vpbroadcastw m1, r3d + vpermw m7, m1, m6 + pmaxuw m1, m0 + vpermw m6, m1, m6 + test angled, 0x400 + jnz .w16_main + lea r3d, [hq+15] + call .filter32 +.w16_main: + vbroadcasti32x8 m0, [base+pw_1to32] + vpbroadcastw m4, dyd + rorx r2d, hd, 1 + pmullw m4, m0 ; ypos + vpbroadcastw ym1, r2d + imul r2, strideq ; stride * height / 2 + paddd m1, [base+pw_32704] {1to16} + lea r3, [r2+strideq] + psrlw m2, m4, 6 + psllw m4, 9 + paddsw m2, m1 ; base+0 + vpandd m4, m14 ; frac << 9 + mova m3, m6 + vpermt2w m3, m2, m7 ; left[base+0] +.w16_loop: + paddsw m1, m2, m15 ; base+1 + paddsw m2, m1, m15 ; base+2 + vpermi2w m1, m6, m7 ; left[base+1] + psubw m0, m1, m3 + pmulhrsw m0, m4 + paddw m0, m3 + mova m3, m6 + vpermt2w m3, m2, m7 ; left[base+2] + vextracti32x8 [dstq+strideq*0], m0, 1 + mova [dstq+r2 ], ym0 + psubw m0, m3, m1 + pmulhrsw m0, m4 + paddw m0, m1 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+r3 ], ym0 + lea dstq, [dstq+strideq*2] + sub hd, 4 + jg .w16_loop + RET +.w16_h64: + vpermw m7, m0, m7 + vpermw m8, m8, [tlq-64*3] + test angled, 0x400 + jnz .w16_h64_main + valignq m11, m8, m7, 6 + call .filter64 + vshufi32x4 m2, m8, m8, q3321 + vpbroadcastd m0, [base+pw_15] + palignr ym3, ym8, ym11, 12 + vinserti32x8 m0, [base+pw_0to31], 0 + palignr ym4, ym8, ym11, 14 + palignr ym1, ym2, ym8, 4 + paddw ym3, ym5 + palignr ym2, ym8, 2 + paddw ym8, ym4 + pavgw ym3, ym1 + paddw ym8, ym2 + paddw ym8, ym3 + psrlw ym8, 2 + vpermw m8, m0, m8 +.w16_h64_main: + vbroadcasti32x8 m0, [base+pw_1to32] + vpbroadcastw m4, dyd + pmullw m4, m0 ; ypos + vpbroadcastd ym1, [base+pw_32] + paddd m1, [base+pw_32672] {1to16} + mov r2, strideq + shl r2, 5 ; stride*32 + vpbroadcastd m9, [base+pw_32735] + lea r3, [r2+strideq] + psrlw m2, m4, 6 + psllw m4, 9 + paddsw m2, m1 ; base+0 + vpandd m4, m14 ; frac << 9 + mova m3, m7 + vpermt2w m3, m2, m6 + vpcmpgtw k1, m2, m9 + vpermw m3{k1}, m2, m8 ; left[base+0] +.w16_h64_loop: + paddsw m2, m15 ; base+1 + mova m1, m7 + vpermt2w m1, m2, m6 + vpcmpgtw k1, m2, m9 + vpermw m1{k1}, m2, m8 ; left[base+1] + psubw m0, m1, m3 + pmulhrsw m0, m4 + paddsw m2, m15 ; base+2 + paddw m0, m3 + mova m3, m7 + vpermt2w m3, m2, m6 + vpcmpgtw k1, m2, m9 + vpermw m3{k1}, m2, m8 ; left[base+2] + vextracti32x8 [dstq+strideq*0], m0, 1 + mova [dstq+r2 ], ym0 + psubw m0, m3, m1 + pmulhrsw m0, m4 + paddw m0, m1 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+r3 ], ym0 + lea dstq, [dstq+strideq*2] + sub hd, 4 + jg .w16_h64_loop + RET +.filter64: + vpbroadcastw m2, [tlq] + vpbroadcastd m5, [base+pw_3] + valignq m2, m6, m2, 6 + valignq m4, m7, m6, 2 + valignq m10, m7, m6, 6 + palignr m1, m6, m2, 12 + palignr m2, m6, m2, 14 + palignr m3, m4, m6, 4 + paddw m1, m5 + palignr m4, m6, 2 + paddw m6, m2 + valignq m2, m8, m7, 2 + pavgw m1, m3 + palignr m3, m7, m10, 12 + paddw m6, m4 + palignr m4, m7, m10, 14 + paddw m6, m1 + palignr m1, m2, m7, 4 + psrlw m6, 2 + palignr m2, m7, 2 + paddw m3, m5 + paddw m7, m4 + pavgw m3, m1 + paddw m7, m2 + paddw m7, m3 + psrlw m7, 2 + ret +.w32: + mova m6, [tlq-64*1] + cmp hd, 32 + jl .w32_h16 + mova m8, [tlq-64*2] + vpermw m6, m0, m6 + vpermw m7, m0, m8 + jg .w32_h64 + test angled, 0x400 + jnz .w32_main + vpbroadcastw xm8, xm8 + jmp .w32_filter +.w32_h16: + lea r3d, [hq*2-1] + xor r3d, 31 ; 32 - (h + imin(w, h)) + vpbroadcastw m1, r3d + vpermw m7, m1, m6 + pmaxuw m1, m0 + vpermw m6, m1, m6 + test angled, 0x400 + jnz .w32_main + vextracti32x4 xm8, m7, 3 +.w32_filter: + call .filter64 +.w32_main: + vpbroadcastw m4, dyd + vpbroadcastd m1, [base+pw_32704] + pmullw m4, [base+pw_1to32] ; ypos + psrlw m2, m4, 6 + psllw m4, 9 + paddsw m2, m1 ; base+0 + vpandd m4, m14 ; frac << 9 + mova m3, m6 + vpermt2w m3, m2, m7 ; left[base+0] +.w32_loop: + paddsw m1, m2, m15 ; base+1 + paddsw m2, m1, m15 ; base+2 + vpermi2w m1, m6, m7 ; left[base+1] + psubw m0, m1, m3 + pmulhrsw m0, m4 + paddw m0, m3 + mova m3, m6 + vpermt2w m3, m2, m7 ; left[base+2] + mova [dstq+strideq*0], m0 + psubw m0, m3, m1 + pmulhrsw m0, m4 + paddw m0, m1 + mova [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w32_h64: + mova m9, [tlq-64*3] + vpermw m8, m0, m9 + test angled, 0x400 + jnz .w32_h64_main + vpbroadcastw xm9, xm9 + call .filter96 +.w32_h64_main: + vpbroadcastw m4, dyd + vpbroadcastd m1, [base+pw_32672] + pmullw m4, [base+pw_1to32] ; ypos + vpbroadcastd m9, [base+pw_32735] + psrlw m2, m4, 6 + psllw m4, 9 + paddsw m2, m1 ; base+0 + vpandd m4, m14 ; frac << 9 + mova m3, m7 + vpermt2w m3, m2, m6 + vpcmpgtw k1, m2, m9 + vpermw m3{k1}, m2, m8 ; left[base+0] +.w32_h64_loop: + paddsw m2, m15 ; base+1 + mova m1, m7 + vpermt2w m1, m2, m6 + vpcmpgtw k1, m2, m9 + vpermw m1{k1}, m2, m8 ; left[base+1] + psubw m0, m1, m3 + pmulhrsw m0, m4 + paddsw m2, m15 ; base+2 + paddw m0, m3 + mova m3, m7 + vpermt2w m3, m2, m6 + vpcmpgtw k1, m2, m9 + vpermw m3{k1}, m2, m8 ; left[base+2] + mova [dstq+strideq*0], m0 + psubw m0, m3, m1 + pmulhrsw m0, m4 + paddw m0, m1 + mova [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_h64_loop + RET +.filter96: + valignq m11, m8, m7, 6 + call .filter64 + valignq m2, m9, m8, 2 + palignr m3, m8, m11, 12 + palignr m4, m8, m11, 14 + palignr m1, m2, m8, 4 + paddw m3, m5 + palignr m2, m8, 2 + paddw m8, m4 + pavgw m3, m1 + paddw m8, m2 + paddw m8, m3 + psrlw m8, 2 + ret +.w64: + mova m7, [tlq-64*1] + vpermw m6, m0, m7 + cmp hd, 32 + jl .w64_h16 + mova m8, [tlq-64*2] + vpermw m7, m0, m8 + jg .w64_h64 + test angled, 0x400 + jnz .w64_main + vpbroadcastw m8, xm8 + mova m9, m8 + call .filter96 + vshufi32x4 m9, m8, m8, q3333 + jmp .w64_h64_main +.w64_h16: + vpbroadcastw m7, xm7 + test angled, 0x400 + jnz .w64_main + mova m8, m7 + call .filter64 +.w64_main: + vpbroadcastw m11, dyd + vpbroadcastd m1, [base+pw_32704] + pmullw m10, m11, [base+pw_1to32] ; ypos + psllw m11, 5 + psrlw m8, m10, 6 + paddw m11, m10 + psllw m10, 9 + psrlw m9, m11, 6 + psllw m11, 9 + psubw m9, m8 + paddsw m8, m1 ; base+0 + vpandd m10, m14 ; frac << 9 + vpandd m11, m14 ; frac << 9 + mova m4, m6 + vpermt2w m4, m8, m7 ; left[base+0] ( 0..31) + paddsw m5, m8, m9 + vpermi2w m5, m6, m7 ; left[base+0] (32..63) +.w64_loop: + paddsw m8, m15 ; base+1 ( 0..31) + mova m2, m6 + vpermt2w m2, m8, m7 ; left[base+1] ( 0..31) + paddsw m3, m8, m9 ; base+1 (32..63) + vpermi2w m3, m6, m7 ; left[base+1] (32..63) + psubw m0, m2, m4 + psubw m1, m3, m5 + pmulhrsw m0, m10 + pmulhrsw m1, m11 + paddw m0, m4 + paddw m1, m5 + mova m4, m2 + mova [dstq+64*0], m0 + mova m5, m3 + mova [dstq+64*1], m1 + add dstq, strideq + dec hd + jg .w64_loop + RET +.w64_h64: + vpermw m8, m0, [tlq-64*3] + mova m13, [tlq-64*4] + vpermw m9, m0, m13 + test angled, 0x400 + jnz .w64_h64_main + valignq m12, m9, m8, 6 + call .filter96 + vpbroadcastw xm2, xm13 + valignq m2, m9, 2 + palignr m3, m9, m12, 12 + palignr m4, m9, m12, 14 + palignr m1, m2, m9, 4 + paddw m3, m5 + palignr m2, m9, 2 + paddw m9, m4 + pavgw m3, m1 + paddw m9, m2 + paddw m9, m3 + psrlw m9, 2 +.w64_h64_main: + vpbroadcastw m11, dyd + vpbroadcastd m1, [base+pw_32640] + pmullw m10, m11, [base+pw_1to32] ; ypos + psllw m11, 5 + psrlw m12, m10, 6 + paddw m11, m10 + psllw m10, 9 + psrlw m13, m11, 6 + psllw m11, 9 + psubw m13, m12 + paddsw m12, m1 ; base+0 + vpandd m10, m14 ; frac << 9 + vpandd m11, m14 ; frac << 9 + vpbroadcastd m14, [base+pw_64] + mova m4, m6 + vpermt2w m4, m12, m7 + vptestmw k1, m12, m14 + mova m0, m8 + vpermt2w m0, m12, m9 + paddsw m1, m12, m13 + mova m5, m6 + vpermt2w m5, m1, m7 + vptestmw k2, m1, m14 + vpermi2w m1, m8, m9 + vmovdqu16 m4{k1}, m0 ; left[base+0] ( 0..31) + vmovdqu16 m5{k2}, m1 ; left[base+0] (32..63) +.w64_h64_loop: + paddsw m12, m15 ; base+1 + mova m2, m6 + vpermt2w m2, m12, m7 + vptestmw k1, m12, m14 + mova m0, m8 + vpermt2w m0, m12, m9 + paddsw m1, m12, m13 + mova m3, m6 + vpermt2w m3, m1, m7 + vptestmw k2, m1, m14 + vpermi2w m1, m8, m9 + vmovdqu16 m2{k1}, m0 ; left[base+1] ( 0..31) + vmovdqu16 m3{k2}, m1 ; left[base+1] (32..63) + psubw m0, m2, m4 + psubw m1, m3, m5 + pmulhrsw m0, m10 + pmulhrsw m1, m11 + paddw m0, m4 + paddw m1, m5 + mova m4, m2 + mova [dstq+64*0], m0 + mova m5, m3 + mova [dstq+64*1], m1 + add dstq, strideq + dec hd + jg .w64_h64_loop + RET + cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3 lea r6, [pal_pred_16bpc_avx512icl_table] tzcnt wd, wm -- cgit v1.2.3 From b084160736bf9955c324a0eff0c6a24148f68b35 Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Mon, 22 Jan 2024 16:37:25 +0100 Subject: CI: Switch to using 'testdata' suite Simplifies testing and also contains the forgotten 'testdata-multi' suite which was added later. --- .gitlab-ci.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cfafb61..60e8b37 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -463,11 +463,11 @@ build-debian-loongarch64: - ninja -C build - cd build - exit_code=0 - - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask 0" || exit_code=$((exit_code + $?)) - - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse2" || exit_code=$((exit_code + $?)) - - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask ssse3" || exit_code=$((exit_code + $?)) - - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse41" || exit_code=$((exit_code + $?)) - - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask avx2" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata --test-args "--cpumask 0" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata --test-args "--cpumask sse2" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata --test-args "--cpumask ssse3" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata --test-args "--cpumask sse41" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata --test-args "--cpumask avx2" || exit_code=$((exit_code + $?)) - if [ $exit_code -ne 0 ]; then exit $exit_code; fi .test-argon: @@ -509,7 +509,7 @@ test-debian: grep -Eo '[0-9.]+' | awk '{ print "coverage:", $1 * 100 } ' - time meson test -v --suite testdata_seek-stress --test-args "--threads 2 --framedelay 1" - time meson test -v --suite testdata_seek-stress --test-args "--threads 2 --framedelay 2" - - time meson test -v --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--threads=1 --negstride" + - time meson test -v --suite testdata --test-args "--threads=1 --negstride" coverage: '/^coverage: (\d+.\d+)$/' artifacts: expose_as: 'Coverage HTML report' @@ -547,8 +547,8 @@ test-debian-avx512: -Dtestdata_tests=true -Dtrim_dsp=false - ninja -C build - - cd build && time meson test --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask avx512icl" - - time meson test --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--threads 2 --framedelay 2 --cpumask avx512icl" + - cd build && time meson test --suite testdata --test-args "--cpumask avx512icl" + - time meson test --suite testdata --test-args "--threads 2 --framedelay 2 --cpumask avx512icl" test-debian-unaligned-stack: extends: @@ -584,8 +584,8 @@ test-debian-asan: - cd build - exit_code=0 - time meson test -v --setup=sanitizer --suite checkasm || exit_code=$((exit_code + $?)) - - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask 0" || exit_code=$((exit_code + $?)) - - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask 0xff" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata --test-args "--cpumask 0" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata --test-args "--cpumask 0xff" || exit_code=$((exit_code + $?)) - if [ $exit_code -ne 0 ]; then exit $exit_code; fi test-debian-msan: @@ -641,9 +641,9 @@ test-debian-tsan: - ninja -C build - cd build - exit_code=0 - - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--threads 2 --framedelay 1" || exit_code=$((exit_code + $?)) - - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--threads 2 --framedelay 2" || exit_code=$((exit_code + $?)) - - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--threads 2 --framedelay 2 --negstride" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata --test-args "--threads 2 --framedelay 1" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata --test-args "--threads 2 --framedelay 2" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata --test-args "--threads 2 --framedelay 2 --negstride" || exit_code=$((exit_code + $?)) - time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--threads 2 --framedelay 1" || exit_code=$((exit_code + $?)) - time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--threads 2 --framedelay 2" || exit_code=$((exit_code + $?)) - time meson test -v --setup=sanitizer --suite oss-fuzz-asan --suite oss-fuzz-msan --suite oss-fuzz-ubsan || exit_code=$((exit_code + $?)) -- cgit v1.2.3 From 2c9bbb49082c6e6e7fc9917c2a9f27db97b64675 Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Tue, 23 Jan 2024 01:57:49 +0100 Subject: meson: Add 'enable_seek_stress' option Allows to explicitly enable/disable seek-stress tests. --- .gitlab-ci.yml | 5 +++++ meson_options.txt | 5 +++++ tests/meson.build | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 60e8b37..2cb4c37 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -497,6 +497,7 @@ test-debian: script: - meson setup build --buildtype release -Dtestdata_tests=true + -Denable_seek_stress=true -Dlogging=false -Db_coverage=true -Dtrim_dsp=false @@ -562,6 +563,7 @@ test-debian-unaligned-stack: script: - meson setup build --buildtype release -Dtestdata_tests=true + -Denable_seek_stress=true -Dlogging=false -Dstack_alignment=16 -Dtrim_dsp=false @@ -599,6 +601,7 @@ test-debian-msan: script: - meson setup build --buildtype debugoptimized -Dtestdata_tests=true + -Denable_seek_stress=true -Dlogging=false -Db_sanitize=memory -Db_lundef=false @@ -617,6 +620,7 @@ test-debian-ubsan: script: - meson setup build --buildtype debugoptimized -Dtestdata_tests=true + -Denable_seek_stress=true -Dlogging=false -Db_sanitize=undefined -Db_lundef=false @@ -635,6 +639,7 @@ test-debian-tsan: script: - meson setup build --buildtype debugoptimized -Dtestdata_tests=true + -Denable_seek_stress=true -Dlogging=false -Db_sanitize=thread -Db_lundef=false diff --git a/meson_options.txt b/meson_options.txt index 91a0f6c..889bd0a 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -25,6 +25,11 @@ option('enable_tests', value: true, description: 'Build dav1d tests') +option('enable_seek_stress', + type: 'boolean', + value: true, + description: 'Build seek_stress test tool') + option('enable_docs', type: 'boolean', value: false, diff --git a/tests/meson.build b/tests/meson.build index ef8c21e..c76a543 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -128,7 +128,7 @@ endforeach subdir('libfuzzer') # seek stress test binary, depends on dav1d cli tool -if get_option('enable_tools') +if (get_option('enable_tools') and get_option('enable_seek_stress')) seek_stress_sources = files('seek_stress.c') seek_stress = executable('seek_stress', seek_stress_sources, rev_target, -- cgit v1.2.3 From cdb2a1a27bb7cc0e21d849d0fbe8a716c7bfb8f7 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 5 Dec 2023 15:02:48 +0100 Subject: Avoid printing full path names in dav1d_argon.bash Only print the paths relative to the argon directory. This avoids excessive terminal line wrapping due to long path names which otherwise interferes with the '\r' usage for progress reporting. --- tests/dav1d_argon.bash | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/dav1d_argon.bash b/tests/dav1d_argon.bash index 0c35663..b0187fd 100755 --- a/tests/dav1d_argon.bash +++ b/tests/dav1d_argon.bash @@ -148,17 +148,17 @@ for i in "${!files[@]}"; do md5=$(<"${md5/%obu/md5}") || error "Error! Can't read md5 ${md5} for file ${f}" md5=${md5/ */} - printf "\033[1K\r[%3d%% %d/%d] Verifying %s" "$(((i+1)*100/${#files[@]}))" "$((i+1))" "${#files[@]}" "$f" + printf "\033[1K\r[%3d%% %d/%d] Verifying %s" "$(((i+1)*100/${#files[@]}))" "$((i+1))" "${#files[@]}" "${f#"$ARGON_DIR"/}" cmd=("$DAV1D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q) if [ "$JOBS" -gt 1 ]; then "${cmd[@]}" 2>/dev/null & p=$! pids+=("$p") - declare "file$p=$f" + declare "file$p=${f#"$ARGON_DIR"/}" block_pids else if ! "${cmd[@]}" 2>/dev/null; then - fail "$f" + fail "${f#"$ARGON_DIR"/}" fi fi done -- cgit v1.2.3 From 227c37f74a7f835f14682b0ec385d236b64862f4 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 5 Dec 2023 15:06:27 +0100 Subject: Use a constant length for progress reporting in dav1d_argon.bash --- tests/dav1d_argon.bash | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/dav1d_argon.bash b/tests/dav1d_argon.bash index b0187fd..27a8d61 100755 --- a/tests/dav1d_argon.bash +++ b/tests/dav1d_argon.bash @@ -132,7 +132,8 @@ for d in "${dirs[@]}"; do fi done -if [ ${#files[@]} -eq 0 ]; then +num_files="${#files[@]}" +if [ "$num_files" -eq 0 ]; then error "Error! No files found at ${dirs[*]}" fi @@ -148,7 +149,7 @@ for i in "${!files[@]}"; do md5=$(<"${md5/%obu/md5}") || error "Error! Can't read md5 ${md5} for file ${f}" md5=${md5/ */} - printf "\033[1K\r[%3d%% %d/%d] Verifying %s" "$(((i+1)*100/${#files[@]}))" "$((i+1))" "${#files[@]}" "${f#"$ARGON_DIR"/}" + printf '\033[1K\r[%3d%% %*d/%d] Verifying %s' "$(((i+1)*100/num_files))" "${#num_files}" "$((i+1))" "$num_files" "${f#"$ARGON_DIR"/}" cmd=("$DAV1D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q) if [ "$JOBS" -gt 1 ]; then "${cmd[@]}" 2>/dev/null & @@ -166,9 +167,9 @@ done wait_all_pids if [ "$failed" -ne 0 ]; then - printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "${#files[@]}" + printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "$num_files" else - printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "${#files[@]}" + printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "$num_files" fi printf " in %dm%ds (%s)\n" "$((SECONDS/60))" "$((SECONDS%60))" "$ver_info" -- cgit v1.2.3 From 16ed8e8b99f2fcfffe016e929d3626e15267ad3e Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Tue, 23 Jan 2024 19:05:37 +0100 Subject: meson: Disable seek-stress tests by default --- meson_options.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson_options.txt b/meson_options.txt index 889bd0a..c04deff 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -27,7 +27,7 @@ option('enable_tests', option('enable_seek_stress', type: 'boolean', - value: true, + value: false, description: 'Build seek_stress test tool') option('enable_docs', -- cgit v1.2.3 From 61251bc9a29049acfb58429d87d8fbb60c20ef82 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 30 Oct 2022 16:20:08 +0000 Subject: Add initial RISC-V support For now the only CPU flag we check for is RISC-V Vector extension (RVV). --- meson.build | 6 ++++++ src/cpu.c | 2 ++ src/cpu.h | 2 ++ src/meson.build | 4 ++++ src/riscv/cpu.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ src/riscv/cpu.h | 37 +++++++++++++++++++++++++++++++++++++ tools/dav1d_cli_parse.c | 4 ++++ 7 files changed, 104 insertions(+) create mode 100644 src/riscv/cpu.c create mode 100644 src/riscv/cpu.h diff --git a/meson.build b/meson.build index 0892a4f..50d2684 100644 --- a/meson.build +++ b/meson.build @@ -67,6 +67,7 @@ is_asm_enabled = (get_option('enable_asm') == true and host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm') or host_machine.cpu() == 'ppc64le' or + host_machine.cpu_family().startswith('riscv') or host_machine.cpu_family().startswith('loongarch'))) cdata.set10('HAVE_ASM', is_asm_enabled) @@ -234,6 +235,7 @@ endif if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm') or host_machine.cpu() == 'ppc64le' or + host_machine.cpu_family().startswith('riscv') or host_machine.cpu_family().startswith('loongarch')) if cc.has_function('getauxval', prefix : '#include ', args : test_args) cdata.set('HAVE_GETAUXVAL', 1) @@ -381,6 +383,10 @@ endif cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le') +cdata.set10('ARCH_RISCV', host_machine.cpu_family().startswith('riscv')) +cdata.set10('ARCH_RV32', host_machine.cpu_family() == 'riscv32') +cdata.set10('ARCH_RV64', host_machine.cpu_family() == 'riscv64') + cdata.set10('ARCH_LOONGARCH', host_machine.cpu_family().startswith('loongarch')) cdata.set10('ARCH_LOONGARCH32', host_machine.cpu_family() == 'loongarch32') cdata.set10('ARCH_LOONGARCH64', host_machine.cpu_family() == 'loongarch64') diff --git a/src/cpu.c b/src/cpu.c index 5d6fc49..9bb85f1 100644 --- a/src/cpu.c +++ b/src/cpu.c @@ -60,6 +60,8 @@ COLD void dav1d_init_cpu(void) { dav1d_cpu_flags = dav1d_get_cpu_flags_loongarch(); #elif ARCH_PPC64LE dav1d_cpu_flags = dav1d_get_cpu_flags_ppc(); +#elif ARCH_RISCV + dav1d_cpu_flags = dav1d_get_cpu_flags_riscv(); #elif ARCH_X86 dav1d_cpu_flags = dav1d_get_cpu_flags_x86(); #endif diff --git a/src/cpu.h b/src/cpu.h index d42530e..3cbeb22 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -41,6 +41,8 @@ #include "src/loongarch/cpu.h" #elif ARCH_PPC64LE #include "src/ppc/cpu.h" +#elif ARCH_RISCV +#include "src/riscv/cpu.h" #elif ARCH_X86 #include "src/x86/cpu.h" #endif diff --git a/src/meson.build b/src/meson.build index f443c05..c6e98cd 100644 --- a/src/meson.build +++ b/src/meson.build @@ -235,6 +235,10 @@ if is_asm_enabled 'ppc/cdef_tmpl.c', 'ppc/looprestoration_tmpl.c', ) + elif host_machine.cpu_family().startswith('riscv') + libdav1d_sources += files( + 'riscv/cpu.c', + ) elif host_machine.cpu_family().startswith('loongarch') libdav1d_sources += files( 'loongarch/cpu.c', diff --git a/src/riscv/cpu.c b/src/riscv/cpu.c new file mode 100644 index 0000000..1637710 --- /dev/null +++ b/src/riscv/cpu.c @@ -0,0 +1,49 @@ +/* + * Copyright © 2022, VideoLAN and dav1d authors + * Copyright © 2022, Nathan Egge + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "common/attributes.h" + +#include "src/riscv/cpu.h" + +#if defined(HAVE_GETAUXVAL) +#include + +#define HWCAP_RVV (1 << ('v' - 'a')) + +#endif + +COLD unsigned dav1d_get_cpu_flags_riscv(void) { + unsigned flags = 0; +#if defined(HAVE_GETAUXVAL) + unsigned long hw_cap = getauxval(AT_HWCAP); + flags |= (hw_cap & HWCAP_RVV) ? DAV1D_RISCV_CPU_FLAG_V : 0; +#endif + + return flags; +} diff --git a/src/riscv/cpu.h b/src/riscv/cpu.h new file mode 100644 index 0000000..8ab7f53 --- /dev/null +++ b/src/riscv/cpu.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2022, VideoLAN and dav1d authors + * Copyright © 2022, Nathan Egge + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_RISCV_CPU_H +#define DAV1D_SRC_RISCV_CPU_H + +enum CpuFlags { + DAV1D_RISCV_CPU_FLAG_V = 1 << 0, +}; + +unsigned dav1d_get_cpu_flags_riscv(void); + +#endif /* DAV1D_SRC_RISCV_CPU_H */ diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c index 5d22e26..5fdbab3 100644 --- a/tools/dav1d_cli_parse.c +++ b/tools/dav1d_cli_parse.c @@ -105,6 +105,8 @@ static const struct option long_opts[] = { #define ALLOWED_CPU_MASKS ", 'lsx' or 'lasx'" #elif ARCH_PPC64LE #define ALLOWED_CPU_MASKS " or 'vsx'" +#elif ARCH_RISCV +#define ALLOWED_CPU_MASKS " or 'rvv'" #elif ARCH_X86 #define ALLOWED_CPU_MASKS \ ", 'sse2', 'ssse3', 'sse41', 'avx2' or 'avx512icl'" @@ -223,6 +225,8 @@ static const EnumParseTable cpu_mask_tbl[] = { { "lasx", DAV1D_LOONGARCH_CPU_FLAG_LASX }, #elif ARCH_PPC64LE { "vsx", DAV1D_PPC_CPU_FLAG_VSX }, +#elif ARCH_RISCV + { "rvv", DAV1D_RISCV_CPU_FLAG_V }, #elif ARCH_X86 { "sse2", X86_CPU_MASK_SSE2 }, { "ssse3", X86_CPU_MASK_SSSE3 }, -- cgit v1.2.3 From 583c2343b70518d1ea723ec4f874f8b20fcca5e7 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 22 Oct 2023 13:27:39 -0400 Subject: riscv: Add support for TRIM_DSP_FUNCTIONS --- src/cpu.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/cpu.h b/src/cpu.h index 3cbeb22..c9009c7 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -68,6 +68,10 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) { #if defined(__VSX__) flags |= DAV1D_PPC_CPU_FLAG_VSX; #endif +#elif ARCH_RISCV +#if defined(__riscv_v) + flags |= DAV1D_RISCV_CPU_FLAG_V; +#endif #elif ARCH_X86 #if defined(__AVX512F__) && defined(__AVX512CD__) && \ defined(__AVX512BW__) && defined(__AVX512DQ__) && \ -- cgit v1.2.3 From 7362fcf653de77ffe882a6702523d275eb320c8b Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 22 Oct 2023 15:38:36 -0400 Subject: riscv: Add support for checkasm --bench --- tests/checkasm/checkasm.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index e323319..cbdafb8 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -197,6 +197,14 @@ static inline uint64_t readtime(void) { return (((uint64_t)tbu) << 32) | (uint64_t)tbl; } #define readtime readtime +#elif ARCH_RISCV +#include +static inline uint64_t clock_gettime_nsec(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return ((uint64_t)ts.tv_sec*1000000000u) + (uint64_t)ts.tv_nsec; +} +#define readtime clock_gettime_nsec #elif ARCH_LOONGARCH static inline uint64_t readtime(void) { #if ARCH_LOONGARCH64 -- cgit v1.2.3 From 43ee02a99cce1f221bdb574c9ee484180d71013f Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Thu, 26 Oct 2023 15:06:49 -0400 Subject: riscv64/itx: Add 4-point 8bpc RVV idtx transform inv_txfm_add_4x4_identity_identity_0_8bpc_c: 534.6 ( 1.00x) inv_txfm_add_4x4_identity_identity_0_8bpc_rvv: 72.2 ( 7.40x) inv_txfm_add_4x4_identity_identity_1_8bpc_c: 534.7 ( 1.00x) inv_txfm_add_4x4_identity_identity_1_8bpc_rvv: 72.3 ( 7.40x) --- src/itx_tmpl.c | 5 ++ src/meson.build | 5 ++ src/riscv/64/itx.S | 122 ++++++++++++++++++++++++++++++++++++++++++++++ src/riscv/asm.S | 108 ++++++++++++++++++++++++++++++++++++++++ src/riscv/itx.h | 54 ++++++++++++++++++++ tests/checkasm/checkasm.c | 2 + 6 files changed, 296 insertions(+) create mode 100644 src/riscv/64/itx.S create mode 100644 src/riscv/asm.S create mode 100644 src/riscv/itx.h diff --git a/src/itx_tmpl.c b/src/itx_tmpl.c index 9ecf8bf..8ff245a 100644 --- a/src/itx_tmpl.c +++ b/src/itx_tmpl.c @@ -185,6 +185,8 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, #include "src/arm/itx.h" #elif ARCH_LOONGARCH64 #include "src/loongarch/itx.h" +#elif ARCH_RISCV +#include "src/riscv/itx.h" #elif ARCH_X86 #include "src/x86/itx.h" #endif @@ -262,6 +264,9 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { #if ARCH_LOONGARCH64 itx_dsp_init_loongarch(c, bpc); #endif +#if ARCH_RISCV + itx_dsp_init_riscv(c, bpc); +#endif #if ARCH_X86 itx_dsp_init_x86(c, bpc); #endif diff --git a/src/meson.build b/src/meson.build index c6e98cd..d1ea408 100644 --- a/src/meson.build +++ b/src/meson.build @@ -239,6 +239,11 @@ if is_asm_enabled libdav1d_sources += files( 'riscv/cpu.c', ) + if host_machine.cpu_family() == 'riscv64' + libdav1d_sources += files( + 'riscv/64/itx.S', + ) + endif elif host_machine.cpu_family().startswith('loongarch') libdav1d_sources += files( 'loongarch/cpu.c', diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S new file mode 100644 index 0000000..e7504a5 --- /dev/null +++ b/src/riscv/64/itx.S @@ -0,0 +1,122 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2023, Nathan Egge + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/riscv/asm.S" + +function inv_txfm_add_4x4_rvv, export=1 + csrw vxrm, zero + + vsetivli zero, 4, e16, m1, ta, ma + vle16.v v0, (a2) + addi t0, a2, 8 + vle16.v v1, (t0) + addi t0, t0, 8 + vle16.v v2, (t0) + addi t0, t0, 8 + vle16.v v3, (t0) + + jalr t0, a4 + + vsseg4e16.v v0, (a2) + vle16.v v0, (a2) + addi t0, a2, 8 + vle16.v v1, (t0) + addi t0, t0, 8 + vle16.v v2, (t0) + addi t0, t0, 8 + vle16.v v3, (t0) + + jalr t0, a5 + + vssra.vi v0, v0, 4 + vssra.vi v1, v1, 4 + vssra.vi v2, v2, 4 + vssra.vi v3, v3, 4 + + vsetvli zero, zero, e8, mf2, ta, ma + vle8.v v4, (a0) + add t0, a0, a1 + vle8.v v5, (t0) + add t0, t0, a1 + vle8.v v6, (t0) + add t0, t0, a1 + vle8.v v7, (t0) + + vwaddu.wv v0, v0, v4 + vwaddu.wv v1, v1, v5 + vwaddu.wv v2, v2, v6 + vwaddu.wv v3, v3, v7 + + vsetvli t0, zero, e16, m4 + vmax.vx v0, v0, x0 + + vsetivli zero, 4, e8, mf2, ta, ma + + vnclipu.wi v4, v0, 0 + vnclipu.wi v5, v1, 0 + vnclipu.wi v6, v2, 0 + vnclipu.wi v7, v3, 0 + + vse8.v v4, (a0) + add a0, a0, a1 + vse8.v v5, (a0) + add a0, a0, a1 + vse8.v v6, (a0) + add a0, a0, a1 + vse8.v v7, (a0) + + vsetivli zero, 16, e16, m2, ta, ma + vmv.v.x v0, zero + vse16.v v0, (a2) + + ret +endfunc + +function inv_identity_e16_x4_rvv, export=1 + li t1, (5793-4096)*8 + vsmul.vx v4, v0, t1 + vsmul.vx v5, v1, t1 + vsmul.vx v6, v2, t1 + vsmul.vx v7, v3, t1 + + vsadd.vv v0, v0, v4 + vsadd.vv v1, v1, v5 + vsadd.vv v2, v2, v6 + vsadd.vv v3, v3, v7 + + jr t0 +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1 + la a4, inv_\txfm1\()_e16_x4_rvv + la a5, inv_\txfm2\()_e16_x4_rvv + j inv_txfm_add_4x4_rvv +endfunc +.endm + +def_fn_4x4 identity, identity diff --git a/src/riscv/asm.S b/src/riscv/asm.S new file mode 100644 index 0000000..81837eb --- /dev/null +++ b/src/riscv/asm.S @@ -0,0 +1,108 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2023, Nathan Egge + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_RISCV_ASM_S +#define DAV1D_SRC_RISCV_ASM_S + +#include "config.h" + +.option arch,+v + +#if !defined(PIC) +#if defined(__PIC__) +#define PIC __PIC__ +#elif defined(__pic__) +#define PIC __pic__ +#endif +#endif + +#ifndef PRIVATE_PREFIX +#define PRIVATE_PREFIX dav1d_ +#endif + +#define PASTE(a,b) a ## b +#define CONCAT(a,b) PASTE(a,b) + +#ifdef PREFIX +#define EXTERN CONCAT(_,PRIVATE_PREFIX) +#else +#define EXTERN PRIVATE_PREFIX +#endif + +.macro function name, export=0 + .macro endfunc +#ifdef __ELF__ + .size \name, . - \name +#endif + .purgem endfunc + .endm + .text + .if \export + .global EXTERN\name +#ifdef __ELF__ + .type EXTERN\name, %function + .hidden EXTERN\name +#elif defined(__MACH__) + .private_extern EXTERN\name +#endif +EXTERN\name: + .else +#ifdef __ELF__ + .type \name, %function +#endif + .endif +\name: +.endm + +.macro const name, export=0, align=2 + .macro endconst +#ifdef __ELF__ + .size \name, . - \name +#endif + .purgem endconst + .endm +#if defined(_WIN32) + .section .rdata +#elif !defined(__MACH__) + .section .rodata +#else + .const_data +#endif + .align \align + .if \export + .global EXTERN\name +#ifdef __ELF__ + .hidden EXTERN\name +#elif defined(__MACH__) + .private_extern EXTERN\name +#endif +EXTERN\name: + .endif +\name: +.endm + +#endif /* DAV1D_SRC_RISCV_ASM_S */ diff --git a/src/riscv/itx.h b/src/riscv/itx.h new file mode 100644 index 0000000..3f83905 --- /dev/null +++ b/src/riscv/itx.h @@ -0,0 +1,54 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2023, Nathan Egge + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/itx.h" + +#define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) + +#define decl_itx_fns(ext) \ +decl_itx2_fns( 4, 4, ext) + +decl_itx_fns(rvv); + +static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, int const bpc) { +#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) + +#define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) + + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return; + +#if BITDEPTH == 8 + assign_itx2_fn( , 4, 4, rvv); +#endif +} diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index d4d51bb..48cf255 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -107,6 +107,8 @@ static const struct { #elif ARCH_LOONGARCH { "LSX", "lsx", DAV1D_LOONGARCH_CPU_FLAG_LSX }, { "LASX", "lasx", DAV1D_LOONGARCH_CPU_FLAG_LASX }, +#elif ARCH_RISCV + { "RVV", "rvv", DAV1D_RISCV_CPU_FLAG_V }, #endif { 0 } }; -- cgit v1.2.3 From ebbddd48e37f893276f137686b918b808f2d1e6a Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Wed, 12 Apr 2023 23:24:20 +0200 Subject: CI: Add riscv64 tests --- .gitlab-ci.yml | 28 ++++++++++++++++++++++++++++ package/crossfiles/riscv64-linux.meson | 12 ++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 package/crossfiles/riscv64-linux.meson diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2cb4c37..d15de9d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -418,6 +418,18 @@ build-debian-wasm: matrix: - CROSSFILE: [wasm32, wasm64] +build-debian-riscv64: + extends: .debian-amd64-common + variables: + QEMU_CPU: rv64,v=true,vext_spec=v1.0,vlen=256,elen=64 + script: + - meson setup build --buildtype release + -Dtrim_dsp=false + --werror + --cross-file package/crossfiles/riscv64-linux.meson + - ninja -C build + - cd build && meson test -v + build-debian-loongarch64: extends: .debian-amd64-common variables: @@ -699,6 +711,22 @@ test-debian-ppc64le: - ninja -C build - cd build && time meson test -v +test-debian-riscv64: + extends: + - .debian-amd64-common + - .test-common + needs: ["build-debian-riscv64"] + variables: + QEMU_CPU: rv64,v=true,vext_spec=v1.0,vlen=256,elen=64 + script: + - meson setup build --buildtype release + -Dtestdata_tests=true + -Dlogging=false + -Dtrim_dsp=false + --cross-file package/crossfiles/riscv64-linux.meson + - ninja -C build + - cd build && time meson test -v --timeout-multiplier 2 + test-debian-armv7-clang-5: extends: - .debian-armv7-common diff --git a/package/crossfiles/riscv64-linux.meson b/package/crossfiles/riscv64-linux.meson new file mode 100644 index 0000000..e3eda5e --- /dev/null +++ b/package/crossfiles/riscv64-linux.meson @@ -0,0 +1,12 @@ +[binaries] +c = 'riscv64-linux-gnu-gcc' +cpp = 'riscv64-linux-gnu-g++' +ar = 'riscv64-linux-gnu-ar' +strip = 'riscv64-linux-gnu-strip' +exe_wrapper = 'qemu-riscv64' + +[host_machine] +system = 'linux' +cpu_family = 'riscv64' +cpu = 'riscv64' +endian = 'little' -- cgit v1.2.3 From 10082586b00d8556070a92b67c987ecc912d3367 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Fri, 27 Oct 2023 09:48:39 -0400 Subject: riscv: Set .option arch by passing ext to function --- src/riscv/64/itx.S | 4 ++-- src/riscv/asm.S | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index e7504a5..114a5e0 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -27,7 +27,7 @@ #include "src/riscv/asm.S" -function inv_txfm_add_4x4_rvv, export=1 +function inv_txfm_add_4x4_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 4, e16, m1, ta, ma @@ -96,7 +96,7 @@ function inv_txfm_add_4x4_rvv, export=1 ret endfunc -function inv_identity_e16_x4_rvv, export=1 +function inv_identity_e16_x4_rvv, export=1, ext=v li t1, (5793-4096)*8 vsmul.vx v4, v0, t1 vsmul.vx v5, v1, t1 diff --git a/src/riscv/asm.S b/src/riscv/asm.S index 81837eb..efd614e 100644 --- a/src/riscv/asm.S +++ b/src/riscv/asm.S @@ -30,8 +30,6 @@ #include "config.h" -.option arch,+v - #if !defined(PIC) #if defined(__PIC__) #define PIC __PIC__ @@ -53,14 +51,19 @@ #define EXTERN PRIVATE_PREFIX #endif -.macro function name, export=0 +.macro function name, export=0, ext= .macro endfunc #ifdef __ELF__ .size \name, . - \name #endif + .option pop .purgem endfunc .endm .text + .option push + .ifnb \ext + .option arch, +\ext + .endif .if \export .global EXTERN\name #ifdef __ELF__ -- cgit v1.2.3 From 8ee8b9eba1712efe1e3c0cb6071c141fb690cf15 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 29 Oct 2023 03:15:41 -0400 Subject: checkasm: Implement riscv64 checked_call() --- src/riscv/asm.S | 15 +++ tests/checkasm/checkasm.h | 11 ++ tests/checkasm/riscv/checkasm_64.S | 223 +++++++++++++++++++++++++++++++++++++ tests/meson.build | 2 + 4 files changed, 251 insertions(+) create mode 100644 tests/checkasm/riscv/checkasm_64.S diff --git a/src/riscv/asm.S b/src/riscv/asm.S index efd614e..2435170 100644 --- a/src/riscv/asm.S +++ b/src/riscv/asm.S @@ -108,4 +108,19 @@ EXTERN\name: \name: .endm +.macro thread_local name, align=3, quads=1 + .macro end_thread_local + .size \name, . - \name + .purgem end_thread_local + .endm + .section .tbss, "waT" + .align \align + .hidden \name +\name: + .rept \quads + .quad 0 + .endr + end_thread_local +.endm + #endif /* DAV1D_SRC_RISCV_ASM_S */ diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index cbdafb8..eeda5df 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -326,6 +326,17 @@ void checkasm_stack_clobber(uint64_t clobber, ...); checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\ 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\ checkasm_set_signal_handler_state(0) +#elif ARCH_RISCV +#define declare_new(ret, ...)\ + ret (*checked_call)(void *, int, int, int, int, int, int, int,\ + __VA_ARGS__, int, int, int, int, int, int, int, int,\ + int, int, int, int, int, int, int) =\ + (void *)checkasm_checked_call; +#define call_new(...)\ + (checkasm_set_signal_handler_state(1),\ + checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\ + 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\ + checkasm_set_signal_handler_state(0) #else #define declare_new(ret, ...) #define call_new(...)\ diff --git a/tests/checkasm/riscv/checkasm_64.S b/tests/checkasm/riscv/checkasm_64.S new file mode 100644 index 0000000..5d35d71 --- /dev/null +++ b/tests/checkasm/riscv/checkasm_64.S @@ -0,0 +1,223 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2023, Nathan Egge + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#define PRIVATE_PREFIX checkasm_ + +#include "src/riscv/asm.S" + +// max number of args used by any asm function. +#define MAX_ARGS 15 + +// + 16 for stack canary reference +#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15 + 16) + +const register_init, align=4 + .quad 0x68909d060f4a7fdd + .quad 0x924f739e310218a1 + .quad 0xb988385a8254174c + .quad 0x4c1110430bf09fd7 + .quad 0x2b310edf6a5d7ecf + .quad 0xda8112e98ddbb559 + .quad 0x6da5854aa2f84b62 + .quad 0x72b761199e9b1f38 + .quad 0x13f27aa74ae5dcdf + .quad 0x36a6c12a7380e827 + .quad 0x5c452889aefc8548 + .quad 0x6a9ea1ddb236235f + .quad 0x0449854bdfc94b1e + .quad 0x4f849b7076a156f5 + .quad 0x1baa4275e734930e + .quad 0x77df3503ba3e073d + .quad 0x6060e073705a4bf2 + .quad 0xa7b482508471e44b + .quad 0xd296a3158d6da2b9 + .quad 0x1c0ed711a93d970b + .quad 0x9359537fdd79569d + .quad 0x2b1dc95c1e232d62 + .quad 0xab06cd578e2bb5a0 + .quad 0x4100b4987a0af30f + .quad 0x2523e36f9bb1e36f + .quad 0xfb0b815930c6d25c + .quad 0x89acc810c2902fcf + .quad 0xa65854b4c2b381f1 + .quad 0x78150d69a1accedf + .quad 0x057e24868e022de1 + .quad 0x88f6e79ed4b8d362 + .quad 0x1f4a420e262c9035 +endconst + +const error_message_register +error_message_rsvd: + .asciz "unallocatable register clobbered" +error_message_sreg: + .asciz "failed to preserve callee-saved integer register" +error_message_fsreg: + .asciz "failed to preserve callee-saved floating-point register" +endconst + +thread_local saved_regs, quads=29 # 5 + 12 + 12 + +function checked_call, export=1, ext=v + /* Save the function ptr, RA, SP, unallocatable and callee-saved registers */ + la.tls.ie t0, saved_regs + add t0, tp, t0 + sd a0, (t0) + sd ra, 8(t0) + sd sp, 16(t0) + sd gp, 24(t0) + sd tp, 32(t0) +.irp n, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + sd s\n, 40 + 16*\n(t0) +#ifdef __riscv_float_abi_double + fsd fs\n, 48 + 16*\n(t0) +#endif +.endr + + /* Check for vector extension */ + call dav1d_get_cpu_flags_riscv + and a0, a0, 1 # DAV1D_RISCV_CPU_FLAG_RVV + beqz a0, 0f + + /* Clobber vector configuration */ + vsetvli t0, zero, e32, m8, ta, ma + lla t0, register_init + ld t0, (t0) +.irp n, 0, 8, 16, 24 + vmv.v.x v0, t0 +.endr + li t0, -1 << 31 + vsetvl zero, zero, t0 + csrwi vxrm, 3 + csrwi vxsat, 1 + +0: + /* Load the register arguments */ +.irp n, 0, 1, 2, 3, 4, 5, 6, 7 + ld a\n, 8*\n(sp) +.endr + + /* Load the stack arguments */ +.irp n, 8, 9, 10, 11, 12, 13, 14, 15 + ld t0, 8*\n(sp) + sd t0, 8*(\n - 8) - ARG_STACK(sp) +.endr + addi sp, sp, -ARG_STACK + + /* Clobber the stack space right below SP */ + lla t0, register_init + ld t1, (t0) +.rept 16 + addi sp, sp, -16 + sd t1, (sp) + sd t1, 8(sp) +.endr + addi sp, sp, 16*16 + + /* Clobber the callee-saved and temporary registers */ +.irp n, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +.if (\n > 0 && \n < 7) + ld t\n, 16*\n(t0) +.endif + ld s\n, 8 + 8*\n(t0) +#ifdef __riscv_float_abi_double + fld ft\n, 16 + 16*\n(t0) + fld fs\n, 24 + 8*\n(t0) +#endif +.endr + + /* Call the checked function */ + la.tls.ie t0, saved_regs + add t0, tp, t0 + ld t0, (t0) + jalr t0 + + /* Check the value of callee-saved registers */ + lla t0, register_init +.irp n, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + ld t1, 8 + 8*\n(t0) + bne t1, s\n, 2f +#ifdef __riscv_float_abi_double + ld t1, 24 + 8*\n(t0) + fmv.x.d t2, fs\n + bne t1, t2, 3f +#endif +.endr + + /* Check unallocatable register values */ + la.tls.ie t0, saved_regs + add t0, tp, t0 + ld t1, 16(t0) + addi t1, t1, -ARG_STACK + bne t1, sp, 4f + ld t1, 24(t0) + bne t1, gp, 4f + ld t1, 32(t0) + bne t1, tp, 4f + +1: + /* Restore RA, SP and callee-saved registers from thread local storage */ + la.tls.ie t0, saved_regs + add t0, tp, t0 + ld ra, 8(t0) + ld sp, 16(t0) +.irp n, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + ld s\n, 40 + 16*\n(t0) +#ifdef __riscv_float_abi_double + fld fs\n, 48 + 16*\n(t0) +#endif +.endr + ret + +2: + lla a0, error_message_sreg +#ifdef PREFIX + call _checkasm_fail_func +#else + call checkasm_fail_func +#endif + j 1b + +#ifdef __riscv_float_abi_double +3: + lla a0, error_message_fsreg +#ifdef PREFIX + call _checkasm_fail_func +#else + call checkasm_fail_func +#endif + j 1b +#endif + +4: + lla a0, error_message_rsvd +#ifdef PREFIX + call _checkasm_fail_func +#else + call checkasm_fail_func +#endif + j 1b +endfunc diff --git a/tests/meson.build b/tests/meson.build index c76a543..11db0a5 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -69,6 +69,8 @@ if is_asm_enabled checkasm_asm_sources += files('checkasm/arm/checkasm_64.S') elif host_machine.cpu_family().startswith('arm') checkasm_asm_sources += files('checkasm/arm/checkasm_32.S') + elif host_machine.cpu_family() == 'riscv64' + checkasm_asm_sources += files('checkasm/riscv/checkasm_64.S') elif host_machine.cpu_family().startswith('x86') checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm')) endif -- cgit v1.2.3 From e7660b8b24dd427d4a278480cac235624fcaef47 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 29 Oct 2023 04:34:15 -0400 Subject: checkasm: riscv64: Add stack canary test --- tests/checkasm/riscv/checkasm_64.S | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/checkasm/riscv/checkasm_64.S b/tests/checkasm/riscv/checkasm_64.S index 5d35d71..fbad5f5 100644 --- a/tests/checkasm/riscv/checkasm_64.S +++ b/tests/checkasm/riscv/checkasm_64.S @@ -77,6 +77,8 @@ error_message_sreg: .asciz "failed to preserve callee-saved integer register" error_message_fsreg: .asciz "failed to preserve callee-saved floating-point register" +error_message_stack: + .asciz "stack clobbered" endconst thread_local saved_regs, quads=29 # 5 + 12 + 12 @@ -125,7 +127,15 @@ function checked_call, export=1, ext=v ld t0, 8*\n(sp) sd t0, 8*(\n - 8) - ARG_STACK(sp) .endr + + /* Setup the stack canary */ + ld t0, MAX_ARGS*8(sp) addi sp, sp, -ARG_STACK + slli t0, t0, 3 + add t0, t0, sp + ld t0, (t0) + not t0, t0 + sd t0, ARG_STACK - 8(sp) /* Clobber the stack space right below SP */ lla t0, register_init @@ -178,6 +188,15 @@ function checked_call, export=1, ext=v ld t1, 32(t0) bne t1, tp, 4f + /* Check the stack canary */ + ld t0, ARG_STACK + MAX_ARGS*8(sp) + slli t0, t0, 3 + add t0, t0, sp + ld t0, (t0) + not t0, t0 + ld t1, ARG_STACK - 8(sp) + bne t0, t1, 5f + 1: /* Restore RA, SP and callee-saved registers from thread local storage */ la.tls.ie t0, saved_regs @@ -220,4 +239,13 @@ function checked_call, export=1, ext=v call checkasm_fail_func #endif j 1b + +5: + lla a0, error_message_stack +#ifdef PREFIX + call _checkasm_fail_func +#else + call checkasm_fail_func +#endif + j 1b endfunc -- cgit v1.2.3 From e67f630613445660e65abd86d80ea8292961a971 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 29 Oct 2023 10:10:29 -0400 Subject: checkasm: riscv64: Print modified register names --- tests/checkasm/riscv/checkasm_64.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/checkasm/riscv/checkasm_64.S b/tests/checkasm/riscv/checkasm_64.S index fbad5f5..0d02e5f 100644 --- a/tests/checkasm/riscv/checkasm_64.S +++ b/tests/checkasm/riscv/checkasm_64.S @@ -74,9 +74,9 @@ const error_message_register error_message_rsvd: .asciz "unallocatable register clobbered" error_message_sreg: - .asciz "failed to preserve callee-saved integer register" + .asciz "callee-saved integer register s%i modified" error_message_fsreg: - .asciz "failed to preserve callee-saved floating-point register" + .asciz "callee-saved floating-point register fs%i modified" error_message_stack: .asciz "stack clobbered" endconst @@ -169,6 +169,7 @@ function checked_call, export=1, ext=v lla t0, register_init .irp n, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ld t1, 8 + 8*\n(t0) + li a1, \n bne t1, s\n, 2f #ifdef __riscv_float_abi_double ld t1, 24 + 8*\n(t0) -- cgit v1.2.3 From 1042008eb5ebfa99c31943b3fa8cd6e41d1d1542 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Tue, 31 Oct 2023 06:48:25 -0400 Subject: CI: riscv64: Use matrix test to vary rvv VLEN --- .gitlab-ci.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d15de9d..b33898a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -716,8 +716,6 @@ test-debian-riscv64: - .debian-amd64-common - .test-common needs: ["build-debian-riscv64"] - variables: - QEMU_CPU: rv64,v=true,vext_spec=v1.0,vlen=256,elen=64 script: - meson setup build --buildtype release -Dtestdata_tests=true @@ -726,6 +724,12 @@ test-debian-riscv64: --cross-file package/crossfiles/riscv64-linux.meson - ninja -C build - cd build && time meson test -v --timeout-multiplier 2 + parallel: + matrix: + - QEMU_CPU: [ "rv64,v=true,vext_spec=v1.0,vlen=128,elen=64", + "rv64,v=true,vext_spec=v1.0,vlen=256,elen=64", + "rv64,v=true,vext_spec=v1.0,vlen=512,elen=64", + "rv64,v=true,vext_spec=v1.0,vlen=1024,elen=64" ] test-debian-armv7-clang-5: extends: -- cgit v1.2.3 From a7edb029872a382f8609d80417cdaf11d8ba6334 Mon Sep 17 00:00:00 2001 From: Matthias Dressel Date: Sat, 11 Nov 2023 00:15:59 +0100 Subject: CI: Use cross-compiling libc instead of multi-arch See https://code.videolan.org/videolan/docker-images/-/merge_requests/272 for more context. --- .gitlab-ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b33898a..702f284 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -422,6 +422,7 @@ build-debian-riscv64: extends: .debian-amd64-common variables: QEMU_CPU: rv64,v=true,vext_spec=v1.0,vlen=256,elen=64 + QEMU_LD_PREFIX: /usr/riscv64-linux-gnu/ script: - meson setup build --buildtype release -Dtrim_dsp=false @@ -724,6 +725,8 @@ test-debian-riscv64: --cross-file package/crossfiles/riscv64-linux.meson - ninja -C build - cd build && time meson test -v --timeout-multiplier 2 + variables: + QEMU_LD_PREFIX: /usr/riscv64-linux-gnu/ parallel: matrix: - QEMU_CPU: [ "rv64,v=true,vext_spec=v1.0,vlen=128,elen=64", -- cgit v1.2.3 From 80d78c80cd6f55cd2bd84492abab4501148e0b03 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 28 Jan 2024 05:21:49 -0500 Subject: riscv64/itx: Add 8-point 8bpc RVV idtx transform inv_txfm_add_8x8_identity_identity_0_8bpc_c: 1344.6 ( 1.00x) inv_txfm_add_8x8_identity_identity_0_8bpc_rvv: 144.4 ( 9.31x) inv_txfm_add_8x8_identity_identity_1_8bpc_c: 1344.4 ( 1.00x) inv_txfm_add_8x8_identity_identity_1_8bpc_rvv: 144.4 ( 9.31x) --- src/riscv/64/itx.S | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/riscv/itx.h | 4 +- 2 files changed, 169 insertions(+), 1 deletion(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 114a5e0..8ba7ad3 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -120,3 +120,169 @@ endfunc .endm def_fn_4x4 identity, identity + +.macro def_fn_8x8_base variant +function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v + csrw vxrm, zero + + vsetivli zero, 8, e16, m1, ta, ma + vle16.v v0, (a2) + addi t0, a2, 16 + vle16.v v1, (t0) + addi t0, t0, 16 + vle16.v v2, (t0) + addi t0, t0, 16 + vle16.v v3, (t0) + addi t0, t0, 16 + vle16.v v4, (t0) + addi t0, t0, 16 + vle16.v v5, (t0) + addi t0, t0, 16 + vle16.v v6, (t0) + addi t0, t0, 16 + vle16.v v7, (t0) + +.ifc \variant, identity_ + // The identity vsadd.vv and downshift vssra.vi 1 cancel out +.else + jalr t0, a4 + + vssra.vi v0, v0, 1 + vssra.vi v1, v1, 1 + vssra.vi v2, v2, 1 + vssra.vi v3, v3, 1 + vssra.vi v4, v4, 1 + vssra.vi v5, v5, 1 + vssra.vi v6, v6, 1 + vssra.vi v7, v7, 1 +.endif + + vsseg8e16.v v0, (a2) + vle16.v v0, (a2) + addi t0, a2, 16 + vle16.v v1, (t0) + addi t0, t0, 16 + vle16.v v2, (t0) + addi t0, t0, 16 + vle16.v v3, (t0) + addi t0, t0, 16 + vle16.v v4, (t0) + addi t0, t0, 16 + vle16.v v5, (t0) + addi t0, t0, 16 + vle16.v v6, (t0) + addi t0, t0, 16 + vle16.v v7, (t0) + + jalr t0, a5 + + vssra.vi v0, v0, 4 + vssra.vi v1, v1, 4 + vssra.vi v2, v2, 4 + vssra.vi v3, v3, 4 + vssra.vi v4, v4, 4 + vssra.vi v5, v5, 4 + vssra.vi v6, v6, 4 + vssra.vi v7, v7, 4 + + vsetvli zero, zero, e8, mf2, ta, ma + vle8.v v8, (a0) + add t0, a0, a1 + vle8.v v9, (t0) + add t0, t0, a1 + vle8.v v10, (t0) + add t0, t0, a1 + vle8.v v11, (t0) + add t0, t0, a1 + vle8.v v12, (t0) + add t0, t0, a1 + vle8.v v13, (t0) + add t0, t0, a1 + vle8.v v14, (t0) + add t0, t0, a1 + vle8.v v15, (t0) + + vwaddu.wv v0, v0, v8 + vwaddu.wv v1, v1, v9 + vwaddu.wv v2, v2, v10 + vwaddu.wv v3, v3, v11 + vwaddu.wv v4, v4, v12 + vwaddu.wv v5, v5, v13 + vwaddu.wv v6, v6, v14 + vwaddu.wv v7, v7, v15 + + vsetvli zero, zero, e16, m1 + vmax.vx v0, v0, zero + vmax.vx v1, v1, zero + vmax.vx v2, v2, zero + vmax.vx v3, v3, zero + vmax.vx v4, v4, zero + vmax.vx v5, v5, zero + vmax.vx v6, v6, zero + vmax.vx v7, v7, zero + + vsetvli zero, zero, e8, mf2, ta, ma + + vnclipu.wi v8, v0, 0 + vnclipu.wi v9, v1, 0 + vnclipu.wi v10, v2, 0 + vnclipu.wi v11, v3, 0 + vnclipu.wi v12, v4, 0 + vnclipu.wi v13, v5, 0 + vnclipu.wi v14, v6, 0 + vnclipu.wi v15, v7, 0 + + vse8.v v8, (a0) + add a0, a0, a1 + vse8.v v9, (a0) + add a0, a0, a1 + vse8.v v10, (a0) + add a0, a0, a1 + vse8.v v11, (a0) + add a0, a0, a1 + vse8.v v12, (a0) + add a0, a0, a1 + vse8.v v13, (a0) + add a0, a0, a1 + vse8.v v14, (a0) + add a0, a0, a1 + vse8.v v15, (a0) + + li t1, 64 + vsetvli zero, t1, e16, m8, ta, ma + vmv.v.x v0, zero + vse16.v v0, (a2) + + ret +endfunc +.endm + +def_fn_8x8_base +def_fn_8x8_base identity_ + +function inv_identity_e16_x8_rvv, export=1, ext=v + vsadd.vv v0, v0, v0 + vsadd.vv v1, v1, v1 + vsadd.vv v2, v2, v2 + vsadd.vv v3, v3, v3 + vsadd.vv v4, v4, v4 + vsadd.vv v5, v5, v5 + vsadd.vv v6, v6, v6 + vsadd.vv v7, v7, v7 + + jr t0 +endfunc + +.macro def_fn_8x8 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1 + la a5, inv_\txfm2\()_e16_x8_rvv +.ifc \txfm1, identity + j inv_txfm_identity_add_8x8_rvv +.else + la a4, inv_\txfm1\()_e16_x8_rvv + j inv_txfm_add_8x8_rvv +.endif +endfunc +.endm + +def_fn_8x8 identity, identity diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 3f83905..6bdd9b7 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -32,7 +32,8 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) #define decl_itx_fns(ext) \ -decl_itx2_fns( 4, 4, ext) +decl_itx2_fns( 4, 4, ext); \ +decl_itx2_fns( 8, 8, ext) decl_itx_fns(rvv); @@ -50,5 +51,6 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in #if BITDEPTH == 8 assign_itx2_fn( , 4, 4, rvv); + assign_itx2_fn( , 8, 8, rvv); #endif } -- cgit v1.2.3 From 04d4d50bcf26025a55ea50b6943202457ea7962c Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 28 Jan 2024 08:38:32 -0500 Subject: riscv64/itx: Add 4-point 8bpc RVV dct transform inv_txfm_add_4x4_dct_dct_0_8bpc_c: 154.6 ( 1.00x) inv_txfm_add_4x4_dct_dct_0_8bpc_rvv: 121.3 ( 1.27x) inv_txfm_add_4x4_dct_dct_1_8bpc_c: 619.5 ( 1.00x) inv_txfm_add_4x4_dct_dct_1_8bpc_rvv: 121.3 ( 5.11x) --- src/riscv/64/itx.S | 38 ++++++++++++++++++++++++++++++++++++++ src/riscv/itx.h | 5 +++++ 2 files changed, 43 insertions(+) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 8ba7ad3..3c6ad5e 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -111,6 +111,43 @@ function inv_identity_e16_x4_rvv, export=1, ext=v jr t0 endfunc +function inv_dct_e16_x4_rvv, export=1, ext=v + li t1, 2896 + li t2, 1567 + li t3, 3784 + + vwmul.vx v8, v0, t1 + vwmul.vx v10, v0, t1 + vwmacc.vx v8, t1, v2 + neg t1, t1 + vwmacc.vx v10, t1, v2 + + vwmul.vx v12, v1, t3 + neg t3, t3 + vwmul.vx v14, v1, t2 + vwmacc.vx v12, t2, v3 + vwmacc.vx v14, t3, v3 + + li t1, 2048 + + vwadd.wx v8, v8, t1 + vwadd.wx v10, v10, t1 + vwadd.wx v12, v12, t1 + vwadd.wx v14, v14, t1 + + vnsra.wi v8, v8, 12 + vnsra.wi v10, v10, 12 + vnsra.wi v12, v12, 12 + vnsra.wi v14, v14, 12 + + vsadd.vv v0, v8, v12 + vsadd.vv v1, v10, v14 + vssub.vv v2, v10, v14 + vssub.vv v3, v8, v12 + + jr t0 +endfunc + .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1 la a4, inv_\txfm1\()_e16_x4_rvv @@ -119,6 +156,7 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1 endfunc .endm +def_fn_4x4 dct, dct def_fn_4x4 identity, identity .macro def_fn_8x8_base variant diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 6bdd9b7..821b54c 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -29,6 +29,7 @@ #include "src/itx.h" #define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) #define decl_itx_fns(ext) \ @@ -42,6 +43,9 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) +#define assign_itx1_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) + #define assign_itx2_fn(pfx, w, h, ext) \ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) @@ -50,6 +54,7 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return; #if BITDEPTH == 8 + assign_itx1_fn( , 4, 4, rvv); assign_itx2_fn( , 4, 4, rvv); assign_itx2_fn( , 8, 8, rvv); #endif -- cgit v1.2.3 From c436b8e3212384af6110a0015eb58be2a37bc6dc Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 28 Jan 2024 18:33:37 -0500 Subject: riscv64/itx: Special case 4x4 8bpc dct_dct eob = 0 inv_txfm_add_4x4_dct_dct_0_8bpc_c: 152.1 ( 1.00x) inv_txfm_add_4x4_dct_dct_0_8bpc_rvv: 46.9 ( 3.25x) inv_txfm_add_4x4_dct_dct_1_8bpc_c: 622.4 ( 1.00x) inv_txfm_add_4x4_dct_dct_1_8bpc_rvv: 120.7 ( 5.16x) --- src/riscv/64/itx.S | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 3c6ad5e..4bc535e 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -41,14 +41,20 @@ function inv_txfm_add_4x4_rvv, export=1, ext=v jalr t0, a4 + vmv.v.x v4, zero + vsseg4e16.v v0, (a2) vle16.v v0, (a2) + vse16.v v4, (a2) addi t0, a2, 8 vle16.v v1, (t0) + vse16.v v4, (t0) addi t0, t0, 8 vle16.v v2, (t0) + vse16.v v4, (t0) addi t0, t0, 8 vle16.v v3, (t0) + vse16.v v4, (t0) jalr t0, a5 @@ -57,6 +63,7 @@ function inv_txfm_add_4x4_rvv, export=1, ext=v vssra.vi v2, v2, 4 vssra.vi v3, v3, 4 +itx_4x4_end: vsetvli zero, zero, e8, mf2, ta, ma vle8.v v4, (a0) add t0, a0, a1 @@ -89,10 +96,6 @@ function inv_txfm_add_4x4_rvv, export=1, ext=v add a0, a0, a1 vse8.v v7, (a0) - vsetivli zero, 16, e16, m2, ta, ma - vmv.v.x v0, zero - vse16.v v0, (a2) - ret endfunc @@ -149,10 +152,29 @@ function inv_dct_e16_x4_rvv, export=1, ext=v endfunc .macro def_fn_4x4 txfm1, txfm2 -function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v +.ifc \txfm1\()_\txfm2, dct_dct + beqz a3, 1f +.endif la a4, inv_\txfm1\()_e16_x4_rvv la a5, inv_\txfm2\()_e16_x4_rvv j inv_txfm_add_4x4_rvv +.ifc \txfm1\()_\txfm2, dct_dct +1: + csrw vxrm, zero + vsetivli zero, 4, e16, m1, ta, ma + ld t2, (a2) + li t1, 2896*8 + vmv.v.x v0, t2 + vsmul.vx v0, v0, t1 + sd x0, (a2) + vsmul.vx v0, v0, t1 + vssra.vi v0, v0, 4 + vmv.v.v v1, v0 + vmv.v.v v2, v0 + vmv.v.v v3, v0 + j itx_4x4_end +.endif endfunc .endm -- cgit v1.2.3 From 15072b34be4b47a8f33608cd62fd6ae7cf2b6b1e Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 28 Jan 2024 18:38:26 -0500 Subject: riscv64: Add missing {decl|assign}_itx_fn() macros --- src/riscv/itx.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 821b54c..7a44199 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -32,9 +32,33 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) +#define decl_itx12_fns(w, h, opt) \ +decl_itx2_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) + +#define decl_itx16_fns(w, h, opt) \ +decl_itx12_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) + +#define decl_itx17_fns(w, h, opt) \ +decl_itx16_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) + #define decl_itx_fns(ext) \ -decl_itx2_fns( 4, 4, ext); \ -decl_itx2_fns( 8, 8, ext) +decl_itx17_fns( 4, 4, ext); \ +decl_itx16_fns( 8, 8, ext) decl_itx_fns(rvv); @@ -47,15 +71,39 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) #define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx1_fn(pfx, w, h, ext); \ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) +#define assign_itx12_fn(pfx, w, h, ext) \ + assign_itx2_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ + assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) + +#define assign_itx16_fn(pfx, w, h, ext) \ + assign_itx12_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ + assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) + +#define assign_itx17_fn(pfx, w, h, ext) \ + assign_itx16_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return; #if BITDEPTH == 8 - assign_itx1_fn( , 4, 4, rvv); assign_itx2_fn( , 4, 4, rvv); - assign_itx2_fn( , 8, 8, rvv); + assign_itx_fn( , 8, 8, identity_identity, IDTX, rvv); #endif } -- cgit v1.2.3 From 802d9257c5905b65f7ef0d68da751cbda8b934a7 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 28 Jan 2024 18:46:50 -0500 Subject: riscv64/itx: Add 4x4 8bpc dct_identity and identity_dct inv_txfm_add_4x4_dct_identity_0_8bpc_c: 619.6 ( 1.00x) inv_txfm_add_4x4_dct_identity_0_8bpc_rvv: 95.6 ( 6.48x) inv_txfm_add_4x4_dct_identity_1_8bpc_c: 620.0 ( 1.00x) inv_txfm_add_4x4_dct_identity_1_8bpc_rvv: 95.6 ( 6.49x) inv_txfm_add_4x4_identity_dct_0_8bpc_c: 573.9 ( 1.00x) inv_txfm_add_4x4_identity_dct_0_8bpc_rvv: 98.1 ( 5.85x) inv_txfm_add_4x4_identity_dct_1_8bpc_c: 573.1 ( 1.00x) inv_txfm_add_4x4_identity_dct_1_8bpc_rvv: 98.1 ( 5.84x) --- src/riscv/64/itx.S | 2 ++ src/riscv/itx.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 4bc535e..7cf27a6 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -180,6 +180,8 @@ endfunc def_fn_4x4 dct, dct def_fn_4x4 identity, identity +def_fn_4x4 dct, identity +def_fn_4x4 identity, dct .macro def_fn_8x8_base variant function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 7a44199..74f7a89 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -104,6 +104,8 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in #if BITDEPTH == 8 assign_itx2_fn( , 4, 4, rvv); + assign_itx_fn( , 4, 4, dct_identity, H_DCT, rvv); + assign_itx_fn( , 4, 4, identity_dct, V_DCT, rvv); assign_itx_fn( , 8, 8, identity_identity, IDTX, rvv); #endif } -- cgit v1.2.3 From c07ccee5aac1b0a588e48cdc20d21073b8c9843d Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 28 Jan 2024 18:58:06 -0500 Subject: riscv64/itx: Improve inv_txfm_add_4x4_rvv performance Use a higher SEW/LMUL ratio to reduce data path costs. inv_txfm_add_4x4_dct_dct_0_8bpc_c: 153.5 ( 1.00x) inv_txfm_add_4x4_dct_dct_0_8bpc_rvv: 39.4 ( 3.90x) inv_txfm_add_4x4_dct_dct_1_8bpc_c: 634.4 ( 1.00x) inv_txfm_add_4x4_dct_dct_1_8bpc_rvv: 90.6 ( 7.00x) inv_txfm_add_4x4_dct_identity_0_8bpc_c: 621.0 ( 1.00x) inv_txfm_add_4x4_dct_identity_0_8bpc_rvv: 74.2 ( 8.37x) inv_txfm_add_4x4_dct_identity_1_8bpc_c: 619.8 ( 1.00x) inv_txfm_add_4x4_dct_identity_1_8bpc_rvv: 74.2 ( 8.35x) inv_txfm_add_4x4_identity_dct_0_8bpc_c: 574.3 ( 1.00x) inv_txfm_add_4x4_identity_dct_0_8bpc_rvv: 73.6 ( 7.80x) inv_txfm_add_4x4_identity_dct_1_8bpc_c: 574.5 ( 1.00x) inv_txfm_add_4x4_identity_dct_1_8bpc_rvv: 73.6 ( 7.80x) inv_txfm_add_4x4_identity_identity_0_8bpc_c: 548.8 ( 1.00x) inv_txfm_add_4x4_identity_identity_0_8bpc_rvv: 60.5 ( 9.08x) inv_txfm_add_4x4_identity_identity_1_8bpc_c: 548.7 ( 1.00x) inv_txfm_add_4x4_identity_identity_1_8bpc_rvv: 60.5 ( 9.07x) --- src/riscv/64/itx.S | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 7cf27a6..5de5280 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -30,7 +30,7 @@ function inv_txfm_add_4x4_rvv, export=1, ext=v csrw vxrm, zero - vsetivli zero, 4, e16, m1, ta, ma + vsetivli zero, 4, e16, mf2, ta, ma vle16.v v0, (a2) addi t0, a2, 8 vle16.v v1, (t0) @@ -64,7 +64,7 @@ function inv_txfm_add_4x4_rvv, export=1, ext=v vssra.vi v3, v3, 4 itx_4x4_end: - vsetvli zero, zero, e8, mf2, ta, ma + vsetvli zero, zero, e8, mf4, ta, ma vle8.v v4, (a0) add t0, a0, a1 vle8.v v5, (t0) @@ -78,10 +78,13 @@ itx_4x4_end: vwaddu.wv v2, v2, v6 vwaddu.wv v3, v3, v7 - vsetvli t0, zero, e16, m4 - vmax.vx v0, v0, x0 + vsetvli zero, zero, e16, mf2, ta, ma + vmax.vx v0, v0, zero + vmax.vx v1, v1, zero + vmax.vx v2, v2, zero + vmax.vx v3, v3, zero - vsetivli zero, 4, e8, mf2, ta, ma + vsetvli zero, zero, e8, mf4, ta, ma vnclipu.wi v4, v0, 0 vnclipu.wi v5, v1, 0 @@ -162,7 +165,7 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct 1: csrw vxrm, zero - vsetivli zero, 4, e16, m1, ta, ma + vsetivli zero, 4, e16, mf2, ta, ma ld t2, (a2) li t1, 2896*8 vmv.v.x v0, t2 -- cgit v1.2.3 From 5305eb3632e7985c55c3178d354d8d0e963c3637 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 28 Jan 2024 19:17:00 -0500 Subject: riscv64/itx: Add 4-point 8bpc RVV adst transform inv_txfm_add_4x4_adst_adst_0_8bpc_c: 674.2 ( 1.00x) inv_txfm_add_4x4_adst_adst_0_8bpc_rvv: 98.5 ( 6.84x) inv_txfm_add_4x4_adst_adst_1_8bpc_c: 674.1 ( 1.00x) inv_txfm_add_4x4_adst_adst_1_8bpc_rvv: 98.5 ( 6.84x) inv_txfm_add_4x4_adst_dct_0_8bpc_c: 650.3 ( 1.00x) inv_txfm_add_4x4_adst_dct_0_8bpc_rvv: 93.1 ( 6.99x) inv_txfm_add_4x4_adst_dct_1_8bpc_c: 650.6 ( 1.00x) inv_txfm_add_4x4_adst_dct_1_8bpc_rvv: 93.1 ( 6.99x) inv_txfm_add_4x4_adst_identity_0_8bpc_c: 635.6 ( 1.00x) inv_txfm_add_4x4_adst_identity_0_8bpc_rvv: 79.9 ( 7.95x) inv_txfm_add_4x4_adst_identity_1_8bpc_c: 635.4 ( 1.00x) inv_txfm_add_4x4_adst_identity_1_8bpc_rvv: 79.9 ( 7.95x) inv_txfm_add_4x4_dct_adst_0_8bpc_c: 649.7 ( 1.00x) inv_txfm_add_4x4_dct_adst_0_8bpc_rvv: 93.7 ( 6.93x) inv_txfm_add_4x4_dct_adst_1_8bpc_c: 649.6 ( 1.00x) inv_txfm_add_4x4_dct_adst_1_8bpc_rvv: 93.7 ( 6.93x) inv_txfm_add_4x4_identity_adst_0_8bpc_c: 594.1 ( 1.00x) inv_txfm_add_4x4_identity_adst_0_8bpc_rvv: 79.8 ( 7.45x) inv_txfm_add_4x4_identity_adst_1_8bpc_c: 592.6 ( 1.00x) inv_txfm_add_4x4_identity_adst_1_8bpc_rvv: 79.8 ( 7.43x) --- src/riscv/64/itx.S | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/riscv/itx.h | 5 +++++ 2 files changed, 56 insertions(+) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 5de5280..b29edbb 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -154,6 +154,52 @@ function inv_dct_e16_x4_rvv, export=1, ext=v jr t0 endfunc +function inv_adst_e16_x4_rvv, export=1, ext=v + li t1, 1321 + li t2, 3803 + li t3, 2482 + + vwmul.vx v4, v0, t1 + vwmul.vx v5, v0, t3 + neg t1, t1 + vwmacc.vx v4, t2, v2 + vwmacc.vx v5, t1, v2 + neg t2, t2 + vwmacc.vx v4, t3, v3 + vwmacc.vx v5, t2, v3 + + vwsub.vv v6, v0, v2 + vwadd.wv v6, v6, v3 + + li t1, 3344 + vwmul.vx v7, v1, t1 + + vsetvli zero, zero, e32, m1, ta, ma + + vmul.vx v6, v6, t1 + + vadd.vv v8, v4, v5 + vadd.vv v4, v4, v7 + vadd.vv v5, v5, v7 + vsub.vv v7, v8, v7 + + li t1, 2048 + + vadd.vx v4, v4, t1 + vadd.vx v5, v5, t1 + vadd.vx v6, v6, t1 + vadd.vx v7, v7, t1 + + vsetvli zero, zero, e16, mf2, ta, ma + + vnsra.wi v0, v4, 12 + vnsra.wi v1, v5, 12 + vnsra.wi v2, v6, 12 + vnsra.wi v3, v7, 12 + + jr t0 +endfunc + .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct @@ -183,8 +229,13 @@ endfunc def_fn_4x4 dct, dct def_fn_4x4 identity, identity +def_fn_4x4 dct, adst def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst def_fn_4x4 identity, dct +def_fn_4x4 adst, identity +def_fn_4x4 identity, adst .macro def_fn_8x8_base variant function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 74f7a89..0a7ffcf 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -104,8 +104,13 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in #if BITDEPTH == 8 assign_itx2_fn( , 4, 4, rvv); + assign_itx_fn( , 4, 4, dct_adst, ADST_DCT, rvv); assign_itx_fn( , 4, 4, dct_identity, H_DCT, rvv); + assign_itx_fn( , 4, 4, adst_dct, DCT_ADST, rvv); + assign_itx_fn( , 4, 4, adst_adst, ADST_ADST, rvv); assign_itx_fn( , 4, 4, identity_dct, V_DCT, rvv); + assign_itx_fn( , 4, 4, adst_identity, H_ADST, rvv); + assign_itx_fn( , 4, 4, identity_adst, V_ADST, rvv); assign_itx_fn( , 8, 8, identity_identity, IDTX, rvv); #endif } -- cgit v1.2.3 From 91c9e6fe5d1d5460d8cf5f5371f3b3a8b01e9f9b Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 28 Jan 2024 19:45:40 -0500 Subject: riscv64/itx: Convert inv_adst_e16_x4_rvv to macro --- src/riscv/64/itx.S | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index b29edbb..6e6457c 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -154,7 +154,7 @@ function inv_dct_e16_x4_rvv, export=1, ext=v jr t0 endfunc -function inv_adst_e16_x4_rvv, export=1, ext=v +.macro iadst_4 o0, o1, o2, o3 li t1, 1321 li t2, 3803 li t3, 2482 @@ -192,11 +192,14 @@ function inv_adst_e16_x4_rvv, export=1, ext=v vsetvli zero, zero, e16, mf2, ta, ma - vnsra.wi v0, v4, 12 - vnsra.wi v1, v5, 12 - vnsra.wi v2, v6, 12 - vnsra.wi v3, v7, 12 + vnsra.wi \o0, v4, 12 + vnsra.wi \o1, v5, 12 + vnsra.wi \o2, v6, 12 + vnsra.wi \o3, v7, 12 +.endm +function inv_adst_e16_x4_rvv, export=1, ext=v + iadst_4 v0, v1, v2, v3 jr t0 endfunc -- cgit v1.2.3 From 9b9f22842e65bad43ee03f10943e3726e85dba03 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 28 Jan 2024 19:53:07 -0500 Subject: riscv64/itx: Add 4-point 8bpc RVV flipadst transform inv_txfm_add_4x4_adst_flipadst_0_8bpc_c: 686.6 ( 1.00x) inv_txfm_add_4x4_adst_flipadst_0_8bpc_rvv: 104.9 ( 6.55x) inv_txfm_add_4x4_adst_flipadst_1_8bpc_c: 686.7 ( 1.00x) inv_txfm_add_4x4_adst_flipadst_1_8bpc_rvv: 104.9 ( 6.55x) inv_txfm_add_4x4_dct_flipadst_0_8bpc_c: 664.3 ( 1.00x) inv_txfm_add_4x4_dct_flipadst_0_8bpc_rvv: 94.2 ( 7.05x) inv_txfm_add_4x4_dct_flipadst_1_8bpc_c: 663.5 ( 1.00x) inv_txfm_add_4x4_dct_flipadst_1_8bpc_rvv: 94.2 ( 7.04x) inv_txfm_add_4x4_flipadst_adst_0_8bpc_c: 686.5 ( 1.00x) inv_txfm_add_4x4_flipadst_adst_0_8bpc_rvv: 101.4 ( 6.77x) inv_txfm_add_4x4_flipadst_adst_1_8bpc_c: 685.6 ( 1.00x) inv_txfm_add_4x4_flipadst_adst_1_8bpc_rvv: 101.4 ( 6.76x) inv_txfm_add_4x4_flipadst_dct_0_8bpc_c: 664.4 ( 1.00x) inv_txfm_add_4x4_flipadst_dct_0_8bpc_rvv: 93.7 ( 7.09x) inv_txfm_add_4x4_flipadst_dct_1_8bpc_c: 664.4 ( 1.00x) inv_txfm_add_4x4_flipadst_dct_1_8bpc_rvv: 93.6 ( 7.10x) inv_txfm_add_4x4_flipadst_flipadst_0_8bpc_c: 691.5 ( 1.00x) inv_txfm_add_4x4_flipadst_flipadst_0_8bpc_rvv: 102.5 ( 6.74x) inv_txfm_add_4x4_flipadst_flipadst_1_8bpc_c: 691.5 ( 1.00x) inv_txfm_add_4x4_flipadst_flipadst_1_8bpc_rvv: 102.5 ( 6.74x) inv_txfm_add_4x4_flipadst_identity_0_8bpc_c: 641.8 ( 1.00x) inv_txfm_add_4x4_flipadst_identity_0_8bpc_rvv: 80.5 ( 7.97x) inv_txfm_add_4x4_flipadst_identity_1_8bpc_c: 641.7 ( 1.00x) inv_txfm_add_4x4_flipadst_identity_1_8bpc_rvv: 80.5 ( 7.97x) inv_txfm_add_4x4_identity_flipadst_0_8bpc_c: 605.3 ( 1.00x) inv_txfm_add_4x4_identity_flipadst_0_8bpc_rvv: 80.5 ( 7.52x) inv_txfm_add_4x4_identity_flipadst_1_8bpc_c: 606.3 ( 1.00x) inv_txfm_add_4x4_identity_flipadst_1_8bpc_rvv: 80.5 ( 7.53x) --- src/riscv/64/itx.S | 12 ++++++++++++ src/riscv/itx.h | 9 +-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 6e6457c..de710e9 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -203,6 +203,11 @@ function inv_adst_e16_x4_rvv, export=1, ext=v jr t0 endfunc +function inv_flipadst_e16_x4_rvv, export=1, ext=v + iadst_4 v3, v2, v1, v0 + jr t0 +endfunc + .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct @@ -233,12 +238,19 @@ endfunc def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst .macro def_fn_8x8_base variant function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 0a7ffcf..9d439a2 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -103,14 +103,7 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return; #if BITDEPTH == 8 - assign_itx2_fn( , 4, 4, rvv); - assign_itx_fn( , 4, 4, dct_adst, ADST_DCT, rvv); - assign_itx_fn( , 4, 4, dct_identity, H_DCT, rvv); - assign_itx_fn( , 4, 4, adst_dct, DCT_ADST, rvv); - assign_itx_fn( , 4, 4, adst_adst, ADST_ADST, rvv); - assign_itx_fn( , 4, 4, identity_dct, V_DCT, rvv); - assign_itx_fn( , 4, 4, adst_identity, H_ADST, rvv); - assign_itx_fn( , 4, 4, identity_adst, V_ADST, rvv); + assign_itx16_fn( , 4, 4, rvv); assign_itx_fn( , 8, 8, identity_identity, IDTX, rvv); #endif } -- cgit v1.2.3 From 6ba7e5701aed91d896169118a47b90d8f44a15c7 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Mon, 29 Jan 2024 04:52:26 -0500 Subject: riscv64/itx: Convert inv_dct_e16_x4_rvv to macro --- src/riscv/64/itx.S | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index de710e9..ccc4be9 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -117,22 +117,22 @@ function inv_identity_e16_x4_rvv, export=1, ext=v jr t0 endfunc -function inv_dct_e16_x4_rvv, export=1, ext=v +.macro idct_4 o0, o1, o2, o3 li t1, 2896 li t2, 1567 li t3, 3784 - vwmul.vx v8, v0, t1 - vwmul.vx v10, v0, t1 - vwmacc.vx v8, t1, v2 + vwmul.vx v8, \o0, t1 + vwmul.vx v10, \o0, t1 + vwmacc.vx v8, t1, \o2 neg t1, t1 - vwmacc.vx v10, t1, v2 + vwmacc.vx v10, t1, \o2 - vwmul.vx v12, v1, t3 + vwmul.vx v12, \o1, t3 neg t3, t3 - vwmul.vx v14, v1, t2 - vwmacc.vx v12, t2, v3 - vwmacc.vx v14, t3, v3 + vwmul.vx v14, \o1, t2 + vwmacc.vx v12, t2, \o3 + vwmacc.vx v14, t3, \o3 li t1, 2048 @@ -146,13 +146,11 @@ function inv_dct_e16_x4_rvv, export=1, ext=v vnsra.wi v12, v12, 12 vnsra.wi v14, v14, 12 - vsadd.vv v0, v8, v12 - vsadd.vv v1, v10, v14 - vssub.vv v2, v10, v14 - vssub.vv v3, v8, v12 - - jr t0 -endfunc + vsadd.vv \o0, v8, v12 + vsadd.vv \o1, v10, v14 + vssub.vv \o2, v10, v14 + vssub.vv \o3, v8, v12 +.endm .macro iadst_4 o0, o1, o2, o3 li t1, 1321 @@ -198,6 +196,11 @@ endfunc vnsra.wi \o3, v7, 12 .endm +function inv_dct_e16_x4_rvv, export=1, ext=v + idct_4 v0, v1, v2, v3 + jr t0 +endfunc + function inv_adst_e16_x4_rvv, export=1, ext=v iadst_4 v0, v1, v2, v3 jr t0 -- cgit v1.2.3 From 1eaff185e02dc08eadc248e5bff4e5a8c93107d0 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Mon, 29 Jan 2024 06:41:32 -0500 Subject: riscv64/itx: Add 8-point 8bpc RVV dct transform inv_txfm_add_8x8_dct_dct_0_8bpc_c: 443.8 ( 1.00x) inv_txfm_add_8x8_dct_dct_0_8bpc_rvv: 310.6 ( 1.43x) inv_txfm_add_8x8_dct_dct_1_8bpc_c: 2813.0 ( 1.00x) inv_txfm_add_8x8_dct_dct_1_8bpc_rvv: 312.3 ( 9.01x) --- src/riscv/64/itx.S | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/riscv/itx.h | 2 +- 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index ccc4be9..27df860 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -407,6 +407,70 @@ function inv_identity_e16_x8_rvv, export=1, ext=v jr t0 endfunc +function inv_dct_e16_x8_rvv, export=1, ext=v + idct_4 v0, v2, v4, v6 + + li t1, 799 + li t2, 4017 + li t3, 3406 + li t4, 2276 + + vwmul.vx v14, v1, t2 + neg t2, t2 + vwmul.vx v8, v1, t1 + vwmacc.vx v14, t1, v7 + vwmacc.vx v8, t2, v7 + + vwmul.vx v12, v5, t4 + neg t4, t4 + vwmul.vx v10, v5, t3 + vwmacc.vx v12, t3, v3 + vwmacc.vx v10, t4, v3 + + li t1, 2048 + + vwadd.wx v8, v8, t1 + vwadd.wx v10, v10, t1 + vwadd.wx v12, v12, t1 + vwadd.wx v14, v14, t1 + + vnsra.wi v8, v8, 12 + vnsra.wi v10, v10, 12 + vnsra.wi v12, v12, 12 + vnsra.wi v14, v14, 12 + + vssub.vv v7, v14, v12 + vsadd.vv v14, v14, v12 + vssub.vv v1, v8, v10 + vsadd.vv v8, v8, v10 + + li t2, 2896 + + vwmul.vx v10, v7, t2 + vwmul.vx v12, v7, t2 + vwmacc.vx v12, t2, v1 + neg t2, t2 + vwmacc.vx v10, t2, v1 + + vwadd.wx v10, v10, t1 + vwadd.wx v12, v12, t1 + + vnsra.wi v10, v10, 12 + vnsra.wi v12, v12, 12 + + vssub.vv v7, v0, v14 + vsadd.vv v0, v0, v14 + vssub.vv v9, v2, v12 + vsadd.vv v1, v2, v12 + vssub.vv v5, v4, v10 + vsadd.vv v2, v4, v10 + vssub.vv v4, v6, v8 + vsadd.vv v3, v6, v8 + vmv.v.v v6, v9 + + jr t0 +endfunc + .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1 la a5, inv_\txfm2\()_e16_x8_rvv @@ -419,4 +483,5 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1 endfunc .endm +def_fn_8x8 dct, dct def_fn_8x8 identity, identity diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 9d439a2..151d0ee 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -104,6 +104,6 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in #if BITDEPTH == 8 assign_itx16_fn( , 4, 4, rvv); - assign_itx_fn( , 8, 8, identity_identity, IDTX, rvv); + assign_itx2_fn( , 8, 8, rvv); #endif } -- cgit v1.2.3 From 877486e043f8459dd937bf9cc2771631f976c017 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Mon, 29 Jan 2024 09:20:25 -0500 Subject: riscv64/itx: Special case 8x8 8bpc dct_dct eob = 0 inv_txfm_add_8x8_dct_dct_0_8bpc_c: 443.1 ( 1.00x) inv_txfm_add_8x8_dct_dct_0_8bpc_rvv: 75.3 ( 5.88x) inv_txfm_add_8x8_dct_dct_1_8bpc_c: 2796.8 ( 1.00x) inv_txfm_add_8x8_dct_dct_1_8bpc_rvv: 315.6 ( 8.86x) --- src/riscv/64/itx.S | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 27df860..49d2493 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -319,7 +319,15 @@ function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v vssra.vi v6, v6, 4 vssra.vi v7, v7, 4 - vsetvli zero, zero, e8, mf2, ta, ma + li t1, 64 + vsetvli zero, t1, e16, m8, ta, ma + vmv.v.x v8, zero + vse16.v v8, (a2) + +.ifc \variant, identity_ +itx_8x8_end: +.endif + vsetivli zero, 8, e8, mf2, ta, ma vle8.v v8, (a0) add t0, a0, a1 vle8.v v9, (t0) @@ -382,11 +390,6 @@ function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v add a0, a0, a1 vse8.v v15, (a0) - li t1, 64 - vsetvli zero, t1, e16, m8, ta, ma - vmv.v.x v0, zero - vse16.v v0, (a2) - ret endfunc .endm @@ -472,7 +475,10 @@ function inv_dct_e16_x8_rvv, export=1, ext=v endfunc .macro def_fn_8x8 txfm1, txfm2 -function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v +.ifc \txfm1\()_\txfm2, dct_dct + beqz a3, 1f +.endif la a5, inv_\txfm2\()_e16_x8_rvv .ifc \txfm1, identity j inv_txfm_identity_add_8x8_rvv @@ -480,6 +486,27 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1 la a4, inv_\txfm1\()_e16_x8_rvv j inv_txfm_add_8x8_rvv .endif +.ifc \txfm1\()_\txfm2, dct_dct +1: + csrw vxrm, zero + vsetivli zero, 8, e16, m1, ta, ma + ld t2, (a2) + li t1, 2896*8 + vmv.v.x v0, t2 + vsmul.vx v0, v0, t1 + sd x0, (a2) + vssra.vi v0, v0, 1 + vsmul.vx v0, v0, t1 + vssra.vi v0, v0, 4 + vmv.v.v v1, v0 + vmv.v.v v2, v0 + vmv.v.v v3, v0 + vmv.v.v v4, v0 + vmv.v.v v5, v0 + vmv.v.v v6, v0 + vmv.v.v v7, v0 + j itx_8x8_end +.endif endfunc .endm -- cgit v1.2.3 From 3e8260d6882c3dcf7ee28daa450f10a3df5baa71 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Mon, 29 Jan 2024 09:49:54 -0500 Subject: riscv64/itx: Add 8x8 8bpc dct_identity and identity_dct inv_txfm_add_8x8_dct_identity_0_8bpc_c: 2122.2 ( 1.00x) inv_txfm_add_8x8_dct_identity_0_8bpc_rvv: 239.6 ( 8.86x) inv_txfm_add_8x8_dct_identity_1_8bpc_c: 2122.6 ( 1.00x) inv_txfm_add_8x8_dct_identity_1_8bpc_rvv: 239.4 ( 8.87x) inv_txfm_add_8x8_identity_dct_0_8bpc_c: 2093.6 ( 1.00x) inv_txfm_add_8x8_identity_dct_0_8bpc_rvv: 224.2 ( 9.34x) inv_txfm_add_8x8_identity_dct_1_8bpc_c: 2092.7 ( 1.00x) inv_txfm_add_8x8_identity_dct_1_8bpc_rvv: 224.2 ( 9.34x) --- src/riscv/64/itx.S | 2 ++ src/riscv/itx.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 49d2493..3a564d4 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -512,3 +512,5 @@ endfunc def_fn_8x8 dct, dct def_fn_8x8 identity, identity +def_fn_8x8 dct, identity +def_fn_8x8 identity, dct diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 151d0ee..338e54a 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -105,5 +105,7 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in #if BITDEPTH == 8 assign_itx16_fn( , 4, 4, rvv); assign_itx2_fn( , 8, 8, rvv); + assign_itx_fn( , 8, 8, dct_identity, H_DCT, rvv); + assign_itx_fn( , 8, 8, identity_dct, V_DCT, rvv); #endif } -- cgit v1.2.3 From 64f9fd0239730226487d56e23aae1735c27b2ab7 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Tue, 30 Jan 2024 07:43:08 -0500 Subject: riscv64/itx: Add 8-point 8bpc RVV adst transform inv_txfm_add_8x8_adst_adst_0_8bpc_c: 3338.5 ( 1.00x) inv_txfm_add_8x8_adst_adst_0_8bpc_rvv: 400.4 ( 8.34x) inv_txfm_add_8x8_adst_adst_1_8bpc_c: 3338.1 ( 1.00x) inv_txfm_add_8x8_adst_adst_1_8bpc_rvv: 399.8 ( 8.35x) inv_txfm_add_8x8_adst_dct_0_8bpc_c: 3112.5 ( 1.00x) inv_txfm_add_8x8_adst_dct_0_8bpc_rvv: 357.2 ( 8.71x) inv_txfm_add_8x8_adst_dct_1_8bpc_c: 3111.4 ( 1.00x) inv_txfm_add_8x8_adst_dct_1_8bpc_rvv: 357.0 ( 8.71x) inv_txfm_add_8x8_adst_identity_0_8bpc_c: 2375.0 ( 1.00x) inv_txfm_add_8x8_adst_identity_0_8bpc_rvv: 281.0 ( 8.45x) inv_txfm_add_8x8_adst_identity_1_8bpc_c: 2375.6 ( 1.00x) inv_txfm_add_8x8_adst_identity_1_8bpc_rvv: 281.0 ( 8.45x) inv_txfm_add_8x8_dct_adst_0_8bpc_c: 3113.3 ( 1.00x) inv_txfm_add_8x8_dct_adst_0_8bpc_rvv: 357.2 ( 8.72x) inv_txfm_add_8x8_dct_adst_1_8bpc_c: 3112.1 ( 1.00x) inv_txfm_add_8x8_dct_adst_1_8bpc_rvv: 357.2 ( 8.71x) inv_txfm_add_8x8_identity_adst_0_8bpc_c: 2346.7 ( 1.00x) inv_txfm_add_8x8_identity_adst_0_8bpc_rvv: 265.6 ( 8.83x) inv_txfm_add_8x8_identity_adst_1_8bpc_c: 2348.3 ( 1.00x) inv_txfm_add_8x8_identity_adst_1_8bpc_rvv: 265.8 ( 8.84x) --- src/riscv/64/itx.S | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/riscv/itx.h | 5 ++ 2 files changed, 136 insertions(+) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 3a564d4..5841298 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -474,6 +474,132 @@ function inv_dct_e16_x8_rvv, export=1, ext=v jr t0 endfunc +function inv_adst_e16_x8_rvv, export=1, ext=v + li t1, 4076 + li t2, 401 + li t3, 3612 + li t4, 1931 + li t5, 2598 + li t6, 3166 + + vwmul.vx v8, v7, t1 + neg t1, t1 + vwmul.vx v10, v7, t2 + vwmacc.vx v8, t2, v0 + vwmacc.vx v10, t1, v0 + + vwmul.vx v12, v5, t3 + neg t3, t3 + vwmul.vx v14, v5, t4 + vwmacc.vx v12, t4, v2 + vwmacc.vx v14, t3, v2 + + vwmul.vx v16, v3, t5 + neg t5, t5 + vwmul.vx v18, v3, t6 + vwmacc.vx v16, t6, v4 + vwmacc.vx v18, t5, v4 + + li t1, 2048 + li t2, 1189 + li t3, 3920 + li t4, 1567 + li t5, 3784 + li t6, 2896 + + vwmul.vx v20, v1, t2 + neg t2, t2 + vwmul.vx v22, v1, t3 + vwmacc.vx v20, t3, v6 + vwmacc.vx v22, t2, v6 + + vwadd.wx v8, v8, t1 + vwadd.wx v10, v10, t1 + vwadd.wx v12, v12, t1 + vwadd.wx v14, v14, t1 + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + + vnsra.wi v8, v8, 12 + vnsra.wi v10, v10, 12 + vnsra.wi v12, v12, 12 + vnsra.wi v14, v14, 12 + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 + + vssub.vv v4, v8, v16 + vsadd.vv v8, v8, v16 + vsadd.vv v1, v10, v18 + vsadd.vv v2, v12, v20 + vsadd.vv v3, v14, v22 + vssub.vv v5, v10, v18 + vssub.vv v6, v12, v20 + vssub.vv v22, v14, v22 + + vsadd.vv v0, v8, v2 + vsadd.vv v7, v1, v3 + vssub.vv v2, v8, v2 + vssub.vv v3, v1, v3 + + vwmul.vx v8, v4, t5 + vwmul.vx v10, v4, t4 + vwmul.vx v12, v22, t5 + vwmul.vx v14, v22, t4 + vwmacc.vx v8, t4, v5 + neg t4, t4 + vwmacc.vx v14, t5, v6 + neg t5, t5 + vwmacc.vx v12, t4, v6 + vwmacc.vx v10, t5, v5 + + vwadd.wx v8, v8, t1 + vwadd.wx v10, v10, t1 + vwadd.wx v12, v12, t1 + vwadd.wx v14, v14, t1 + + vnsra.wi v8, v8, 12 + vnsra.wi v10, v10, 12 + vnsra.wi v12, v12, 12 + vnsra.wi v14, v14, 12 + + vsadd.vv v1, v8, v12 + vsadd.vv v6, v10, v14 + vssub.vv v8, v8, v12 + vssub.vv v9, v10, v14 + + vwmul.vx v10, v2, t6 + vwmul.vx v12, v2, t6 + vwmul.vx v14, v8, t6 + vwmul.vx v16, v8, t6 + vwmacc.vx v10, t6, v3 + vwmacc.vx v14, t6, v9 + neg t6, t6 + vwmacc.vx v12, t6, v3 + vwmacc.vx v16, t6, v9 + + vwadd.wx v10, v10, t1 + vwadd.wx v12, v12, t1 + vwadd.wx v14, v14, t1 + vwadd.wx v16, v16, t1 + + vnsra.wi v3, v10, 12 + vnsra.wi v4, v12, 12 + vnsra.wi v2, v14, 12 + vnsra.wi v5, v16, 12 + + vmv.v.x v8, zero + vssub.vv v1, v8, v1 + vssub.vv v3, v8, v3 + vssub.vv v5, v8, v5 + vssub.vv v7, v8, v7 + + jr t0 +endfunc + .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct @@ -512,5 +638,10 @@ endfunc def_fn_8x8 dct, dct def_fn_8x8 identity, identity +def_fn_8x8 dct, adst def_fn_8x8 dct, identity +def_fn_8x8 adst, dct +def_fn_8x8 adst, adst def_fn_8x8 identity, dct +def_fn_8x8 adst, identity +def_fn_8x8 identity, adst diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 338e54a..9b791f3 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -105,7 +105,12 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in #if BITDEPTH == 8 assign_itx16_fn( , 4, 4, rvv); assign_itx2_fn( , 8, 8, rvv); + assign_itx_fn( , 8, 8, dct_adst, ADST_DCT, rvv); assign_itx_fn( , 8, 8, dct_identity, H_DCT, rvv); + assign_itx_fn( , 8, 8, adst_dct, DCT_ADST, rvv); + assign_itx_fn( , 8, 8, adst_adst, ADST_ADST, rvv); assign_itx_fn( , 8, 8, identity_dct, V_DCT, rvv); + assign_itx_fn( , 8, 8, adst_identity, H_ADST, rvv); + assign_itx_fn( , 8, 8, identity_adst, V_ADST, rvv); #endif } -- cgit v1.2.3 From b5747aee1e47f3d167d74d5cb6310d186a7c9e15 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Tue, 30 Jan 2024 08:29:59 -0500 Subject: riscv64/itx: Convert inv_adst_e16_x8_rvv to macro --- src/riscv/64/itx.S | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 5841298..44ca29b 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -474,7 +474,7 @@ function inv_dct_e16_x8_rvv, export=1, ext=v jr t0 endfunc -function inv_adst_e16_x8_rvv, export=1, ext=v +.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 li t1, 4076 li t2, 401 li t3, 3612 @@ -540,8 +540,8 @@ function inv_adst_e16_x8_rvv, export=1, ext=v vssub.vv v6, v12, v20 vssub.vv v22, v14, v22 - vsadd.vv v0, v8, v2 - vsadd.vv v7, v1, v3 + vsadd.vv \o0, v8, v2 + vsadd.vv \o7, v1, v3 vssub.vv v2, v8, v2 vssub.vv v3, v1, v3 @@ -566,8 +566,8 @@ function inv_adst_e16_x8_rvv, export=1, ext=v vnsra.wi v12, v12, 12 vnsra.wi v14, v14, 12 - vsadd.vv v1, v8, v12 - vsadd.vv v6, v10, v14 + vsadd.vv \o1, v8, v12 + vsadd.vv \o6, v10, v14 vssub.vv v8, v8, v12 vssub.vv v9, v10, v14 @@ -586,17 +586,20 @@ function inv_adst_e16_x8_rvv, export=1, ext=v vwadd.wx v14, v14, t1 vwadd.wx v16, v16, t1 - vnsra.wi v3, v10, 12 - vnsra.wi v4, v12, 12 - vnsra.wi v2, v14, 12 - vnsra.wi v5, v16, 12 + vnsra.wi \o3, v10, 12 + vnsra.wi \o4, v12, 12 + vnsra.wi \o2, v14, 12 + vnsra.wi \o5, v16, 12 vmv.v.x v8, zero - vssub.vv v1, v8, v1 - vssub.vv v3, v8, v3 - vssub.vv v5, v8, v5 - vssub.vv v7, v8, v7 + vssub.vv \o1, v8, \o1 + vssub.vv \o3, v8, \o3 + vssub.vv \o5, v8, \o5 + vssub.vv \o7, v8, \o7 +.endm +function inv_adst_e16_x8_rvv, export=1, ext=v + iadst_8 v0, v1, v2, v3, v4, v5, v6, v7 jr t0 endfunc -- cgit v1.2.3 From 219befefeb5de441b9dc1dabcee6bb83412df8c4 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Tue, 30 Jan 2024 08:34:42 -0500 Subject: riscv64/itx: Add 8-point 8bpc RVV flipadst transform inv_txfm_add_8x8_adst_flipadst_0_8bpc_c: 3323.1 ( 1.00x) inv_txfm_add_8x8_adst_flipadst_0_8bpc_rvv: 402.1 ( 8.26x) inv_txfm_add_8x8_adst_flipadst_1_8bpc_c: 3322.8 ( 1.00x) inv_txfm_add_8x8_adst_flipadst_1_8bpc_rvv: 402.2 ( 8.26x) inv_txfm_add_8x8_dct_flipadst_0_8bpc_c: 3074.3 ( 1.00x) inv_txfm_add_8x8_dct_flipadst_0_8bpc_rvv: 359.5 ( 8.55x) inv_txfm_add_8x8_dct_flipadst_1_8bpc_c: 3074.4 ( 1.00x) inv_txfm_add_8x8_dct_flipadst_1_8bpc_rvv: 359.4 ( 8.56x) inv_txfm_add_8x8_flipadst_adst_0_8bpc_c: 3314.8 ( 1.00x) inv_txfm_add_8x8_flipadst_adst_0_8bpc_rvv: 403.3 ( 8.22x) inv_txfm_add_8x8_flipadst_adst_1_8bpc_c: 3315.3 ( 1.00x) inv_txfm_add_8x8_flipadst_adst_1_8bpc_rvv: 403.3 ( 8.22x) inv_txfm_add_8x8_flipadst_dct_0_8bpc_c: 3071.7 ( 1.00x) inv_txfm_add_8x8_flipadst_dct_0_8bpc_rvv: 359.1 ( 8.55x) inv_txfm_add_8x8_flipadst_dct_1_8bpc_c: 3072.5 ( 1.00x) inv_txfm_add_8x8_flipadst_dct_1_8bpc_rvv: 359.3 ( 8.55x) inv_txfm_add_8x8_flipadst_flipadst_0_8bpc_c: 3325.2 ( 1.00x) inv_txfm_add_8x8_flipadst_flipadst_0_8bpc_rvv: 405.2 ( 8.21x) inv_txfm_add_8x8_flipadst_flipadst_1_8bpc_c: 3325.0 ( 1.00x) inv_txfm_add_8x8_flipadst_flipadst_1_8bpc_rvv: 405.2 ( 8.21x) inv_txfm_add_8x8_flipadst_identity_0_8bpc_c: 2356.2 ( 1.00x) inv_txfm_add_8x8_flipadst_identity_0_8bpc_rvv: 283.7 ( 8.31x) inv_txfm_add_8x8_flipadst_identity_1_8bpc_c: 2356.2 ( 1.00x) inv_txfm_add_8x8_flipadst_identity_1_8bpc_rvv: 283.5 ( 8.31x) inv_txfm_add_8x8_identity_flipadst_0_8bpc_c: 2332.8 ( 1.00x) inv_txfm_add_8x8_identity_flipadst_0_8bpc_rvv: 268.0 ( 8.71x) inv_txfm_add_8x8_identity_flipadst_1_8bpc_c: 2331.5 ( 1.00x) inv_txfm_add_8x8_identity_flipadst_1_8bpc_rvv: 268.0 ( 8.70x) --- src/riscv/64/itx.S | 12 ++++++++++++ src/riscv/itx.h | 9 +-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 44ca29b..f7d907e 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -603,6 +603,11 @@ function inv_adst_e16_x8_rvv, export=1, ext=v jr t0 endfunc +function inv_flipadst_e16_x8_rvv, export=1, ext=v + iadst_8 v7, v6, v5, v4, v3, v2, v1, v0 + jr t0 +endfunc + .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct @@ -642,9 +647,16 @@ endfunc def_fn_8x8 dct, dct def_fn_8x8 identity, identity def_fn_8x8 dct, adst +def_fn_8x8 dct, flipadst def_fn_8x8 dct, identity def_fn_8x8 adst, dct def_fn_8x8 adst, adst +def_fn_8x8 adst, flipadst +def_fn_8x8 flipadst, dct +def_fn_8x8 flipadst, adst +def_fn_8x8 flipadst, flipadst def_fn_8x8 identity, dct def_fn_8x8 adst, identity +def_fn_8x8 flipadst, identity def_fn_8x8 identity, adst +def_fn_8x8 identity, flipadst diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 9b791f3..bed2154 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -104,13 +104,6 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in #if BITDEPTH == 8 assign_itx16_fn( , 4, 4, rvv); - assign_itx2_fn( , 8, 8, rvv); - assign_itx_fn( , 8, 8, dct_adst, ADST_DCT, rvv); - assign_itx_fn( , 8, 8, dct_identity, H_DCT, rvv); - assign_itx_fn( , 8, 8, adst_dct, DCT_ADST, rvv); - assign_itx_fn( , 8, 8, adst_adst, ADST_ADST, rvv); - assign_itx_fn( , 8, 8, identity_dct, V_DCT, rvv); - assign_itx_fn( , 8, 8, adst_identity, H_ADST, rvv); - assign_itx_fn( , 8, 8, identity_adst, V_ADST, rvv); + assign_itx16_fn( , 8, 8, rvv); #endif } -- cgit v1.2.3 From a6878be7e07114f5a2915ad46300700f0db55197 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Tue, 30 Jan 2024 12:58:00 -0500 Subject: Alphabetize architecture defines and usage --- include/common/attributes.h | 2 +- meson.build | 12 ++++++------ src/meson.build | 36 ++++++++++++++++++------------------ tests/checkasm/checkasm.c | 4 ++-- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/include/common/attributes.h b/include/common/attributes.h index d8dac04..cd058ab 100644 --- a/include/common/attributes.h +++ b/include/common/attributes.h @@ -60,7 +60,7 @@ #define ALIGN_64_VAL 64 #define ALIGN_32_VAL 32 #define ALIGN_16_VAL 16 -#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE || ARCH_LOONGARCH +#elif ARCH_AARCH64 || ARCH_ARM || ARCH_LOONGARCH || ARCH_PPC64LE || ARCH_X86_32 /* ARM doesn't benefit from anything more than 16-byte alignment. */ #define ALIGN_64_VAL 16 #define ALIGN_32_VAL 16 diff --git a/meson.build b/meson.build index 50d2684..30ed4ac 100644 --- a/meson.build +++ b/meson.build @@ -62,13 +62,13 @@ endforeach # ASM option is_asm_enabled = (get_option('enable_asm') == true and - (host_machine.cpu_family() == 'x86' or - (host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '') or - host_machine.cpu_family() == 'aarch64' or + (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm') or host_machine.cpu() == 'ppc64le' or host_machine.cpu_family().startswith('riscv') or - host_machine.cpu_family().startswith('loongarch'))) + host_machine.cpu_family().startswith('loongarch') or + host_machine.cpu_family() == 'x86' or + (host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == ''))) cdata.set10('HAVE_ASM', is_asm_enabled) if is_asm_enabled and get_option('b_sanitize') == 'memory' @@ -234,9 +234,9 @@ endif if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm') or + host_machine.cpu_family().startswith('loongarch') or host_machine.cpu() == 'ppc64le' or - host_machine.cpu_family().startswith('riscv') or - host_machine.cpu_family().startswith('loongarch')) + host_machine.cpu_family().startswith('riscv')) if cc.has_function('getauxval', prefix : '#include ', args : test_args) cdata.set('HAVE_GETAUXVAL', 1) endif diff --git a/src/meson.build b/src/meson.build index d1ea408..dc4be5f 100644 --- a/src/meson.build +++ b/src/meson.build @@ -226,24 +226,6 @@ if is_asm_enabled # Compile the ASM sources with NASM libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm) - elif host_machine.cpu() == 'ppc64le' - arch_flags = ['-maltivec', '-mvsx'] - libdav1d_sources += files( - 'ppc/cpu.c', - ) - libdav1d_arch_tmpl_sources += files( - 'ppc/cdef_tmpl.c', - 'ppc/looprestoration_tmpl.c', - ) - elif host_machine.cpu_family().startswith('riscv') - libdav1d_sources += files( - 'riscv/cpu.c', - ) - if host_machine.cpu_family() == 'riscv64' - libdav1d_sources += files( - 'riscv/64/itx.S', - ) - endif elif host_machine.cpu_family().startswith('loongarch') libdav1d_sources += files( 'loongarch/cpu.c', @@ -262,6 +244,24 @@ if is_asm_enabled 'loongarch/itx.S', ) libdav1d_asm_objs += libdav1d_sources_asm + elif host_machine.cpu() == 'ppc64le' + arch_flags = ['-maltivec', '-mvsx'] + libdav1d_sources += files( + 'ppc/cpu.c', + ) + libdav1d_arch_tmpl_sources += files( + 'ppc/cdef_tmpl.c', + 'ppc/looprestoration_tmpl.c', + ) + elif host_machine.cpu_family().startswith('riscv') + libdav1d_sources += files( + 'riscv/cpu.c', + ) + if host_machine.cpu_family() == 'riscv64' + libdav1d_sources += files( + 'riscv/64/itx.S', + ) + endif endif endif diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 48cf255..844ae44 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -102,11 +102,11 @@ static const struct { { "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL }, #elif ARCH_AARCH64 || ARCH_ARM { "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON }, -#elif ARCH_PPC64LE - { "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX }, #elif ARCH_LOONGARCH { "LSX", "lsx", DAV1D_LOONGARCH_CPU_FLAG_LSX }, { "LASX", "lasx", DAV1D_LOONGARCH_CPU_FLAG_LASX }, +#elif ARCH_PPC64LE + { "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX }, #elif ARCH_RISCV { "RVV", "rvv", DAV1D_RISCV_CPU_FLAG_V }, #endif -- cgit v1.2.3 From 6d33d1796bdf42c258fb740306cd93db636cf9d0 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 1 Feb 2024 10:34:46 -0500 Subject: Check for trailing marker/zero bits for tile data Fixes #385. --- src/decode.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/decode.c b/src/decode.c index 94ef17c..f04d9d2 100644 --- a/src/decode.c +++ b/src/decode.c @@ -2616,6 +2616,25 @@ static void read_restoration_info(Dav1dTaskContext *const t, } } +// modeled after the equivalent function in aomdec:decodeframe.c +static int check_trailing_bits_after_symbol_coder(const MsacContext *const msac) { + // check marker bit (single 1), followed by zeroes + const int n_bits = -(msac->cnt + 14); + assert(n_bits <= 0); // this assumes we errored out when cnt <= -15 in caller + const int n_bytes = (n_bits + 7) >> 3; + const uint8_t *p = &msac->buf_pos[n_bytes]; + const int pattern = 128 >> ((n_bits - 1) & 7); + if ((p[-1] & (2 * pattern - 1)) != pattern) + return 1; + + // check remainder zero bytes + for (; p < msac->buf_end; p++) + if (*p) + return 1; + + return 0; +} + int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { const Dav1dFrameContext *const f = t->f; const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64; @@ -2659,9 +2678,6 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { return 0; } - // error out on symbol decoder overread - if (ts->msac.cnt < -15) return 1; - if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) { f->c->refmvs_dsp.load_tmvs(&f->rf, ts->tiling.row, ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, @@ -2767,7 +2783,12 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)], &t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver); - return 0; + // error out on symbol decoder overread + if (ts->msac.cnt <= -15) return 1; + + return c->strict_std_compliance && + (t->by >> f->sb_shift) + 1 >= f->frame_hdr->tiling.row_start_sb[tile_row + 1] && + check_trailing_bits_after_symbol_coder(&ts->msac); } int dav1d_decode_frame_init(Dav1dFrameContext *const f) { -- cgit v1.2.3 From 18b6ed7008d5aabf92e70d5c1c00a8702ca84849 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 1 Feb 2024 14:16:50 -0500 Subject: Verify ref frame results after decoding completion This fixes the issue where - when frame threading is active - that a reference could successfully progress to a particular sbrow and signal that, have that picked up by a frame it serves as a reference for, which therefore decodes successfully, even though the reference might fail decoding at a later stage. --- src/decode.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/decode.c b/src/decode.c index f04d9d2..fdf5a6b 100644 --- a/src/decode.c +++ b/src/decode.c @@ -3282,7 +3282,7 @@ error: return retval; } -void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) { +void dav1d_decode_frame_exit(Dav1dFrameContext *const f, int retval) { const Dav1dContext *const c = f->c; if (f->sr_cur.p.data[0]) @@ -3293,8 +3293,16 @@ void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) { (size_t)f->frame_thread.cf_sz * 128 * 128 / 2); } for (int i = 0; i < 7; i++) { - if (f->refp[i].p.frame_hdr) + if (f->refp[i].p.frame_hdr) { + if (!retval && c->n_fc > 1 && c->strict_std_compliance && + atomic_load(&f->refp[i].progress[1]) == FRAME_ERROR) + { + retval = DAV1D_ERR(EINVAL); + atomic_store(&f->task_thread.error, 1); + atomic_store(&f->sr_cur.progress[1], FRAME_ERROR); + } dav1d_thread_picture_unref(&f->refp[i]); + } dav1d_ref_dec(&f->ref_mvs_ref[i]); } @@ -3348,6 +3356,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { } } dav1d_decode_frame_exit(f, res); + res = f->task_thread.retval; f->n_tile_data = 0; return res; } -- cgit v1.2.3 From 7f5d3492f6b7d8d4c15d0a2bad06193643d1aa03 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 2 Feb 2024 10:57:31 -0500 Subject: picture.c: rename picture_alloc_with_edges() to picture_alloc() The allocated picture has no edges and is not expected to have any edges, so the _with_edges() suffix was misleading. Fixes #415. --- src/picture.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/src/picture.c b/src/picture.c index f22f05f..ecfb512 100644 --- a/src/picture.c +++ b/src/picture.c @@ -111,15 +111,15 @@ void dav1d_picture_free_itut_t35(const uint8_t *const data, void *const user_dat dav1d_free(itut_t35_ctx); } -static int picture_alloc_with_edges(Dav1dContext *const c, - Dav1dPicture *const p, - const int w, const int h, - Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref, - Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref, - const int bpc, - const Dav1dDataProps *const props, - Dav1dPicAllocator *const p_allocator, - void **const extra_ptr) +static int picture_alloc(Dav1dContext *const c, + Dav1dPicture *const p, + const int w, const int h, + Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref, + Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref, + const int bpc, + const Dav1dDataProps *const props, + Dav1dPicAllocator *const p_allocator, + void **const extra_ptr) { if (p->data[0]) { dav1d_log(c, "Picture already allocated!\n"); @@ -194,12 +194,11 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f { Dav1dThreadPicture *const p = &f->sr_cur; - const int res = - picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height, - f->seq_hdr, f->seq_hdr_ref, - f->frame_hdr, f->frame_hdr_ref, - bpc, &f->tile[0].data.m, &c->allocator, - (void **) &p->progress); + const int res = picture_alloc(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height, + f->seq_hdr, f->seq_hdr_ref, + f->frame_hdr, f->frame_hdr_ref, + bpc, &f->tile[0].data.m, &c->allocator, + (void **) &p->progress); if (res) return res; dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref, @@ -233,11 +232,11 @@ int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, con { Dav1dMemPoolBuffer *const buf = (Dav1dMemPoolBuffer *)src->ref->const_data; struct pic_ctx_context *const pic_ctx = buf->data; - const int res = picture_alloc_with_edges(c, dst, w, src->p.h, - src->seq_hdr, src->seq_hdr_ref, - src->frame_hdr, src->frame_hdr_ref, - src->p.bpc, &src->m, &pic_ctx->allocator, - NULL); + const int res = picture_alloc(c, dst, w, src->p.h, + src->seq_hdr, src->seq_hdr_ref, + src->frame_hdr, src->frame_hdr_ref, + src->p.bpc, &src->m, &pic_ctx->allocator, + NULL); if (res) return res; dav1d_picture_copy_props(dst, src->content_light, src->content_light_ref, -- cgit v1.2.3 From 864d90d9e41d999a5df8e8a74ff76c4fd1147017 Mon Sep 17 00:00:00 2001 From: James Almer Date: Fri, 2 Feb 2024 18:45:50 -0300 Subject: picture: propagate the new sequence event flag in the next picture if the current one is from a lower layer This further ensures the caller sees the DAV1D_EVENT_FLAG_NEW_SEQUENCE flag in the first output frame after a new sequence header is parsed even if the first coded frame is not meant to be output. --- src/picture.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/picture.c b/src/picture.c index ecfb512..94365bc 100644 --- a/src/picture.c +++ b/src/picture.c @@ -211,9 +211,10 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f c->itut_t35 = NULL; c->n_itut_t35 = 0; - // Don't clear these flags from c->frame_flags if the frame is not visible. + // Don't clear these flags from c->frame_flags if the frame is not going to be output. // This way they will be added to the next visible frame too. - const int flags_mask = (f->frame_hdr->show_frame || c->output_invisible_frames) + const int flags_mask = ((f->frame_hdr->show_frame || c->output_invisible_frames) && + c->max_spatial_id == f->frame_hdr->spatial_id) ? 0 : (PICTURE_FLAG_NEW_SEQUENCE | PICTURE_FLAG_NEW_OP_PARAMS_INFO); p->flags = c->frame_flags; c->frame_flags &= flags_mask; -- cgit v1.2.3 From 314423b3d9ebd1de3ac332b7d3db4ed847e36ba8 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Mon, 5 Feb 2024 23:28:15 +0000 Subject: arm64/itx: Set x8 only once in inv_txfm_add_16x16_neon --- src/arm/64/itx.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S index b1b2f8f..f96110b 100644 --- a/src/arm/64/itx.S +++ b/src/arm/64/itx.S @@ -1426,6 +1426,7 @@ endfunc function inv_txfm_add_16x16_neon mov x15, x30 sub sp, sp, #512 + mov x8, #16*2 .irp i, 0, 8 add x6, sp, #(\i*16*2) .if \i == 8 @@ -1433,7 +1434,6 @@ function inv_txfm_add_16x16_neon b.lt 1f .endif add x7, x2, #(\i*2) - mov x8, #16*2 blr x9 .endr b 2f @@ -1449,7 +1449,6 @@ function inv_txfm_add_16x16_neon .irp i, 0, 8 add x6, x0, #(\i) add x7, sp, #(\i*2) - mov x8, #32 bl inv_txfm_add_vert_8x16_neon .endr -- cgit v1.2.3 From 08051a3b50dee91a88e2bff0391c5abd89da1c12 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Tue, 6 Feb 2024 01:43:10 +0000 Subject: arm64/itx: Set x8 outside .irp loop --- src/arm/64/itx.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S index f96110b..53490cd 100644 --- a/src/arm/64/itx.S +++ b/src/arm/64/itx.S @@ -2460,10 +2460,10 @@ function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 b.gt 2b 3: + mov x8, #32*2 .irp i, 0, 8, 16, 24 add x6, x0, #(\i) add x7, sp, #(\i*2) - mov x8, #32*2 bl inv_txfm_add_vert_8x16_neon .endr @@ -3204,10 +3204,10 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 3: adr x5, inv_dct_8h_x16_neon + mov x8, #64*2 .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i) add x7, x4, #(\i*2) - mov x8, #64*2 bl inv_txfm_add_vert_8x16_neon .endr -- cgit v1.2.3 From 2b475307dc11be9a1c3cc4358102c76a7f386a51 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 21 Nov 2023 20:47:50 +0100 Subject: Fix tile_start_off calculations for extremely large frame sizes The tile start offset, in pixels, can exceed the range of a signed int. --- src/decode.c | 13 +++++++------ src/internal.h | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/decode.c b/src/decode.c index fdf5a6b..eed9dfb 100644 --- a/src/decode.c +++ b/src/decode.c @@ -2470,7 +2470,7 @@ static void setup_tile(Dav1dTileState *const ts, const Dav1dFrameContext *const f, const uint8_t *const data, const size_t sz, const int tile_row, const int tile_col, - const int tile_start_off) + const unsigned tile_start_off) { const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col]; const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128; @@ -2843,15 +2843,16 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout]; const int hbd = !!f->seq_hdr->hbd; if (c->n_fc > 1) { + const unsigned sb_step4 = f->sb_step * 4; int tile_idx = 0; for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { - int row_off = f->frame_hdr->tiling.row_start_sb[tile_row] * - f->sb_step * 4 * f->sb128w * 128; - int b_diff = (f->frame_hdr->tiling.row_start_sb[tile_row + 1] - - f->frame_hdr->tiling.row_start_sb[tile_row]) * f->sb_step * 4; + const unsigned row_off = f->frame_hdr->tiling.row_start_sb[tile_row] * + sb_step4 * f->sb128w * 128; + const unsigned b_diff = (f->frame_hdr->tiling.row_start_sb[tile_row + 1] - + f->frame_hdr->tiling.row_start_sb[tile_row]) * sb_step4; for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { f->frame_thread.tile_start_off[tile_idx++] = row_off + b_diff * - f->frame_hdr->tiling.col_start_sb[tile_col] * f->sb_step * 4; + f->frame_hdr->tiling.col_start_sb[tile_col] * sb_step4; } } diff --git a/src/internal.h b/src/internal.h index 631c5a8..72f6560 100644 --- a/src/internal.h +++ b/src/internal.h @@ -289,7 +289,7 @@ struct Dav1dFrameContext { int prog_sz; int cbi_sz, pal_sz, pal_idx_sz, cf_sz; // start offsets per tile - int *tile_start_off; + unsigned *tile_start_off; } frame_thread; // loopfilter -- cgit v1.2.3 From 01bee1c5beb50e30a2ea003703e66df25648ab73 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Kempf Date: Tue, 13 Feb 2024 18:36:27 +0100 Subject: Update for 1.4.0 --- NEWS | 14 ++++++++++++++ meson.build | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index 54f8557..097fcce 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,17 @@ +Changes for 1.4.0 'Road Runner': +------------------------------------------------------ + +1.4.0 is a medium release of dav1d, focusing on new architecture support and optimizations + +- AVX-512 optimizations for z1, z2, z3 in 8bit and high-bit depth +- New architecture supported: loongarch +- Loongarch optimizations for 8bit +- New architecture supported: RISC-V +- RISC-V optimizations for itx +- Misc improvements in threading and in reducing binary size +- Fix potential integer overflow with extremely large frame sizes + + Changes for 1.3.0 'Tundra Peregrine Falcon (Calidus)': ------------------------------------------------------ diff --git a/meson.build b/meson.build index 30ed4ac..6e49852 100644 --- a/meson.build +++ b/meson.build @@ -23,7 +23,7 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '1.3.0', + version: '1.4.0', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', @@ -136,7 +136,7 @@ if host_machine.system() == 'windows' rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major) rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor) rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision) - rc_data.set('COPYRIGHT_YEARS', '2018-2023') + rc_data.set('COPYRIGHT_YEARS', '2018-2024') else thread_dependency = dependency('threads') thread_compat_dep = [] -- cgit v1.2.3 From 97744bdc8cb68bc0923f6f608e46bb26faa410d2 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 12 Feb 2024 19:36:14 +0100 Subject: x86: Add high bit-depth ipred z2 AVX-512 (Ice Lake) asm --- src/x86/ipred.h | 1 + src/x86/ipred16_avx512.asm | 610 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 611 insertions(+) diff --git a/src/x86/ipred.h b/src/x86/ipred.h index f5f187e..57aff0f 100644 --- a/src/x86/ipred.h +++ b/src/x86/ipred.h @@ -144,6 +144,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl); init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl); init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl); + init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl); init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl); init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl); diff --git a/src/x86/ipred16_avx512.asm b/src/x86/ipred16_avx512.asm index 8124a3b..6980261 100644 --- a/src/x86/ipred16_avx512.asm +++ b/src/x86/ipred16_avx512.asm @@ -79,14 +79,17 @@ z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 z_filter_k: dw 8, 8, 6, 6, 4, 4 dw 4, 4, 5, 5, 4, 4 dw 0, 0, 0, 0, 2, 2 +pb_90: times 4 db 90 pw_15: times 2 dw 15 pw_16: times 2 dw 16 pw_17: times 2 dw 17 pw_24: times 2 dw 24 +pw_31: times 2 dw 31 pw_32: times 2 dw 32 pw_63: times 2 dw 63 pw_64: times 2 dw 64 pw_512: times 2 dw 512 +pw_2048: times 2 dw 2048 pw_31806: times 2 dw 31806 pw_32640: times 2 dw 32640 pw_32672: times 2 dw 32672 @@ -114,6 +117,7 @@ JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 @@ -1174,6 +1178,612 @@ cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx mov rsp, r7 RET +cglobal ipred_z2_16bpc, 3, 9, 16, dst, stride, tl, w, h, angle, dx, _, dy + tzcnt wd, wm + movifnidn angled, anglem + lea dxq, [dr_intra_derivative-90] + movzx dyd, angleb + xor angled, 0x400 + mov r7, dxq + sub dxq, dyq + movifnidn hd, hm + and dyd, ~1 + vpbroadcastw m12, [tlq] + and dxq, ~1 + movzx dyd, word [r7+dyq] ; angle - 90 + lea r7, [z_filter_t0] + movzx dxd, word [dxq+270] ; 180 - angle + mova m0, [base+pw_31to0] + movsxd wq, [base+ipred_z2_16bpc_avx512icl_table+wq*4] + movu m4, [tlq+2] + neg dyd + vpermw m7, m0, [tlq-64*1] + lea wq, [base+ipred_z2_16bpc_avx512icl_table+wq] + vpbroadcastd m14, [base+pw_31806] + vpbroadcastd m15, [base+pw_1] + jmp wq +.w4: + movq xm3, [tlq] + vpbroadcastq m8, [base+pw_1to32] + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + pshuflw xm0, xm4, q3321 + sub angled, 1075 ; angle - 53 + lea r3d, [hq+3] + call .upsample_above + punpcklwd xm4, xm3, xm4 + palignr xm3, xm4, xm12, 14 + jmp .w4_main +.w4_upsample_left: + call .upsample_left + movsldup m1, [base+z_xpos_mul] + paddw m1, m1 + jmp .w4_main2 +.w4_no_upsample_above: + lea r3d, [hq+3] + vpbroadcastd ym0, [base+pw_3] + sub angled, 1112 ; angle - 90 + call .filter_above2 + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + palignr xm3, xm4, xm12, 14 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + call .filter_left16 +.w4_main: + movsldup m1, [base+z_xpos_mul] + psllw m15, 3 +.w4_main2: + vpbroadcastq m0, [base+pw_1to32] + vpbroadcastw m11, dxd + movsldup m2, [base+z_xpos_mul] + vpbroadcastw m13, dyd + vpbroadcastd m5, [tlq-2] + psllw m10, m8, 6 + valignq m5, m7, m5, 6 + pmullw m2, m11 + psubw m10, m2 ; xpos + pmullw m13, m0 ; ypos + palignr m5, m7, m5, 14 + psrlw m12, m13, 6 + psllw m13, 9 + paddw m12, m1 ; base_y + pand m13, m14 ; frac_y << 9 + psllw m11, 3 + lea r5, [strideq*3] +.w4_loop: + psrlw m1, m10, 6 ; base_x + pand m2, m14, m10 ; frac + vpermw m0, m1, m3 ; top[base_x] + vpermw m1, m1, m4 ; top[base_x+1] + vpmovw2m k1, m10 ; base_x < 0 + psllw m2, 9 + vpermw m0{k1}, m12, m5 ; left[base_y] + vpermw m1{k1}, m12, m7 ; left[base_y+1] + vmovdqu16 m2{k1}, m13 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r5 ], xm1 + sub hd, 8 + jl .w4_end + vextracti32x8 ym0, m0, 1 + psubw m10, m11 ; base_x -= dx + lea dstq, [dstq+strideq*4] + paddw m12, m15 ; base_y++ + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r5 ], xm1 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.upsample_above: ; w4/w8 + mova ym9, [base+pw_1to32] + palignr xm1, xm4, xm12, 12 + paddw xm3, xm4 ; b+c + xor angled, 0x7f ; 180 - angle + paddw xm0, xm1 ; a+d + vpbroadcastw xm1, r9m ; pixel_max + vpbroadcastb xm11, r3d + psubw xm0, xm3, xm0 + vpbroadcastb xm2, angled + psraw xm0, 3 + shr angled, 8 + paddw xm3, xm0 + pxor xm0, xm0 + vpcmpeqb k2, xm11, [base+z_filter_wh] + pmaxsw xm3, xm0 + add dxd, dxd + pavgw xm3, xm0 + vpcmpgtb k2{k2}, xm2, [base+z_filter_t0+angleq*8] + pminsw xm3, xm1 + paddw m8, m8 + jmp .filter_left16b +.upsample_left: ; h4/h8 + lea r3d, [hq-1] + palignr xm2, xm7, xm12, 14 + vpbroadcastw xm0, r3d + palignr xm1, xm7, xm12, 12 + pminuw xm0, xm9 + paddw xm2, xm7 ; b+c + vpermw xm0, xm0, xm7 + add dyd, dyd + paddw xm0, xm1 ; a+d + vpbroadcastw xm1, r9m ; pixel_max + psubw xm0, xm2, xm0 + psraw xm0, 3 + paddw xm2, xm0 + pxor xm0, xm0 + pmaxsw xm2, xm0 + pavgw xm2, xm0 + pminsw xm2, xm1 + punpckhwd xm0, xm2, xm7 + punpcklwd xm7, xm2, xm7 + vinserti32x4 ym7, xm0, 1 + ret +.filter_above: + sub angled, 90 +.filter_above2: + vpbroadcastb ym1, r3d + vpbroadcastb ym10, angled + mov r3d, angled + shr r3d, 8 + vpcmpeqb k2, ym1, [base+z_filter_wh] + mova xm11, [base+z_filter_t0+r3*8] + vpcmpgtb k1{k2}, ym10, ym11 + mova m9, [base+pw_1to32] + kmovd r3d, k1 + test r3d, r3d + jz .filter_end + pminuw ym0, ym9 + popcnt r3d, r3d + vpbroadcastd ym6, r7m ; max_w + kxnorw k1, k1, k1 + vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0] + kaddw k1, k1, k1 ; ~1 + vpbroadcastd ym13, [base+z_filter_k+(r3-1)*4+12*1] + vpermw ym2, ym0, ym4 ; +1 + pmullw ym5, ym4 + paddw ym1, ym2, ym3 + vmovdqu16 m3{k1}, [tlq-2] ; -2 + vpermw ym2, ym0, ym2 ; +2 + vpbroadcastd ym0, [base+z_filter_k+(r3-1)*4+12*2] + pmullw ym1, ym13 + movu m13, [base+pw_0to31] + paddw ym2, ym3 + packssdw ym6, ym6 + pmullw ym2, ym0 + paddw ym1, ym5 + vpcmpgtw k1, ym6, ym13 + paddw ym1, ym2 + pxor ym2, ym2 + psrlw ym1, 3 + pavgw ym4{k1}, ym1, ym2 +.filter_end: + ret +.filter_left16: + vpbroadcastd ym1, [base+pb_90] + psubb ym1, ym10 + vpcmpgtb k2{k2}, ym1, ym11 +.filter_left16b: + kmovd r3d, k2 + test r3d, r3d + jz .filter_end + lea r5d, [hq-1] + vinserti32x4 ym0, ym12, xm7, 1 + vpbroadcastw ym1, r5d + popcnt r3d, r3d + vpbroadcastd ym6, r8m ; max_h + pminuw ym9, ym1 + vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0] + vpermw ym2, ym9, ym7 ; +1 + vpbroadcastd ym10, [base+z_filter_k+(r3-1)*4+12*1] + palignr ym1, ym7, ym0, 14 ; -1 + pmullw ym5, ym7 + palignr ym0, ym7, ym0, 12 ; -2 + paddw ym1, ym2 + vpermw ym2, ym9, ym2 ; +2 + vpbroadcastd ym9, [base+z_filter_k+(r3-1)*4+12*2] + pmullw ym1, ym10 + paddw ym2, ym0 + packssdw ym6, ym6 + pmullw ym2, ym9 + paddw ym1, ym5 + vpcmpgtw k1, ym6, [base+pw_0to31] + paddw ym1, ym2 + pxor ym2, ym2 + psrlw ym1, 3 + pavgw ym7{k1}, ym1, ym2 + ret +.filter_left: + cmp hd, 32 + jl .filter_left16 + vpbroadcastd m5, [base+pw_3] + pminud m0, m9, [base+pw_31] {1to16} +.filter_left32: + vpbroadcastd m6, r8m ; max_h + valignq m2, m7, m12, 6 + packssdw m6, m6 + palignr m1, m7, m2, 14 ; -1 + paddw m1, m7 + palignr m2, m7, m2, 12 ; -2 + vpcmpgtw k1, m6, m13 + paddw m2, m5 + cmp hd, 64 + je .filter_left64 + lea r3d, [hq-1] + vpbroadcastw m10, r3d + pminuw m0, m10 + vpermw m10, m0, m7 ; +1 + paddw m1, m10 + vpermw m10, m0, m10 ; +2 + pavgw m2, m10 + paddw m1, m2 + vpsrlw m7{k1}, m1, 2 + ret +.filter_left64: + valignq m10, m8, m7, 2 + vpaddd m13, [base+pw_32] {1to16} + palignr m11, m10, m7, 2 ; +1 + paddw m1, m11 + palignr m11, m10, m7, 4 ; +2 + valignq m10, m8, m7, 6 + pavgw m11, m2 + vpermw m2, m0, m8 ; 32+1 + paddw m1, m11 + vpsrlw m7{k1}, m1, 2 + palignr m1, m8, m10, 14 ; 32-1 + paddw m1, m8 + palignr m10, m8, m10, 12 ; 32-2 + paddw m1, m2 + vpermw m2, m0, m2 ; 32+2 + paddw m10, m5 + vpcmpgtw k1, m6, m13 + pavgw m2, m10 + paddw m1, m2 + vpsrlw m8{k1}, m1, 2 + ret +.w8: + mova xm3, [tlq] + vbroadcasti32x4 m8, [base+pw_1to32] + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + psrldq xm0, xm4, 2 + sub angled, 53 + pshufhw xm0, xm0, q2210 + lea r3d, [hq+7] + call .upsample_above + punpcklwd xm0, xm3, xm4 + punpckhwd xm4, xm3, xm4 + vinserti32x4 ym3, ym12, xm0, 1 + vinserti32x4 ym4, ym0, xm4, 1 + palignr ym3, ym4, ym3, 14 + jmp .w8_main +.w8_upsample_left: + call .upsample_left + movshdup m1, [base+z_xpos_mul] + psllw m15, 3 + paddw m1, m1 + jmp .w8_main2 +.w8_no_upsample_above: + lea r3d, [hq+7] + vpbroadcastd ym0, [base+pw_7] + call .filter_above + lea r3d, [angleq-51] + mov r3b, hb + palignr xm3, xm4, xm12, 14 + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + call .filter_left +.w8_main: + movshdup m1, [base+z_xpos_mul] + psllw m15, 2 +.w8_main2: + vbroadcasti32x4 m0, [base+pw_1to32] + vpbroadcastw m11, dxd + movshdup m2, [base+z_xpos_mul] + vpbroadcastw m13, dyd + psllw m10, m8, 6 + valignq m5, m7, m12, 6 + pmullw m2, m11 + psubw m10, m2 ; xpos + pmullw m13, m0 ; ypos + palignr m5, m7, m5, 14 + psrlw m12, m13, 6 + psllw m13, 9 + mov r2d, 1<<6 + paddw m12, m1 ; base_y + lea r3d, [dxq-(8<<6)] ; left-only threshold + pand m13, m14 ; frac_y << 9 + shl dxd, 2 + psllw m11, 2 + lea r5, [strideq*3] +.w8_loop: + psrlw m1, m10, 6 + pand m2, m14, m10 + vpermw m0, m1, m3 + vpermw m1, m1, m4 + psllw m2, 9 + sub r2d, dxd + jge .w8_toponly + vpmovw2m k1, m10 + vpermw m0{k1}, m12, m5 + vpermw m1{k1}, m12, m7 + vmovdqu16 m2{k1}, m13 +.w8_toponly: + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r5 ], m0, 3 + sub hd, 4 + jz .w8_end + psubw m10, m11 ; base_x -= dx + lea dstq, [dstq+strideq*4] + paddw m12, m15 ; base_y++ + cmp r2d, r3d + jge .w8_loop +.w8_leftonly_loop: + vpermw m0, m12, m5 + vpermw m1, m12, m7 + psubw m1, m0 + pmulhrsw m1, m13 + paddw m12, m15 + paddw m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r5 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_leftonly_loop +.w8_end: + RET +.w16: + mova ym3, [tlq] + vpermw m8, m0, [tlq-64*2] + test angled, 0x400 + jnz .w16_main + lea r3d, [hq+15] + vpbroadcastd ym0, [base+pw_15] + call .filter_above + call .filter_left + vinserti32x4 ym3, ym12, xm4, 1 + palignr ym3, ym4, ym3, 14 +.w16_main: + vbroadcasti32x8 m0, [base+pw_1to32] + vpbroadcastw m11, dxd + vpbroadcastw m13, dyd + kxnorw k2, k2, k2 + psllw m10, m0, 6 + valignq m5, m7, m12, 6 + psubw m10, m11 ; xpos + valignq m6, m8, m7, 6 + pmullw m13, m0 ; ypos + knotd k1, k2 + palignr m5, m7, m5, 14 + palignr m6, m8, m6, 14 + vpsubw m10{k1}, m11 + psrlw m12, m13, 6 + psllw m13, 9 + mov r2d, 1<<6 + vpsubw m12{k2}, m15 ; base_y + pand m13, m14 ; frac_y << 9 + lea r3d, [dxq-(16<<6)] + paddw m11, m11 + add dxd, dxd + paddw m15, m15 +.w16_loop: + psrlw m1, m10, 6 + pand m2, m14, m10 + vpermw m0, m1, m3 + vpermw m1, m1, m4 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m12, m15 ; base_y++ + paddw m0, m1 + sub r2d, dxd + jge .w16_toponly + mova m1, m5 + vpermt2w m1, m12, m6 + mova m2, m7 + vpermt2w m2, m12, m8 + vpmovw2m k1, m10 + psubw m2, m1 + pmulhrsw m2, m13 + vpaddw m0{k1}, m1, m2 +.w16_toponly: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w16_end + psubw m10, m11 ; base_x -= dx + lea dstq, [dstq+strideq*2] + cmp r2d, r3d + jge .w16_loop + paddw m12, m15 + vpermt2w m5, m12, m6 + mova m1, m7 + vpermt2w m1, m12, m8 + jmp .w16_leftonly_loop_start +.w16_leftonly_loop: + mova m1, m7 + vpermt2w m1, m12, m8 + vshufi32x4 m5, m1, q1032 +.w16_leftonly_loop_start: + psubw m0, m1, m5 + pmulhrsw m0, m13 + paddw m12, m15 + paddw m0, m5 + mova m5, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_leftonly_loop +.w16_end: + RET +.w32: + mova m3, [tlq] + vpermw m8, m0, [tlq-64*2] + mova m9, [base+pw_1to32] + test angled, 0x400 + jnz .w32_main + pminud m0, m9, [base+pw_31] {1to16} + mov r3d, ~1 + kmovd k1, r3d + vpbroadcastd m5, [base+pw_3] + vpbroadcastd m6, r6m ; max_w + vpermw m2, m0, m4 ; +1 + movu m13, [base+pw_0to31] + paddw m1, m4, m3 + vmovdqu16 m3{k1}, [tlq-2] ; -2 + packssdw m6, m6 + paddw m1, m2 + vpermw m2, m0, m2 ; +2 + paddw m3, m5 + vpcmpgtw k1, m6, m13 + pavgw m2, m3 + paddw m1, m2 + psrlw m4{k1}, m1, 2 + call .filter_left32 +.w32_main: + sub rsp, 64*2 + call .w32_main1 + add rsp, 64*2 + RET +.w32_main1: + vpbroadcastw m11, dxd + movu [rsp+64], m4 + vpbroadcastw m4, dyd + movd [rsp+60], xm12 + valignq m5, m7, m12, 6 + psllw m3, m9, 6 ; xpos + valignq m6, m8, m7, 6 + pmullw m9, m4 ; ypos + palignr m5, m7, m5, 14 + mov r2d, 33<<6 + palignr m6, m8, m6, 14 + mova m10, m3 +.w32_main2: + psllw m13, m9, 9 + sub r2d, dxd + psrlw m12, m9, 6 ; base_y + mov r8d, hd + pand m13, m14 ; frac_y << 9 +.w32_loop: + mov r3d, r2d + shr r3d, 6 + psubw m10, m11 ; base_x -= dx + movu m0, [rsp+r3*2-2] + pand m2, m10, m14 ; frac_x + movu m1, [rsp+r3*2] + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m12, m15 ; base_y++ + paddw m0, m1 + cmp r2d, 32<<6 + jge .w32_toponly + mova m1, m5 + vpermt2w m1, m12, m6 + mova m2, m7 + vpermt2w m2, m12, m8 + vpmovw2m k1, m10 + psubw m2, m1 + pmulhrsw m2, m13 + vpaddw m0{k1}, m1, m2 +.w32_toponly: + mova [dstq], m0 + dec r8d + jz .w32_end + add dstq, strideq + sub r2d, dxd + jge .w32_loop + paddw m12, m15 + mova m2, m5 + vpermt2w m2, m12, m6 +.w32_leftonly_loop: + mova m1, m7 + vpermt2w m1, m12, m8 + psubw m0, m1, m2 + pmulhrsw m0, m13 + paddw m12, m15 + paddw m0, m2 + mova m2, m1 + mova [dstq], m0 + add dstq, strideq + dec r8d + jg .w32_leftonly_loop +.w32_end: + ret +.w64: + movu m3, [tlq+66] + vpermw m8, m0, [tlq-64*2] + mova m9, [base+pw_1to32] + test angled, 0x400 + jnz .w64_main + mova m2, [tlq] ; -1 + mov r3d, ~1 + vpbroadcastd m5, [base+pw_3] + kmovd k1, r3d + movu m13, [base+pw_0to31] + vpbroadcastd m6, r6m ; max_w + pminud m0, m9, [base+pw_31] {1to16} + paddw m1, m4, m2 + vmovdqu16 m2{k1}, [tlq-2] ; -2 + packssdw m6, m6 + paddw m1, [tlq+4] ; +1 + paddw m2, m5 + vpcmpgtw k1, m6, m13 + pavgw m2, [tlq+6] ; +2 + paddw m1, m2 + vpermw m2, m0, m3 ; 32+1 + psrlw m4{k1}, m1, 2 + paddw m1, m3, [tlq+64] ; 32-1 + vpaddd m11, m13, [base+pw_32] {1to16} + paddw m1, m2 + vpermw m2, m0, m2 ; 32+2 + paddw m10, m5, [tlq+62] ; 32-2 + vpcmpgtw k1, m6, m11 + pavgw m2, m10 + paddw m1, m2 + psrlw m3{k1}, m1, 2 + call .filter_left32 +.w64_main: + sub rsp, 64*3 + movu [rsp+64*2-gprsize], m3 + mov r5, dstq + call .w32_main1 + psllw m4, 5 + mov r2d, 65<<6 + vpaddd m10, m3, [base+pw_2048] {1to16} ; xpos + lea dstq, [r5+64] + paddw m9, m4 ; ypos + call .w32_main2 + add rsp, 64*3 + RET + cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy lea r7, [z_filter_t0] tzcnt wd, wm -- cgit v1.2.3 From 64c9d16049331ee2875b25afc579a46f9413141f Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sat, 10 Feb 2024 12:29:38 -0500 Subject: riscv64/itx: Add 16-point 8bpc RVV idtx transform inv_txfm_add_16x16_identity_identity_0_8bpc_c: 6933.8 ( 1.00x) inv_txfm_add_16x16_identity_identity_0_8bpc_rvv: 866.0 ( 8.01x) inv_txfm_add_16x16_identity_identity_1_8bpc_c: 6933.4 ( 1.00x) inv_txfm_add_16x16_identity_identity_1_8bpc_rvv: 866.1 ( 8.01x) inv_txfm_add_16x16_identity_identity_2_8bpc_c: 6934.2 ( 1.00x) inv_txfm_add_16x16_identity_identity_2_8bpc_rvv: 866.1 ( 8.01x) --- src/riscv/64/itx.S | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/riscv/itx.h | 4 +- 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index f7d907e..4d9ff7c 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -660,3 +660,130 @@ def_fn_8x8 adst, identity def_fn_8x8 flipadst, identity def_fn_8x8 identity, adst def_fn_8x8 identity, flipadst + +function inv_identity_e16_x16_rvv, export=1, ext=v + li t1, 2*(5793-4096)*8 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vsmul.vx v16, v\i, t1 + vsadd.vv v\i, v\i, v\i + vsadd.vv v\i, v\i, v16 +.endr + jr t0 +endfunc + +function inv_txfm_horz_16x8_rvv, export=1, ext=v + vmv.v.x v16, zero +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vle16.v v\i, (t4) + vse16.v v16, (t4) + add t4, t4, t6 +.endr + li t1, 2*(5793-4096)*8 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vsmul.vx v16, v\i, t1 + vsra.vi v16, v16, 1 + vaadd.vv v\i, v\i, v16 +.endr +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vsse16.v v\i, (t5), t6 + addi t5, t5, 2 +.endr + jr a7 +endfunc + +function inv_txfm_add_vert_8x16_rvv, export=1, ext=v + vsetivli zero, 8, e16, m1, ta, ma +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vle16.v v\i, (t4) + add t4, t4, t6 +.endr + jalr t0, a5 + +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vssra.vi v\i, v\i, 4 +.endr + + vsetivli zero, 8, e8, mf2, ta, ma + mv t0, t5 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vle8.v v\i, (t0) + add t0, t0, a1 +.endr + + vwaddu.wv v0, v0, v16 + vwaddu.wv v1, v1, v17 + vwaddu.wv v2, v2, v18 + vwaddu.wv v3, v3, v19 + vwaddu.wv v4, v4, v20 + vwaddu.wv v5, v5, v21 + vwaddu.wv v6, v6, v22 + vwaddu.wv v7, v7, v23 + vwaddu.wv v8, v8, v24 + vwaddu.wv v9, v9, v25 + vwaddu.wv v10, v10, v26 + vwaddu.wv v11, v11, v27 + vwaddu.wv v12, v12, v28 + vwaddu.wv v13, v13, v29 + vwaddu.wv v14, v14, v30 + vwaddu.wv v15, v15, v31 + + vsetvli zero, zero, e16, m1 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vmax.vx v\i, v\i, zero +.endr + + vsetvli zero, zero, e8, mf2, ta, ma + vnclipu.wi v16, v0, 0 + vnclipu.wi v17, v1, 0 + vnclipu.wi v18, v2, 0 + vnclipu.wi v19, v3, 0 + vnclipu.wi v20, v4, 0 + vnclipu.wi v21, v5, 0 + vnclipu.wi v22, v6, 0 + vnclipu.wi v23, v7, 0 + vnclipu.wi v24, v8, 0 + vnclipu.wi v25, v9, 0 + vnclipu.wi v26, v10, 0 + vnclipu.wi v27, v11, 0 + vnclipu.wi v28, v12, 0 + vnclipu.wi v29, v13, 0 + vnclipu.wi v30, v14, 0 + vnclipu.wi v31, v15, 0 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vse8.v v\i, (t5) + add t5, t5, a1 +.endr + + jr a7 +endfunc + +function inv_txfm_add_16x16_rvv, export=1, ext=v + csrw vxrm, zero + vsetivli zero, 8, e16, m1, ta, ma + addi sp, sp, -16*32 +.irp i, 0, 8 + addi t4, a2, \i*2 + addi t5, sp, \i*16*2 + li t6, 16*2 + jalr a7, a6 +.endr +.irp i, 0, 8 + addi t4, sp, \i*2 + addi t5, a0, \i + li t6, 16*2 + jal a7, inv_txfm_add_vert_8x16_rvv +.endr + addi sp, sp, 16*32 + ret +endfunc + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v + la a6, inv_txfm_horz_16x8_rvv + la a5, inv_\txfm2\()_e16_x16_rvv + j inv_txfm_add_16x16_rvv +endfunc +.endm + +def_fn_16x16 identity, identity diff --git a/src/riscv/itx.h b/src/riscv/itx.h index bed2154..1abd9db 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -58,7 +58,8 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) #define decl_itx_fns(ext) \ decl_itx17_fns( 4, 4, ext); \ -decl_itx16_fns( 8, 8, ext) +decl_itx16_fns( 8, 8, ext); \ +decl_itx16_fns(16, 16, ext) decl_itx_fns(rvv); @@ -105,5 +106,6 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in #if BITDEPTH == 8 assign_itx16_fn( , 4, 4, rvv); assign_itx16_fn( , 8, 8, rvv); + assign_itx_fn( , 16, 16, identity_identity, IDTX, rvv); #endif } -- cgit v1.2.3 From c0ccc323d648d484d56880fff382e52fe66f0b49 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 11 Feb 2024 06:51:37 -0500 Subject: riscv64/itx: Convert inv_txfm_horz_16x8_rvv to macro --- src/riscv/64/itx.S | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 4d9ff7c..70c740b 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -671,25 +671,37 @@ function inv_identity_e16_x16_rvv, export=1, ext=v jr t0 endfunc -function inv_txfm_horz_16x8_rvv, export=1, ext=v +.macro def_horz_16 variant +function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v vmv.v.x v16, zero .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vle16.v v\i, (t4) vse16.v v16, (t4) add t4, t4, t6 .endr +.ifc \variant, _identity li t1, 2*(5793-4096)*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsmul.vx v16, v\i, t1 vsra.vi v16, v16, 1 vaadd.vv v\i, v\i, v16 .endr +.else + jalr t0, a4 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vssra.vi v\i, v\i, 2 +.endr +.endif .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsse16.v v\i, (t5), t6 addi t5, t5, 2 .endr jr a7 endfunc +.endm + +def_horz_16 +def_horz_16 _identity function inv_txfm_add_vert_8x16_rvv, export=1, ext=v vsetivli zero, 8, e16, m1, ta, ma @@ -780,7 +792,12 @@ endfunc .macro def_fn_16x16 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v +.ifc \txfm1, identity + la a6, inv_txfm_horz_identity_16x8_rvv +.else la a6, inv_txfm_horz_16x8_rvv + la a4, inv_\txfm1\()_e16_x16_rvv +.endif la a5, inv_\txfm2\()_e16_x16_rvv j inv_txfm_add_16x16_rvv endfunc -- cgit v1.2.3 From 57d5729cf8b2d35429cc35b78fad0f5a04838a14 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 11 Feb 2024 07:24:01 -0500 Subject: riscv64/itx: Convert inv_dct_e16_x8_rvv to macro --- src/riscv/64/itx.S | 59 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 70c740b..80599e7 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -410,25 +410,25 @@ function inv_identity_e16_x8_rvv, export=1, ext=v jr t0 endfunc -function inv_dct_e16_x8_rvv, export=1, ext=v - idct_4 v0, v2, v4, v6 +.macro idct_8 o0, o1, o2, o3, o4, o5, o6, o7 + idct_4 \o0, \o2, \o4, \o6 li t1, 799 li t2, 4017 li t3, 3406 li t4, 2276 - vwmul.vx v14, v1, t2 + vwmul.vx v14, \o1, t2 neg t2, t2 - vwmul.vx v8, v1, t1 - vwmacc.vx v14, t1, v7 - vwmacc.vx v8, t2, v7 + vwmul.vx v8, \o1, t1 + vwmacc.vx v14, t1, \o7 + vwmacc.vx v8, t2, \o7 - vwmul.vx v12, v5, t4 + vwmul.vx v12, \o5, t4 neg t4, t4 - vwmul.vx v10, v5, t3 - vwmacc.vx v12, t3, v3 - vwmacc.vx v10, t4, v3 + vwmul.vx v10, \o5, t3 + vwmacc.vx v12, t3, \o3 + vwmacc.vx v10, t4, \o3 li t1, 2048 @@ -442,18 +442,18 @@ function inv_dct_e16_x8_rvv, export=1, ext=v vnsra.wi v12, v12, 12 vnsra.wi v14, v14, 12 - vssub.vv v7, v14, v12 + vssub.vv \o7, v14, v12 vsadd.vv v14, v14, v12 - vssub.vv v1, v8, v10 + vssub.vv \o1, v8, v10 vsadd.vv v8, v8, v10 li t2, 2896 - vwmul.vx v10, v7, t2 - vwmul.vx v12, v7, t2 - vwmacc.vx v12, t2, v1 + vwmul.vx v10, \o7, t2 + vwmul.vx v12, \o7, t2 + vwmacc.vx v12, t2, \o1 neg t2, t2 - vwmacc.vx v10, t2, v1 + vwmacc.vx v10, t2, \o1 vwadd.wx v10, v10, t1 vwadd.wx v12, v12, t1 @@ -461,18 +461,16 @@ function inv_dct_e16_x8_rvv, export=1, ext=v vnsra.wi v10, v10, 12 vnsra.wi v12, v12, 12 - vssub.vv v7, v0, v14 - vsadd.vv v0, v0, v14 - vssub.vv v9, v2, v12 - vsadd.vv v1, v2, v12 - vssub.vv v5, v4, v10 - vsadd.vv v2, v4, v10 - vssub.vv v4, v6, v8 - vsadd.vv v3, v6, v8 - vmv.v.v v6, v9 - - jr t0 -endfunc + vssub.vv \o7, \o0, v14 + vsadd.vv \o0, \o0, v14 + vssub.vv v9, \o2, v12 + vsadd.vv \o1, \o2, v12 + vssub.vv \o5, \o4, v10 + vsadd.vv \o2, \o4, v10 + vssub.vv \o4, \o6, v8 + vsadd.vv \o3, \o6, v8 + vmv.v.v \o6, v9 +.endm .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 li t1, 4076 @@ -598,6 +596,11 @@ endfunc vssub.vv \o7, v8, \o7 .endm +function inv_dct_e16_x8_rvv, export=1, ext=v + idct_8 v0, v1, v2, v3, v4, v5, v6, v7 + jr t0 +endfunc + function inv_adst_e16_x8_rvv, export=1, ext=v iadst_8 v0, v1, v2, v3, v4, v5, v6, v7 jr t0 -- cgit v1.2.3 From 9976976ec81607f3a1af92c8a8546c2fc5cad6ea Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 11 Feb 2024 11:48:10 -0500 Subject: riscv64/itx: Use registers above v15 in dct macros --- src/riscv/64/itx.S | 114 ++++++++++++++++++++++++++--------------------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 80599e7..49d972f 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -122,34 +122,34 @@ endfunc li t2, 1567 li t3, 3784 - vwmul.vx v8, \o0, t1 - vwmul.vx v10, \o0, t1 - vwmacc.vx v8, t1, \o2 + vwmul.vx v16, \o0, t1 + vwmul.vx v18, \o0, t1 + vwmacc.vx v16, t1, \o2 neg t1, t1 - vwmacc.vx v10, t1, \o2 + vwmacc.vx v18, t1, \o2 - vwmul.vx v12, \o1, t3 + vwmul.vx v20, \o1, t3 neg t3, t3 - vwmul.vx v14, \o1, t2 - vwmacc.vx v12, t2, \o3 - vwmacc.vx v14, t3, \o3 + vwmul.vx v22, \o1, t2 + vwmacc.vx v20, t2, \o3 + vwmacc.vx v22, t3, \o3 li t1, 2048 - vwadd.wx v8, v8, t1 - vwadd.wx v10, v10, t1 - vwadd.wx v12, v12, t1 - vwadd.wx v14, v14, t1 + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 - vnsra.wi v8, v8, 12 - vnsra.wi v10, v10, 12 - vnsra.wi v12, v12, 12 - vnsra.wi v14, v14, 12 + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 - vsadd.vv \o0, v8, v12 - vsadd.vv \o1, v10, v14 - vssub.vv \o2, v10, v14 - vssub.vv \o3, v8, v12 + vsadd.vv \o0, v16, v20 + vsadd.vv \o1, v18, v22 + vssub.vv \o2, v18, v22 + vssub.vv \o3, v16, v20 .endm .macro iadst_4 o0, o1, o2, o3 @@ -418,58 +418,58 @@ endfunc li t3, 3406 li t4, 2276 - vwmul.vx v14, \o1, t2 + vwmul.vx v22, \o1, t2 neg t2, t2 - vwmul.vx v8, \o1, t1 - vwmacc.vx v14, t1, \o7 - vwmacc.vx v8, t2, \o7 + vwmul.vx v16, \o1, t1 + vwmacc.vx v22, t1, \o7 + vwmacc.vx v16, t2, \o7 - vwmul.vx v12, \o5, t4 + vwmul.vx v20, \o5, t4 neg t4, t4 - vwmul.vx v10, \o5, t3 - vwmacc.vx v12, t3, \o3 - vwmacc.vx v10, t4, \o3 + vwmul.vx v18, \o5, t3 + vwmacc.vx v20, t3, \o3 + vwmacc.vx v18, t4, \o3 li t1, 2048 - vwadd.wx v8, v8, t1 - vwadd.wx v10, v10, t1 - vwadd.wx v12, v12, t1 - vwadd.wx v14, v14, t1 + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 - vnsra.wi v8, v8, 12 - vnsra.wi v10, v10, 12 - vnsra.wi v12, v12, 12 - vnsra.wi v14, v14, 12 + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 - vssub.vv \o7, v14, v12 - vsadd.vv v14, v14, v12 - vssub.vv \o1, v8, v10 - vsadd.vv v8, v8, v10 + vssub.vv \o7, v22, v20 + vsadd.vv v22, v22, v20 + vssub.vv \o1, v16, v18 + vsadd.vv v16, v16, v18 li t2, 2896 - vwmul.vx v10, \o7, t2 - vwmul.vx v12, \o7, t2 - vwmacc.vx v12, t2, \o1 + vwmul.vx v18, \o7, t2 + vwmul.vx v20, \o7, t2 + vwmacc.vx v20, t2, \o1 neg t2, t2 - vwmacc.vx v10, t2, \o1 + vwmacc.vx v18, t2, \o1 - vwadd.wx v10, v10, t1 - vwadd.wx v12, v12, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 - vnsra.wi v10, v10, 12 - vnsra.wi v12, v12, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 - vssub.vv \o7, \o0, v14 - vsadd.vv \o0, \o0, v14 - vssub.vv v9, \o2, v12 - vsadd.vv \o1, \o2, v12 - vssub.vv \o5, \o4, v10 - vsadd.vv \o2, \o4, v10 - vssub.vv \o4, \o6, v8 - vsadd.vv \o3, \o6, v8 - vmv.v.v \o6, v9 + vssub.vv \o7, \o0, v22 + vsadd.vv \o0, \o0, v22 + vssub.vv v17, \o2, v20 + vsadd.vv \o1, \o2, v20 + vssub.vv \o5, \o4, v18 + vsadd.vv \o2, \o4, v18 + vssub.vv \o4, \o6, v16 + vsadd.vv \o3, \o6, v16 + vmv.v.v \o6, v17 .endm .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 -- cgit v1.2.3 From 8e82093ebb7bbb08a1f9ce24cf88ff53d0ee09a8 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 11 Feb 2024 21:25:31 -0500 Subject: riscv64/itx: Add 16-point 8bpc RVV dct transform inv_txfm_add_16x16_dct_dct_0_8bpc_c: 1574.4 ( 1.00x) inv_txfm_add_16x16_dct_dct_0_8bpc_rvv: 1450.3 ( 1.09x) inv_txfm_add_16x16_dct_dct_1_8bpc_c: 13614.4 ( 1.00x) inv_txfm_add_16x16_dct_dct_1_8bpc_rvv: 1450.5 ( 9.39x) inv_txfm_add_16x16_dct_dct_2_8bpc_c: 13613.2 ( 1.00x) inv_txfm_add_16x16_dct_dct_2_8bpc_rvv: 1450.4 ( 9.39x) --- src/riscv/64/itx.S | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/riscv/itx.h | 2 +- 2 files changed, 144 insertions(+), 1 deletion(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 49d972f..d272a1b 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -674,6 +674,148 @@ function inv_identity_e16_x16_rvv, export=1, ext=v jr t0 endfunc +function inv_dct_e16_x16_rvv, export=1, ext=v + idct_8 v0, v2, v4, v6, v8, v10, v12, v14 + + li t1, 401 + li t2, 4076 + li t3, 3166 + li t4, 2598 + + vwmul.vx v30, v1, t2 + neg t2, t2 + vwmul.vx v16, v1, t1 + vwmacc.vx v30, t1, v15 + vwmacc.vx v16, t2, v15 + + vwmul.vx v28, v9, t4 + neg t4, t4 + vwmul.vx v18, v9, t3 + vwmacc.vx v28, t3, v7 + vwmacc.vx v18, t4, v7 + + li t1, 1931 + li t2, 3612 + li t3, 3920 + li t4, 1189 + + vwmul.vx v26, v5, t2 + neg t2, t2 + vwmul.vx v20, v5, t1 + vwmacc.vx v26, t1, v11 + vwmacc.vx v20, t2, v11 + + vwmul.vx v24, v13, t4 + neg t4, t4 + vwmul.vx v22, v13, t3 + vwmacc.vx v24, t3, v3 + vwmacc.vx v22, t4, v3 + + li t1, 2048 + li t2, 2896 + li t3, 1567 + li t4, 3784 + + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + vwadd.wx v30, v30, t1 + + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v26, v26, 12 + vnsra.wi v28, v28, 12 + vnsra.wi v30, v30, 12 + + vssub.vv v3, v16, v18 + vsadd.vv v16, v16, v18 + vssub.vv v5, v22, v20 + vsadd.vv v22, v22, v20 + vssub.vv v11, v24, v26 + vsadd.vv v24, v24, v26 + vssub.vv v13, v30, v28 + vsadd.vv v30, v30, v28 + + vwmul.vx v28, v13, t4 + neg t4, t4 + vwmul.vx v18, v13, t3 + vwmul.vx v26, v11, t3 + vwmacc.vx v28, t3, v3 + neg t3, t3 + vwmul.vx v20, v11, t4 + vwmacc.vx v18, t4, v3 + vwmacc.vx v20, t3, v5 + vwmacc.vx v26, t4, v5 + + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v26, v26, 12 + vnsra.wi v28, v28, 12 + + vssub.vv v5, v18, v20 + vsadd.vv v18, v18, v20 + vssub.vv v11, v28, v26 + vsadd.vv v28, v28, v26 + + vssub.vv v7, v16, v22 + vsadd.vv v16, v16, v22 + vssub.vv v9, v30, v24 + vsadd.vv v30, v30, v24 + + vwmul.vx v20, v11, t2 + vwmul.vx v22, v9, t2 + vwmul.vx v24, v9, t2 + vwmul.vx v26, v11, t2 + vwmacc.vx v24, t2, v7 + vwmacc.vx v26, t2, v5 + neg t2, t2 + vwmacc.vx v20, t2, v5 + vwmacc.vx v22, t2, v7 + + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v26, v26, 12 + + vssub.vv v15, v0, v30 + vsadd.vv v0, v0, v30 + vssub.vv v17, v2, v28 + vsadd.vv v1, v2, v28 + vssub.vv v13, v4, v26 + vsadd.vv v2, v4, v26 + vssub.vv v19, v6, v24 + vsadd.vv v3, v6, v24 + vssub.vv v11, v8, v22 + vsadd.vv v4, v8, v22 + vsadd.vv v5, v10, v20 + vssub.vv v10, v10, v20 + vssub.vv v9, v12, v18 + vsadd.vv v6, v12, v18 + vssub.vv v8, v14, v16 + vsadd.vv v7, v14, v16 + vmv.v.v v14, v17 + vmv.v.v v12, v19 + + jr t0 +endfunc + .macro def_horz_16 variant function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v vmv.v.x v16, zero @@ -806,4 +948,5 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v endfunc .endm +def_fn_16x16 dct, dct def_fn_16x16 identity, identity diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 1abd9db..86a9475 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -106,6 +106,6 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in #if BITDEPTH == 8 assign_itx16_fn( , 4, 4, rvv); assign_itx16_fn( , 8, 8, rvv); - assign_itx_fn( , 16, 16, identity_identity, IDTX, rvv); + assign_itx2_fn( , 16, 16, rvv); #endif } -- cgit v1.2.3 From cc29b2314ce4e350d08fe5714f6bf88d80cfc00b Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 11 Feb 2024 21:48:55 -0500 Subject: riscv64/itx: Add 16x16 8bpc dct_identity and identity_dct inv_txfm_add_16x16_dct_identity_0_8bpc_c: 10593.3 ( 1.00x) inv_txfm_add_16x16_dct_identity_0_8bpc_rvv: 1163.3 ( 9.11x) inv_txfm_add_16x16_dct_identity_1_8bpc_c: 10584.9 ( 1.00x) inv_txfm_add_16x16_dct_identity_1_8bpc_rvv: 1163.3 ( 9.10x) inv_txfm_add_16x16_dct_identity_2_8bpc_c: 10590.3 ( 1.00x) inv_txfm_add_16x16_dct_identity_2_8bpc_rvv: 1163.6 ( 9.10x) inv_txfm_add_16x16_identity_dct_0_8bpc_c: 9945.9 ( 1.00x) inv_txfm_add_16x16_identity_dct_0_8bpc_rvv: 1150.2 ( 8.65x) inv_txfm_add_16x16_identity_dct_1_8bpc_c: 9937.0 ( 1.00x) inv_txfm_add_16x16_identity_dct_1_8bpc_rvv: 1150.3 ( 8.64x) inv_txfm_add_16x16_identity_dct_2_8bpc_c: 9934.6 ( 1.00x) inv_txfm_add_16x16_identity_dct_2_8bpc_rvv: 1150.4 ( 8.64x) --- src/riscv/64/itx.S | 2 ++ src/riscv/itx.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index d272a1b..fef6be0 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -950,3 +950,5 @@ endfunc def_fn_16x16 dct, dct def_fn_16x16 identity, identity +def_fn_16x16 dct, identity +def_fn_16x16 identity, dct diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 86a9475..f5a268f 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -107,5 +107,7 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in assign_itx16_fn( , 4, 4, rvv); assign_itx16_fn( , 8, 8, rvv); assign_itx2_fn( , 16, 16, rvv); + assign_itx_fn( , 16, 16, dct_identity, H_DCT, rvv); + assign_itx_fn( , 16, 16, identity_dct, V_DCT, rvv); #endif } -- cgit v1.2.3 From 72dba22e663d04d44c8a340b1dbc6a38b5c7980e Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 11 Feb 2024 23:03:41 -0500 Subject: riscv64/itx: Add 4x4 8bpc RVV wht_wht transform inv_txfm_add_4x4_wht_wht_0_8bpc_c: 265.6 ( 1.00x) inv_txfm_add_4x4_wht_wht_0_8bpc_rvv: 66.9 ( 3.97x) inv_txfm_add_4x4_wht_wht_1_8bpc_c: 265.5 ( 1.00x) inv_txfm_add_4x4_wht_wht_1_8bpc_rvv: 66.9 ( 3.97x) --- src/riscv/64/itx.S | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/riscv/itx.h | 2 +- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index fef6be0..f571c3d 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -117,6 +117,17 @@ function inv_identity_e16_x4_rvv, export=1, ext=v jr t0 endfunc +.macro iwht_4 + vadd.vv v0, v0, v1 + vsub.vv v5, v2, v3 + vsub.vv v4, v0, v5 + vsra.vi v4, v4, 1 + vsub.vv v2, v4, v1 + vsub.vv v1, v4, v3 + vadd.vv v3, v5, v2 + vsub.vv v0, v0, v1 +.endm + .macro idct_4 o0, o1, o2, o3 li t1, 2896 li t2, 1567 @@ -211,6 +222,45 @@ function inv_flipadst_e16_x4_rvv, export=1, ext=v jr t0 endfunc +function inv_txfm_add_wht_wht_4x4_8bpc_rvv, export=1, ext=v + csrw vxrm, zero + + vsetivli zero, 4, e16, mf2, ta, ma + vle16.v v0, (a2) + addi t0, a2, 8 + vle16.v v1, (t0) + addi t0, t0, 8 + vle16.v v2, (t0) + addi t0, t0, 8 + vle16.v v3, (t0) + + vsra.vi v0, v0, 2 + vsra.vi v1, v1, 2 + vsra.vi v2, v2, 2 + vsra.vi v3, v3, 2 + + iwht_4 + + vmv.v.x v4, zero + + vsseg4e16.v v0, (a2) + vle16.v v0, (a2) + vse16.v v4, (a2) + addi t0, a2, 8 + vle16.v v1, (t0) + vse16.v v4, (t0) + addi t0, t0, 8 + vle16.v v2, (t0) + vse16.v v4, (t0) + addi t0, t0, 8 + vle16.v v3, (t0) + vse16.v v4, (t0) + + iwht_4 + + j itx_4x4_end +endfunc + .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct diff --git a/src/riscv/itx.h b/src/riscv/itx.h index f5a268f..24d08ba 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -104,7 +104,7 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return; #if BITDEPTH == 8 - assign_itx16_fn( , 4, 4, rvv); + assign_itx17_fn( , 4, 4, rvv); assign_itx16_fn( , 8, 8, rvv); assign_itx2_fn( , 16, 16, rvv); assign_itx_fn( , 16, 16, dct_identity, H_DCT, rvv); -- cgit v1.2.3 From 2685b40920cc85ced05e4958f92dea2cbf4fa95a Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Mon, 12 Feb 2024 11:46:37 -0500 Subject: riscv64/itx: Add 16-point 8bpc RVV adst transform inv_txfm_add_16x16_adst_adst_0_8bpc_c: 15364.4 ( 1.00x) inv_txfm_add_16x16_adst_adst_0_8bpc_rvv: 1814.1 ( 8.47x) inv_txfm_add_16x16_adst_adst_1_8bpc_c: 15363.7 ( 1.00x) inv_txfm_add_16x16_adst_adst_1_8bpc_rvv: 1814.5 ( 8.47x) inv_txfm_add_16x16_adst_adst_2_8bpc_c: 15368.9 ( 1.00x) inv_txfm_add_16x16_adst_adst_2_8bpc_rvv: 1814.5 ( 8.47x) inv_txfm_add_16x16_adst_dct_0_8bpc_c: 14560.0 ( 1.00x) inv_txfm_add_16x16_adst_dct_0_8bpc_rvv: 1644.4 ( 8.85x) inv_txfm_add_16x16_adst_dct_1_8bpc_c: 14578.9 ( 1.00x) inv_txfm_add_16x16_adst_dct_1_8bpc_rvv: 1644.5 ( 8.87x) inv_txfm_add_16x16_adst_dct_2_8bpc_c: 14575.0 ( 1.00x) inv_txfm_add_16x16_adst_dct_2_8bpc_rvv: 1644.6 ( 8.86x) inv_txfm_add_16x16_dct_adst_0_8bpc_c: 14550.7 ( 1.00x) inv_txfm_add_16x16_dct_adst_0_8bpc_rvv: 1622.7 ( 8.97x) inv_txfm_add_16x16_dct_adst_1_8bpc_c: 14556.0 ( 1.00x) inv_txfm_add_16x16_dct_adst_1_8bpc_rvv: 1622.6 ( 8.97x) inv_txfm_add_16x16_dct_adst_2_8bpc_c: 14543.3 ( 1.00x) inv_txfm_add_16x16_dct_adst_2_8bpc_rvv: 1622.6 ( 8.96x) --- src/riscv/64/itx.S | 310 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/riscv/itx.h | 3 + 2 files changed, 313 insertions(+) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index f571c3d..f992549 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -866,6 +866,313 @@ function inv_dct_e16_x16_rvv, export=1, ext=v jr t0 endfunc +function inv_adst_e16_x16_rvv, export=1, ext=v + li t1, 4091 + li t2, 201 + li t3, 3973 + li t4, 995 + + vwmul.vx v16, v15, t1 + neg t1, t1 + vwmul.vx v18, v15, t2 + vwmacc.vx v16, t2, v0 + vwmacc.vx v18, t1, v0 + + vwmul.vx v20, v13, t3 + neg t3, t3 + vwmul.vx v22, v13, t4 + vwmacc.vx v20, t4, v2 + vwmacc.vx v22, t3, v2 + + li t1, 3703 + li t2, 1751 + li t3, 3290 + li t4, 2440 + + vwmul.vx v24, v11, t1 + neg t1, t1 + vwmul.vx v26, v11, t2 + vwmacc.vx v24, t2, v4 + vwmacc.vx v26, t1, v4 + + vwmul.vx v28, v9, t3 + neg t3, t3 + vwmul.vx v30, v9, t4 + vwmacc.vx v28, t4, v6 + vwmacc.vx v30, t3, v6 + + li t1, 2048 + + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + vwadd.wx v30, v30, t1 + + vnsra.wi v0, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v2, v20, 12 + vnsra.wi v22, v22, 12 + vnsra.wi v4, v24, 12 + vnsra.wi v26, v26, 12 + vnsra.wi v6, v28, 12 + vnsra.wi v30, v30, 12 + + li t1, 2751 + li t2, 3035 + li t3, 2106 + li t4, 3513 + + vwmul.vx v16, v7, t1 + neg t1, t1 + vwmul.vx v20, v7, t2 + vwmacc.vx v16, t2, v8 + vwmacc.vx v20, t1, v8 + + vwmul.vx v24, v5, t3 + neg t3, t3 + vwmul.vx v28, v5, t4 + vwmacc.vx v24, t4, v10 + vwmacc.vx v28, t3, v10 + + li t1, 2048 + + vwadd.wx v16, v16, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v28, v28, t1 + + vnsra.wi v16, v16, 12 + vnsra.wi v9, v20, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v11, v28, 12 + + vssub.vv v8, v0, v16 + vsadd.vv v0, v0, v16 + vssub.vv v10, v2, v24 + vsadd.vv v2, v2, v24 + + li t1, 1380 + li t2, 3857 + li t3, 601 + li t4, 4052 + + vwmul.vx v16, v3, t1 + neg t1, t1 + vwmul.vx v20, v3, t2 + vwmacc.vx v16, t2, v12 + vwmacc.vx v20, t1, v12 + + vwmul.vx v24, v1, t3 + neg t3, t3 + vwmul.vx v28, v1, t4 + vwmacc.vx v24, t4, v14 + vwmacc.vx v28, t3, v14 + + li t1, 2048 + + vwadd.wx v16, v16, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v28, v28, t1 + + vnsra.wi v16, v16, 12 + vnsra.wi v13, v20, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v15, v28, 12 + + vssub.vv v12, v4, v16 + vsadd.vv v16, v4, v16 + vssub.vv v14, v6, v24 + vsadd.vv v20, v6, v24 + + vsadd.vv v1, v18, v9 + vssub.vv v9, v18, v9 + vsadd.vv v3, v22, v11 + vssub.vv v11, v22, v11 + vsadd.vv v18, v26, v13 + vssub.vv v13, v26, v13 + vsadd.vv v22, v30, v15 + vssub.vv v15, v30, v15 + + vssub.vv v4, v0, v16 + vsadd.vv v0, v0, v16 + vssub.vv v5, v1, v18 + vsadd.vv v1, v1, v18 + vssub.vv v6, v2, v20 + vsadd.vv v2, v2, v20 + vssub.vv v7, v3, v22 + vsadd.vv v3, v3, v22 + + li t1, 799 + li t2, 4017 + li t3, 3406 + li t4, 2276 + + vwmul.vx v16, v8, t2 + vwmul.vx v18, v8, t1 + vwmul.vx v20, v10, t4 + vwmul.vx v22, v10, t3 + vwmul.vx v24, v13, t2 + vwmul.vx v26, v13, t1 + vwmul.vx v28, v15, t4 + vwmul.vx v30, v15, t3 + vwmacc.vx v16, t1, v9 + neg t1, t1 + vwmacc.vx v20, t3, v11 + neg t3, t3 + vwmacc.vx v26, t2, v12 + neg t2, t2 + vwmacc.vx v30, t4, v14 + neg t4, t4 + vwmacc.vx v18, t2, v9 + vwmacc.vx v22, t4, v11 + vwmacc.vx v24, t1, v12 + vwmacc.vx v28, t3, v14 + + li t1, 2048 + li t2, 2896 + li t3, 1567 + li t4, 3784 + + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + vwadd.wx v30, v30, t1 + + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v26, v26, 12 + vnsra.wi v28, v28, 12 + vnsra.wi v30, v30, 12 + + vsadd.vv v8, v16, v24 + vsadd.vv v9, v18, v26 + vsadd.vv v10, v20, v28 + vsadd.vv v11, v22, v30 + vssub.vv v12, v16, v24 + vssub.vv v13, v18, v26 + vssub.vv v14, v20, v28 + vssub.vv v15, v22, v30 + + vwmul.vx v16, v4, t4 + vwmul.vx v18, v4, t3 + vwmul.vx v20, v7, t4 + vwmul.vx v22, v7, t3 + vwmul.vx v24, v12, t4 + vwmul.vx v26, v12, t3 + vwmul.vx v28, v15, t4 + vwmul.vx v30, v15, t3 + vwmacc.vx v16, t3, v5 + vwmacc.vx v22, t4, v6 + vwmacc.vx v24, t3, v13 + neg t3, t3 + vwmacc.vx v30, t4, v14 + neg t4, t4 + vwmacc.vx v20, t3, v6 + vwmacc.vx v28, t3, v14 + vwmacc.vx v18, t4, v5 + vwmacc.vx v26, t4, v13 + + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + vwadd.wx v30, v30, t1 + + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v26, v26, 12 + vnsra.wi v28, v28, 12 + vnsra.wi v30, v30, 12 + + vsadd.vv v14, v9, v11 + vssub.vv v11, v9, v11 + vssub.vv v9, v1, v3 + vsadd.vv v15, v1, v3 + vsadd.vv v1, v8, v10 + vssub.vv v10, v8, v10 + vssub.vv v8, v0, v2 + vsadd.vv v0, v0, v2 + + vsadd.vv v3, v16, v20 + vssub.vv v6, v16, v20 + vsadd.vv v12, v18, v22 + vssub.vv v7, v18, v22 + vsadd.vv v2, v24, v28 + vssub.vv v24, v24, v28 + vsadd.vv v13, v26, v30 + vssub.vv v26, v26, v30 + + neg t3, t2 + + vwmul.vx v28, v24, t2 + vwmul.vx v30, v24, t2 + vwmacc.vx v28, t2, v26 + vwmacc.vx v30, t3, v26 + + vwmul.vx v24, v10, t2 + vwmul.vx v26, v10, t2 + vwmacc.vx v24, t2, v11 + vwmacc.vx v26, t3, v11 + + vwmul.vx v20, v6, t2 + vwmul.vx v22, v6, t2 + vwmacc.vx v20, t2, v7 + vwmacc.vx v22, t3, v7 + + vwmul.vx v16, v8, t2 + vwmul.vx v18, v8, t2 + vwmacc.vx v16, t2, v9 + vwmacc.vx v18, t3, v9 + + vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + vwadd.wx v30, v30, t1 + + vnsra.wi v7, v16, 12 + vnsra.wi v8, v18, 12 + vnsra.wi v4, v20, 12 + vnsra.wi v11, v22, 12 + vnsra.wi v6, v24, 12 + vnsra.wi v9, v26, 12 + vnsra.wi v5, v28, 12 + vnsra.wi v10, v30, 12 + + vmv.v.x v16, zero + vssub.vv v1, v16, v1 + vssub.vv v3, v16, v3 + vssub.vv v5, v16, v5 + vssub.vv v7, v16, v7 + vssub.vv v9, v16, v9 + vssub.vv v11, v16, v11 + vssub.vv v13, v16, v13 + vssub.vv v15, v16, v15 + + jr t0 +endfunc + .macro def_horz_16 variant function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v vmv.v.x v16, zero @@ -1000,5 +1307,8 @@ endfunc def_fn_16x16 dct, dct def_fn_16x16 identity, identity +def_fn_16x16 dct, adst def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst def_fn_16x16 identity, dct diff --git a/src/riscv/itx.h b/src/riscv/itx.h index 24d08ba..cf738c6 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -107,7 +107,10 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in assign_itx17_fn( , 4, 4, rvv); assign_itx16_fn( , 8, 8, rvv); assign_itx2_fn( , 16, 16, rvv); + assign_itx_fn( , 16, 16, dct_adst, ADST_DCT, rvv); assign_itx_fn( , 16, 16, dct_identity, H_DCT, rvv); + assign_itx_fn( , 16, 16, adst_dct, DCT_ADST, rvv); + assign_itx_fn( , 16, 16, adst_adst, ADST_ADST, rvv); assign_itx_fn( , 16, 16, identity_dct, V_DCT, rvv); #endif } -- cgit v1.2.3 From b981bc9c3ef4b44e4e57326b753f2d0a8cebb341 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Mon, 12 Feb 2024 18:36:59 -0500 Subject: riscv64/itx: Convert inv_adst_e16_x16_rvv to macro --- src/riscv/64/itx.S | 83 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index f992549..8e5472f 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -866,7 +866,7 @@ function inv_dct_e16_x16_rvv, export=1, ext=v jr t0 endfunc -function inv_adst_e16_x16_rvv, export=1, ext=v +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 li t1, 4091 li t2, 201 li t3, 3973 @@ -1102,23 +1102,35 @@ function inv_adst_e16_x16_rvv, export=1, ext=v vnsra.wi v28, v28, 12 vnsra.wi v30, v30, 12 - vsadd.vv v14, v9, v11 - vssub.vv v11, v9, v11 - vssub.vv v9, v1, v3 - vsadd.vv v15, v1, v3 - vsadd.vv v1, v8, v10 - vssub.vv v10, v8, v10 - vssub.vv v8, v0, v2 - vsadd.vv v0, v0, v2 - - vsadd.vv v3, v16, v20 - vssub.vv v6, v16, v20 - vsadd.vv v12, v18, v22 - vssub.vv v7, v18, v22 - vsadd.vv v2, v24, v28 - vssub.vv v24, v24, v28 - vsadd.vv v13, v26, v30 - vssub.vv v26, v26, v30 +.ifc \o0, v0 + vsadd.vv \o14, v9, v11 + vssub.vv v11, v9, v11 + vssub.vv v9, v1, v3 + vsadd.vv \o15, v1, v3 + vsadd.vv \o1, v8, v10 + vssub.vv v10, v8, v10 + vssub.vv v8, v0, v2 + vsadd.vv \o0, v0, v2 +.else + vsadd.vv \o1, v8, v10 + vssub.vv v10, v8, v10 + vssub.vv v8, v0, v2 + vsadd.vv \o0, v0, v2 + vsadd.vv v2, v9, v11 + vssub.vv v11, v9, v11 + vssub.vv v9, v1, v3 + vsadd.vv \o15, v1, v3 + vmv.v.v \o14, v2 +.endif + + vsadd.vv \o3, v16, v20 + vssub.vv v6, v16, v20 + vsadd.vv \o12, v18, v22 + vssub.vv v7, v18, v22 + vsadd.vv \o2, v24, v28 + vssub.vv v24, v24, v28 + vsadd.vv \o13, v26, v30 + vssub.vv v26, v26, v30 neg t3, t2 @@ -1151,25 +1163,28 @@ function inv_adst_e16_x16_rvv, export=1, ext=v vwadd.wx v28, v28, t1 vwadd.wx v30, v30, t1 - vnsra.wi v7, v16, 12 - vnsra.wi v8, v18, 12 - vnsra.wi v4, v20, 12 - vnsra.wi v11, v22, 12 - vnsra.wi v6, v24, 12 - vnsra.wi v9, v26, 12 - vnsra.wi v5, v28, 12 - vnsra.wi v10, v30, 12 + vnsra.wi \o7, v16, 12 + vnsra.wi \o8, v18, 12 + vnsra.wi \o4, v20, 12 + vnsra.wi \o11, v22, 12 + vnsra.wi \o6, v24, 12 + vnsra.wi \o9, v26, 12 + vnsra.wi \o5, v28, 12 + vnsra.wi \o10, v30, 12 vmv.v.x v16, zero - vssub.vv v1, v16, v1 - vssub.vv v3, v16, v3 - vssub.vv v5, v16, v5 - vssub.vv v7, v16, v7 - vssub.vv v9, v16, v9 - vssub.vv v11, v16, v11 - vssub.vv v13, v16, v13 - vssub.vv v15, v16, v15 + vssub.vv \o1, v16, \o1 + vssub.vv \o3, v16, \o3 + vssub.vv \o5, v16, \o5 + vssub.vv \o7, v16, \o7 + vssub.vv \o9, v16, \o9 + vssub.vv \o11, v16, \o11 + vssub.vv \o13, v16, \o13 + vssub.vv \o15, v16, \o15 +.endm +function inv_adst_e16_x16_rvv, export=1, ext=v + iadst_16 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 jr t0 endfunc -- cgit v1.2.3 From 7b15ca13752aac7f0a1c6a56e33fe64d1f7638d4 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Mon, 12 Feb 2024 18:43:40 -0500 Subject: riscv64/itx: Add 16-point 8bpc RVV flipadst transform inv_txfm_add_16x16_adst_flipadst_0_8bpc_c: 15272.2 ( 1.00x) inv_txfm_add_16x16_adst_flipadst_0_8bpc_rvv: 1824.4 ( 8.37x) inv_txfm_add_16x16_adst_flipadst_1_8bpc_c: 15261.2 ( 1.00x) inv_txfm_add_16x16_adst_flipadst_1_8bpc_rvv: 1824.5 ( 8.36x) inv_txfm_add_16x16_adst_flipadst_2_8bpc_c: 15260.0 ( 1.00x) inv_txfm_add_16x16_adst_flipadst_2_8bpc_rvv: 1824.5 ( 8.36x) inv_txfm_add_16x16_dct_flipadst_0_8bpc_c: 14497.2 ( 1.00x) inv_txfm_add_16x16_dct_flipadst_0_8bpc_rvv: 1637.3 ( 8.85x) inv_txfm_add_16x16_dct_flipadst_1_8bpc_c: 14490.5 ( 1.00x) inv_txfm_add_16x16_dct_flipadst_1_8bpc_rvv: 1637.3 ( 8.85x) inv_txfm_add_16x16_dct_flipadst_2_8bpc_c: 14486.4 ( 1.00x) inv_txfm_add_16x16_dct_flipadst_2_8bpc_rvv: 1637.3 ( 8.85x) inv_txfm_add_16x16_flipadst_adst_0_8bpc_c: 15307.7 ( 1.00x) inv_txfm_add_16x16_flipadst_adst_0_8bpc_rvv: 1808.0 ( 8.47x) inv_txfm_add_16x16_flipadst_adst_1_8bpc_c: 15341.0 ( 1.00x) inv_txfm_add_16x16_flipadst_adst_1_8bpc_rvv: 1808.1 ( 8.48x) inv_txfm_add_16x16_flipadst_adst_2_8bpc_c: 15333.5 ( 1.00x) inv_txfm_add_16x16_flipadst_adst_2_8bpc_rvv: 1808.1 ( 8.48x) inv_txfm_add_16x16_flipadst_dct_0_8bpc_c: 14530.0 ( 1.00x) inv_txfm_add_16x16_flipadst_dct_0_8bpc_rvv: 1636.4 ( 8.88x) inv_txfm_add_16x16_flipadst_dct_1_8bpc_c: 14510.3 ( 1.00x) inv_txfm_add_16x16_flipadst_dct_1_8bpc_rvv: 1636.3 ( 8.87x) inv_txfm_add_16x16_flipadst_dct_2_8bpc_c: 14504.7 ( 1.00x) inv_txfm_add_16x16_flipadst_dct_2_8bpc_rvv: 1636.3 ( 8.86x) inv_txfm_add_16x16_flipadst_flipadst_0_8bpc_c: 15315.5 ( 1.00x) inv_txfm_add_16x16_flipadst_flipadst_0_8bpc_rvv: 1823.5 ( 8.40x) inv_txfm_add_16x16_flipadst_flipadst_1_8bpc_c: 15324.1 ( 1.00x) inv_txfm_add_16x16_flipadst_flipadst_1_8bpc_rvv: 1823.3 ( 8.40x) inv_txfm_add_16x16_flipadst_flipadst_2_8bpc_c: 15315.6 ( 1.00x) inv_txfm_add_16x16_flipadst_flipadst_2_8bpc_rvv: 1823.5 ( 8.40x) --- src/riscv/64/itx.S | 10 ++++++++++ src/riscv/itx.h | 7 +------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S index 8e5472f..5677cf4 100644 --- a/src/riscv/64/itx.S +++ b/src/riscv/64/itx.S @@ -1188,6 +1188,11 @@ function inv_adst_e16_x16_rvv, export=1, ext=v jr t0 endfunc +function inv_flipadst_e16_x16_rvv, export=1, ext=v + iadst_16 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 + jr t0 +endfunc + .macro def_horz_16 variant function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v vmv.v.x v16, zero @@ -1323,7 +1328,12 @@ endfunc def_fn_16x16 dct, dct def_fn_16x16 identity, identity def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst def_fn_16x16 dct, identity def_fn_16x16 adst, dct def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst def_fn_16x16 identity, dct diff --git a/src/riscv/itx.h b/src/riscv/itx.h index cf738c6..28c5e54 100644 --- a/src/riscv/itx.h +++ b/src/riscv/itx.h @@ -106,11 +106,6 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in #if BITDEPTH == 8 assign_itx17_fn( , 4, 4, rvv); assign_itx16_fn( , 8, 8, rvv); - assign_itx2_fn( , 16, 16, rvv); - assign_itx_fn( , 16, 16, dct_adst, ADST_DCT, rvv); - assign_itx_fn( , 16, 16, dct_identity, H_DCT, rvv); - assign_itx_fn( , 16, 16, adst_dct, DCT_ADST, rvv); - assign_itx_fn( , 16, 16, adst_adst, ADST_ADST, rvv); - assign_itx_fn( , 16, 16, identity_dct, V_DCT, rvv); + assign_itx12_fn( , 16, 16, rvv); #endif } -- cgit v1.2.3 From bb645893f374ab694b10a79f1205a728e010f2c9 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Kempf Date: Wed, 14 Feb 2024 19:06:02 +0100 Subject: Update NEWS Fix typo --- NEWS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 097fcce..f74af58 100644 --- a/NEWS +++ b/NEWS @@ -3,7 +3,7 @@ Changes for 1.4.0 'Road Runner': 1.4.0 is a medium release of dav1d, focusing on new architecture support and optimizations -- AVX-512 optimizations for z1, z2, z3 in 8bit and high-bit depth +- AVX-512 optimizations for z1, z2, z3 in 8bit and high-bitdepth - New architecture supported: loongarch - Loongarch optimizations for 8bit - New architecture supported: RISC-V -- cgit v1.2.3