diff options
Diffstat (limited to 'src/x86/ipred_avx512.asm')
-rw-r--r-- | src/x86/ipred_avx512.asm | 1701 |
1 files changed, 1697 insertions, 4 deletions
diff --git a/src/x86/ipred_avx512.asm b/src/x86/ipred_avx512.asm index 4aeb14e..de953de 100644 --- a/src/x86/ipred_avx512.asm +++ b/src/x86/ipred_avx512.asm @@ -97,16 +97,97 @@ ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pb_63to0: db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48 + db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32 + db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 + db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +z_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14 + db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30 + db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46 + db 16, 48, 14, 50, 12, 52, 10, 54, 8, 56, 6, 58, 4, 60, 2, 62 +z_filter_s1: db -1, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6 + db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22 + db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38 + db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54 +z_filter_s5: db 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16 + db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32 + db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48 + db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64 +z_filter_s3: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +z_filter_s2: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +z_filter_s4: db 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8 +z_xpos_bc: db 17, 17, 17, 17, 33, 33, 33, 33, 9, 9, 9, 9, 9, 9, 9, 9 +z_filter4_s1: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 + db 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 +z_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72 +z_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80 +z_xpos_off2a: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 + db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 + db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56 +z_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 + db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 + db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 + db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64 +z_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8 + dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16 +z_ypos_off1: db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67 + db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71 + db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75 + db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79 +z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0 + db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1 + db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2 + db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3 +z_ypos_off3: db 1, 2, 1, 2, 1, 1, 1, 1, 3, 4, 3, 4, 1, 1, 1, 1 + db 5, 6, 5, 6, 3, 3, 3, 3, 7, 8, 7, 8, 3, 3, 3, 3 + db 9, 10, 9, 10, 5, 5, 5, 5, 11, 12, 11, 12, 5, 5, 5, 5 + db 13, 14, 13, 14, 7, 7, 7, 7, 15, 16, 15, 16, 7, 7, 7, 7 +z_ypos_mul1a: dw 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24 + dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56 +z_ypos_mul1b: dw 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32 + dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64 +z_ypos_mul2a: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512 + dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512 + dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512 + dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512 +z_ypos_mul2b: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512 + dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512 + dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512 + dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z3_upsample: db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 + db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 79, 79, 79 +z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 + db 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16, 0 + db 0, 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16 +pb_8_56_0_0: db 8, 56, 0, 0 +pb_m4_36: times 2 db -4, 36 pb_127_m127: times 2 db 127, -127 +pb_8: times 4 db 8 +pb_15: times 4 db 15 +pb_16: times 4 db 16 +pb_31: times 4 db 31 +pb_63: times 4 db 63 +pb_90: times 4 db 90 pb_128: times 4 db 128 pw_128: times 2 dw 128 pw_255: times 2 dw 255 +pw_512: times 2 dw 512 -%define pb_1 (ipred_h_shuf+24) -%define pb_2 (ipred_h_shuf+20) -%define pb_3 (ipred_h_shuf+16) -%define pd_8 (filter_taps+128) +%define pb_1 (ipred_h_shuf+24) +%define pb_2 (ipred_h_shuf+20) +%define pb_3 (ipred_h_shuf+16) +%define pb_4 (smooth_shuf +48) +%define pb_7 (ipred_h_shuf+ 0) +%define pb_9 (z_xpos_bc + 8) +%define pb_17 (z_xpos_bc + 0) +%define pb_33 (z_xpos_bc + 4) +%define pd_8 (filter_taps+128) %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) @@ -125,10 +206,16 @@ JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 +cextern dr_intra_derivative +cextern pb_0to63 + SECTION .text INIT_ZMM avx512icl @@ -1200,6 +1287,1612 @@ cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3 jg .w64 RET +%if WIN64 + DECLARE_REG_TMP 4 +%else + DECLARE_REG_TMP 8 +%endif + +cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx +%define base r7-z_filter_t0 + lea r7, [z_filter_t0] + tzcnt wd, wm + movifnidn angled, anglem + lea t0, [dr_intra_derivative] + movsxd wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4] + inc tlq + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [t0+dxq] + lea wq, [base+ipred_z1_8bpc_avx512icl_table+wq] + movifnidn hd, hm + xor angled, 0x4ff ; d = 90 - angle + mova m14, [base+z_frac_table] + vpbroadcastd m15, [base+pw_512] + jmp wq +.w4: + mova m9, [pb_0to63] + pminud m8, m9, [base+pb_7] {1to16} + vpbroadcastq m7, [tlq] + pshufb m7, m8 + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + pshufb xmm0, xm7, [base+z_filter_s4] + mova xmm1, [tlq-1] + pshufb xmm1, [base+z_xpos_off2a] + vpbroadcastd xmm2, [base+pb_m4_36] + vpbroadcastq m4, [pb_0to63] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm1, xmm2 + add dxd, dxd + kxnorw k1, k1, k1 + paddw xmm0, xmm1 + pmulhrsw xm0, xmm0, xm15 + packuswb xm0, xm0 + punpcklbw ym7{k1}, ym0 + jmp .w4_main2 +.w4_no_upsample: + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+3] + vpbroadcastb xm0, r3d + vpbroadcastb xm1, angled + shr angled, 8 ; is_sm << 1 + vpcmpeqb k1, xm0, [base+z_filter_wh] + vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] + kmovw r5d, k1 + test r5d, r5d + jz .w4_main + vbroadcasti32x4 ym0, [tlq-1] + pshufb ym0, [base+z_filter4_s1] + popcnt r5d, r5d ; filter_strength + pshufb ym1, ym7, [z_filter_s4] + pshufb ym7, [base+z_filter_s3] + vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] + vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym7, ym12 + paddw ym0, ym1 + paddw ym7, ym0 + pmulhrsw ym7, ym15 + cmp hd, 4 + je .w4_filter_end + vpbroadcastd m8, [base+pb_9] + pminub m8, m9 +.w4_filter_end: + paddb m8, m8 + vpermb m7, m8, m7 +.w4_main: + vpbroadcastq m4, [base+z_xpos_off1a] +.w4_main2: + movsldup m2, [base+z_xpos_mul] + vpbroadcastw m5, dxd + vbroadcasti32x4 m3, [base+z_xpos_bc] + lea r2, [strideq*3] + pmullw m2, m5 ; xpos + psllw m5, 5 ; dx*8 +.w4_loop: + psrlw m1, m2, 3 + pshufb m0, m2, m3 + vpermw m1, m1, m14 ; 64-frac, frac + paddsb m0, m4 ; base, base+1 + vpermb m0, m0, m7 ; top[base], top[base+1] + paddsw m2, m5 ; xpos += dx + pmaddubsw m0, m1 ; v + pmulhrsw m0, m15 + packuswb m0, m0 + vextracti32x4 xm1, ym0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+r2 ], xm1, 1 + sub hd, 8 + jl .w4_end + vextracti32x4 xm1, m0, 2 ; top[max_base_x] + lea dstq, [dstq+strideq*4] + vextracti32x4 xm0, m0, 3 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r2 ], xm0, 1 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.w8_filter: + mova ym0, [base+z_filter_s1] + popcnt r5d, r5d + vbroadcasti32x4 ym1, [base+z_filter_s2] + vbroadcasti32x4 ym3, [base+z_filter_s3] + vbroadcasti32x4 ym4, [base+z_filter_s4] + vpermi2b ym0, ym7, ym2 ; al bl + mova ym5, [base+z_filter_s5] + pshufb ym1, ym7, ym1 ; ah bh + vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] + pshufb ym3, ym7, ym3 ; cl ch + vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] + pshufb ym4, ym7, ym4 ; el dl + vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2] + vpermb ym5, ym5, ym7 ; eh dh + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym2, ym3, ym12 + pmaddubsw ym3, ym13 + pmaddubsw ym4, ym11 + pmaddubsw ym5, ym11 + paddw ym0, ym2 + paddw ym1, ym3 + paddw ym0, ym4 + paddw ym1, ym5 + pmulhrsw ym0, ym15 + pmulhrsw ym1, ym15 + packuswb ym0, ym1 + ret +.w8: + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + lea r3d, [hq-1] + mova xm1, [base+z_filter_s4] + vpbroadcastb xm2, r3d + mova xm7, [tlq-1] + vinserti32x4 ym7, [tlq+7], 1 + vbroadcasti32x4 ym0, [base+z_xpos_off1a] + vpbroadcastd ym3, [base+pb_m4_36] + pminub xm2, xm1 + pshufb ym0, ym7, ym0 + vinserti32x4 ym1, xm2, 1 + psrldq ym7, 1 + pshufb ym1, ym7, ym1 + pmaddubsw ym0, ym3 + pmaddubsw ym1, ym3 + vbroadcasti32x4 m8, [pb_0to63] + add dxd, dxd + paddw ym0, ym1 + pmulhrsw ym0, ym15 + packuswb ym0, ym0 + punpcklbw ym7, ym0 + jmp .w8_main2 +.w8_no_upsample: + lea r3d, [hq+7] + mova m9, [pb_0to63] + vpbroadcastb ym0, r3d + and r3d, 7 + vbroadcasti32x4 m7, [tlq] + or r3d, 8 ; imin(h+7, 15) + vpbroadcastb m8, r3d + pminub m8, m9 + pshufb m7, m8 + test angled, 0x400 + jnz .w8_main + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym0, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + kmovd r5d, k1 + test r5d, r5d + jz .w8_main + vpbroadcastd ym2, [tlq-4] + call .w8_filter + cmp hd, 8 + jle .w8_filter_end + vpbroadcastd m8, [base+pb_17] + add r3d, 2 + pminub m8, m9 +.w8_filter_end: + vpermb m7, m8, m0 +.w8_main: + vbroadcasti32x4 m8, [base+z_xpos_off1a] +.w8_main2: + movsldup m4, [base+z_xpos_mul] + vpbroadcastw m9, dxd + shl r3d, 6 + vpbroadcastd m5, [base+z_xpos_bc+8*0] + pmullw m4, m9 ; xpos + vpbroadcastd m6, [base+z_xpos_bc+8*1] + sub r3d, dxd + shl dxd, 3 + psllw m9, 5 ; dx*8 + lea r2, [strideq*3] +.w8_loop: + psrlw m3, m4, 3 + pshufb m0, m4, m5 + pshufb m1, m4, m6 + vpermw m3, m3, m14 + paddsb m0, m8 + paddsb m1, m8 + vpermb m0, m0, m7 + vpermb m1, m1, m7 + paddsw m4, m9 + punpcklqdq m2, m3, m3 + pmaddubsw m0, m2 + punpckhqdq m3, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + sub hd, 8 + jl .w8_end + vextracti32x8 ym0, m0, 1 + lea dstq, [dstq+strideq*4] + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + jz .w8_end + lea dstq, [dstq+strideq*4] + sub r3d, dxd + jg .w8_loop + vextracti32x4 xm7, m7, 3 +.w8_end_loop: + movq [dstq+strideq*0], xm7 + movq [dstq+strideq*1], xm7 + movq [dstq+strideq*2], xm7 + movq [dstq+r2 ], xm7 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_end_loop +.w8_end: + RET +.w16_filter: + mova m0, [base+z_filter_s1] + popcnt r5d, r5d + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpermi2b m0, m7, m2 ; al bl + mova m5, [base+z_filter_s5] + pshufb m1, m7, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+(r5-1)*4+12*0] + pshufb m3, m7, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+(r5-1)*4+12*1] + pshufb m4, m7, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+(r5-1)*4+12*2] + vpermb m5, m5, m7 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m2, m3, m12 + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m0, m2 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + ret +.w16: + lea r3d, [hq+15] + mova m9, [pb_0to63] + vpbroadcastb ym0, r3d + and r3d, 15 + movu ym7, [tlq] + or r3d, 16 ; imin(h+15, 31) + vpbroadcastb m8, r3d + pminub m8, m9 + vpermb m7, m8, m7 + test angled, 0x400 + jnz .w16_main + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym0, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + kmovd r5d, k1 + test r5d, r5d + jz .w16_main + vpbroadcastd m2, [tlq-4] + call .w16_filter + cmp hd, 16 + jle .w16_filter_end + vpbroadcastd m8, [base+pb_33] + add r3d, 2 + pminub m8, m9 +.w16_filter_end: + vpermb m7, m8, m0 +.w16_main: + movshdup m3, [base+z_xpos_mul] + vpbroadcastw m8, dxd + shl r3d, 6 + vpbroadcastd m4, [base+z_xpos_bc] + pmullw m3, m8 ; xpos + vbroadcasti32x4 m5, [base+z_xpos_off1a] + sub r3d, dxd + shl dxd, 2 + vbroadcasti32x4 m6, [base+z_xpos_off1b] + psllw m8, 4 ; dx*4 + lea r2, [strideq*3] +.w16_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermb m0, m0, m7 + vpermb m1, m1, m7 + paddsw m3, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + sub hd, 4 + jz .w16_end + lea dstq, [dstq+strideq*4] + sub r3d, dxd + jg .w16_loop + vextracti32x4 xm7, m7, 3 +.w16_end_loop: + mova [dstq+strideq*0], xm7 + mova [dstq+strideq*1], xm7 + mova [dstq+strideq*2], xm7 + mova [dstq+r2 ], xm7 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_end_loop +.w16_end: + RET +.w32_filter: + mova m0, [base+z_filter_s1] + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpermi2b m0, m7, m2 ; al bl + mova m5, [base+z_filter_s5] + pshufb m1, m7, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+4*2+12*0] + pshufb m3, m7, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+4*2+12*1] + pshufb m4, m7, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+4*2+12*2] + vpermi2b m5, m7, m8 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m2, m3, m12 + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m0, m2 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m7, m0, m1 + ret +.w32: + lea r3d, [hq+31] + vpbroadcastb m9, r3d + and r3d, 31 + pminub m10, m9, [pb_0to63] + or r3d, 32 ; imin(h+31, 63) + vpermb m7, m10, [tlq] + vpbroadcastb m8, [tlq+r3] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vpbroadcastd m2, [tlq-4] + call .w32_filter + cmp hd, 64 + je .w32_h64_filter_end + vpermb m8, m9, m7 + vpermb m7, m10, m7 + jmp .w32_main +.w32_h64_filter_end: ; edge case for 32x64 + movd xmm0, [tlq+r3-1] + movd xmm1, [base+pb_8_56_0_0] + add r3d, 2 + pmaddubsw xmm0, xmm1 + vptestmw k1, xmm1, xmm1 ; 0x01 + pmulhrsw xm0, xmm0, xm15 + vmovdqu8 m8{k1}, m0 +.w32_main: + rorx r2d, dxd, 30 + vpbroadcastd m4, [base+z_xpos_bc] + vpbroadcastw m3, r2d + vbroadcasti32x8 m5, [base+z_xpos_off2a] + shl r3d, 6 + vbroadcasti32x8 m6, [base+z_xpos_off2b] + sub r3d, dxd + paddw m9, m3, m3 + add dxd, dxd + vinserti32x8 m3, ym9, 1 +.w32_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermi2b m0, m7, m8 + vpermi2b m1, m7, m8 + paddsw m3, m9 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w32_end + lea dstq, [dstq+strideq*2] + sub r3d, dxd + jg .w32_loop + punpckhqdq ym8, ym8 +.w32_end_loop: + mova [dstq+strideq*0], ym8 + mova [dstq+strideq*1], ym8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_end_loop +.w32_end: + RET +.w64_filter: + vbroadcasti32x4 m3, [base+z_filter_s2] + mova m1, [base+z_filter_s1] + pshufb m0, m3 ; al bl + vpermi2b m1, m7, m2 + vbroadcasti32x4 m4, [base+z_filter_s4] + pshufb m6, m8, m4 ; el dl + pshufb m9, m7, m4 + pminub m10, m13, [base+z_filter_s5] + pshufb m2, m8, m3 ; ah bh + pshufb m3, m7, m3 + vbroadcasti32x4 m5, [base+z_filter_s3] + vpermb m10, m10, m8 ; eh dh + pshufb m11, m4 + vpbroadcastd m4, [base+z_filter_k+4*2+12*0] + pshufb m8, m5 ; cl ch + pshufb m7, m5 + vpbroadcastd m5, [base+z_filter_k+4*2+12*1] + REPX {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11 + pmaddubsw m4, m8, m5 + pmaddubsw m5, m7, m5 + paddw m0, m6 + vpbroadcastd m6, [base+z_filter_k+4*2+12*2] + paddw m1, m9 + pmaddubsw m7, m6 + pmaddubsw m8, m6 + paddw m2, m10 + paddw m3, m11 + paddw m0, m4 + paddw m1, m5 + paddw m2, m8 + paddw m3, m7 + REPX {pmulhrsw x, m15}, m0, m2, m1, m3 + packuswb m0, m2 + packuswb m7, m1, m3 + vpermb m8, m12, m0 + ret +.w64: + lea r3d, [hq-1] + movu m7, [tlq+64*0] + vpbroadcastb m13, r3d + pminub m12, m13, [pb_0to63] + or r3d, 64 + vpermb m8, m12, [tlq+64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + movu m0, [tlq+56] + vpbroadcastd m2, [tlq-4] + movu m11, [tlq+8] + call .w64_filter +.w64_main: + rorx r2d, dxd, 30 + vpbroadcastd m4, [base+z_xpos_bc] + vpbroadcastw m3, r2d + mova m5, [base+z_xpos_off2a] + shl r3d, 6 + mova m6, [base+z_xpos_off2b] + sub r3d, dxd + mova m9, m3 +.w64_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermi2b m0, m7, m8 + vpermi2b m1, m7, m8 + paddsw m3, m9 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq], m0 + dec hd + jz .w64_end + add dstq, strideq + sub r3d, dxd + jg .w64_loop + vpermb m8, m13, m8 +.w64_end_loop: + mova [dstq], m8 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET + +cglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy + tzcnt wd, wm + movifnidn angled, anglem + lea dxq, [dr_intra_derivative-90] + movzx dyd, angleb + xor angled, 0x400 + mov r7, dxq + sub dxq, dyq + movifnidn hd, hm + and dyd, ~1 + and dxq, ~1 + movzx dyd, word [r7+dyq] ; angle - 90 + lea r7, [z_filter_t0] + movzx dxd, word [dxq+270] ; 180 - angle + movsxd wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4] + mova m8, [base+pb_63to0] + neg dyd + vpermb m8, m8, [tlq-64] ; left + lea wq, [base+ipred_z2_8bpc_avx512icl_table+wq] + mova m14, [base+z_frac_table] + inc tlq + vpbroadcastd m15, [base+pw_512] + neg dxd + jmp wq +.w4: + movd xm7, [tlq] + vpbroadcastq m10, [base+z_xpos_off2a] + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm2, [base+pb_4] + sub angled, 1075 ; angle - 53 + call .upsample_above + lea r3d, [hq+3] + vpbroadcastq m10, [pb_0to63+1] + punpcklbw xm7, xm0, xm7 + call .filter_strength + jmp .w4_filter_left +.w4_upsample_left: + call .upsample_left + movsldup m16, [base+z_ypos_off3] + vpbroadcastd m9, [base+pb_16] + punpcklbw xm8, xm0, xm8 + jmp .w4_main2 +.w4_no_upsample_above: + lea r3d, [hq+3] + sub angled, 1112 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w4_no_filter_above + vpbroadcastd xm5, [base+pb_3] + call .filter_top_w16 +.w4_no_filter_above: + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + vpbroadcastd ym0, [base+pb_90] + psubb ym0, ym17 + vpcmpgtb k2{k2}, ym0, ym16 + kmovd r3d, k2 +.w4_filter_left: + test r3d, r3d + jz .w4_main + popcnt r3d, r3d + call .filter_left_h16 +.w4_main: + movsldup m16, [base+z_ypos_off1] + vpbroadcastd m9, [base+pb_8] +.w4_main2: + vpbroadcastq m3, [base+z_ypos_mul1a] + vpbroadcastw m0, dyd + movsldup m1, [base+z_xpos_mul] + vpbroadcastw m5, dxd + vinserti32x4 m7, [tlq-16], 3 + vinserti32x4 m8, [tlq-16], 3 + pmullw m3, m0 + vbroadcasti32x4 m2, [base+z_xpos_bc] + pmullw m1, m5 ; xpos0..3 + psllw m5, 5 ; dx*8 + psraw m4, m3, 6 + psrlw m3, 1 + packsswb m4, m4 + vpermw m3, m3, m14 ; 64-frac, frac + punpcklbw m4, m4 + lea r2, [strideq*3] + paddb m4, m16 ; base, base+1 +.w4_loop: + pshufb m16, m1, m2 + psrlw m0, m1, 3 + paddb m16, m10 + vpermw m0, m0, m14 + vpmovw2m k1, m16 ; base_x < 0 + vpermb m16, m16, m7 + pmaddubsw m16, m0 + vpermb m0, m4, m8 + pmaddubsw m16{k1}, m0, m3 + pmulhrsw m16, m15 + vpmovwb ym16, m16 + movd [dstq+strideq*0], xm16 + pextrd [dstq+strideq*1], xm16, 1 + pextrd [dstq+strideq*2], xm16, 2 + pextrd [dstq+r2 ], xm16, 3 + sub hd, 8 + jl .w4_end + paddsw m1, m5 + vextracti128 xm16, ym16, 1 + lea dstq, [dstq+strideq*4] + paddb m4, m9 + movd [dstq+strideq*0], xm16 + pextrd [dstq+strideq*1], xm16, 1 + pextrd [dstq+strideq*2], xm16, 2 + pextrd [dstq+r2 ], xm16, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.upsample_above: ; w4/w8 + mova xm0, [tlq-1] + xor angled, 0x7f ; 180 - angle + add dxd, dxd + jmp .upsample +.upsample_left: ; h4/h8 + palignr xm0, xm8, [tlq-16], 15 + vpbroadcastb xm2, hd + add dyd, dyd +.upsample: + pshufb xm1, xm0, [base+z_filter4_s1] + pminub xm2, [base+z_filter_s4] + vpbroadcastd xm3, [base+pb_m4_36] + pshufb xm0, xm2 + pmaddubsw xm1, xm3 + pmaddubsw xm0, xm3 + paddw xm0, xm1 + pmulhrsw xm0, xm15 + packuswb xm0, xm0 + ret +.filter_strength: + vpbroadcastb ym16, r3d + mov r3d, angled + vpbroadcastd m2, [tlq-4] + vpbroadcastb ym17, angled + shr r3d, 8 + vpcmpeqb k2, ym16, [base+z_filter_wh] + mova xm16, [base+z_filter_t0+r3*8] + vpcmpgtb k1{k2}, ym17, ym16 + mova m9, [pb_0to63] + kmovd r3d, k1 + ret +.w8: + movq xm7, [tlq] + vbroadcasti32x4 m10, [base+z_xpos_off2a] + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + vpbroadcastd xm2, [base+pb_8] + sub angled, 53 ; angle - 53 + call .upsample_above + lea r3d, [hq+7] + vbroadcasti32x4 m10, [pb_0to63+1] + punpcklbw xm7, xm0, xm7 + call .filter_strength + jmp .w8_filter_left +.w8_upsample_left: + call .upsample_left + movshdup m16, [base+z_ypos_off3] + vpbroadcastd m9, [base+pb_8] + punpcklbw xm8, xm0, xm8 + jmp .w8_main2 +.w8_no_upsample_above: + lea r3d, [hq+7] + sub angled, 90 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w8_no_filter_above + vpbroadcastd xm5, [base+pb_7] + call .filter_top_w16 +.w8_no_filter_above: + lea r3d, [angleq-51] + mov r3b, hb + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + vpbroadcastd ym0, [base+pb_90] + psubb ym0, ym17 + vpcmpgtb k2{k2}, ym0, ym16 + kmovd r3d, k2 +.w8_filter_left: + test r3d, r3d + jz .w8_main + cmp hd, 32 + je .w8_filter_left_h32 + popcnt r3d, r3d + call .filter_left_h16 + jmp .w8_main +.w8_filter_left_h32: + call .filter_left_h64 +.w8_main: + movshdup m16, [base+z_ypos_off2] + vpbroadcastd m9, [base+pb_4] +.w8_main2: + vbroadcasti32x4 m3, [base+z_ypos_mul1a] + vpbroadcastw m0, dyd + movshdup m1, [base+z_xpos_mul] + vpbroadcastw m5, dxd + vinserti32x4 m7, [tlq-16], 3 + vinserti32x4 m8, [tlq-16], 3 + pmullw m3, m0 + vpbroadcastd m2, [base+pb_1] + pmullw m1, m5 ; xpos0..3 + psllw m5, 4 ; dx*4 + psraw m4, m3, 6 + psrlw m3, 1 + packsswb m4, m4 + vpermw m3, m3, m14 ; 64-frac, frac + lea r3d, [dxq+(8<<6)] + paddsb m4, m16 + shl dxd, 2 + paddsb m0, m4, m2 + lea r2, [strideq*3] + punpcklbw m4, m0 ; base, base+1 +.w8_loop: + pshufb m16, m1, m2 + psrlw m0, m1, 3 + paddb m16, m10 + vpermw m0, m0, m14 + vpmovw2m k1, m16 ; base_x < 0 + vpermb m16, m16, m7 + pmaddubsw m16, m0 + vpermb m0, m4, m8 + pmaddubsw m16{k1}, m0, m3 + pmulhrsw m16, m15 + vpmovwb ym16, m16 + vextracti128 xm17, ym16, 1 + movq [dstq+strideq*0], xm16 + movhps [dstq+strideq*1], xm16 + movq [dstq+strideq*2], xm17 + movhps [dstq+r2 ], xm17 + sub hd, 4 + jz .w8_end + paddw m1, m5 + lea dstq, [dstq+strideq*4] + paddb m4, m9 + add r3d, dxd + jge .w8_loop +.w8_leftonly_loop: + vpermb m16, m4, m8 + pmaddubsw m16, m3 + paddb m4, m9 + pmulhrsw m16, m15 + vpmovwb ym16, m16 + vextracti128 xm17, ym16, 1 + movq [dstq+strideq*0], xm16 + movhps [dstq+strideq*1], xm16 + movq [dstq+strideq*2], xm17 + movhps [dstq+r2 ], xm17 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_leftonly_loop +.w8_end: + RET +.filter_top_w16: + mova xm0, [base+z_filter_s1] + popcnt r3d, r3d + pminub xm4, xm5, [base+z_filter_s4] + vpermi2b xm0, xm7, xm2 + pminub xm5, [base+z_filter_s5] + pshufb xm1, xm7, [base+z_filter_s2] + vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] + pshufb xm3, xm7, [base+z_filter_s3] + vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] + pshufb xm4, xm7, xm4 + vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] + pshufb xm5, xm7, xm5 + pmaddubsw xm0, xm11 + pmaddubsw xm1, xm11 + pmaddubsw xm6, xm3, xm12 + vpbroadcastd xm12, r7m ; max_width + pmaddubsw xm3, xm13 + pmaddubsw xm4, xm11 + pmaddubsw xm5, xm11 + packssdw xm12, xm12 + paddw xm0, xm6 + paddw xm1, xm3 + paddw xm0, xm4 + paddw xm1, xm5 + packsswb xm12, xm12 + pmulhrsw xm0, xm15 + pmulhrsw xm1, xm15 + vpcmpgtb k1, xm12, xm9 ; x < max_width + packuswb xm7{k1}, xm0, xm1 + ret +.filter_left_h16: + lea r5d, [hq-1] + mova xm0, [base+z_filter_s1] + vpbroadcastb xm5, r5d + vpermi2b xm0, xm8, xm2 + pminub xm4, xm5, [base+z_filter_s4] + pshufb xm1, xm8, [base+z_filter_s2] + pminub xm5, [base+z_filter_s5] + pshufb xm3, xm8, [base+z_filter_s3] + vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] + pshufb xm4, xm8, xm4 + vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] + pshufb xm5, xm8, xm5 + vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] + pmaddubsw xm0, xm11 + pmaddubsw xm1, xm11 + pmaddubsw xm6, xm3, xm12 + vpbroadcastd xm12, r8m ; max_height + pmaddubsw xm3, xm13 + pmaddubsw xm4, xm11 + pmaddubsw xm5, xm11 + packssdw xm12, xm12 + paddw xm0, xm6 + paddw xm1, xm3 + paddw xm0, xm4 + paddw xm1, xm5 + packsswb xm12, xm12 + pmulhrsw xm0, xm15 + pmulhrsw xm1, xm15 + vpcmpgtb k1, xm12, xm9 ; y < max_height + packuswb xm8{k1}, xm0, xm1 + ret +.w16: + movu xm7, [tlq] ; top + test angled, 0x400 + jnz .w16_main + lea r3d, [hq+15] + sub angled, 90 + call .filter_strength + test r3d, r3d + jz .w16_no_filter_above + vpbroadcastd xm5, [base+pb_15] + call .filter_top_w16 +.w16_no_filter_above: + cmp hd, 16 + jg .w16_filter_left_h64 + vpbroadcastd ym0, [base+pb_90] + psubb ym0, ym17 + vpcmpgtb k2{k2}, ym0, ym16 + kmovd r3d, k2 + test r3d, r3d + jz .w16_main + popcnt r3d, r3d + call .filter_left_h16 + jmp .w16_main +.w16_filter_left_h64: + call .filter_left_h64 +.w16_main: + vbroadcasti32x4 m6, [base+z_ypos_mul1a] ; 1.. 8 + vbroadcasti32x4 m5, [base+z_ypos_mul1b] ; 9..15 + vpbroadcastw m0, dyd + vinserti32x4 m7, [tlq-16], 3 + vpbroadcastd m2, [base+pb_1] + vpbroadcastw m12, dxd + movshdup m1, [base+z_xpos_mul] + pmullw m6, m0 + vbroadcasti32x4 m3, [base+z_xpos_off2a] + pmullw m5, m0 + vbroadcasti32x4 m4, [base+z_xpos_off2b] + pmullw m1, m12 ; xpos0 xpos1 xpos2 xpos3 + vpbroadcastd m9, [base+pb_4] + psllw m12, 4 ; dx*4 + movshdup m16, [base+z_ypos_off2] + psrlw m10, m6, 1 + psrlw m11, m5, 1 + vpermw m10, m10, m14 ; 64-frac, frac + psraw m6, 6 + vpermw m11, m11, m14 + psraw m5, 6 + mov r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft + packsswb m6, m5 + mov r3d, 1<<6 + paddsb m6, m16 + sub r5d, dxd ; left-only threshold + paddsb m0, m6, m2 + shl dxd, 2 + punpcklbw m5, m6, m0 ; base, base+1 + lea r2, [strideq*3] + punpckhbw m6, m0 +.w16_loop: + pshufb m17, m1, m2 + psrlw m0, m1, 3 + paddb m16, m3, m17 + vpermw m0, m0, m14 + paddb m17, m4 + vpmovw2m k1, m16 + vpermb m16, m16, m7 + vpmovw2m k2, m17 + vpermb m17, m17, m7 + pmaddubsw m16, m0 + pmaddubsw m17, m0 + add r3d, dxd + jge .w16_toponly + mova m0, m8 + vpermt2b m0, m5, m7 + pmaddubsw m16{k1}, m0, m10 + mova m0, m8 + vpermt2b m0, m6, m7 + pmaddubsw m17{k2}, m0, m11 +.w16_toponly: + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + sub hd, 4 + jz .w16_end + paddw m1, m12 + lea dstq, [dstq+strideq*4] + paddb m5, m9 + paddb m6, m9 + cmp r3d, r5d + jge .w16_loop +.w16_leftonly_loop: + vpermb m16, m5, m8 + vpermb m17, m6, m8 + pmaddubsw m16, m10 + pmaddubsw m17, m11 + paddb m5, m9 + paddb m6, m9 + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_leftonly_loop +.w16_end: + RET +.w32: + movu ym7, [tlq] + test angled, 0x400 + jnz .w32_main + vpbroadcastd m2, [tlq-4] + mova ym0, [base+z_filter_s1] + vbroadcasti32x4 ym1, [base+z_filter_s2] + vbroadcasti32x4 ym3, [base+z_filter_s3] + vbroadcasti32x4 ym4, [base+z_filter_s4] + vpermi2b ym0, ym7, ym2 ; al bl + vpbroadcastd ym5, [base+pb_31] + pminub ym5, [base+z_filter_s5] + pshufb ym1, ym7, ym1 ; ah bh + vpbroadcastd ym11, [base+z_filter_k+4*2+12*0] + pshufb ym3, ym7, ym3 ; cl ch + vpbroadcastd ym12, [base+z_filter_k+4*2+12*1] + pshufb ym4, ym7, ym4 ; el dl + vpbroadcastd ym13, [base+z_filter_k+4*2+12*2] + vpermb ym5, ym5, ym7 ; eh dh + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym6, ym3, ym12 + vpbroadcastd ym12, r6m + pmaddubsw ym3, ym13 + pmaddubsw ym4, ym11 + pmaddubsw ym5, ym11 + mova m9, [pb_0to63] + packssdw ym12, ym12 + paddw ym0, ym6 + paddw ym1, ym3 + paddw ym0, ym4 + paddw ym1, ym5 + packsswb ym12, ym12 + pmulhrsw ym0, ym15 + pmulhrsw ym1, ym15 + vpcmpgtb k1, ym12, ym9 ; x < max_width + packuswb ym7{k1}, ym0, ym1 + cmp hd, 16 + jg .w32_filter_h64 + mov r3d, 3 + call .filter_left_h16 + jmp .w32_main +.w32_filter_h64: + call .filter_left_h64 +.w32_main: + vbroadcasti32x8 m6, [base+z_ypos_mul1a] ; 1.. 8 + vbroadcasti32x8 m5, [base+z_ypos_mul1b] ; 9..15 + vpbroadcastw m0, dyd + vinserti32x4 m7, [tlq-16], 3 + rorx r2q, dxq, 62 ; dx << 2 + vpbroadcastd m2, [base+pb_1] + vpbroadcastw m1, r2d + pmullw m6, m0 + vbroadcasti32x8 m3, [base+z_xpos_off2a] + pmullw m5, m0 + vbroadcasti32x8 m4, [base+z_xpos_off2b] + mova ym0, ym1 + paddw m12, m1, m1 + vpbroadcastd m9, [base+pb_2] + paddw m1, m0 ; xpos1 xpos0 + mova ym0, ym2 + psrlw m10, m6, 1 + psrlw m11, m5, 1 + vpermw m10, m10, m14 ; 64-frac, frac + psraw m6, 6 + vpermw m11, m11, m14 + psraw m5, 6 + mov r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft + packsswb m6, m5 + mov r3d, 1<<6 + paddsb m6, m0 + sub r5d, dxd ; left-only threshold + paddsb m0, m6, m2 + add dxd, dxd + punpcklbw m5, m6, m0 ; base, base+1 + punpckhbw m6, m0 +.w32_loop: + pshufb m17, m1, m2 + psrlw m0, m1, 3 + paddb m16, m3, m17 + vpermw m0, m0, m14 + paddb m17, m4 + vpmovw2m k1, m16 + vpermb m16, m16, m7 + vpmovw2m k2, m17 + vpermb m17, m17, m7 + pmaddubsw m16, m0 + pmaddubsw m17, m0 + add r3d, dxd + jge .w32_toponly + mova m0, m8 + vpermt2b m0, m5, m7 + pmaddubsw m16{k1}, m0, m10 + mova m0, m8 + vpermt2b m0, m6, m7 + pmaddubsw m17{k2}, m0, m11 +.w32_toponly: + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + vextracti32x8 [dstq+strideq*0], m16, 1 + mova [dstq+strideq*1], ym16 + sub hd, 2 + jz .w32_end + paddw m1, m12 + lea dstq, [dstq+strideq*2] + paddb m5, m9 + paddb m6, m9 + cmp r3d, r5d + jge .w32_loop +.w32_leftonly_loop: + vpermb m16, m5, m8 + vpermb m17, m6, m8 + pmaddubsw m16, m10 + pmaddubsw m17, m11 + paddb m5, m9 + paddb m6, m9 + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + vextracti32x8 [dstq+strideq*0], m16, 1 + mova [dstq+strideq*1], ym16 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_leftonly_loop +.w32_end: + RET +.filter_left_h64: + mova m0, [base+z_filter_s1] + lea r3d, [hq-1] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpbroadcastb m5, r3d + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vpermi2b m0, m8, m2 ; al bl + pminub m5, [base+z_filter_s5] + pshufb m1, m8, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+4*2+12*0] + pshufb m3, m8, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+4*2+12*1] + pshufb m4, m8, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+4*2+12*2] + vpermb m5, m5, m8 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m6, m3, m12 + vpbroadcastd m12, r8m ; max_height + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + packssdw m12, m12 + paddw m0, m6 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + packsswb m12, m12 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + vpcmpgtb k1, m12, m9 ; y < max_height + packuswb m8{k1}, m0, m1 + ret +.w64: + movu m7, [tlq] + test angled, 0x400 + jnz .w64_main + vpbroadcastd m2, [tlq-4] + mova m0, [base+z_filter_s1] + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpermi2b m0, m7, m2 ; al bl + vpbroadcastd m5, [base+pb_63] + pminub m5, [base+z_filter_s5] + pshufb m1, m7, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+4*2+12*0] + pshufb m3, m7, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+4*2+12*1] + pshufb m4, m7, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+4*2+12*2] + vpermb m5, m5, m7 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m6, m3, m12 + vpbroadcastd m12, r6m + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + mova m9, [pb_0to63] + packssdw m12, m12 + paddw m0, m6 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + packsswb m12, m12 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + vpcmpgtb k1, m12, m9 ; x < max_width + packuswb m7{k1}, m0, m1 + call .filter_left_h64 ; always filter the full 64 pixels for simplicity +.w64_main: + vpbroadcastw m5, dyd + vpbroadcastd m9, [tlq-4] + rorx r2q, dxq, 62 ; dx << 2 + pmullw m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such + pmullw m5, [base+z_ypos_mul1b] ; pixels aren't selected from the left edge + vpbroadcastw m1, r2d ; xpos + mova m3, [base+z_xpos_off2a] + mova m4, [base+z_xpos_off2b] + mova m12, m1 + vpbroadcastd m2, [base+pb_1] + psrlw m10, m6, 1 + psrlw m11, m5, 1 + vpermw m10, m10, m14 ; 64-frac, frac + psraw m6, 6 + vpermw m11, m11, m14 + psraw m5, 6 + mov r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft + packsswb m6, m5 + mov r3d, 1<<6 + paddsb m0, m6, m2 + sub r5d, dxd ; left-only threshold + punpcklbw m5, m6, m0 ; base, base+1 + punpckhbw m6, m0 +.w64_loop: + pshufb m17, m1, m2 + psrlw m0, m1, 3 + paddb m16, m3, m17 + vpermw m0, m0, m14 + paddb m17, m4 + vpmovw2m k1, m16 ; base_x < 0 + vpermi2b m16, m7, m9 + vpmovw2m k2, m17 + vpermi2b m17, m7, m9 + pmaddubsw m16, m0 + pmaddubsw m17, m0 + add r3d, dxd + jge .w64_toponly + mova m0, m8 + vpermt2b m0, m5, m9 + pmaddubsw m16{k1}, m0, m10 + mova m0, m8 + vpermt2b m0, m6, m9 + pmaddubsw m17{k2}, m0, m11 +.w64_toponly: + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq], m16 + dec hd + jz .w64_end + paddw m1, m12 + add dstq, strideq + paddb m5, m2 + paddb m6, m2 + cmp r3d, r5d + jge .w64_loop +.w64_leftonly_loop: + vpermb m16, m5, m8 + vpermb m17, m6, m8 + pmaddubsw m16, m10 + pmaddubsw m17, m11 + paddb m5, m2 + paddb m6, m2 + pmulhrsw m16, m15 + pmulhrsw m17, m15 + packuswb m16, m17 + mova [dstq], m16 + add dstq, strideq + dec hd + jg .w64_leftonly_loop +.w64_end: + RET + +cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy + lea r7, [z_filter_t0] + tzcnt wd, wm + movifnidn angled, anglem + lea t0, [dr_intra_derivative+45*2-1] + movsxd wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4] + sub angled, 180 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + mova m0, [base+pb_63to0] + movzx dyd, word [t0+dyq] + lea wq, [base+ipred_z3_8bpc_avx512icl_table+wq] + movifnidn hd, hm + mova m14, [base+z_frac_table] + shl dyd, 6 + vpbroadcastd m15, [base+pw_512] + jmp wq +.w4: + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + lea r3d, [hq+4] + call .upsample + movshdup m1, [base+z_ypos_off1] + vpbroadcastd m6, [base+pb_16] + jmp .w4_main2 +.w4_no_upsample: + lea r3d, [hq+3] + vpbroadcastb m9, r3d + vpxord m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4) + pmaxub m1, m0 + vpermb m7, m1, [tlq-64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + vpbroadcastb xm1, angled + shr angled, 8 + vpcmpeqb k1, xm9, [base+z_filter_wh] + vpbroadcastd m2, [tlq-3] + vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] + kmovw r5d, k1 + test r5d, r5d + jz .w4_main + pminub m9, [pb_0to63] + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter + vpermb m7, m9, m0 +.w4_main: + movsldup m1, [base+z_ypos_off1] + vpbroadcastd m6, [base+pb_8] +.w4_main2: + vpbroadcastw m0, dyd + vpbroadcastq m2, [base+z_ypos_mul2a] ; 1..4 + pmulhuw m2, m0 ; ypos >> 1 + lea r2, [strideq*3] + vpermw m3, m2, m14 ; 64-frac, frac + psrlw m2, 5 + packsswb m2, m2 + punpcklbw m2, m2 + paddsb m2, m1 ; base, base+1 +.w4_loop: + vpermb m0, m2, m7 + pmaddubsw m0, m3 + paddsb m2, m6 + pmulhrsw m0, m15 + vpmovwb ym0, m0 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + sub hd, 8 + jl .w4_end + vextracti32x4 xm0, ym0, 1 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.upsample: + xor r3d, 31 ; 31 - (h + imin(w, h)) + vbroadcasti32x4 ym0, [base+z_xpos_off2a] + vpbroadcastb ym7, r3d + pmaxub ym7, [base+z3_upsample] + vbroadcasti32x4 ym1, [base+z_filter_s4] + vpermb ym7, ym7, [tlq-31] + vpbroadcastd ym2, [base+pb_m4_36] + pshufb ym0, ym7, ym0 + psrldq ym7, 1 + pshufb ym1, ym7, ym1 + pmaddubsw ym0, ym2 + pmaddubsw ym1, ym2 + add dyd, dyd + paddw ym0, ym1 + pmulhrsw ym0, ym15 + packuswb ym0, ym0 + punpcklbw ym7, ym0 + ret +.w8: + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + lea r3d, [hq*2] + call .upsample + pshufd m1, [base+z_ypos_off1], q0000 + vpbroadcastd m6, [base+pb_8] + jmp .w8_main2 +.w8_no_upsample: + mov r3d, 8 + cmp hd, 4 + cmove r3d, hd + lea r3d, [r3+hq-1] + xor r3d, 63 ; 63 - (h + imin(w, h)) + vpbroadcastb m1, wd + pmaxub m1, m0 + vpermb m7, m1, [tlq-64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w8_main + lea r3d, [hq+7] + call .filter_strength + test r5d, r5d + jz .w8_main + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter + vpermb m7, m10, m0 +.w8_main: + movsldup m1, [base+z_ypos_off2] + vpbroadcastd m6, [base+pb_4] +.w8_main2: + vpbroadcastw m0, dyd + vbroadcasti32x4 m2, [base+z_ypos_mul2a] ; 1..8 + pmulhuw m2, m0 ; ypos >> 1 + lea r2, [strideq*3] + vpermw m3, m2, m14 ; 64-frac, frac + psrlw m2, 5 + packsswb m2, m2 + punpcklbw m2, m2 + paddsb m2, m1 ; base, base+1 +.w8_loop: + vpermb m0, m2, m7 + pmaddubsw m0, m3 + paddsb m2, m6 + pmulhrsw m0, m15 + vpmovwb ym0, m0 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.filter_strength: + vpbroadcastd m2, [tlq-3] +.filter_strength2: + vpbroadcastb m9, r3d + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym9, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + pminub m10, m9, [pb_0to63] + kmovd r5d, k1 + ret +.w16_load: + cmp r3d, hd + cmovae r3d, hd + add r3d, hd + mova m7, [tlq-64*1] + neg r3d ; -(h + imin(w, h)) + and r3d, 63 + vpbroadcastb m1, r3d + pmaxub m2, m0, m1 + cmp hd, 64 + je .w16_load_h64 + vpermb m8, m1, m7 + vpermb m7, m2, m7 + ret +.w16_load_h64: + vpermb m7, m0, m7 + vpermb m8, m2, [tlq-64*2] + ret +.w16: + mov r3d, 16 + call .w16_load + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w16_main + vpbroadcastd m2, [tlq-3] + cmp hd, 64 + je .w16_filter64 + lea r3d, [hq+15] + call .filter_strength2 + test r5d, r5d + jz .w16_main + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter + pminub m10, m9, [pb_0to63] + vpermb m8, m9, m0 + vpermb m7, m10, m0 + jmp .w16_main +.w16_filter64: + vpbroadcastd m13, [base+pb_15] + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w16_main: + vbroadcasti32x4 m3, [base+z_ypos_mul2a] ; 1.. 8 + vbroadcasti32x4 m2, [base+z_ypos_mul2b] ; 9..15 + vpbroadcastw m0, dyd + vpbroadcastd m6, [base+pb_4] + pmulhuw m3, m0 ; ypos >> 1 + pmulhuw m2, m0 + movshdup m0, [base+z_ypos_off2] + lea r2, [strideq*3] + vpbroadcastd m1, [base+pb_1] + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m3, m0 + paddsb m1, m3 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w16_loop: +%macro Z3_PERM2 0 + mova m0, m7 + vpermt2b m0, m2, m8 + mova m1, m7 + vpermt2b m1, m3, m8 + pmaddubsw m0, m4 + pmaddubsw m1, m5 + paddsb m2, m6 + paddsb m3, m6 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 +%endmacro + Z3_PERM2 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + mov r3d, 32 + call .w16_load + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vpbroadcastd m2, [tlq-3] + cmp hd, 64 + je .w32_filter64 + lea r3d, [hq+31] + vpbroadcastb m9, r3d + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter + vpermb m8, m9, m7 + jmp .w32_main +.w32_filter64: + vpbroadcastd m13, [base+pb_31] + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w32_main: + vbroadcasti32x8 m3, [base+z_ypos_mul2a] ; 1.. 8 + vbroadcasti32x8 m2, [base+z_ypos_mul2b] ; 9..15 + vpbroadcastw m0, dyd + vpbroadcastd m1, [base+pb_1] + pmulhuw m3, m0 ; ypos >> 1 + pmulhuw m2, m0 + vpbroadcastd m6, [base+pb_2] + mova ym0, ym1 + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m3, m0 + paddsb m1, m3 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w32_loop: + Z3_PERM2 + vextracti32x8 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], ym0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + mova m7, [tlq-64*1] + cmp hd, 64 + je .w64_h64 + lea r3d, [hq*2-1] + xor r3d, 63 ; -(h + imin(w, h)) & 63 + vpbroadcastb m1, r3d + pmaxub m0, m1 + vpermb m8, m1, m7 + jmp .w64_filter +.w64_h64: + vpermb m8, m0, [tlq-64*2] +.w64_filter: + vpermb m7, m0, m7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + lea r3d, [hq-1] + vpbroadcastd m2, [tlq-3] + vpbroadcastb m13, r3d + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w64_main: + vpbroadcastw m2, dyd + pmulhuw m3, m2, [base+z_ypos_mul2a] + pmulhuw m2, [base+z_ypos_mul2b] + vpbroadcastd m6, [base+pb_1] + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m1, m3, m6 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w64_loop: + Z3_PERM2 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + ; The ipred_filter code processes 4x2 blocks in the following order ; which increases parallelism compared to doing things row by row. ; Some redundant blocks are calculated for w > 4. |