/* * Copyright (c) 2021 Loongson Technology Corporation Limited * Contributed by Lu Wang * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "./vp8_rtcd.h" #include "vp8/common/loopfilter.h" #include "vpx_util/loongson_intrinsics.h" #define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \ do { \ __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ const __m128i cnst4b = __lsx_vldi(4); \ const __m128i cnst3b = __lsx_vldi(3); \ \ p1_m = __lsx_vxori_b(p1, 0x80); \ p0_m = __lsx_vxori_b(p0, 0x80); \ q0_m = __lsx_vxori_b(q0, 0x80); \ q1_m = __lsx_vxori_b(q1, 0x80); \ \ filt = __lsx_vssub_b(p1_m, q1_m); \ filt = __lsx_vand_v(filt, hev); \ q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ filt = __lsx_vsadd_b(filt, q0_sub_p0); \ filt = __lsx_vsadd_b(filt, q0_sub_p0); \ filt = __lsx_vsadd_b(filt, q0_sub_p0); \ filt = __lsx_vand_v(filt, mask); \ t1 = __lsx_vsadd_b(filt, cnst4b); \ t1 = __lsx_vsra_b(t1, cnst3b); \ t2 = __lsx_vsadd_b(filt, cnst3b); \ t2 = __lsx_vsra_b(t2, cnst3b); \ q0_m = __lsx_vssub_b(q0_m, t1); \ q0 = __lsx_vxori_b(q0_m, 0x80); \ p0_m = __lsx_vsadd_b(p0_m, t2); \ p0 = __lsx_vxori_b(p0_m, 0x80); \ filt = __lsx_vsrari_b(t1, 1); \ hev = __lsx_vxori_b(hev, 0xff); \ filt = __lsx_vand_v(filt, hev); \ q1_m = __lsx_vssub_b(q1_m, filt); \ q1 = __lsx_vxori_b(q1_m, 0x80); \ p1_m = __lsx_vsadd_b(p1_m, filt); \ p1 = __lsx_vxori_b(p1_m, 0x80); \ } while (0) #define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ do { \ __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ __m128i u, filt, t1, t2, filt_sign, q0_sub_p0; \ __m128i filt_r, filt_l; \ __m128i temp0, temp1, temp2, temp3; \ const __m128i cnst4b = __lsx_vldi(4); \ const __m128i cnst3b = __lsx_vldi(3); \ const __m128i cnst9h = __lsx_vldi(1033); \ const __m128i cnst63h = __lsx_vldi(1087); \ \ p2_m = __lsx_vxori_b(p2, 0x80); \ p1_m = __lsx_vxori_b(p1, 0x80); \ p0_m = __lsx_vxori_b(p0, 0x80); \ q0_m = __lsx_vxori_b(q0, 0x80); \ q1_m = __lsx_vxori_b(q1, 0x80); \ q2_m = __lsx_vxori_b(q2, 0x80); \ \ filt = __lsx_vssub_b(p1_m, q1_m); \ q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ filt = __lsx_vsadd_b(filt, q0_sub_p0); \ filt = __lsx_vsadd_b(filt, q0_sub_p0); \ filt = __lsx_vsadd_b(filt, q0_sub_p0); \ filt = __lsx_vand_v(filt, mask); \ \ t2 = __lsx_vand_v(filt, hev); \ hev = __lsx_vxori_b(hev, 0xff); \ filt = __lsx_vand_v(hev, filt); \ t1 = __lsx_vsadd_b(t2, cnst4b); \ t1 = __lsx_vsra_b(t1, cnst3b); \ t2 = __lsx_vsadd_b(t2, cnst3b); \ t2 = __lsx_vsra_b(t2, cnst3b); \ q0_m = __lsx_vssub_b(q0_m, t1); \ p0_m = __lsx_vsadd_b(p0_m, t2); \ filt_sign = __lsx_vslti_b(filt, 0); \ filt_r = __lsx_vilvl_b(filt_sign, filt); \ filt_l = __lsx_vilvh_b(filt_sign, filt); \ temp0 = __lsx_vmul_h(filt_r, cnst9h); \ temp1 = __lsx_vadd_h(temp0, cnst63h); \ temp2 = __lsx_vmul_h(filt_l, cnst9h); \ temp3 = __lsx_vadd_h(temp2, cnst63h); \ \ u = __lsx_vssrani_b_h(temp3, temp1, 7); \ q2_m = __lsx_vssub_b(q2_m, u); \ p2_m = __lsx_vsadd_b(p2_m, u); \ q2 = __lsx_vxori_b(q2_m, 0x80); \ p2 = __lsx_vxori_b(p2_m, 0x80); \ \ temp1 = __lsx_vadd_h(temp1, temp0); \ temp3 = __lsx_vadd_h(temp3, temp2); \ \ u = __lsx_vssrani_b_h(temp3, temp1, 7); \ q1_m = __lsx_vssub_b(q1_m, u); \ p1_m = __lsx_vsadd_b(p1_m, u); \ q1 = __lsx_vxori_b(q1_m, 0x80); \ p1 = __lsx_vxori_b(p1_m, 0x80); \ \ temp1 = __lsx_vadd_h(temp1, temp0); \ temp3 = __lsx_vadd_h(temp3, temp2); \ \ u = __lsx_vssrani_b_h(temp3, temp1, 7); \ q0_m = __lsx_vssub_b(q0_m, u); \ p0_m = __lsx_vsadd_b(p0_m, u); \ q0 = __lsx_vxori_b(q0_m, 0x80); \ p0 = __lsx_vxori_b(p0_m, 0x80); \ } while (0) #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ flat_out) \ do { \ __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ \ p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in); \ p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in); \ p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in); \ q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in); \ q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in); \ q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in); \ p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in); \ p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in); \ flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m); \ hev_out = __lsx_vslt_bu(thresh_in, flat_out); \ p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m); \ p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1); \ p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m); \ mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m); \ mask_out = __lsx_vmax_bu(flat_out, mask_out); \ p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m); \ mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out); \ q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m); \ mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \ mask_out = __lsx_vslt_bu(limit_in, mask_out); \ mask_out = __lsx_vxori_b(mask_out, 0xff); \ } while (0) #define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \ do { \ __lsx_vstelm_w(in0, pdst, 0, in0_idx); \ __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx); \ } while (0) static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, const uint8_t *limit0_ptr, const uint8_t *thresh0_ptr, const uint8_t *b_limit1_ptr, const uint8_t *limit1_ptr, const uint8_t *thresh1_ptr) { int32_t pitch_x2 = pitch << 1; int32_t pitch_x3 = pitch_x2 + pitch; int32_t pitch_x4 = pitch << 2; __m128i mask, hev, flat; __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; __m128i p3, p2, p1, p0, q3, q2, q1, q0; DUP4_ARG2(__lsx_vldx, src, -pitch_x4, src, -pitch_x3, src, -pitch_x2, src, -pitch, p3, p2, p1, p0); q0 = __lsx_vld(src, 0); DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2); q3 = __lsx_vldx(src, pitch_x3); thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); thresh0 = __lsx_vilvl_d(thresh1, thresh0); b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); limit0 = __lsx_vldrepl_b(limit0_ptr, 0); limit1 = __lsx_vldrepl_b(limit1_ptr, 0); limit0 = __lsx_vilvl_d(limit1, limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); __lsx_vstx(p1, src, -pitch_x2); __lsx_vstx(p0, src, -pitch); __lsx_vst(q0, src, 0); __lsx_vstx(q1, src, pitch); } static void loop_filter_vertical_4_dual_lsx(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, const uint8_t *limit0_ptr, const uint8_t *thresh0_ptr, const uint8_t *b_limit1_ptr, const uint8_t *limit1_ptr, const uint8_t *thresh1_ptr) { uint8_t *src_tmp0 = src - 4; int32_t pitch_x2 = pitch << 1; int32_t pitch_x3 = pitch_x2 + pitch; int32_t pitch_x4 = pitch << 2; __m128i mask, hev, flat; __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; __m128i p3, p2, p1, p0, q3, q2, q1, q0; __m128i row0, row1, row2, row3, row4, row5, row6, row7; __m128i row8, row9, row10, row11, row12, row13, row14, row15; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; row0 = __lsx_vld(src_tmp0, 0); DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row1, row2); row3 = __lsx_vldx(src_tmp0, pitch_x3); src_tmp0 += pitch_x4; row4 = __lsx_vld(src_tmp0, 0); DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row5, row6); row7 = __lsx_vldx(src_tmp0, pitch_x3); src_tmp0 += pitch_x4; row8 = __lsx_vld(src_tmp0, 0); DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row9, row10); row11 = __lsx_vldx(src_tmp0, pitch_x3); src_tmp0 += pitch_x4; row12 = __lsx_vld(src_tmp0, 0); DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row13, row14); row15 = __lsx_vldx(src_tmp0, pitch_x3); LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); thresh0 = __lsx_vilvl_d(thresh1, thresh0); b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); limit0 = __lsx_vldrepl_b(limit0_ptr, 0); limit1 = __lsx_vldrepl_b(limit1_ptr, 0); limit0 = __lsx_vilvl_d(limit1, limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); tmp2 = __lsx_vilvl_h(tmp1, tmp0); tmp3 = __lsx_vilvh_h(tmp1, tmp0); DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1); tmp4 = __lsx_vilvl_h(tmp1, tmp0); tmp5 = __lsx_vilvh_h(tmp1, tmp0); src -= 2; __lsx_vstelm_w(tmp2, src, 0, 0); src += pitch; __lsx_vstelm_w(tmp2, src, 0, 1); src += pitch; __lsx_vstelm_w(tmp2, src, 0, 2); src += pitch; __lsx_vstelm_w(tmp2, src, 0, 3); src += pitch; __lsx_vstelm_w(tmp3, src, 0, 0); src += pitch; __lsx_vstelm_w(tmp3, src, 0, 1); src += pitch; __lsx_vstelm_w(tmp3, src, 0, 2); src += pitch; __lsx_vstelm_w(tmp3, src, 0, 3); src += pitch; __lsx_vstelm_w(tmp4, src, 0, 0); src += pitch; __lsx_vstelm_w(tmp4, src, 0, 1); src += pitch; __lsx_vstelm_w(tmp4, src, 0, 2); src += pitch; __lsx_vstelm_w(tmp4, src, 0, 3); src += pitch; __lsx_vstelm_w(tmp5, src, 0, 0); src += pitch; __lsx_vstelm_w(tmp5, src, 0, 1); src += pitch; __lsx_vstelm_w(tmp5, src, 0, 2); src += pitch; __lsx_vstelm_w(tmp5, src, 0, 3); } static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { int32_t pitch_x2 = pitch << 1; int32_t pitch_x3 = pitch_x2 + pitch; int32_t pitch_x4 = pitch << 2; __m128i p3, p2, p1, p0, q3, q2, q1, q0; __m128i mask, hev, flat, thresh, limit, b_limit; __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; thresh = __lsx_vreplgr2vr_b(thresh_in); limit = __lsx_vreplgr2vr_b(limit_in); b_limit = __lsx_vreplgr2vr_b(b_limit_in); DUP4_ARG2(__lsx_vldx, src_u, -pitch_x4, src_u, -pitch_x3, src_u, -pitch_x2, src_u, -pitch, p3_u, p2_u, p1_u, p0_u); q0_u = __lsx_vld(src_u, 0); DUP2_ARG2(__lsx_vldx, src_u, pitch, src_u, pitch_x2, q1_u, q2_u); q3_u = __lsx_vldx(src_u, pitch_x3); DUP4_ARG2(__lsx_vldx, src_v, -pitch_x4, src_v, -pitch_x3, src_v, -pitch_x2, src_v, -pitch, p3_v, p2_v, p1_v, p0_v); q0_v = __lsx_vld(src_v, 0); DUP2_ARG2(__lsx_vldx, src_v, pitch, src_v, pitch_x2, q1_v, q2_v); q3_v = __lsx_vldx(src_v, pitch_x3); /* right 8 element of p3 are u pixel and left 8 element of p3 are v pixel */ DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); __lsx_vstelm_d(q1, src_u + pitch, 0, 0); __lsx_vstelm_d(q0, src_u, 0, 0); __lsx_vstelm_d(p0, src_u - pitch, 0, 0); __lsx_vstelm_d(p1, src_u - pitch_x2, 0, 0); __lsx_vstelm_d(q1, src_v + pitch, 0, 1); __lsx_vstelm_d(q0, src_v, 0, 1); __lsx_vstelm_d(p0, src_v - pitch, 0, 1); __lsx_vstelm_d(p1, src_v - pitch_x2, 0, 1); } static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint8_t *src_u_tmp, *src_v_tmp; int32_t pitch_x2 = pitch << 1; int32_t pitch_x3 = pitch_x2 + pitch; int32_t pitch_x4 = pitch << 2; __m128i p3, p2, p1, p0, q3, q2, q1, q0; __m128i mask, hev, flat, thresh, limit, b_limit; __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; __m128i row9, row10, row11, row12, row13, row14, row15; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; thresh = __lsx_vreplgr2vr_b(thresh_in); limit = __lsx_vreplgr2vr_b(limit_in); b_limit = __lsx_vreplgr2vr_b(b_limit_in); src_u_tmp = src_u - 4; row0 = __lsx_vld(src_u_tmp, 0); DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row1, row2); row3 = __lsx_vldx(src_u_tmp, pitch_x3); src_u_tmp += pitch_x4; row4 = __lsx_vld(src_u_tmp, 0); DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row5, row6); row7 = __lsx_vldx(src_u_tmp, pitch_x3); src_v_tmp = src_v - 4; row8 = __lsx_vld(src_v_tmp, 0); DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row9, row10); row11 = __lsx_vldx(src_v_tmp, pitch_x3); src_v_tmp += pitch_x4; row12 = __lsx_vld(src_v_tmp, 0); DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row13, row14); row15 = __lsx_vldx(src_v_tmp, pitch_x3); LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); tmp2 = __lsx_vilvl_h(tmp1, tmp0); tmp3 = __lsx_vilvh_h(tmp1, tmp0); tmp0 = __lsx_vilvh_b(p0, p1); tmp1 = __lsx_vilvh_b(q1, q0); tmp4 = __lsx_vilvl_h(tmp1, tmp0); tmp5 = __lsx_vilvh_h(tmp1, tmp0); src_u_tmp += 2; __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x4, 0, 0); __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x3, 0, 1); __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x2, 0, 2); __lsx_vstelm_w(tmp2, src_u_tmp - pitch, 0, 3); __lsx_vstelm_w(tmp3, src_u_tmp, 0, 0); __lsx_vstelm_w(tmp3, src_u_tmp + pitch, 0, 1); __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x2, 0, 2); __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x3, 0, 3); src_v_tmp += 2; __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x4, 0, 0); __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x3, 0, 1); __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x2, 0, 2); __lsx_vstelm_w(tmp4, src_v_tmp - pitch, 0, 3); __lsx_vstelm_w(tmp5, src_v_tmp, 0, 0); __lsx_vstelm_w(tmp5, src_v_tmp + pitch, 0, 1); __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x2, 0, 2); __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x3, 0, 3); } static inline void mbloop_filter_horizontal_edge_y_lsx( uint8_t *src, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint8_t *temp_src; int32_t pitch_x2 = pitch << 1; int32_t pitch_x3 = pitch_x2 + pitch; int32_t pitch_x4 = pitch << 2; __m128i p3, p2, p1, p0, q3, q2, q1, q0; __m128i mask, hev, flat, thresh, limit, b_limit; DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); thresh = __lsx_vldrepl_b(&thresh_in, 0); temp_src = src - pitch_x4; DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, temp_src, pitch_x3, p3, p2, p1, p0); temp_src += pitch_x4; DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, temp_src, pitch_x3, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); temp_src = src - pitch_x3; __lsx_vstx(p2, temp_src, 0); __lsx_vstx(p1, temp_src, pitch); __lsx_vstx(p0, temp_src, pitch_x2); __lsx_vstx(q0, temp_src, pitch_x3); temp_src += pitch_x4; __lsx_vstx(q1, temp_src, 0); __lsx_vstx(q2, temp_src, pitch); } static inline void mbloop_filter_horizontal_edge_uv_lsx( uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint8_t *temp_src; int32_t pitch_x2 = pitch << 1; int32_t pitch_x3 = pitch_x2 + pitch; int32_t pitch_x4 = pitch << 2; __m128i p3, p2, p1, p0, q3, q2, q1, q0; __m128i mask, hev, flat, thresh, limit, b_limit; __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); thresh = __lsx_vldrepl_b(&thresh_in, 0); temp_src = src_u - pitch_x4; DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, temp_src, pitch_x3, p3_u, p2_u, p1_u, p0_u); temp_src += pitch_x4; DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, temp_src, pitch_x3, q0_u, q1_u, q2_u, q3_u); temp_src = src_v - pitch_x4; DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, temp_src, pitch_x3, p3_v, p2_v, p1_v, p0_v); temp_src += pitch_x4; DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, temp_src, pitch_x3, q0_v, q1_v, q2_v, q3_v); DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); src_u -= pitch_x3; __lsx_vstelm_d(p2, src_u, 0, 0); __lsx_vstelm_d(p1, src_u + pitch, 0, 0); __lsx_vstelm_d(p0, src_u + pitch_x2, 0, 0); __lsx_vstelm_d(q0, src_u + pitch_x3, 0, 0); src_u += pitch_x4; __lsx_vstelm_d(q1, src_u, 0, 0); src_u += pitch; __lsx_vstelm_d(q2, src_u, 0, 0); src_v -= pitch_x3; __lsx_vstelm_d(p2, src_v, 0, 1); __lsx_vstelm_d(p1, src_v + pitch, 0, 1); __lsx_vstelm_d(p0, src_v + pitch_x2, 0, 1); __lsx_vstelm_d(q0, src_v + pitch_x3, 0, 1); src_v += pitch_x4; __lsx_vstelm_d(q1, src_v, 0, 1); src_v += pitch; __lsx_vstelm_d(q2, src_v, 0, 1); } static inline void mbloop_filter_vertical_edge_y_lsx(uint8_t *src, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint8_t *temp_src; int32_t pitch_x2 = pitch << 1; int32_t pitch_x3 = pitch_x2 + pitch; int32_t pitch_x4 = pitch << 2; __m128i p3, p2, p1, p0, q3, q2, q1, q0; __m128i mask, hev, flat, thresh, limit, b_limit; __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; __m128i row9, row10, row11, row12, row13, row14, row15; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); thresh = __lsx_vldrepl_b(&thresh_in, 0); temp_src = src - 4; DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, temp_src, pitch_x3, row0, row1, row2, row3); temp_src += pitch_x4; DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, temp_src, pitch_x3, row4, row5, row6, row7); temp_src += pitch_x4; DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, temp_src, pitch_x3, row8, row9, row10, row11); temp_src += pitch_x4; DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, temp_src, pitch_x3, row12, row13, row14, row15); temp_src -= pitch_x4; LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1); tmp3 = __lsx_vilvl_h(tmp1, tmp0); tmp4 = __lsx_vilvh_h(tmp1, tmp0); DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1); tmp6 = __lsx_vilvl_h(tmp1, tmp0); tmp7 = __lsx_vilvh_h(tmp1, tmp0); tmp2 = __lsx_vilvl_b(q2, q1); tmp5 = __lsx_vilvh_b(q2, q1); temp_src = src - 3; VP8_ST6x1_B(tmp3, 0, tmp2, 0, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp3, 1, tmp2, 1, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp3, 2, tmp2, 2, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp3, 3, tmp2, 3, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp4, 0, tmp2, 4, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp4, 1, tmp2, 5, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp4, 2, tmp2, 6, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp4, 3, tmp2, 7, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp6, 0, tmp5, 0, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp6, 1, tmp5, 1, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp6, 2, tmp5, 2, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp6, 3, tmp5, 3, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp7, 0, tmp5, 4, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp7, 1, tmp5, 5, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp7, 2, tmp5, 6, temp_src, 4); temp_src += pitch; VP8_ST6x1_B(tmp7, 3, tmp5, 7, temp_src, 4); } static inline void mbloop_filter_vertical_edge_uv_lsx( uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { int32_t pitch_x2 = pitch << 1; int32_t pitch_x3 = pitch_x2 + pitch; int32_t pitch_x4 = pitch << 2; __m128i p3, p2, p1, p0, q3, q2, q1, q0; __m128i mask, hev, flat, thresh, limit, b_limit; __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; __m128i row9, row10, row11, row12, row13, row14, row15; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); thresh = __lsx_vldrepl_b(&thresh_in, 0); src_u -= 4; DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u, pitch_x3, row0, row1, row2, row3); src_u += pitch_x4; DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u, pitch_x3, row4, row5, row6, row7); src_v -= 4; DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v, pitch_x3, row8, row9, row10, row11); src_v += pitch_x4; DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v, pitch_x3, row12, row13, row14, row15); LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1); tmp3 = __lsx_vilvl_h(tmp1, tmp0); tmp4 = __lsx_vilvh_h(tmp1, tmp0); DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1); tmp6 = __lsx_vilvl_h(tmp1, tmp0); tmp7 = __lsx_vilvh_h(tmp1, tmp0); tmp2 = __lsx_vilvl_b(q2, q1); tmp5 = __lsx_vilvh_b(q2, q1); src_u += 1 - pitch_x4; VP8_ST6x1_B(tmp3, 0, tmp2, 0, src_u, 4); src_u += pitch; VP8_ST6x1_B(tmp3, 1, tmp2, 1, src_u, 4); src_u += pitch; VP8_ST6x1_B(tmp3, 2, tmp2, 2, src_u, 4); src_u += pitch; VP8_ST6x1_B(tmp3, 3, tmp2, 3, src_u, 4); src_u += pitch; VP8_ST6x1_B(tmp4, 0, tmp2, 4, src_u, 4); src_u += pitch; VP8_ST6x1_B(tmp4, 1, tmp2, 5, src_u, 4); src_u += pitch; VP8_ST6x1_B(tmp4, 2, tmp2, 6, src_u, 4); src_u += pitch; VP8_ST6x1_B(tmp4, 3, tmp2, 7, src_u, 4); src_v += 1 - pitch_x4; VP8_ST6x1_B(tmp6, 0, tmp5, 0, src_v, 4); src_v += pitch; VP8_ST6x1_B(tmp6, 1, tmp5, 1, src_v, 4); src_v += pitch; VP8_ST6x1_B(tmp6, 2, tmp5, 2, src_v, 4); src_v += pitch; VP8_ST6x1_B(tmp6, 3, tmp5, 3, src_v, 4); src_v += pitch; VP8_ST6x1_B(tmp7, 0, tmp5, 4, src_v, 4); src_v += pitch; VP8_ST6x1_B(tmp7, 1, tmp5, 5, src_v, 4); src_v += pitch; VP8_ST6x1_B(tmp7, 2, tmp5, 6, src_v, 4); src_v += pitch; VP8_ST6x1_B(tmp7, 3, tmp5, 7, src_v, 4); } void vp8_loop_filter_mbh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, int32_t pitch_y, int32_t pitch_u_v, loop_filter_info *lpf_info_ptr) { mbloop_filter_horizontal_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); if (src_u) { mbloop_filter_horizontal_edge_uv_lsx( src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); } } void vp8_loop_filter_mbv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, int32_t pitch_y, int32_t pitch_u_v, loop_filter_info *lpf_info_ptr) { mbloop_filter_vertical_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); if (src_u) { mbloop_filter_vertical_edge_uv_lsx(src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); } } void vp8_loop_filter_bh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, int32_t pitch_y, int32_t pitch_u_v, loop_filter_info *lpf_info_ptr) { loop_filter_horizontal_4_dual_lsx(src_y + 4 * pitch_y, pitch_y, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr); loop_filter_horizontal_4_dual_lsx(src_y + 8 * pitch_y, pitch_y, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr); loop_filter_horizontal_4_dual_lsx(src_y + 12 * pitch_y, pitch_y, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr); if (src_u) { loop_filter_horizontal_edge_uv_lsx( src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v, *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); } } void vp8_loop_filter_bv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, int32_t pitch_y, int32_t pitch_u_v, loop_filter_info *lpf_info_ptr) { loop_filter_vertical_4_dual_lsx(src_y + 4, pitch_y, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr); loop_filter_vertical_4_dual_lsx(src_y + 8, pitch_y, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr); loop_filter_vertical_4_dual_lsx(src_y + 12, pitch_y, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr, lpf_info_ptr->blim, lpf_info_ptr->lim, lpf_info_ptr->hev_thr); if (src_u) { loop_filter_vertical_edge_uv_lsx(src_u + 4, src_v + 4, pitch_u_v, *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); } }