aboutsummaryrefslogtreecommitdiff
path: root/src/loongarch/mc.S
diff options
context:
space:
mode:
Diffstat (limited to 'src/loongarch/mc.S')
-rw-r--r--src/loongarch/mc.S4758
1 files changed, 4758 insertions, 0 deletions
diff --git a/src/loongarch/mc.S b/src/loongarch/mc.S
new file mode 100644
index 0000000..97887de
--- /dev/null
+++ b/src/loongarch/mc.S
@@ -0,0 +1,4758 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+/*
+static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *const abcd, int mx, int my
+ HIGHBD_DECL_SUFFIX)
+*/
+.macro FILTER_WARP_RND_P_LSX in0, in1, in2, in3, out0, out1, out2, out3
+ vbsrl.v vr2, \in0, \in1
+ vbsrl.v vr20, \in0, \in2
+ addi.w t4, \in3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr1, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr29, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ vilvl.d vr2, vr20, vr2
+ vilvl.d vr1, vr29, vr1
+ vmulwev.h.bu.b vr3, vr2, vr1
+ vmulwod.h.bu.b vr20, vr2, vr1
+ vilvl.d vr2, vr20, vr3
+ vhaddw.w.h vr2, vr2, vr2
+ vhaddw.d.w vr2, vr2, vr2
+ vhaddw.q.d vr2, vr2, vr2
+ vilvh.d vr3, vr20, vr3
+ vhaddw.w.h vr3, vr3, vr3
+ vhaddw.d.w vr3, vr3, vr3
+ vhaddw.q.d vr3, vr3, vr3
+ vextrins.w \out0, vr2, \out1
+ vextrins.w \out2, vr3, \out3
+.endm
+
+.macro FILTER_WARP_CLIP_LSX in0, in1, in2, out0, out1
+ add.w \in0, \in0, \in1
+ addi.w t6, \in0, 512
+ srai.w t6, t6, 10
+ addi.w t6, t6, 64
+ slli.w t6, t6, 3
+ fldx.d f1, t5, t6
+ vsllwil.h.b vr1, vr1, 0
+ vmulwev.w.h vr3, \in2, vr1
+ vmaddwod.w.h vr3, \in2, vr1
+ vhaddw.d.w vr3, vr3, vr3
+ vhaddw.q.d vr3, vr3, vr3
+ vextrins.w \out0, vr3, \out1
+.endm
+
+const warp_sh
+.rept 2
+.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+.endr
+.rept 2
+.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.endr
+endconst
+
+.macro warp_lsx t, shift
+function warp_affine_8x8\t\()_8bpc_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ la.local t4, warp_sh
+ ld.h t0, a4, 0 // abcd[0]
+ ld.h t1, a4, 2 // abcd[1]
+
+ alsl.w t2, a3, a3, 1
+ addi.w t3, a5, 0
+ la.local t5, dav1d_mc_warp_filter
+ sub.d a2, a2, t2
+ addi.d a2, a2, -3
+ vld vr0, a2, 0
+ vld vr30, t4, 0
+ vld vr31, t4, 32
+
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30
+
+ add.w a5, t1, a5
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x00, vr13, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x00, vr15, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x00, vr17, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x00, vr19, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x10, vr13, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x10, vr15, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x10, vr17, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x10, vr19, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x20, vr13, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x20, vr15, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x20, vr17, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x20, vr19, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x30, vr13, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x30, vr15, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x30, vr17, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x30, vr19, 0x30
+
+ vsrarni.h.w vr12, vr4, 3
+ vsrarni.h.w vr13, vr5, 3
+ vsrarni.h.w vr14, vr6, 3
+ vsrarni.h.w vr15, vr7, 3
+ vsrarni.h.w vr16, vr8, 3
+ vsrarni.h.w vr17, vr9, 3
+ vsrarni.h.w vr18, vr10, 3
+ vsrarni.h.w vr19, vr11, 3
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x00, vr22, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x00, vr24, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x00, vr26, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x00, vr28, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x10, vr22, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x10, vr24, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x10, vr26, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x10, vr28, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x20, vr22, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x20, vr24, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x20, vr26, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x20, vr28, 0x20
+
+ vsrarni.h.w vr21, vr4, 3
+ vsrarni.h.w vr22, vr5, 3
+ vsrarni.h.w vr23, vr6, 3
+ vsrarni.h.w vr24, vr7, 3
+ vsrarni.h.w vr25, vr8, 3
+ vsrarni.h.w vr26, vr9, 3
+ vsrarni.h.w vr27, vr10, 3
+ vsrarni.h.w vr28, vr11, 3
+
+ addi.w t2, a6, 0 // my
+ ld.h t7, a4, 4 // abcd[2]
+ ld.h t8, a4, 6 // abcd[3]
+
+.ifnb \t
+ slli.d a1, a1, 1
+.endif
+
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+ alsl.d a0, a1, a0, 1
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+ alsl.d a0, a1, a0, 1
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+ alsl.d a0, a1, a0, 1
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+.endm
+
+warp_lsx , 11
+warp_lsx t, 7
+
+.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3
+ xvshuf.b xr2, \in0, \in0, \in2
+
+ addi.w t4, \in1, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr3, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr4, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr5, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr6, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ xvinsve0.d xr3, xr5, 1
+ xvinsve0.d xr3, xr4, 2
+ xvinsve0.d xr3, xr6, 3
+
+ xvmulwev.h.bu.b xr4, xr2, xr3
+ xvmulwod.h.bu.b xr5, xr2, xr3
+ xvilvl.d xr2, xr5, xr4
+ xvilvh.d xr3, xr5, xr4
+ xvhaddw.w.h xr2, xr2, xr2
+ xvhaddw.w.h xr3, xr3, xr3
+ xvhaddw.d.w xr2, xr2, xr2
+ xvhaddw.d.w xr3, xr3, xr3
+ xvhaddw.q.d xr2, xr2, xr2
+ xvhaddw.q.d xr3, xr3, xr3
+
+ xvextrins.w \out0, xr2, \out1
+ xvextrins.w \out2, xr3, \out3
+.endm
+
+.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1
+ add.w \in0, \in0, \in1
+ addi.w t6, \in0, 512
+ srai.w t6, t6, 10
+ addi.w t6, t6, 64
+ slli.w t6, t6, 3
+ fldx.d f1, t5, t6
+
+ add.w t2, t2, t7
+ addi.w t6, t2, 512
+ srai.w t6, t6, 10
+ addi.w t6, t6, 64
+ slli.w t6, t6, 3
+ fldx.d f2, t5, t6
+
+ vilvl.d vr0, vr2, vr1
+ vext2xv.h.b xr0, xr0
+ xvmulwev.w.h xr3, \in2, xr0
+ xvmaddwod.w.h xr3, \in2, xr0
+ xvhaddw.d.w xr3, xr3, xr3
+ xvhaddw.q.d xr3, xr3, xr3
+ xvextrins.w \out0, xr3, \out1
+.endm
+
+const shuf0
+.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+.byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10
+endconst
+
+.macro warp_lasx t, shift
+function warp_affine_8x8\t\()_8bpc_lasx
+ addi.d sp, sp, -16
+ ld.h t0, a4, 0 // abcd[0]
+ ld.h t1, a4, 2 // abcd[1]
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+
+ alsl.w t2, a3, a3, 1
+ addi.w t3, a5, 0
+ la.local t4, warp_sh
+ la.local t5, dav1d_mc_warp_filter
+ sub.d a2, a2, t2
+ addi.d a2, a2, -3
+ vld vr0, a2, 0
+ xvld xr24, t4, 0
+ xvld xr25, t4, 32
+ la.local t2, shuf0
+ xvld xr1, t2, 0
+ xvpermi.q xr0, xr0, 0x00
+ xvaddi.bu xr9, xr1, 4
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30
+
+ xvsrarni.h.w xr12, xr7, 3
+ xvsrarni.h.w xr13, xr8, 3
+ xvsrarni.h.w xr14, xr10, 3
+ xvsrarni.h.w xr15, xr11, 3
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20
+
+ xvsrarni.h.w xr16, xr7, 3
+ xvsrarni.h.w xr17, xr8, 3
+ xvsrarni.h.w xr18, xr10, 3
+ xvsrarni.h.w xr19, xr11, 3
+
+ addi.w t2, a6, 0 // my
+ ld.h t7, a4, 4 // abcd[2]
+ ld.h t8, a4, 6 // abcd[3]
+
+.ifnb \t
+ slli.d a1, a1, 1
+.endif
+
+ // y = 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, \shift
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ alsl.d a0, a1, a0, 1
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, 11
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ add.d a0, a0, a1
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ alsl.d a0, a1, a0, 1
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, 11
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ add.d a0, a0, a1
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ alsl.d a0, a1, a0, 1
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, 11
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ add.d a0, a0, a1
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ addi.d sp, sp, 16
+endfunc
+.endm
+
+warp_lasx , 11
+warp_lasx t, 7
+
+/*
+static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2,
+ const int w, int h,
+ const int weight HIGHBD_DECL_SUFFIX)
+*/
+
+#define bpc8_sh 5 // sh = intermediate_bits + 1
+#define bpcw8_sh 8 // sh = intermediate_bits + 4
+
+#define bpc_sh bpc8_sh
+#define bpcw_sh bpcw8_sh
+
+function avg_8bpc_lsx
+ addi.d t8, a0, 0
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .AVG_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE
+ add.d t1, t1, t2 // Get absolute address
+ jirl $r0, t1, 0
+
+ .align 3
+.AVG_LSX_JRTABLE:
+ .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W64_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W32_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W16_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W8_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W4_LSX - .AVG_LSX_JRTABLE
+
+.AVG_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vadd.h vr2, vr0, vr1
+ vssrarni.bu.h vr3, vr2, bpc_sh
+ vstelm.w vr3, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W4_LSX
+ b .AVG_END_LSX
+
+.AVG_W8_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr4, vr0, vr1
+ vadd.h vr5, vr2, vr3
+ vssrarni.bu.h vr5, vr4, bpc_sh
+ addi.w a5, a5, -2
+ addi.d a2, a2, 32
+ vstelm.d vr5, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr5, a0, 0, 1
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W8_LSX
+ b .AVG_END_LSX
+
+.AVG_W16_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr4, vr0, vr1
+ vadd.h vr5, vr2, vr3
+ vssrarni.bu.h vr5, vr4, bpc_sh
+ addi.w a5, a5, -1
+ addi.d a2, a2, 32
+ vst vr5, a0, 0
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W16_LSX
+ b .AVG_END_LSX
+
+.AVG_W32_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr4, a2, 32
+ vld vr6, a2, 48
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vld vr5, a3, 32
+ vld vr7, a3, 48
+ vadd.h vr0, vr0, vr1
+ vadd.h vr2, vr2, vr3
+ vadd.h vr4, vr4, vr5
+ vadd.h vr6, vr6, vr7
+ vssrarni.bu.h vr2, vr0, bpc_sh
+ vssrarni.bu.h vr6, vr4, bpc_sh
+ addi.w a5, a5, -1
+ addi.d a2, a2, 64
+ vst vr2, a0, 0
+ vst vr6, a0, 16
+ addi.d a3, a3, 64
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W32_LSX
+ b .AVG_END_LSX
+
+.AVG_W64_LSX:
+.rept 4
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr0, vr0, vr1
+ vadd.h vr2, vr2, vr3
+ vssrarni.bu.h vr2, vr0, bpc_sh
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ vst vr2, a0, 0
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .AVG_W64_LSX
+ b .AVG_END_LSX
+
+.AVG_W128_LSX:
+.rept 8
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr0, vr0, vr1
+ vadd.h vr2, vr2, vr3
+ vssrarni.bu.h vr2, vr0, bpc_sh
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ vst vr2, a0, 0
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .AVG_W128_LSX
+.AVG_END_LSX:
+endfunc
+
+function avg_8bpc_lasx
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .AVG_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.AVG_LASX_JRTABLE:
+ .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W64_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W32_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W16_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W8_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W4_LASX - .AVG_LASX_JRTABLE
+
+.AVG_W4_LASX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vadd.h vr0, vr0, vr1
+ vssrarni.bu.h vr1, vr0, bpc_sh
+ vstelm.w vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr1, a0, 0, 1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W4_LASX
+ b .AVG_END_LASX
+.AVG_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ xvadd.h xr2, xr0, xr1
+ xvssrarni.bu.h xr1, xr2, bpc_sh
+ xvstelm.d xr1, a0, 0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr1, a0, 0, 2
+ addi.w a5, a5, -2
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a1, a0
+ blt zero, a5, .AVG_W8_LASX
+ b .AVG_END_LASX
+.AVG_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvadd.h xr4, xr0, xr1
+ xvadd.h xr5, xr2, xr3
+ xvssrarni.bu.h xr5, xr4, bpc_sh
+ xvpermi.d xr2, xr5, 0xd8
+ xvpermi.d xr3, xr5, 0x8d
+ vst vr2, a0, 0
+ vstx vr3, a0, a1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ alsl.d a0, a1, a0, 1
+ blt zero, a5, .AVG_W16_LASX
+ b .AVG_END_LASX
+.AVG_W32_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvadd.h xr4, xr0, xr1
+ xvadd.h xr5, xr2, xr3
+ xvssrarni.bu.h xr5, xr4, bpc_sh
+ xvpermi.d xr6, xr5, 0xd8
+ xvst xr6, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W32_LASX
+ b .AVG_END_LASX
+.AVG_W64_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr4, a2, 64
+ xvld xr6, a2, 96
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvld xr5, a3, 64
+ xvld xr7, a3, 96
+ xvadd.h xr0, xr0, xr1
+ xvadd.h xr2, xr2, xr3
+ xvadd.h xr4, xr4, xr5
+ xvadd.h xr6, xr6, xr7
+ xvssrarni.bu.h xr2, xr0, bpc_sh
+ xvssrarni.bu.h xr6, xr4, bpc_sh
+ xvpermi.d xr1, xr2, 0xd8
+ xvpermi.d xr3, xr6, 0xd8
+ xvst xr1, a0, 0
+ xvst xr3, a0, 32
+ addi.w a5, a5, -1
+ addi.d a2, a2, 128
+ addi.d a3, a3, 128
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W64_LASX
+ b .AVG_END_LASX
+.AVG_W128_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr4, a2, 64
+ xvld xr6, a2, 96
+ xvld xr8, a2, 128
+ xvld xr10, a2, 160
+ xvld xr12, a2, 192
+ xvld xr14, a2, 224
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvld xr5, a3, 64
+ xvld xr7, a3, 96
+ xvld xr9, a3, 128
+ xvld xr11, a3, 160
+ xvld xr13, a3, 192
+ xvld xr15, a3, 224
+ xvadd.h xr0, xr0, xr1
+ xvadd.h xr2, xr2, xr3
+ xvadd.h xr4, xr4, xr5
+ xvadd.h xr6, xr6, xr7
+ xvadd.h xr8, xr8, xr9
+ xvadd.h xr10, xr10, xr11
+ xvadd.h xr12, xr12, xr13
+ xvadd.h xr14, xr14, xr15
+ xvssrarni.bu.h xr2, xr0, bpc_sh
+ xvssrarni.bu.h xr6, xr4, bpc_sh
+ xvssrarni.bu.h xr10, xr8, bpc_sh
+ xvssrarni.bu.h xr14, xr12, bpc_sh
+ xvpermi.d xr1, xr2, 0xd8
+ xvpermi.d xr3, xr6, 0xd8
+ xvpermi.d xr5, xr10, 0xd8
+ xvpermi.d xr7, xr14, 0xd8
+ xvst xr1, a0, 0
+ xvst xr3, a0, 32
+ xvst xr5, a0, 64
+ xvst xr7, a0, 96
+ addi.w a5, a5, -1
+ addi.d a2, a2, 256
+ addi.d a3, a3, 256
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W128_LASX
+.AVG_END_LASX:
+endfunc
+
+function w_avg_8bpc_lsx
+ addi.d t8, a0, 0
+ li.w t2, 16
+ sub.w t2, t2, a6 // 16 - weight
+ vreplgr2vr.h vr21, a6
+ vreplgr2vr.h vr22, t2
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .W_AVG_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.W_AVG_LSX_JRTABLE:
+ .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE
+
+.W_AVG_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vmulwev.w.h vr2, vr0, vr21
+ vmulwod.w.h vr3, vr0, vr21
+ vmaddwev.w.h vr2, vr1, vr22
+ vmaddwod.w.h vr3, vr1, vr22
+ vssrarni.hu.w vr3, vr2, bpcw_sh
+ vssrlni.bu.h vr1, vr3, 0
+ vpickod.w vr4, vr2, vr1
+ vilvl.b vr0, vr4, vr1
+ fst.s f0, a0, 0
+ add.d a0, a0, a1
+ vstelm.w vr0, a0, 0, 1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a1, a0
+ blt zero, a5, .W_AVG_W4_LSX
+ b .W_AVG_END_LSX
+.W_AVG_W8_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vmulwev.w.h vr2, vr0, vr21
+ vmulwod.w.h vr3, vr0, vr21
+ vmaddwev.w.h vr2, vr1, vr22
+ vmaddwod.w.h vr3, vr1, vr22
+ vssrarni.hu.w vr3, vr2, bpcw_sh
+ vssrlni.bu.h vr1, vr3, 0
+ vpickod.w vr4, vr2, vr1
+ vilvl.b vr0, vr4, vr1
+ fst.d f0, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W8_LSX
+ b .W_AVG_END_LSX
+.W_AVG_W16_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W16_LSX
+ b .W_AVG_END_LSX
+.W_AVG_W32_LSX:
+.rept 2
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W32_LSX
+ b .W_AVG_END_LSX
+
+.W_AVG_W64_LSX:
+.rept 4
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W64_LSX
+ b .W_AVG_END_LSX
+
+.W_AVG_W128_LSX:
+.rept 8
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W128_LSX
+.W_AVG_END_LSX:
+endfunc
+
+function w_avg_8bpc_lasx
+ addi.d t8, a0, 0
+ li.w t2, 16
+ sub.w t2, t2, a6 // 16 - weight
+ xvreplgr2vr.h xr21, a6
+ xvreplgr2vr.h xr22, t2
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .W_AVG_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.W_AVG_LASX_JRTABLE:
+ .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE
+
+.W_AVG_W4_LASX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ xvpermi.d xr2, xr0, 0xD8
+ xvpermi.d xr3, xr1, 0xD8
+ xvilvl.h xr4, xr3, xr2
+ xvmulwev.w.h xr0, xr4, xr21
+ xvmaddwod.w.h xr0, xr4, xr22
+ xvssrarni.hu.w xr1, xr0, bpcw_sh
+ xvssrlni.bu.h xr0, xr1, 0
+ fst.s f0, a0, 0
+ add.d a0, a0, a1
+ xvstelm.w xr0, a0, 0, 4
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a1, a0
+ blt zero, a5, .W_AVG_W4_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ xvmulwev.w.h xr2, xr0, xr21
+ xvmulwod.w.h xr3, xr0, xr21
+ xvmaddwev.w.h xr2, xr1, xr22
+ xvmaddwod.w.h xr3, xr1, xr22
+ xvssrarni.hu.w xr3, xr2, bpcw_sh
+ xvssrlni.bu.h xr1, xr3, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ xvstelm.d xr0, a0, 0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr0, a0, 0, 2
+ addi.w a5, a5, -2
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W8_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ xvmulwev.w.h xr2, xr0, xr21
+ xvmulwod.w.h xr3, xr0, xr21
+ xvmaddwev.w.h xr2, xr1, xr22
+ xvmaddwod.w.h xr3, xr1, xr22
+ xvssrarni.hu.w xr3, xr2, bpcw_sh
+ xvssrlni.bu.h xr1, xr3, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ xvpermi.d xr1, xr0, 0xD8
+ vst vr1, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W16_LASX
+ b .W_AVG_END_LSX
+
+.W_AVG_W32_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvmulwev.w.h xr4, xr0, xr21
+ xvmulwod.w.h xr5, xr0, xr21
+ xvmulwev.w.h xr6, xr2, xr21
+ xvmulwod.w.h xr7, xr2, xr21
+ xvmaddwev.w.h xr4, xr1, xr22
+ xvmaddwod.w.h xr5, xr1, xr22
+ xvmaddwev.w.h xr6, xr3, xr22
+ xvmaddwod.w.h xr7, xr3, xr22
+ xvssrarni.hu.w xr6, xr4, bpcw_sh
+ xvssrarni.hu.w xr7, xr5, bpcw_sh
+ xvssrlni.bu.h xr7, xr6, 0
+ xvshuf4i.w xr8, xr7, 0x4E
+ xvilvl.b xr9, xr8, xr7
+ xvpermi.d xr0, xr9, 0xD8
+ xvst xr0, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W32_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W64_LASX:
+.rept 2
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvmulwev.w.h xr4, xr0, xr21
+ xvmulwod.w.h xr5, xr0, xr21
+ xvmulwev.w.h xr6, xr2, xr21
+ xvmulwod.w.h xr7, xr2, xr21
+ xvmaddwev.w.h xr4, xr1, xr22
+ xvmaddwod.w.h xr5, xr1, xr22
+ xvmaddwev.w.h xr6, xr3, xr22
+ xvmaddwod.w.h xr7, xr3, xr22
+ xvssrarni.hu.w xr6, xr4, bpcw_sh
+ xvssrarni.hu.w xr7, xr5, bpcw_sh
+ xvssrlni.bu.h xr7, xr6, 0
+ xvshuf4i.w xr8, xr7, 0x4E
+ xvilvl.b xr9, xr8, xr7
+ xvpermi.d xr0, xr9, 0xD8
+ xvst xr0, a0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a0, a0, 32
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W64_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W128_LASX:
+.rept 4
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvmulwev.w.h xr4, xr0, xr21
+ xvmulwod.w.h xr5, xr0, xr21
+ xvmulwev.w.h xr6, xr2, xr21
+ xvmulwod.w.h xr7, xr2, xr21
+ xvmaddwev.w.h xr4, xr1, xr22
+ xvmaddwod.w.h xr5, xr1, xr22
+ xvmaddwev.w.h xr6, xr3, xr22
+ xvmaddwod.w.h xr7, xr3, xr22
+ xvssrarni.hu.w xr6, xr4, bpcw_sh
+ xvssrarni.hu.w xr7, xr5, bpcw_sh
+ xvssrlni.bu.h xr7, xr6, 0
+ xvshuf4i.w xr8, xr7, 0x4E
+ xvilvl.b xr9, xr8, xr7
+ xvpermi.d xr0, xr9, 0xD8
+ xvst xr0, a0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a0, a0, 32
+.endr
+
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W128_LASX
+.W_AVG_END_LASX:
+endfunc
+
+#undef bpc_sh
+#undef bpcw_sh
+
+#define mask_sh 10
+/*
+static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ const uint8_t *mask HIGHBD_DECL_SUFFIX)
+*/
+function mask_8bpc_lsx
+ vldi vr21, 0x440 // 64
+ vxor.v vr19, vr19, vr19
+ addi.d t8, a0, 0
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .MASK_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.MASK_LSX_JRTABLE:
+ .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W64_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W32_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W16_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W8_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W4_LSX - .MASK_LSX_JRTABLE
+
+.MASK_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ fld.d f22, a6, 0
+
+ vilvl.b vr2, vr19, vr22
+ vsub.h vr3, vr21, vr2
+
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vssrarni.hu.w vr5, vr4, mask_sh
+ vssrlrni.bu.h vr1, vr5, 0
+ vpickod.w vr4, vr2, vr1
+ vilvl.b vr0, vr4, vr1
+ fst.s f0, a0, 0
+ add.d a0, a0, a1
+ vstelm.w vr0, a0, 0, 1
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ addi.d a6, a6, 8
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W4_LSX
+ b .MASK_END_LSX
+.MASK_W8_LSX:
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ fst.d f0, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr0, a0, 0, 1
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W8_LSX
+ b .MASK_END_LSX
+
+.MASK_W16_LSX:
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W16_LSX
+ b .MASK_END_LSX
+.MASK_W32_LSX:
+.rept 2
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ addi.d a0, a0, 16
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W32_LSX
+ b .MASK_END_LSX
+.MASK_W64_LSX:
+.rept 4
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ addi.d a0, a0, 16
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W64_LSX
+ b .MASK_END_LSX
+.MASK_W128_LSX:
+.rept 8
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ addi.d a0, a0, 16
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W128_LSX
+.MASK_END_LSX:
+endfunc
+
+function mask_8bpc_lasx
+ xvldi xr21, 0x440 // 64
+ xvxor.v xr19, xr19, xr19
+ addi.d t8, a0, 0
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .MASK_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.MASK_LASX_JRTABLE:
+ .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W64_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W32_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W16_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W8_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W4_LASX - .MASK_LASX_JRTABLE
+
+.MASK_W4_LASX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ fld.d f22, a6, 0
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr14, vr1, vr0
+ vilvl.b vr2, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ xvpermi.q xr14, xr4, 0x20
+ vilvl.h vr5, vr3, vr2
+ vilvh.h vr15, vr3, vr2
+ xvpermi.q xr15, xr5, 0x20
+ xvmulwev.w.h xr0, xr14, xr15
+ xvmaddwod.w.h xr0, xr14, xr15
+ xvssrarni.hu.w xr1, xr0, mask_sh
+ xvssrlni.bu.h xr2, xr1, 0
+ fst.s f2, a0, 0
+ add.d a0, a0, a1
+ xvstelm.w xr2, a0, 0, 4
+
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ addi.d a6, a6, 8
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W4_LASX
+ b .MASK_END_LASX
+
+.MASK_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ vld vr22, a6, 0
+
+ vext2xv.hu.bu xr2, xr22
+ xvsub.h xr3, xr21, xr2
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvssrarni.hu.w xr5, xr4, mask_sh
+ xvssrlni.bu.h xr1, xr5, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ fst.d f0, a0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr0, a0, 0, 2
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W8_LASX
+ b .MASK_END_LASX
+
+.MASK_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ vld vr22, a6, 0
+
+ vext2xv.hu.bu xr2, xr22
+ xvsub.h xr3, xr21, xr2
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvssrarni.hu.w xr5, xr4, mask_sh
+ xvssrlni.bu.h xr1, xr5, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ xvpermi.d xr1, xr0, 0xD8
+ vst vr1, a0, 0
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W16_LASX
+ b .MASK_END_LASX
+.MASK_W32_LASX:
+ xvld xr0, a2, 0
+ xvld xr10, a2, 32
+ xvld xr1, a3, 0
+ xvld xr11, a3, 32
+ xvld xr22, a6, 0
+ vext2xv.hu.bu xr2, xr22
+ xvpermi.q xr4, xr22, 0x01
+ vext2xv.hu.bu xr12, xr4
+ xvsub.h xr3, xr21, xr2
+ xvsub.h xr13, xr21, xr12
+
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmulwev.w.h xr14, xr10, xr12
+ xvmulwod.w.h xr15, xr10, xr12
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvmaddwev.w.h xr14, xr11, xr13
+ xvmaddwod.w.h xr15, xr11, xr13
+ xvssrarni.hu.w xr14, xr4, mask_sh
+ xvssrarni.hu.w xr15, xr5, mask_sh
+ xvssrlni.bu.h xr15, xr14, 0
+ xvshuf4i.w xr6, xr15, 0x4E
+ xvilvl.b xr1, xr6, xr15
+ xvpermi.d xr0, xr1, 0xD8
+ xvst xr0, a0, 0
+
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 32
+ add.d a0, a0, a1
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W32_LASX
+ b .MASK_END_LASX
+
+.MASK_W64_LASX:
+.rept 2
+ xvld xr0, a2, 0
+ xvld xr10, a2, 32
+ xvld xr1, a3, 0
+ xvld xr11, a3, 32
+ xvld xr22, a6, 0
+ vext2xv.hu.bu xr2, xr22
+ xvpermi.q xr4, xr22, 0x01
+ vext2xv.hu.bu xr12, xr4
+ xvsub.h xr3, xr21, xr2
+ xvsub.h xr13, xr21, xr12
+
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmulwev.w.h xr14, xr10, xr12
+ xvmulwod.w.h xr15, xr10, xr12
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvmaddwev.w.h xr14, xr11, xr13
+ xvmaddwod.w.h xr15, xr11, xr13
+ xvssrarni.hu.w xr14, xr4, mask_sh
+ xvssrarni.hu.w xr15, xr5, mask_sh
+ xvssrlni.bu.h xr15, xr14, 0
+ xvshuf4i.w xr6, xr15, 0x4E
+ xvilvl.b xr1, xr6, xr15
+ xvpermi.d xr0, xr1, 0xD8
+ xvst xr0, a0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 32
+ addi.d a0, a0, 32
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W64_LASX
+ b .MASK_END_LASX
+
+.MASK_W128_LASX:
+.rept 4
+ xvld xr0, a2, 0
+ xvld xr10, a2, 32
+ xvld xr1, a3, 0
+ xvld xr11, a3, 32
+ xvld xr22, a6, 0
+ vext2xv.hu.bu xr2, xr22
+ xvpermi.q xr4, xr22, 0x01
+ vext2xv.hu.bu xr12, xr4
+ xvsub.h xr3, xr21, xr2
+ xvsub.h xr13, xr21, xr12
+
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmulwev.w.h xr14, xr10, xr12
+ xvmulwod.w.h xr15, xr10, xr12
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvmaddwev.w.h xr14, xr11, xr13
+ xvmaddwod.w.h xr15, xr11, xr13
+ xvssrarni.hu.w xr14, xr4, mask_sh
+ xvssrarni.hu.w xr15, xr5, mask_sh
+ xvssrlni.bu.h xr15, xr14, 0
+ xvshuf4i.w xr6, xr15, 0x4E
+ xvilvl.b xr1, xr6, xr15
+ xvpermi.d xr0, xr1, 0xD8
+ xvst xr0, a0, 0
+
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 32
+ addi.d a0, a0, 32
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W128_LASX
+.MASK_END_LASX:
+endfunc
+
+/*
+static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ uint8_t *mask, const int sign,
+ const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
+*/
+function w_mask_420_8bpc_lsx
+ addi.d sp, sp, -24
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ vldi vr20, 0x440
+ vreplgr2vr.h vr21, a7
+ vldi vr22, 0x426
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .WMASK420_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t8, t0, 0
+ add.d t1, t1, t8
+ jirl $r0, t1, 0
+
+ .align 3
+.WMASK420_LSX_JRTABLE:
+ .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE
+
+.WMASK420_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a3, 0
+ vld vr3, a3, 16
+ addi.w a5, a5, -4
+
+ vabsd.h vr4, vr0, vr2
+ vabsd.h vr5, vr1, vr3
+ vaddi.hu vr4, vr4, 8
+ vaddi.hu vr5, vr5, 8
+ vsrli.h vr4, vr4, 8
+ vsrli.h vr5, vr5, 8
+ vadd.h vr4, vr4, vr22
+ vadd.h vr5, vr5, vr22
+ vmin.hu vr6, vr4, vr20
+ vmin.hu vr7, vr5, vr20
+ vsub.h vr8, vr20, vr6
+ vsub.h vr9, vr20, vr7
+ vmulwev.w.h vr4, vr6, vr0
+ vmulwod.w.h vr5, vr6, vr0
+ vmulwev.w.h vr10, vr7, vr1
+ vmulwod.w.h vr11, vr7, vr1
+ vmaddwev.w.h vr4, vr8, vr2
+ vmaddwod.w.h vr5, vr8, vr2
+ vmaddwev.w.h vr10, vr9, vr3
+ vmaddwod.w.h vr11, vr9, vr3
+ vilvl.w vr0, vr5, vr4
+ vilvh.w vr1, vr5, vr4
+ vilvl.w vr2, vr11, vr10
+ vilvh.w vr3, vr11, vr10
+ vssrarni.hu.w vr1, vr0, 10
+ vssrarni.hu.w vr3, vr2, 10
+ vssrlni.bu.h vr3, vr1, 0
+ vstelm.w vr3, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 1
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 2
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 3
+ add.d a0, a0, a1
+ vpickev.h vr0, vr7, vr6
+ vpickod.h vr1, vr7, vr6
+ vadd.h vr0, vr0, vr1
+ vshuf4i.h vr0, vr0, 0xd8
+ vhaddw.w.h vr2, vr0, vr0
+ vpickev.h vr2, vr2, vr2
+ vsub.h vr2, vr2, vr21
+ vaddi.hu vr2, vr2, 2
+ vssrani.bu.h vr2, vr2, 2
+ vstelm.w vr2, a6, 0, 0
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 4
+ blt zero, a5, .WMASK420_W4_LSX
+ b .END_W420
+
+.WMASK420_W8_LSX:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a3, 0
+ vld vr3, a3, 16
+ addi.w a5, a5, -2
+
+ vabsd.h vr4, vr0, vr2
+ vabsd.h vr5, vr1, vr3
+ vaddi.hu vr4, vr4, 8
+ vaddi.hu vr5, vr5, 8
+ vsrli.h vr4, vr4, 8
+ vsrli.h vr5, vr5, 8
+ vadd.h vr4, vr4, vr22
+ vadd.h vr5, vr5, vr22
+ vmin.hu vr6, vr4, vr20
+ vmin.hu vr7, vr5, vr20
+ vsub.h vr8, vr20, vr6
+ vsub.h vr9, vr20, vr7
+ vmulwev.w.h vr4, vr6, vr0
+ vmulwod.w.h vr5, vr6, vr0
+ vmulwev.w.h vr10, vr7, vr1
+ vmulwod.w.h vr11, vr7, vr1
+ vmaddwev.w.h vr4, vr8, vr2
+ vmaddwod.w.h vr5, vr8, vr2
+ vmaddwev.w.h vr10, vr9, vr3
+ vmaddwod.w.h vr11, vr9, vr3
+ vssrarni.hu.w vr10, vr4, 10
+ vssrarni.hu.w vr11, vr5, 10
+ vssrlni.bu.h vr11, vr10, 0
+ vshuf4i.w vr0, vr11, 0x4E
+ vilvl.b vr3, vr0, vr11
+ vstelm.d vr3, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr3, a0, 0, 1
+ add.d a0, a0, a1
+ vpickev.h vr0, vr7, vr6
+ vpickod.h vr1, vr7, vr6
+ vadd.h vr0, vr0, vr1
+ vilvh.d vr2, vr0, vr0
+ vadd.h vr2, vr2, vr0
+ vsub.h vr2, vr2, vr21
+ vaddi.hu vr2, vr2, 2
+ vssrani.bu.h vr2, vr2, 2
+ vstelm.w vr2, a6, 0, 0
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 4
+ blt zero, a5, .WMASK420_W8_LSX
+ b .END_W420
+
+.WMASK420_W16_LSX:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ alsl.d a2, a4, a2, 1
+ vld vr2, a2, 0
+ vld vr3, a2, 16
+ vld vr4, a3, 0
+ vld vr5, a3, 16
+ alsl.d a3, a4, a3, 1
+ vld vr6, a3, 0
+ vld vr7, a3, 16
+
+ vabsd.h vr8, vr0, vr4
+ vabsd.h vr9, vr1, vr5
+ vabsd.h vr10, vr2, vr6
+ vabsd.h vr11, vr3, vr7
+ vaddi.hu vr8, vr8, 8
+ vaddi.hu vr9, vr9, 8
+ vaddi.hu vr10, vr10, 8
+ vaddi.hu vr11, vr11, 8
+ vsrli.h vr8, vr8, 8
+ vsrli.h vr9, vr9, 8
+ vsrli.h vr10, vr10, 8
+ vsrli.h vr11, vr11, 8
+ vadd.h vr8, vr8, vr22
+ vadd.h vr9, vr9, vr22
+ vadd.h vr10, vr10, vr22
+ vadd.h vr11, vr11, vr22
+ vmin.hu vr12, vr8, vr20
+ vmin.hu vr13, vr9, vr20
+ vmin.hu vr14, vr10, vr20
+ vmin.hu vr15, vr11, vr20
+ vsub.h vr16, vr20, vr12
+ vsub.h vr17, vr20, vr13
+ vsub.h vr18, vr20, vr14
+ vsub.h vr19, vr20, vr15
+ vmulwev.w.h vr8, vr12, vr0
+ vmulwod.w.h vr9, vr12, vr0
+ vmulwev.w.h vr10, vr13, vr1
+ vmulwod.w.h vr11, vr13, vr1
+ vmulwev.w.h vr23, vr14, vr2
+ vmulwod.w.h vr24, vr14, vr2
+ vmulwev.w.h vr25, vr15, vr3
+ vmulwod.w.h vr26, vr15, vr3
+ vmaddwev.w.h vr8, vr16, vr4
+ vmaddwod.w.h vr9, vr16, vr4
+ vmaddwev.w.h vr10, vr17, vr5
+ vmaddwod.w.h vr11, vr17, vr5
+ vmaddwev.w.h vr23, vr18, vr6
+ vmaddwod.w.h vr24, vr18, vr6
+ vmaddwev.w.h vr25, vr19, vr7
+ vmaddwod.w.h vr26, vr19, vr7
+ vssrarni.hu.w vr10, vr8, 10
+ vssrarni.hu.w vr11, vr9, 10
+ vssrarni.hu.w vr25, vr23, 10
+ vssrarni.hu.w vr26, vr24, 10
+ vssrlni.bu.h vr11, vr10, 0
+ vssrlni.bu.h vr26, vr25, 0
+ vshuf4i.w vr0, vr11, 0x4E
+ vshuf4i.w vr1, vr26, 0x4E
+ vilvl.b vr3, vr0, vr11
+ vilvl.b vr7, vr1, vr26
+ vst vr3, a0, 0
+ vstx vr7, a0, a1
+ vpickev.h vr0, vr13, vr12
+ vpickod.h vr1, vr13, vr12
+ vpickev.h vr2, vr15, vr14
+ vpickod.h vr3, vr15, vr14
+ vadd.h vr4, vr0, vr1
+ vadd.h vr5, vr2, vr3
+ vadd.h vr4, vr4, vr5
+ vsub.h vr4, vr4, vr21
+ vssrarni.bu.h vr4, vr4, 2
+ vstelm.d vr4, a6, 0, 0
+
+ alsl.d a2, a4, a2, 1
+ alsl.d a3, a4, a3, 1
+ alsl.d a0, a1, a0, 1
+ addi.d a6, a6, 8
+ addi.w a5, a5, -2
+ blt zero, a5, .WMASK420_W16_LSX
+ b .END_W420
+
+.WMASK420_W32_LSX:
+.WMASK420_W64_LSX:
+.WMASK420_W128_LSX:
+
+.LOOP_W32_420_LSX:
+ add.d t1, a2, zero
+ add.d t2, a3, zero
+ add.d t3, a0, zero
+ add.d t4, a6, zero
+ alsl.d t5, a4, t1, 1
+ alsl.d t6, a4, t2, 1
+ or t7, a4, a4
+
+.W32_420_LSX:
+ vld vr0, t1, 0
+ vld vr1, t1, 16
+ vld vr2, t2, 0
+ vld vr3, t2, 16
+ vld vr4, t5, 0
+ vld vr5, t5, 16
+ vld vr6, t6, 0
+ vld vr7, t6, 16
+ addi.d t1, t1, 32
+ addi.d t2, t2, 32
+ addi.d t5, t5, 32
+ addi.d t6, t6, 32
+ addi.w t7, t7, -16
+ vabsd.h vr8, vr0, vr2
+ vabsd.h vr9, vr1, vr3
+ vabsd.h vr10, vr4, vr6
+ vabsd.h vr11, vr5, vr7
+ vaddi.hu vr8, vr8, 8
+ vaddi.hu vr9, vr9, 8
+ vaddi.hu vr10, vr10, 8
+ vaddi.hu vr11, vr11, 8
+ vsrli.h vr8, vr8, 8
+ vsrli.h vr9, vr9, 8
+ vsrli.h vr10, vr10, 8
+ vsrli.h vr11, vr11, 8
+ vadd.h vr8, vr8, vr22
+ vadd.h vr9, vr9, vr22
+ vadd.h vr10, vr10, vr22
+ vadd.h vr11, vr11, vr22
+ vmin.hu vr12, vr8, vr20
+ vmin.hu vr13, vr9, vr20
+ vmin.hu vr14, vr10, vr20
+ vmin.hu vr15, vr11, vr20
+ vsub.h vr16, vr20, vr12
+ vsub.h vr17, vr20, vr13
+ vsub.h vr18, vr20, vr14
+ vsub.h vr19, vr20, vr15
+ vmulwev.w.h vr8, vr12, vr0
+ vmulwod.w.h vr9, vr12, vr0
+ vmulwev.w.h vr10, vr13, vr1
+ vmulwod.w.h vr11, vr13, vr1
+ vmulwev.w.h vr23, vr14, vr4
+ vmulwod.w.h vr24, vr14, vr4
+ vmulwev.w.h vr25, vr15, vr5
+ vmulwod.w.h vr26, vr15, vr5
+ vmaddwev.w.h vr8, vr16, vr2
+ vmaddwod.w.h vr9, vr16, vr2
+ vmaddwev.w.h vr10, vr17, vr3
+ vmaddwod.w.h vr11, vr17, vr3
+ vmaddwev.w.h vr23, vr18, vr6
+ vmaddwod.w.h vr24, vr18, vr6
+ vmaddwev.w.h vr25, vr19, vr7
+ vmaddwod.w.h vr26, vr19, vr7
+ vssrarni.hu.w vr10, vr8, 10
+ vssrarni.hu.w vr11, vr9, 10
+ vssrarni.hu.w vr25, vr23, 10
+ vssrarni.hu.w vr26, vr24, 10
+ vssrlni.bu.h vr11, vr10, 0
+ vssrlni.bu.h vr26, vr25, 0
+ vshuf4i.w vr8, vr11, 0x4E
+ vshuf4i.w vr9, vr26, 0x4E
+ vilvl.b vr3, vr8, vr11
+ vilvl.b vr7, vr9, vr26
+ vst vr3, t3, 0
+ vstx vr7, a1, t3
+ addi.d t3, t3, 16
+ vpickev.h vr8, vr13, vr12
+ vpickod.h vr9, vr13, vr12
+ vpickev.h vr10, vr15, vr14
+ vpickod.h vr11, vr15, vr14
+ vadd.h vr8, vr8, vr9
+ vadd.h vr10, vr10, vr11
+ vadd.h vr12, vr8, vr10
+ vsub.h vr12, vr12, vr21
+ vssrarni.bu.h vr12, vr12, 2
+ vstelm.d vr12, t4, 0, 0
+ addi.d t4, t4, 8
+ bne t7, zero, .W32_420_LSX
+
+ alsl.d a2, a4, a2, 2
+ alsl.d a3, a4, a3, 2
+ alsl.d a0, a1, a0, 1
+ srai.w t8, a4, 1
+ add.d a6, a6, t8
+ addi.w a5, a5, -2
+ blt zero, a5, .LOOP_W32_420_LSX
+
+.END_W420:
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ addi.d sp, sp, 24
+endfunc
+
+function w_mask_420_8bpc_lasx
+ xvldi xr20, 0x440
+ xvreplgr2vr.h xr21, a7
+ xvldi xr22, 0x426
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .WMASK420_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t8, t0, 0
+ add.d t1, t1, t8
+ jirl $r0, t1, 0
+
+ .align 3
+.WMASK420_LASX_JRTABLE:
+ .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W64_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W32_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W16_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W8_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W4_LASX - .WMASK420_LASX_JRTABLE
+
+.WMASK420_W4_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ addi.w a5, a5, -4
+
+ xvabsd.h xr2, xr0, xr1
+ xvaddi.hu xr2, xr2, 8
+ xvsrli.h xr2, xr2, 8
+ xvadd.h xr2, xr2, xr22
+ xvmin.hu xr3, xr2, xr20
+ xvsub.h xr4, xr20, xr3
+ xvmulwev.w.h xr5, xr3, xr0
+ xvmulwod.w.h xr6, xr3, xr0
+ xvmaddwev.w.h xr5, xr4, xr1
+ xvmaddwod.w.h xr6, xr4, xr1
+ xvilvl.w xr7, xr6, xr5
+ xvilvh.w xr8, xr6, xr5
+ xvssrarni.hu.w xr8, xr7, 10
+ xvssrlni.bu.h xr9, xr8, 0
+ vstelm.w vr9, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr9, a0, 0, 1
+ add.d a0, a0, a1
+ xvstelm.w xr9, a0, 0, 4
+ add.d a0, a0, a1
+ xvstelm.w xr9, a0, 0, 5
+ add.d a0, a0, a1
+
+ xvhaddw.w.h xr3, xr3, xr3
+ xvpermi.d xr4, xr3, 0xb1
+ xvadd.h xr3, xr3, xr4
+ xvpickev.h xr3, xr3, xr3
+ xvsub.h xr3, xr3, xr21
+ xvssrarni.bu.h xr3, xr3, 2
+ vstelm.h vr3, a6, 0, 0
+ xvstelm.h xr3, a6, 2, 8
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 4
+ blt zero, a5, .WMASK420_W4_LASX
+ b .END_W420_LASX
+
+.WMASK420_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a2, 32
+ xvld xr2, a3, 0
+ xvld xr3, a3, 32
+ addi.w a5, a5, -4
+
+ xvabsd.h xr4, xr0, xr2
+ xvabsd.h xr5, xr1, xr3
+ xvaddi.hu xr4, xr4, 8
+ xvaddi.hu xr5, xr5, 8
+ xvsrli.h xr4, xr4, 8
+ xvsrli.h xr5, xr5, 8
+ xvadd.h xr4, xr4, xr22
+ xvadd.h xr5, xr5, xr22
+ xvmin.hu xr6, xr4, xr20
+ xvmin.hu xr7, xr5, xr20
+ xvsub.h xr8, xr20, xr6
+ xvsub.h xr9, xr20, xr7
+ xvmulwev.w.h xr10, xr6, xr0
+ xvmulwod.w.h xr11, xr6, xr0
+ xvmulwev.w.h xr12, xr7, xr1
+ xvmulwod.w.h xr13, xr7, xr1
+ xvmaddwev.w.h xr10, xr8, xr2
+ xvmaddwod.w.h xr11, xr8, xr2
+ xvmaddwev.w.h xr12, xr9, xr3
+ xvmaddwod.w.h xr13, xr9, xr3
+ xvssrarni.hu.w xr12, xr10, 10
+ xvssrarni.hu.w xr13, xr11, 10
+ xvssrlni.bu.h xr13, xr12, 0
+ xvshuf4i.w xr1, xr13, 0x4E
+ xvilvl.b xr17, xr1, xr13
+ vstelm.d vr17, a0, 0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr17, a0, 0, 2
+ add.d a0, a0, a1
+ xvstelm.d xr17, a0, 0, 1
+ add.d a0, a0, a1
+ xvstelm.d xr17, a0, 0, 3
+ add.d a0, a0, a1
+
+ xvhaddw.w.h xr6, xr6, xr6
+ xvhaddw.w.h xr7, xr7, xr7
+ xvpickev.h xr8, xr7, xr6
+ xvpermi.q xr9, xr8, 0x01
+ vadd.h vr8, vr8, vr9
+ vsub.h vr8, vr8, vr21
+ vssrarni.bu.h vr8, vr8, 2
+ vstelm.d vr8, a6, 0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 8
+ blt zero, a5, .WMASK420_W8_LASX
+ b .END_W420_LASX
+
+.WMASK420_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a2, 32
+ xvld xr2, a3, 0
+ xvld xr3, a3, 32
+ addi.w a5, a5, -2
+
+ xvabsd.h xr4, xr0, xr2
+ xvabsd.h xr5, xr1, xr3
+ xvaddi.hu xr4, xr4, 8
+ xvaddi.hu xr5, xr5, 8
+ xvsrli.h xr4, xr4, 8
+ xvsrli.h xr5, xr5, 8
+ xvadd.h xr4, xr4, xr22
+ xvadd.h xr5, xr5, xr22
+ xvmin.hu xr4, xr4, xr20
+ xvmin.hu xr5, xr5, xr20
+ xvsub.h xr6, xr20, xr4
+ xvsub.h xr7, xr20, xr5
+ xvmulwev.w.h xr8, xr4, xr0
+ xvmulwod.w.h xr9, xr4, xr0
+ xvmulwev.w.h xr10, xr5, xr1
+ xvmulwod.w.h xr11, xr5, xr1
+ xvmaddwev.w.h xr8, xr6, xr2
+ xvmaddwod.w.h xr9, xr6, xr2
+ xvmaddwev.w.h xr10, xr7, xr3
+ xvmaddwod.w.h xr11, xr7, xr3
+ xvssrarni.hu.w xr10, xr8, 10
+ xvssrarni.hu.w xr11, xr9, 10
+ xvssrlni.bu.h xr11, xr10, 0
+ xvshuf4i.w xr8, xr11, 0x4E
+ xvilvl.b xr15, xr8, xr11
+ xvpermi.d xr16, xr15, 0xd8
+ vst vr16, a0, 0
+ add.d a0, a0, a1
+ xvpermi.q xr16, xr16, 0x01
+ vst vr16, a0, 0
+ add.d a0, a0, a1
+
+ xvhaddw.w.h xr4, xr4, xr4
+ xvhaddw.w.h xr5, xr5, xr5
+ xvadd.h xr4, xr5, xr4
+ xvpickev.h xr6, xr4, xr4
+ xvpermi.d xr7, xr6, 0x08
+ vsub.h vr7, vr7, vr21
+ vssrarni.bu.h vr7, vr7, 2
+ vstelm.d vr7, a6, 0, 0
+
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 8
+ blt zero, a5, .WMASK420_W16_LASX
+ b .END_W420_LASX
+
+.WMASK420_W32_LASX:
+.WMASK420_W64_LASX:
+.WMASK420_W128_LASX:
+
+.LOOP_W32_420_LASX:
+ add.d t1, a2, zero
+ add.d t2, a3, zero
+ add.d t3, a0, zero
+ add.d t4, a6, zero
+ alsl.d t5, a4, t1, 1
+ alsl.d t6, a4, t2, 1
+ or t7, a4, a4
+.W32_420_LASX:
+ xvld xr0, t1, 0
+ xvld xr1, t2, 0
+ xvld xr2, t5, 0
+ xvld xr3, t6, 0
+ addi.d t1, t1, 32
+ addi.d t2, t2, 32
+ addi.d t5, t5, 32
+ addi.d t6, t6, 32
+ addi.w t7, t7, -16
+ xvabsd.h xr4, xr0, xr1
+ xvabsd.h xr5, xr2, xr3
+ xvaddi.hu xr4, xr4, 8
+ xvaddi.hu xr5, xr5, 8
+ xvsrli.h xr4, xr4, 8
+ xvsrli.h xr5, xr5, 8
+ xvadd.h xr4, xr4, xr22
+ xvadd.h xr5, xr5, xr22
+ xvmin.hu xr6, xr4, xr20
+ xvmin.hu xr7, xr5, xr20
+ xvsub.h xr8, xr20, xr6
+ xvsub.h xr9, xr20, xr7
+ xvmulwev.w.h xr10, xr6, xr0
+ xvmulwod.w.h xr11, xr6, xr0
+ xvmulwev.w.h xr12, xr7, xr2
+ xvmulwod.w.h xr13, xr7, xr2
+ xvmaddwev.w.h xr10, xr8, xr1
+ xvmaddwod.w.h xr11, xr8, xr1
+ xvmaddwev.w.h xr12, xr9, xr3
+ xvmaddwod.w.h xr13, xr9, xr3
+ xvssrarni.hu.w xr12, xr10, 10
+ xvssrarni.hu.w xr13, xr11, 10
+ xvssrlni.bu.h xr13, xr12, 0
+ xvshuf4i.w xr10, xr13, 0x4E
+ xvilvl.b xr17, xr10, xr13
+ xvpermi.d xr18, xr17, 0x08
+ xvpermi.d xr19, xr17, 0x0d
+ vst vr18, t3, 0
+ vstx vr19, t3, a1
+ addi.d t3, t3, 16
+
+ xvhaddw.w.h xr6, xr6, xr6
+ xvhaddw.w.h xr7, xr7, xr7
+ xvadd.h xr6, xr7, xr6
+ xvpickev.h xr7, xr6, xr6
+ xvpermi.d xr8, xr7, 0x08
+ vsub.h vr9, vr8, vr21
+ vssrarni.bu.h vr9, vr9, 2
+ vstelm.d vr9, t4, 0, 0
+ addi.d t4, t4, 8
+ bne t7, zero, .W32_420_LASX
+
+ alsl.d a2, a4, a2, 2
+ alsl.d a3, a4, a3, 2
+ alsl.d a0, a1, a0, 1
+ srai.w t8, a4, 1
+ add.d a6, a6, t8
+ addi.w a5, a5, -2
+ blt zero, a5, .LOOP_W32_420_LASX
+
+.END_W420_LASX:
+endfunc
+
+#undef bpc_sh
+#undef bpcw_sh
+
+.macro vhaddw.d.h in0
+ vhaddw.w.h \in0, \in0, \in0
+ vhaddw.d.w \in0, \in0, \in0
+.endm
+.macro vhaddw.q.w in0
+ vhaddw.d.w \in0, \in0, \in0
+ vhaddw.q.d \in0, \in0, \in0
+.endm
+.macro PUT_H_8W in0
+ vbsrl.v vr2, \in0, 1
+ vbsrl.v vr3, \in0, 2
+ vbsrl.v vr4, \in0, 3
+ vbsrl.v vr5, \in0, 4
+ vbsrl.v vr6, \in0, 5
+ vbsrl.v vr7, \in0, 6
+ vbsrl.v vr10, \in0, 7
+ vilvl.d vr2, vr2, \in0
+ vilvl.d vr3, vr4, vr3
+ vilvl.d vr4, vr6, vr5
+ vilvl.d vr5, vr10, vr7
+ vdp2.h.bu.b \in0, vr2, vr8
+ vdp2.h.bu.b vr2, vr3, vr8
+ vdp2.h.bu.b vr3, vr4, vr8
+ vdp2.h.bu.b vr4, vr5, vr8
+ vhaddw.d.h \in0
+ vhaddw.d.h vr2
+ vhaddw.d.h vr3
+ vhaddw.d.h vr4
+ vpickev.w \in0, vr2, \in0
+ vpickev.w vr2, vr4, vr3
+ vpickev.h \in0, vr2, \in0
+ vadd.h \in0, \in0, vr9
+.endm
+.macro FILTER_8TAP_4W in0
+ vbsrl.v vr10, \in0, 1
+ vbsrl.v vr11, \in0, 2
+ vbsrl.v vr12, \in0, 3
+ vilvl.d vr10, vr10, \in0
+ vilvl.d vr11, vr12, vr11
+ vdp2.h.bu.b vr7, vr10, vr8
+ vdp2.h.bu.b vr10, vr11, vr8
+ vhaddw.d.h vr7
+ vhaddw.d.h vr10
+ vpickev.w \in0, vr10, vr7
+.endm
+.macro FILTER_8TAP_8W in0
+ vbsrl.v vr10, \in0, 1
+ vbsrl.v vr11, \in0, 2
+ vbsrl.v vr12, \in0, 3
+ vbsrl.v vr13, \in0, 4
+ vbsrl.v vr14, \in0, 5
+ vbsrl.v vr15, \in0, 6
+ vbsrl.v vr16, \in0, 7
+ vilvl.d vr10, vr10, \in0
+ vilvl.d vr11, vr12, vr11
+ vilvl.d vr12, vr14, vr13
+ vilvl.d vr13, vr16, vr15
+ vdp2.h.bu.b vr14, vr10, vr8
+ vdp2.h.bu.b vr15, vr11, vr8
+ vdp2.h.bu.b vr16, vr12, vr8
+ vdp2.h.bu.b vr17, vr13, vr8
+ vhaddw.d.h vr14
+ vhaddw.d.h vr15
+ vhaddw.d.h vr16
+ vhaddw.d.h vr17
+ vpickev.w vr13, vr15, vr14
+ vpickev.w vr14, vr17, vr16
+ vpickev.h \in0, vr14, vr13 //x0 ... x7
+ vsrari.h \in0, \in0, 2
+.endm
+.macro FILTER_8TAP_8W_CLIP_STORE
+ vdp2.w.h vr12, vr0, vr9
+ vdp2.w.h vr13, vr1, vr9
+ vdp2.w.h vr14, vr2, vr9
+ vdp2.w.h vr15, vr3, vr9
+ vdp2.w.h vr16, vr4, vr9
+ vdp2.w.h vr17, vr5, vr9
+ vdp2.w.h vr18, vr6, vr9
+ vdp2.w.h vr19, vr7, vr9
+ vhaddw.q.w vr12
+ vhaddw.q.w vr13
+ vhaddw.q.w vr14
+ vhaddw.q.w vr15
+ vhaddw.q.w vr16
+ vhaddw.q.w vr17
+ vhaddw.q.w vr18
+ vhaddw.q.w vr19
+ vpackev.w vr12, vr13, vr12
+ vpackev.w vr13, vr15, vr14
+ vpackev.d vr12, vr13, vr12
+ vpackev.w vr14, vr17, vr16
+ vpackev.w vr15, vr19, vr18
+ vpackev.d vr13, vr15, vr14
+ vssrarni.hu.w vr13, vr12, 10
+ vssrani.bu.h vr13, vr13, 0
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a1
+.endm
+.macro VEXTRINS_Hx8 in0
+ vextrins.h vr0, \in0, 0x70
+ vextrins.h vr1, \in0, 0x71
+ vextrins.h vr2, \in0, 0x72
+ vextrins.h vr3, \in0, 0x73
+ vextrins.h vr4, \in0, 0x74
+ vextrins.h vr5, \in0, 0x75
+ vextrins.h vr6, \in0, 0x76
+ vextrins.h vr7, \in0, 0x77
+.endm
+.macro VBSRL_Vx8
+ vbsrl.v vr0, vr0, 2
+ vbsrl.v vr1, vr1, 2
+ vbsrl.v vr2, vr2, 2
+ vbsrl.v vr3, vr3, 2
+ vbsrl.v vr4, vr4, 2
+ vbsrl.v vr5, vr5, 2
+ vbsrl.v vr6, vr6, 2
+ vbsrl.v vr7, vr7, 2
+.endm
+
+.macro PUT_8TAP_8BPC_LSX lable
+ li.w t0, 4
+ la.local t6, dav1d_mc_subpel_filters
+ slli.d t2, a3, 1 //src_stride*2
+ add.d t3, t2, a3 //src_stride*3
+ slli.d t4, t2, 1 //src_stride*4
+
+ bnez a6, .l_\lable\()put_h //mx
+ bnez a7, .l_\lable\()put_v //my
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_hv0_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_hv0_jtable:
+ .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_64w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_32w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_16w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_8w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_4w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_2w - .l_\lable\()put_hv0_jtable
+
+.l_\lable\()put_hv0_2w:
+ vldrepl.h vr0, a2, 0
+ add.d a2, a2, a3
+ vldrepl.h vr1, a2, 0
+ vstelm.h vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr1, a0, 0, 0
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_2w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_4w:
+ fld.s f0, a2, 0
+ fldx.s f1, a2, a3
+ fst.s f0, a0, 0
+ fstx.s f1, a0, a1
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_4w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_8w:
+ fld.d f0, a2, 0
+ fldx.d f1, a2, a3
+ fst.d f0, a0, 0
+ fstx.d f1, a0, a1
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_8w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_16w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vst vr0, a0, 0
+ vstx vr1, a0, a1
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_16w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_32w:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ add.d a2, a2, a3
+ vld vr2, a2, 0
+ vld vr3, a2, 16
+ vst vr0, a0, 0
+ vst vr1, a0, 16
+ add.d a0, a0, a1
+ vst vr2, a0, 0
+ vst vr3, a0, 16
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_32w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_64w:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a2, 32
+ vld vr3, a2, 48
+ add.d a2, a2, a3
+ vld vr4, a2, 0
+ vld vr5, a2, 16
+ vld vr6, a2, 32
+ vld vr7, a2, 48
+ add.d a2, a2, a3
+ vst vr0, a0, 0
+ vst vr1, a0, 16
+ vst vr2, a0, 32
+ vst vr3, a0, 48
+ add.d a0, a0, a1
+ vst vr4, a0, 0
+ vst vr5, a0, 16
+ vst vr6, a0, 32
+ vst vr7, a0, 48
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_64w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_128w:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a2, 32
+ vld vr3, a2, 48
+ vld vr4, a2, 64
+ vld vr5, a2, 80
+ vld vr6, a2, 96
+ vld vr7, a2, 112
+ add.d a2, a2, a3
+ vld vr8, a2, 0
+ vld vr9, a2, 16
+ vld vr10, a2, 32
+ vld vr11, a2, 48
+ vld vr12, a2, 64
+ vld vr13, a2, 80
+ vld vr14, a2, 96
+ vld vr15, a2, 112
+ add.d a2, a2, a3
+ vst vr0, a0, 0
+ vst vr1, a0, 16
+ vst vr2, a0, 32
+ vst vr3, a0, 48
+ vst vr4, a0, 64
+ vst vr5, a0, 80
+ vst vr6, a0, 96
+ vst vr7, a0, 112
+ add.d a0, a0, a1
+ vst vr8, a0, 0
+ vst vr9, a0, 16
+ vst vr10, a0, 32
+ vst vr11, a0, 48
+ vst vr12, a0, 64
+ vst vr13, a0, 80
+ vst vr14, a0, 96
+ vst vr15, a0, 112
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_128w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h:
+ bnez a7, .l_\lable\()put_hv //if(fh) && if (fv)
+ ld.d t5, sp, 0 //filter_type
+ andi t1, t5, 3
+ blt t0, a4, .l_\lable\()put_h_idx_fh
+ andi t1, t5, 1
+ addi.w t1, t1, 3
+
+.l_\lable\()put_h_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ vldrepl.d vr8, t1, 0
+ addi.d a2, a2, -3
+ li.w t1, 34
+ vreplgr2vr.h vr9, t1
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_h_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_h_jtable:
+ .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_64w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_32w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_16w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_8w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_4w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_2w - .l_\lable\()put_h_jtable
+
+.l_\lable\()put_h_2w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+
+ vbsrl.v vr2, vr0, 1
+ vilvl.d vr0, vr2, vr0
+ vdp2.h.bu.b vr2, vr0, vr8
+ vhaddw.w.h vr0, vr2, vr2
+ vhaddw.d.w vr0, vr0, vr0
+ vbsrl.v vr2, vr1, 1
+ vilvl.d vr1, vr2, vr1
+ vdp2.h.bu.b vr2, vr1, vr8
+ vhaddw.w.h vr1, vr2, vr2
+ vhaddw.d.w vr1, vr1, vr1
+ vpickev.w vr0, vr1, vr0
+ vpickev.h vr0, vr0, vr0
+ vadd.h vr0, vr0, vr9
+ vssrani.bu.h vr0, vr0, 6
+
+ vstelm.h vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr0, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_h_2w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_4w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+
+ vbsrl.v vr2, vr0, 1
+ vbsrl.v vr3, vr0, 2
+ vbsrl.v vr4, vr0, 3
+ vilvl.d vr0, vr2, vr0 //x0 x1
+ vilvl.d vr2, vr4, vr3 //x2 x3
+ vdp2.h.bu.b vr3, vr0, vr8
+ vdp2.h.bu.b vr4, vr2, vr8
+ vhaddw.w.h vr0, vr3, vr3
+ vhaddw.d.w vr0, vr0, vr0
+ vhaddw.w.h vr2, vr4, vr4
+ vhaddw.d.w vr2, vr2, vr2
+ vpickev.w vr5, vr2, vr0
+ vbsrl.v vr2, vr1, 1
+ vbsrl.v vr3, vr1, 2
+ vbsrl.v vr4, vr1, 3
+ vilvl.d vr0, vr2, vr1 //x0 x1
+ vilvl.d vr2, vr4, vr3 //x2 x3
+ vdp2.h.bu.b vr3, vr0, vr8
+ vdp2.h.bu.b vr4, vr2, vr8
+ vhaddw.w.h vr0, vr3, vr3
+ vhaddw.d.w vr0, vr0, vr0
+ vhaddw.w.h vr2, vr4, vr4
+ vhaddw.d.w vr2, vr2, vr2
+ vpickev.w vr6, vr2, vr0
+ vpickev.h vr0, vr6, vr5
+ vadd.h vr0, vr0, vr9
+ vssrani.bu.h vr0, vr0, 6
+
+ vstelm.w vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr0, a0, 0, 1
+ add.d a0, a0, a1
+ addi.d a5, a5, -2
+ bnez a5, .l_\lable\()put_h_4w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_8w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+ PUT_H_8W vr0
+ PUT_H_8W vr1
+ vssrani.bu.h vr1, vr0, 6
+ vstelm.d vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr1, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_h_8w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_16w:
+.l_\lable\()put_h_32w:
+.l_\lable\()put_h_64w:
+.l_\lable\()put_h_128w:
+ addi.d t0, a2, 0 //src
+ addi.w t5, a5, 0 //h
+ addi.d t8, a0, 0 //dst
+.l_\lable\()put_h_16w_loop:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+ PUT_H_8W vr0
+ PUT_H_8W vr1
+ vssrani.bu.h vr1, vr0, 6
+ vstelm.d vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr1, a0, 0, 1
+ add.d a0, a0, a1
+ addi.d a5, a5, -2
+ bnez a5, .l_\lable\()put_h_16w_loop
+ addi.d a2, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.w a5, t5, 0
+ addi.w a4, a4, -8
+ bnez a4, .l_\lable\()put_h_16w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v:
+ ld.d t1, sp, 0 //filter_type
+ srli.w t1, t1, 2
+ blt t0, a5, .l_\lable\()put_v_idx_fv
+ andi t1, t1, 1
+ addi.w t1, t1, 3
+
+.l_\lable\()put_v_idx_fv:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a7, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fv's offset
+ vldrepl.d vr8, t1, 0
+ sub.d a2, a2, t3
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_v_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_v_jtable:
+ .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_64w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_32w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_16w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_8w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_4w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_2w - .l_\lable\()put_v_jtable
+
+.l_\lable\()put_v_2w:
+ fld.s f0, a2, 0
+ fldx.s f1, a2, a3
+ fldx.s f2, a2, t2
+ add.d a2, a2, t3
+ fld.s f3, a2, 0
+ fldx.s f4, a2, a3
+ fldx.s f5, a2, t2
+ fldx.s f6, a2, t3
+ add.d a2, a2, t4
+ vilvl.b vr0, vr1, vr0
+ vilvl.b vr1, vr3, vr2
+ vilvl.b vr2, vr5, vr4
+ vilvl.b vr3, vr7, vr6
+ vilvl.h vr0, vr1, vr0
+ vilvl.h vr1, vr3, vr2
+ vilvl.w vr0, vr1, vr0
+
+.l_\lable\()put_v_2w_loop:
+ fld.s f7, a2, 0 //h0
+ fldx.s f10, a2, a3 //h1
+ add.d a2, a2, t2
+
+ vextrins.b vr0, vr7, 0x70
+ vextrins.b vr0, vr7, 0xf1
+ vbsrl.v vr1, vr0, 1
+ vextrins.b vr1, vr10, 0x70
+ vextrins.b vr1, vr10, 0xf1
+ vdp2.h.bu.b vr10, vr0, vr8
+ vdp2.h.bu.b vr11, vr1, vr8
+ vbsrl.v vr0, vr1, 1
+ vhaddw.d.h vr10
+ vhaddw.d.h vr11
+ vpickev.w vr10, vr11, vr10
+ vssrarni.hu.w vr10, vr10, 6
+ vssrani.bu.h vr10, vr10, 0
+
+ vstelm.h vr10, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr10, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_v_2w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v_4w:
+ fld.s f0, a2, 0
+ fldx.s f1, a2, a3
+ fldx.s f2, a2, t2
+ add.d a2, a2, t3
+ fld.s f3, a2, 0
+ fldx.s f4, a2, a3
+ fldx.s f5, a2, t2
+ fldx.s f6, a2, t3
+ add.d a2, a2, t4
+
+ vilvl.b vr0, vr1, vr0
+ vilvl.b vr1, vr3, vr2
+ vilvl.b vr2, vr5, vr4
+ vilvl.b vr3, vr7, vr6
+ vilvl.h vr0, vr1, vr0
+ vilvl.h vr1, vr3, vr2
+ vilvl.w vr2, vr1, vr0
+ vilvh.w vr3, vr1, vr0
+
+.l_\lable\()put_v_4w_loop:
+ fld.s f7, a2, 0
+ fldx.s f10, a2, a3
+ add.d a2, a2, t2
+
+ vextrins.b vr2, vr7, 0x70
+ vextrins.b vr2, vr7, 0xf1 //x0x1(h0)
+ vbsrl.v vr4, vr2, 1
+ vextrins.b vr4, vr10, 0x70
+ vextrins.b vr4, vr10, 0xf1 //x0x1(h1)
+ vdp2.h.bu.b vr11, vr2, vr8
+ vdp2.h.bu.b vr12, vr4, vr8
+ vbsrl.v vr2, vr4, 1
+
+ vextrins.b vr3, vr7, 0x72
+ vextrins.b vr3, vr7, 0xf3 //x2x3(h0)
+ vbsrl.v vr4, vr3, 1
+ vextrins.b vr4, vr10, 0x72
+ vextrins.b vr4, vr10, 0xf3 //x2x3(h1)
+ vdp2.h.bu.b vr13, vr3, vr8
+ vdp2.h.bu.b vr14, vr4, vr8
+ vbsrl.v vr3, vr4, 1
+
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+
+ vpickev.w vr11, vr13, vr11
+ vpickev.w vr12, vr14, vr12
+ vpickev.h vr11, vr12, vr11
+ vssrarni.bu.h vr11, vr11, 6
+ vstelm.w vr11, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr11, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_v_4w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v_8w:
+.l_\lable\()put_v_16w:
+.l_\lable\()put_v_32w:
+.l_\lable\()put_v_64w:
+.l_\lable\()put_v_128w:
+ addi.d t0, a2, 0 //src
+ addi.d t5, a5, 0 //h
+ addi.d t8, a0, 0 //dst
+.l_\lable\()put_v_8w_loop0:
+ fld.d f0, a2, 0
+ fldx.d f1, a2, a3
+ fldx.d f2, a2, t2
+ add.d a2, a2, t3
+ fld.d f3, a2, 0
+ fldx.d f4, a2, a3
+ fldx.d f5, a2, t2
+ fldx.d f6, a2, t3
+ add.d a2, a2, t4
+
+ vilvl.b vr0, vr1, vr0
+ vilvl.b vr1, vr3, vr2
+ vilvl.b vr2, vr5, vr4
+ vilvl.b vr3, vr7, vr6
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr6, vr3, vr2
+ vilvh.h vr7, vr3, vr2
+ vilvl.w vr0, vr6, vr4 // x0x1
+ vilvh.w vr1, vr6, vr4 // x2x3
+ vilvl.w vr2, vr7, vr5 // x4x5
+ vilvh.w vr3, vr7, vr5 // x6x7
+.l_\lable\()put_v_8w_loop:
+ fld.d f7, a2, 0
+ fldx.d f10, a2, a3
+ add.d a2, a2, t2
+ //h0
+ vextrins.b vr0, vr7, 0x70
+ vextrins.b vr0, vr7, 0xf1
+ vextrins.b vr1, vr7, 0x72
+ vextrins.b vr1, vr7, 0xf3
+ vextrins.b vr2, vr7, 0x74
+ vextrins.b vr2, vr7, 0xf5
+ vextrins.b vr3, vr7, 0x76
+ vextrins.b vr3, vr7, 0xf7
+ vdp2.h.bu.b vr11, vr0, vr8
+ vdp2.h.bu.b vr12, vr1, vr8
+ vdp2.h.bu.b vr13, vr2, vr8
+ vdp2.h.bu.b vr14, vr3, vr8
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+ vpickev.w vr11, vr12, vr11
+ vpickev.w vr12, vr14, vr13
+ vpickev.h vr11, vr12, vr11
+ vssrarni.bu.h vr11, vr11, 6
+ fst.d f11, a0, 0
+ add.d a0, a0, a1
+ //h1
+ vbsrl.v vr0, vr0, 1
+ vbsrl.v vr1, vr1, 1
+ vbsrl.v vr2, vr2, 1
+ vbsrl.v vr3, vr3, 1
+ vextrins.b vr0, vr10, 0x70
+ vextrins.b vr0, vr10, 0xf1
+ vextrins.b vr1, vr10, 0x72
+ vextrins.b vr1, vr10, 0xf3
+ vextrins.b vr2, vr10, 0x74
+ vextrins.b vr2, vr10, 0xf5
+ vextrins.b vr3, vr10, 0x76
+ vextrins.b vr3, vr10, 0xf7
+ vdp2.h.bu.b vr11, vr0, vr8
+ vdp2.h.bu.b vr12, vr1, vr8
+ vdp2.h.bu.b vr13, vr2, vr8
+ vdp2.h.bu.b vr14, vr3, vr8
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+ vpickev.w vr11, vr12, vr11
+ vpickev.w vr12, vr14, vr13
+ vpickev.h vr11, vr12, vr11
+ vssrarni.bu.h vr11, vr11, 6
+ fst.d f11, a0, 0
+ add.d a0, a0, a1
+ vbsrl.v vr0, vr0, 1
+ vbsrl.v vr1, vr1, 1
+ vbsrl.v vr2, vr2, 1
+ vbsrl.v vr3, vr3, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_v_8w_loop
+ addi.d a2, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.d a5, t5, 0
+ addi.w a4, a4, -8
+ bnez a4, .l_\lable\()put_v_8w_loop0
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv:
+ ld.d t5, sp, 0 //filter_type
+ andi t1, t5, 3
+ blt t0, a4, .l_\lable\()put_hv_idx_fh
+ andi t1, t5, 1
+ addi.w t1, t1, 3
+.l_\lable\()put_hv_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ vldrepl.d vr8, t1, 0
+ ld.d t1, sp, 0 //filter_type
+ srli.w t1, t1, 2
+ blt t0, a5, .l_\lable\()put_hv_idx_fv
+ andi t1, t1, 1
+ addi.w t1, t1, 3
+.l_\lable\()put_hv_idx_fv:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a7, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fv's offset
+ vldrepl.d vr9, t1, 0
+ vexth.h.b vr9, vr9
+
+ sub.d a2, a2, t3
+ addi.d a2, a2, -3
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_hv_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_hv_jtable:
+ .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_64w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_32w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_16w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_8w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_4w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_2w - .l_\lable\()put_hv_jtable
+
+.l_\lable\()put_hv_2w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vldx vr2, a2, t2
+ add.d a2, a2, t3
+ vld vr3, a2, 0
+ vldx vr4, a2, a3
+ vldx vr5, a2, t2
+ vldx vr6, a2, t3
+ add.d a2, a2, t4
+
+ vbsrl.v vr10, vr0, 1
+ vbsrl.v vr11, vr1, 1
+ vbsrl.v vr12, vr2, 1
+ vbsrl.v vr13, vr3, 1
+ vbsrl.v vr14, vr4, 1
+ vbsrl.v vr15, vr5, 1
+ vbsrl.v vr16, vr6, 1
+ vilvl.d vr0, vr10, vr0
+ vilvl.d vr1, vr11, vr1
+ vilvl.d vr2, vr12, vr2
+ vilvl.d vr3, vr13, vr3
+ vilvl.d vr4, vr14, vr4
+ vilvl.d vr5, vr15, vr5
+ vilvl.d vr6, vr16, vr6
+ vdp2.h.bu.b vr10, vr0, vr8
+ vdp2.h.bu.b vr11, vr1, vr8
+ vdp2.h.bu.b vr12, vr2, vr8
+ vdp2.h.bu.b vr13, vr3, vr8
+ vdp2.h.bu.b vr14, vr4, vr8
+ vdp2.h.bu.b vr15, vr5, vr8
+ vdp2.h.bu.b vr16, vr6, vr8
+ vhaddw.d.h vr10
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+ vhaddw.d.h vr15
+ vhaddw.d.h vr16
+
+ vpackev.w vr10, vr11, vr10
+ vpackev.w vr12, vr13, vr12
+ vpackod.d vr11, vr12, vr10
+ vpackev.d vr10, vr12, vr10
+
+ vpackev.w vr12, vr15, vr14
+ vpackev.w vr16, vr17, vr16
+ vpackod.d vr13, vr16, vr12
+ vpackev.d vr12, vr16, vr12
+
+ vpickev.h vr10, vr12, vr10 //0 1 2 3 4 5 6 * (h0)
+ vpickev.h vr11, vr13, vr11 //8 9 10 11 12 13 14 * (h1)
+ vsrari.h vr10, vr10, 2
+ vsrari.h vr11, vr11, 2
+.l_\lable\()put_hv_2w_loop:
+ vld vr7, a2, 0
+ vldx vr12, a2, a3
+ add.d a2, a2, t2
+
+ vbsrl.v vr1, vr7, 1
+ vbsrl.v vr2, vr12, 1
+ vilvl.d vr0, vr1, vr7
+ vilvl.d vr1, vr2, vr12
+ vdp2.h.bu.b vr2, vr0, vr8
+ vdp2.h.bu.b vr3, vr1, vr8
+ vhaddw.d.h vr2
+ vhaddw.d.h vr3
+ vpickev.w vr2, vr3, vr2
+ vpickev.h vr2, vr2, vr2
+ vsrari.h vr2, vr2, 2
+ vextrins.h vr10, vr2, 0x70 //0 1 2 3 4 5 6 7
+ vextrins.h vr11, vr2, 0x71
+ vbsrl.v vr12, vr10, 2
+ vbsrl.v vr13, vr11, 2
+ vextrins.h vr12, vr2, 0x72 //1 2 3 4 5 6 7 8
+ vextrins.h vr13, vr2, 0x73
+ vdp2.w.h vr0, vr10, vr9
+ vdp2.w.h vr1, vr11, vr9
+ vdp2.w.h vr2, vr12, vr9
+ vdp2.w.h vr3, vr13, vr9
+ vhaddw.q.w vr0
+ vhaddw.q.w vr1
+ vhaddw.q.w vr2
+ vhaddw.q.w vr3
+ vpackev.w vr0, vr1, vr0
+ vpackev.w vr1, vr3, vr2
+ vpackev.d vr0, vr1, vr0
+ vssrarni.hu.w vr0, vr0, 10
+ vssrani.bu.h vr0, vr0, 0
+ vbsrl.v vr10, vr12, 2
+ vbsrl.v vr11, vr13, 2
+ vstelm.h vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr0, a0, 0, 1
+ add.d a0, a0, a1
+ addi.d a5, a5, -2
+ bnez a5, .l_\lable\()put_hv_2w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv_4w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vldx vr2, a2, t2
+ add.d a2, a2, t3
+ vld vr3, a2, 0
+ vldx vr4, a2, a3
+ vldx vr5, a2, t2
+ vldx vr6, a2, t3
+ add.d a2, a2, t4
+ FILTER_8TAP_4W vr0 //x0 x1 x2 x3
+ FILTER_8TAP_4W vr1
+ FILTER_8TAP_4W vr2
+ FILTER_8TAP_4W vr3
+ FILTER_8TAP_4W vr4
+ FILTER_8TAP_4W vr5
+ FILTER_8TAP_4W vr6
+ vpackev.h vr0, vr1, vr0
+ vpackev.h vr1, vr3, vr2
+ vpackev.h vr2, vr5, vr4
+ vpackev.h vr3, vr7, vr6
+ vilvl.w vr4, vr1, vr0
+ vilvh.w vr5, vr1, vr0
+ vilvl.w vr6, vr3, vr2
+ vilvh.w vr7, vr3, vr2
+ vilvl.d vr0, vr6, vr4 //0 1 2 3 4 5 6 *
+ vilvh.d vr1, vr6, vr4
+ vilvl.d vr2, vr7, vr5
+ vilvh.d vr3, vr7, vr5
+ vsrari.h vr0, vr0, 2
+ vsrari.h vr1, vr1, 2
+ vsrari.h vr2, vr2, 2
+ vsrari.h vr3, vr3, 2
+.l_\lable\()put_hv_4w_loop:
+ vld vr4, a2, 0
+ vldx vr5, a2, a3
+ add.d a2, a2, t2
+ FILTER_8TAP_4W vr4
+ FILTER_8TAP_4W vr5
+ vpickev.h vr4, vr5, vr4
+ vsrari.h vr4, vr4, 2
+ vextrins.h vr0, vr4, 0x70
+ vextrins.h vr1, vr4, 0x71
+ vextrins.h vr2, vr4, 0x72
+ vextrins.h vr3, vr4, 0x73
+ vbsrl.v vr5, vr0, 2
+ vbsrl.v vr6, vr1, 2
+ vbsrl.v vr7, vr2, 2
+ vbsrl.v vr10, vr3, 2
+ vextrins.h vr5, vr4, 0x74
+ vextrins.h vr6, vr4, 0x75
+ vextrins.h vr7, vr4, 0x76
+ vextrins.h vr10, vr4, 0x77
+ vdp2.w.h vr11, vr0, vr9
+ vdp2.w.h vr12, vr1, vr9
+ vdp2.w.h vr13, vr2, vr9
+ vdp2.w.h vr14, vr3, vr9
+ vhaddw.q.w vr11
+ vhaddw.q.w vr12
+ vhaddw.q.w vr13
+ vhaddw.q.w vr14
+ vpackev.w vr0, vr12, vr11
+ vpackev.w vr1, vr14, vr13
+ vpackev.d vr0, vr1, vr0
+ vdp2.w.h vr11, vr5, vr9
+ vdp2.w.h vr12, vr6, vr9
+ vdp2.w.h vr13, vr7, vr9
+ vdp2.w.h vr14, vr10, vr9
+ vhaddw.q.w vr11
+ vhaddw.q.w vr12
+ vhaddw.q.w vr13
+ vhaddw.q.w vr14
+ vpackev.w vr1, vr12, vr11
+ vpackev.w vr2, vr14, vr13
+ vpackev.d vr1, vr2, vr1
+ vssrarni.hu.w vr1, vr0, 10
+ vssrani.bu.h vr1, vr1, 0
+ vstelm.w vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr1, a0, 0, 1
+ add.d a0, a0, a1
+ vbsrl.v vr0, vr5, 2
+ vbsrl.v vr1, vr6, 2
+ vbsrl.v vr2, vr7, 2
+ vbsrl.v vr3, vr10, 2
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv_4w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv_8w:
+.l_\lable\()put_hv_16w:
+.l_\lable\()put_hv_32w:
+.l_\lable\()put_hv_64w:
+.l_\lable\()put_hv_128w:
+ addi.d t0, a2, 0 //src
+ addi.d t5, a5, 0 //h
+ addi.d t8, a0, 0 //dst
+.l_\lable\()put_hv_8w_loop0:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vldx vr2, a2, t2
+ add.d a2, a2, t3
+ vld vr3, a2, 0
+ vldx vr4, a2, a3
+ vldx vr5, a2, t2
+ vldx vr6, a2, t3
+ add.d a2, a2, t4
+ FILTER_8TAP_8W vr0
+ FILTER_8TAP_8W vr1
+ FILTER_8TAP_8W vr2
+ FILTER_8TAP_8W vr3
+ FILTER_8TAP_8W vr4
+ FILTER_8TAP_8W vr5
+ FILTER_8TAP_8W vr6
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
+ vr10,vr11,vr12,vr13,vr14,vr15,vr16,vr17
+.l_\lable\()put_hv_8w_loop:
+ vld vr20, a2, 0
+ vldx vr21, a2, a3
+ add.d a2, a2, t2
+ FILTER_8TAP_8W vr20
+ FILTER_8TAP_8W vr21
+ VEXTRINS_Hx8 vr20
+ FILTER_8TAP_8W_CLIP_STORE
+ VBSRL_Vx8
+ VEXTRINS_Hx8 vr21
+ FILTER_8TAP_8W_CLIP_STORE
+ VBSRL_Vx8
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv_8w_loop
+ addi.d a2, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.d a5, t5, 0
+ addi.w a4, a4, -8
+ bnez a4, .l_\lable\()put_hv_8w_loop0
+.l_\lable\()end_put_8tap:
+.endm
+
+function put_8tap_regular_8bpc_lsx
+ addi.d sp, sp, -16
+ st.d zero, sp, 0
+ PUT_8TAP_8BPC_LSX 0
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_smooth_regular_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 1
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 1
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_sharp_regular_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 2
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 2
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_regular_smooth_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 4
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 4
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_smooth_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 5
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 5
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_sharp_smooth_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 6
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 6
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_regular_sharp_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 8
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 8
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_smooth_sharp_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 9
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 9
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_sharp_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 10
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 10
+ addi.d sp, sp, 16
+endfunc
+
+const shufb1
+.byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8,0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8
+endconst
+
+.macro SHUFB in0, in1, tmp, out
+ xvbsrl.v \tmp, \in0, 2
+ xvpermi.q \tmp, \in0, 0x20
+ xvshuf.b \out, \tmp, \tmp, \in1
+.endm
+
+.macro HADDWDH in0
+ xvhaddw.w.h \in0, \in0, \in0
+ xvhaddw.d.w \in0, \in0, \in0
+.endm
+
+.macro HADDWQW in0
+ xvhaddw.d.w \in0, \in0, \in0
+ xvhaddw.q.d \in0, \in0, \in0
+.endm
+
+.macro PREP_W16_H in0
+ xvbsrl.v xr4, \in0, 4
+ xvbsrl.v xr5, \in0, 8
+ xvpermi.q xr9, \in0, 0x31
+ xvpackev.d xr5, xr9, xr5
+ xvbsrl.v xr6, xr5, 4
+ SHUFB \in0, xr23, xr9, \in0
+ SHUFB xr4, xr23, xr9, xr4
+ SHUFB xr5, xr23, xr9, xr5
+ SHUFB xr6, xr23, xr9, xr6
+ xvdp2.h.bu.b xr10, \in0, xr22
+ xvdp2.h.bu.b xr11, xr4, xr22
+ xvdp2.h.bu.b xr12, xr5, xr22
+ xvdp2.h.bu.b xr13, xr6, xr22
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ xvpickev.w xr10, xr11, xr10
+ xvpickev.w xr11, xr13, xr12
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr11, xr11, 0xd8
+ xvpickev.h xr10, xr11, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h \in0, xr10, 2
+.endm
+
+.macro PREP_8TAP_8BPC_LASX lable
+ li.w t0, 4
+ la.local t6, dav1d_mc_subpel_filters
+ la.local t7, shufb1
+ xvld xr23, t7, 0
+ slli.d t2, a2, 1 //src_stride*2
+ add.d t3, t2, a2 //src_stride*3
+ slli.d t4, t2, 1
+
+ bnez a5, .l_\lable\()h //mx
+ bnez a6, .l_\lable\()v
+
+ clz.w t1, a3
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()prep_hv0_jtable
+ alsl.d t1, t1, t5, 1
+ ld.h t8, t1, 0
+ add.d t5, t5, t8
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()prep_hv0_jtable:
+ .hword .l_\lable\()hv0_128w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_64w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_32w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_16w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_8w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_4w - .l_\lable\()prep_hv0_jtable
+
+.l_\lable\()hv0_4w:
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fldx.s f2, a1, t2
+ fldx.s f3, a1, t3
+ add.d a1, a1, t4
+ xvpackev.w xr0, xr1, xr0
+ xvpackev.w xr1, xr3, xr2
+ xvpermi.q xr0, xr1, 0x02
+ xvsllwil.hu.bu xr0, xr0, 4
+ xvst xr0, a0, 0
+ addi.d a0, a0, 32
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_4w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_8w:
+ fld.d f0, a1, 0
+ fldx.d f1, a1, a2
+ fldx.d f2, a1, t2
+ fldx.d f3, a1, t3
+ add.d a1, a1, t4
+ xvpermi.q xr0, xr1, 0x02
+ xvpermi.q xr2, xr3, 0x02
+ xvsllwil.hu.bu xr0, xr0, 4
+ xvsllwil.hu.bu xr2, xr2, 4
+ xvst xr0, a0, 0
+ xvst xr2, a0, 32
+ addi.d a0, a0, 64
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_8w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_16w:
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t2
+ vldx vr3, a1, t3
+ add.d a1, a1, t4
+ vext2xv.hu.bu xr0, xr0
+ vext2xv.hu.bu xr1, xr1
+ vext2xv.hu.bu xr2, xr2
+ vext2xv.hu.bu xr3, xr3
+ xvslli.h xr0, xr0, 4
+ xvslli.h xr1, xr1, 4
+ xvslli.h xr2, xr2, 4
+ xvslli.h xr3, xr3, 4
+ xvst xr0, a0, 0
+ xvst xr1, a0, 32
+ xvst xr2, a0, 64
+ xvst xr3, a0, 96
+ addi.d a0, a0, 128
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_16w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_32w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+ xvpermi.d xr4, xr0, 0xD8
+ xvpermi.d xr5, xr1, 0xD8
+ xvpermi.d xr6, xr2, 0xD8
+ xvpermi.d xr7, xr3, 0xD8
+ xvpermi.d xr10, xr0, 0x32
+ xvpermi.d xr11, xr1, 0x32
+ xvpermi.d xr12, xr2, 0x32
+ xvpermi.d xr13, xr3, 0x32
+ xvsllwil.hu.bu xr0, xr4, 4
+ xvsllwil.hu.bu xr1, xr5, 4
+ xvsllwil.hu.bu xr2, xr6, 4
+ xvsllwil.hu.bu xr3, xr7, 4
+ xvsllwil.hu.bu xr4, xr10, 4
+ xvsllwil.hu.bu xr5, xr11, 4
+ xvsllwil.hu.bu xr6, xr12, 4
+ xvsllwil.hu.bu xr7, xr13, 4
+ xvst xr0, a0, 0
+ xvst xr4, a0, 32
+ xvst xr1, a0, 64
+ xvst xr5, a0, 96
+ xvst xr2, a0, 128
+ xvst xr6, a0, 160
+ xvst xr3, a0, 192
+ xvst xr7, a0, 224
+ addi.d a0, a0, 256
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_32w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_64w:
+.l_\lable\()hv0_128w:
+ addi.d t0, a1, 0
+ addi.d t5, a4, 0
+ srli.w t7, a3, 5
+ slli.w t7, t7, 6
+ addi.d t8, a0, 0
+.l_\lable\()hv0_32_loop:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+ xvpermi.d xr4, xr0, 0xD8
+ xvpermi.d xr5, xr1, 0xD8
+ xvpermi.d xr6, xr2, 0xD8
+ xvpermi.d xr7, xr3, 0xD8
+ xvpermi.d xr10, xr0, 0x32
+ xvpermi.d xr11, xr1, 0x32
+ xvpermi.d xr12, xr2, 0x32
+ xvpermi.d xr13, xr3, 0x32
+ xvsllwil.hu.bu xr0, xr4, 4
+ xvsllwil.hu.bu xr1, xr5, 4
+ xvsllwil.hu.bu xr2, xr6, 4
+ xvsllwil.hu.bu xr3, xr7, 4
+ xvsllwil.hu.bu xr4, xr10, 4
+ xvsllwil.hu.bu xr5, xr11, 4
+ xvsllwil.hu.bu xr6, xr12, 4
+ xvsllwil.hu.bu xr7, xr13, 4
+ xvst xr0, a0, 0
+ xvst xr4, a0, 32
+ add.d t1, a0, t7
+ xvst xr1, t1, 0
+ xvst xr5, t1, 32
+ add.d t1, t1, t7
+ xvst xr2, t1, 0
+ xvst xr6, t1, 32
+ add.d t1, t1, t7
+ xvst xr3, t1, 0
+ xvst xr7, t1, 32
+ add.d a0, t1, t7
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_32_loop
+ addi.d a1, t0, 32
+ addi.d t0, t0, 32
+ addi.d a0, t8, 64
+ addi.d t8, t8, 64
+ addi.d a4, t5, 0
+ addi.d a3, a3, -32
+ bnez a3, .l_\lable\()hv0_32_loop
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h:
+ bnez a6, .l_\lable\()hv //if(fh) && if (fv)
+
+ andi t1, a7, 3
+ blt t0, a3, .l_\lable\()h_idx_fh
+ andi t1, a7, 1
+ addi.w t1, t1, 3
+.l_\lable\()h_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a5, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ xvldrepl.d xr22, t1, 0
+
+ addi.d a1, a1, -3
+ clz.w t1, a3
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()prep_h_jtable
+ alsl.d t1, t1, t5, 1
+ ld.h t8, t1, 0
+ add.d t5, t5, t8
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()prep_h_jtable:
+ .hword .l_\lable\()h_128w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_64w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_32w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_16w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_8w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_4w - .l_\lable\()prep_h_jtable
+
+.l_\lable\()h_4w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr12, xr1, xr22
+ xvdp2.h.bu.b xr14, xr2, xr22
+ xvdp2.h.bu.b xr16, xr3, xr22
+
+ HADDWDH xr10 //h0 mid0 mid1 mid2 mid3
+ HADDWDH xr12 //h1 mid4 mid5 mid6 mid7
+ HADDWDH xr14 //h2
+ HADDWDH xr16 //h3
+
+ xvpickev.w xr10, xr12, xr10
+ xvpickev.w xr14, xr16, xr14
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr14, xr14, 0xd8
+ xvpickev.h xr10, xr14, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h xr10, xr10, 2
+
+ xvst xr10, a0, 0
+ addi.d a0, a0, 32
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()h_4w
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h_8w:
+ xvld xr0, a1, 0
+ xvldx xr2, a1, a2
+ xvldx xr4, a1, t2
+ xvldx xr6, a1, t3
+ add.d a1, a1, t4
+
+ xvbsrl.v xr1, xr0, 4
+ xvbsrl.v xr3, xr2, 4
+ xvbsrl.v xr5, xr4, 4
+ xvbsrl.v xr7, xr6, 4
+
+ SHUFB xr0, xr23, xr9, xr10
+ SHUFB xr1, xr23, xr9, xr11
+ SHUFB xr2, xr23, xr9, xr12
+ SHUFB xr3, xr23, xr9, xr13
+ SHUFB xr4, xr23, xr9, xr14
+ SHUFB xr5, xr23, xr9, xr15
+ SHUFB xr6, xr23, xr9, xr16
+ SHUFB xr7, xr23, xr9, xr17
+
+ xvdp2.h.bu.b xr0, xr10, xr22
+ xvdp2.h.bu.b xr1, xr11, xr22
+ xvdp2.h.bu.b xr2, xr12, xr22
+ xvdp2.h.bu.b xr3, xr13, xr22
+ xvdp2.h.bu.b xr4, xr14, xr22
+ xvdp2.h.bu.b xr5, xr15, xr22
+ xvdp2.h.bu.b xr6, xr16, xr22
+ xvdp2.h.bu.b xr7, xr17, xr22
+
+ HADDWDH xr0
+ HADDWDH xr1
+ HADDWDH xr2
+ HADDWDH xr3
+ HADDWDH xr4
+ HADDWDH xr5
+ HADDWDH xr6
+ HADDWDH xr7
+
+ xvpickev.w xr0, xr1, xr0
+ xvpickev.w xr2, xr3, xr2
+ xvpermi.d xr0, xr0, 0xd8
+ xvpermi.d xr2, xr2, 0xd8
+ xvpickev.h xr0, xr2, xr0
+ xvpermi.d xr0, xr0, 0xd8
+ xvsrari.h xr0, xr0, 2
+
+ xvpickev.w xr4, xr5, xr4
+ xvpickev.w xr6, xr7, xr6
+ xvpermi.d xr4, xr4, 0xd8
+ xvpermi.d xr6, xr6, 0xd8
+ xvpickev.h xr4, xr6, xr4
+ xvpermi.d xr4, xr4, 0xd8
+ xvsrari.h xr4, xr4, 2
+
+ xvst xr0, a0, 0
+ xvst xr4, a0, 32
+ addi.d a0, a0, 64
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()h_8w
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h_16w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+
+ PREP_W16_H xr0
+ PREP_W16_H xr1
+ PREP_W16_H xr2
+ PREP_W16_H xr3
+
+ xvst xr0, a0, 0
+ xvst xr1, a0, 32
+ xvst xr2, a0, 64
+ xvst xr3, a0, 96
+
+ addi.d a0, a0, 128
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()h_16w
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h_32w:
+.l_\lable\()h_64w:
+.l_\lable\()h_128w:
+ addi.d t0, a1, 0 //src
+ addi.d t5, a4, 0 //h
+ srli.w t7, a3, 4 //w
+ slli.w t7, t7, 5 //store offset
+ addi.d t8, a0, 0 //dst
+.l_\lable\()h_16_loop:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+
+ PREP_W16_H xr0
+ PREP_W16_H xr1
+ PREP_W16_H xr2
+ PREP_W16_H xr3
+
+ xvst xr0, a0, 0
+ xvstx xr1, a0, t7
+ slli.w t1, t7, 1
+ xvstx xr2, a0, t1
+ add.w t1, t1, t7
+ xvstx xr3, a0, t1
+ slli.w t1, t7, 2
+ add.d a0, a0, t1
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()h_16_loop
+
+ addi.d a1, t0, 16
+ addi.d t0, t0, 16
+ addi.d a0, t8, 32
+ addi.d t8, t8, 32
+ addi.d a4, t5, 0
+ addi.d a3, a3, -16
+ bnez a3, .l_\lable\()h_16_loop
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv:
+ andi t1, a7, 3
+ blt t0, a3, .l_\lable\()hv_idx_fh
+ andi t1, a7, 1
+ addi.w t1, t1, 3
+.l_\lable\()hv_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a5, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ xvldrepl.d xr22, t1, 0
+ srli.w a7, a7, 2
+ blt t0, a4, .l_\lable\()hv_idx_fv
+ andi a7, a7, 1
+ addi.w a7, a7, 3
+.l_\lable\()hv_idx_fv:
+ addi.w t5, zero, 120
+ mul.w a7, a7, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w a7, a7, t5
+ add.d a7, t6, a7 //fv's offset
+ xvldrepl.d xr8, a7, 0
+ xvsllwil.h.b xr8, xr8, 0
+
+ sub.d a1, a1, t3
+ addi.d a1, a1, -3
+ beq a3, t0, .l_\lable\()hv_4w
+ b .l_\lable\()hv_8w
+.l_\lable\()hv_4w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+ xvld xr4, a1, 0
+ xvldx xr5, a1, a2
+ xvldx xr6, a1, t2
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+
+ SHUFB xr4, xr23, xr9, xr4
+ SHUFB xr5, xr23, xr9, xr5
+ SHUFB xr6, xr23, xr9, xr6
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr11, xr1, xr22
+ xvdp2.h.bu.b xr12, xr2, xr22
+ xvdp2.h.bu.b xr13, xr3, xr22
+
+ xvdp2.h.bu.b xr14, xr4, xr22
+ xvdp2.h.bu.b xr15, xr5, xr22
+ xvdp2.h.bu.b xr16, xr6, xr22
+
+ HADDWDH xr10 //h0 mid0 mid1 mid2 mid3
+ HADDWDH xr11 //h1 mid4 mid5 mid6 mid7
+ HADDWDH xr12 //h2
+ HADDWDH xr13 //h3
+
+ xvpackev.w xr10, xr11, xr10
+ xvpackev.w xr12, xr13, xr12
+ xvpackev.d xr11, xr12, xr10
+ xvpackod.d xr10, xr12, xr10
+ xvpickev.h xr11, xr10, xr11
+ xvsrari.h xr11, xr11, 2
+
+ HADDWDH xr14 //h4
+ HADDWDH xr15 //h5
+ HADDWDH xr16 //h6
+
+ xvpackev.w xr14, xr15, xr14
+ xvpackev.w xr16, xr17, xr16
+ xvpackev.d xr17, xr16, xr14
+ xvpackod.d xr14, xr16, xr14
+ xvpickev.h xr13, xr14, xr17
+ xvsrari.h xr13, xr13, 2
+
+ xvpackev.d xr18, xr13, xr11 //0 4 8 12 16 20 24 * 2 6 10 14 18 22 26 *
+ xvpackod.d xr19, xr13, xr11 //1 5 9 13 17 21 25 * 3 7 11 15 19 23 27 *
+.l_\lable\()hv_w4_loop:
+ xvldx xr0, a1, t3
+ add.d a1, a1, t4
+ xvld xr1, a1, 0
+ xvldx xr2, a1, a2
+ xvldx xr3, a1, t2
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr12, xr1, xr22
+ xvdp2.h.bu.b xr14, xr2, xr22
+ xvdp2.h.bu.b xr16, xr3, xr22
+
+ HADDWDH xr10 //h0 mid0 mid1 mid2 mid3
+ HADDWDH xr12 //h1 mid4 mid5 mid6 mid7
+ HADDWDH xr14 //h2
+ HADDWDH xr16 //h3
+
+ xvpackev.w xr10, xr12, xr10
+ xvpackev.w xr14, xr16, xr14
+ xvpackev.d xr12, xr14, xr10
+ xvpackod.d xr10, xr14, xr10
+ xvpickev.h xr12, xr10, xr12
+ xvsrari.h xr12, xr12, 2
+
+ xvextrins.h xr18, xr12, 0x70 //0 4 8 12 16 20 24 0(x0) 2 6 10 14 18 22 26 2(x2)
+ xvextrins.h xr19, xr12, 0x74 //1 5 9 13 17 21 25 0(x1) 3 7 11 15 19 23 27 2(x3)
+
+ xvdp2.w.h xr0, xr18, xr8
+ xvdp2.w.h xr2, xr19, xr8
+ HADDWQW xr0
+ HADDWQW xr2
+ xvpackev.w xr0, xr2, xr0
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+ xvextrins.h xr18, xr12, 0x71
+ xvextrins.h xr19, xr12, 0x75
+ xvdp2.w.h xr2, xr18, xr8
+ xvdp2.w.h xr4, xr19, xr8
+ HADDWQW xr2
+ HADDWQW xr4
+ xvpackev.w xr2, xr4, xr2
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+ xvextrins.h xr18, xr12, 0x72
+ xvextrins.h xr19, xr12, 0x76
+ xvdp2.w.h xr4, xr18, xr8
+ xvdp2.w.h xr9, xr19, xr8
+ HADDWQW xr4
+ HADDWQW xr9
+ xvpackev.w xr4, xr9, xr4
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+ xvextrins.h xr18, xr12, 0x73
+ xvextrins.h xr19, xr12, 0x77
+ xvdp2.w.h xr9, xr18, xr8
+ xvdp2.w.h xr11, xr19, xr8
+ HADDWQW xr9
+ HADDWQW xr11
+ xvpackev.w xr9, xr11, xr9
+
+ xvpackev.d xr0, xr2, xr0
+ xvpackev.d xr4, xr9, xr4
+ xvsrari.w xr0, xr0, 6
+ xvsrari.w xr4, xr4, 6
+ xvpermi.d xr0, xr0, 0xd8
+ xvpermi.d xr4, xr4, 0xd8
+ xvpickev.h xr0, xr4, xr0
+ xvpermi.d xr0, xr0, 0xd8
+ xvst xr0, a0, 0
+ addi.d a0, a0, 32
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv_w4_loop
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()hv_8w:
+ addi.d t0, a1, 0
+ addi.d t5, a4, 0
+ srli.w t7, a3, 3
+ slli.w t7, t7, 4 // store offset
+ addi.d t8, a0, 0
+.l_\lable\()hv_8w_loop0:
+ xvld xr0, a1, 0
+ xvldx xr2, a1, a2
+ xvldx xr4, a1, t2
+ xvldx xr6, a1, t3
+
+ add.d a1, a1, t4
+ xvld xr10, a1, 0
+ xvldx xr11, a1, a2
+ xvldx xr12, a1, t2
+
+ xvbsrl.v xr1, xr0, 4
+ xvbsrl.v xr3, xr2, 4
+ xvbsrl.v xr5, xr4, 4
+ xvbsrl.v xr7, xr6, 4
+
+ SHUFB xr0, xr23, xr9, xr13
+ SHUFB xr1, xr23, xr9, xr14
+ SHUFB xr2, xr23, xr9, xr15
+ SHUFB xr3, xr23, xr9, xr16
+ SHUFB xr4, xr23, xr9, xr17
+ SHUFB xr5, xr23, xr9, xr18
+ SHUFB xr6, xr23, xr9, xr19
+ SHUFB xr7, xr23, xr9, xr20
+
+ xvdp2.h.bu.b xr0, xr13, xr22
+ xvdp2.h.bu.b xr1, xr14, xr22
+ xvdp2.h.bu.b xr2, xr15, xr22
+ xvdp2.h.bu.b xr3, xr16, xr22
+ xvdp2.h.bu.b xr4, xr17, xr22
+ xvdp2.h.bu.b xr5, xr18, xr22
+ xvdp2.h.bu.b xr6, xr19, xr22
+ xvdp2.h.bu.b xr7, xr20, xr22
+
+ HADDWDH xr0
+ HADDWDH xr1
+ HADDWDH xr2
+ HADDWDH xr3
+ HADDWDH xr4
+ HADDWDH xr5
+ HADDWDH xr6
+ HADDWDH xr7
+
+ xvpackev.w xr0, xr2, xr0
+ xvpackev.w xr2, xr6, xr4
+ xvpackev.d xr16, xr2, xr0
+ xvpackod.d xr0, xr2, xr0
+ xvpickev.h xr0, xr0, xr16
+ xvsrari.h xr0, xr0, 2 // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27
+
+ xvpackev.w xr1, xr3, xr1
+ xvpackev.w xr3, xr7, xr5
+ xvpackev.d xr16, xr3, xr1
+ xvpackod.d xr1, xr3, xr1
+ xvpickev.h xr1, xr1, xr16
+ xvsrari.h xr1, xr1, 2 // 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31
+
+ xvbsrl.v xr13, xr10, 4
+ xvbsrl.v xr14, xr11, 4
+ xvbsrl.v xr15, xr12, 4
+
+ SHUFB xr10, xr23, xr9, xr10
+ SHUFB xr13, xr23, xr9, xr13
+ SHUFB xr11, xr23, xr9, xr11
+ SHUFB xr14, xr23, xr9, xr14
+ SHUFB xr12, xr23, xr9, xr12
+ SHUFB xr15, xr23, xr9, xr15
+
+ xvdp2.h.bu.b xr4, xr10, xr22
+ xvdp2.h.bu.b xr5, xr13, xr22
+ xvdp2.h.bu.b xr6, xr11, xr22
+ xvdp2.h.bu.b xr7, xr14, xr22
+ xvdp2.h.bu.b xr9, xr12, xr22
+ xvdp2.h.bu.b xr10, xr15, xr22
+
+ HADDWDH xr4
+ HADDWDH xr5
+ HADDWDH xr6
+ HADDWDH xr7
+ HADDWDH xr9
+ HADDWDH xr10
+
+ xvpackev.w xr4, xr6, xr4
+ xvpackev.w xr9, xr12, xr9
+ xvpackev.d xr16, xr9, xr4
+ xvpackod.d xr11, xr9, xr4
+ xvpickev.h xr2, xr11, xr16
+ xvsrari.h xr2, xr2, 2 // 32 40 48 * 33 41 49 * 34 42 50 * 35 43 51 *
+
+ xvpackev.w xr5, xr7, xr5
+ xvpackev.w xr10, xr12, xr10
+ xvpackev.d xr16, xr10, xr5
+ xvpackod.d xr11, xr10, xr5
+ xvpickev.h xr3, xr11, xr16
+ xvsrari.h xr3, xr3, 2 // 36 44 52 * 37 45 53 * 38 46 54 * 39 47 56 *
+
+ xvpackev.d xr18, xr2, xr0 // 0 8 16 24 32 40 48 * 2 10 18 26 34 42 50 *
+ xvpackod.d xr19, xr2, xr0 // 1 9 17 25 33 41 49 * 3 11 19 27 35 43 51 *
+ xvpackev.d xr20, xr3, xr1 // 4 12 20 28 36 44 52 * 6 14 22 30 38 46 54 *
+ xvpackod.d xr21, xr3, xr1 // 5 13 21 29 37 45 53 * 7 15 23 31 39 47 55 *
+
+.l_\lable\()hv_8w_loop:
+ xvldx xr0, a1, t3
+ add.d a1, a1, t4
+ xvld xr2, a1, 0
+ xvldx xr4, a1, a2
+ xvldx xr6, a1, t2
+
+ xvbsrl.v xr1, xr0, 4
+ xvbsrl.v xr3, xr2, 4
+ xvbsrl.v xr5, xr4, 4
+ xvbsrl.v xr7, xr6, 4
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+ SHUFB xr4, xr23, xr9, xr4
+ SHUFB xr5, xr23, xr9, xr5
+ SHUFB xr6, xr23, xr9, xr6
+ SHUFB xr7, xr23, xr9, xr7
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr11, xr1, xr22
+ xvdp2.h.bu.b xr12, xr2, xr22
+ xvdp2.h.bu.b xr13, xr3, xr22
+ xvdp2.h.bu.b xr14, xr4, xr22
+ xvdp2.h.bu.b xr15, xr5, xr22
+ xvdp2.h.bu.b xr16, xr6, xr22
+ xvdp2.h.bu.b xr17, xr7, xr22
+
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ HADDWDH xr14
+ HADDWDH xr15
+ HADDWDH xr16
+ HADDWDH xr17
+
+ xvpackev.w xr0, xr12, xr10
+ xvpackev.w xr2, xr16, xr14
+ xvpackev.d xr9, xr2, xr0
+ xvpackod.d xr0, xr2, xr0
+ xvpickev.h xr0, xr0, xr9
+ xvsrari.h xr0, xr0, 2 // 56 64 72 80 57 65 73 81 58 66 74 82 59 67 75 83
+
+ xvpackev.w xr1, xr13, xr11
+ xvpackev.w xr3, xr17, xr15
+ xvpackev.d xr9, xr3, xr1
+ xvpackod.d xr1, xr3, xr1
+ xvpickev.h xr1, xr1, xr9
+ xvsrari.h xr1, xr1, 2 // 60 68 76 84 61 69 77 85 62 70 78 86 63 71 79 87
+
+ xvextrins.h xr18, xr0, 0x70 // 0 8 16 24 32 40 48 (56) 2 10 18 26 34 42 50 (58)
+ xvextrins.h xr19, xr0, 0x74 // 1 9 17 25 33 41 49 (57) 3 11 19 27 35 43 51 (59)
+ xvextrins.h xr20, xr1, 0x70
+ xvextrins.h xr21, xr1, 0x74
+
+ //h - 1
+ xvdp2.w.h xr10, xr18, xr8
+ xvdp2.w.h xr11, xr19, xr8
+ xvdp2.w.h xr12, xr20, xr8
+ xvdp2.w.h xr13, xr21, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr2, xr11, xr10 //0 1 * * 2 3 * *
+ xvpackev.w xr3, xr13, xr12 //4 5 * * 6 7 * *
+ xvpackev.d xr2, xr3, xr2 //0 1 4 5 2 3 6 7
+ //h - 2
+ xvbsrl.v xr4, xr18, 2
+ xvbsrl.v xr5, xr19, 2
+ xvbsrl.v xr6, xr20, 2
+ xvbsrl.v xr7, xr21, 2
+ xvextrins.h xr4, xr0, 0x71
+ xvextrins.h xr5, xr0, 0x75
+ xvextrins.h xr6, xr1, 0x71
+ xvextrins.h xr7, xr1, 0x75
+
+ xvdp2.w.h xr10, xr4, xr8
+ xvdp2.w.h xr11, xr5, xr8
+ xvdp2.w.h xr12, xr6, xr8
+ xvdp2.w.h xr13, xr7, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr14, xr11, xr10
+ xvpackev.w xr15, xr13, xr12
+ xvpackev.d xr14, xr15, xr14 //8 9 12 13 10 11 14 15
+ //h - 3
+ xvbsrl.v xr4, xr4, 2
+ xvbsrl.v xr5, xr5, 2
+ xvbsrl.v xr6, xr6, 2
+ xvbsrl.v xr7, xr7, 2
+ xvextrins.h xr4, xr0, 0x72
+ xvextrins.h xr5, xr0, 0x76
+ xvextrins.h xr6, xr1, 0x72
+ xvextrins.h xr7, xr1, 0x76
+
+ xvdp2.w.h xr10, xr4, xr8
+ xvdp2.w.h xr11, xr5, xr8
+ xvdp2.w.h xr12, xr6, xr8
+ xvdp2.w.h xr13, xr7, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr15, xr11, xr10
+ xvpackev.w xr16, xr13, xr12
+ xvpackev.d xr15, xr16, xr15 //16 17 20 21 18 19 22 23
+ //h - 4
+ xvbsrl.v xr4, xr4, 2
+ xvbsrl.v xr5, xr5, 2
+ xvbsrl.v xr6, xr6, 2
+ xvbsrl.v xr7, xr7, 2
+ xvextrins.h xr4, xr0, 0x73
+ xvextrins.h xr5, xr0, 0x77
+ xvextrins.h xr6, xr1, 0x73
+ xvextrins.h xr7, xr1, 0x77
+
+ xvdp2.w.h xr10, xr4, xr8
+ xvdp2.w.h xr11, xr5, xr8
+ xvdp2.w.h xr12, xr6, xr8
+ xvdp2.w.h xr13, xr7, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr16, xr11, xr10
+ xvpackev.w xr17, xr13, xr12
+ xvpackev.d xr16, xr17, xr16 //24 25 28 29 26 27 30 31
+
+ xvsrari.w xr2, xr2, 6
+ xvsrari.w xr14, xr14, 6
+ xvsrari.w xr15, xr15, 6
+ xvsrari.w xr16, xr16, 6
+
+ xvpermi.d xr2, xr2, 0xd8
+ xvpermi.d xr14, xr14, 0xd8
+ xvpermi.d xr15, xr15, 0xd8
+ xvpermi.d xr16, xr16, 0xd8
+ xvpickev.h xr2, xr14, xr2
+ xvpickev.h xr3, xr16, xr15
+ xvpermi.d xr2, xr2, 0xd8
+ xvpermi.d xr3, xr3, 0xd8
+
+ xvpermi.q xr10, xr2, 0x31
+ xvpermi.q xr11, xr3, 0x31
+
+ vst vr2, a0, 0
+ vstx vr10, a0, t7 //32
+ slli.w t1, t7, 1 //64
+ vstx vr3, a0, t1
+ add.w t1, t1, t7 //96
+ vstx vr11, a0, t1
+ slli.w t1, t7, 2 //128
+ add.d a0, a0, t1
+
+ xvbsrl.v xr18, xr4, 2
+ xvbsrl.v xr19, xr5, 2
+ xvbsrl.v xr20, xr6, 2
+ xvbsrl.v xr21, xr7, 2
+
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv_8w_loop
+
+ addi.d a1, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 16
+ addi.d t8, t8, 16
+ addi.d a4, t5, 0
+ addi.d a3, a3, -8
+ bnez a3, .l_\lable\()hv_8w_loop0
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()v:
+
+ srli.w a7, a7, 2
+ blt t0, a4, .l_\lable\()v_idx_fv
+ andi a7, a7, 1
+ addi.w a7, a7, 3
+.l_\lable\()v_idx_fv:
+ addi.w t5, zero, 120
+ mul.w a7, a7, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w a7, a7, t5
+ add.d a7, t6, a7 //fv's offset
+ xvldrepl.d xr8, a7, 0
+
+ sub.d a1, a1, t3
+ beq a3, t0, .l_\lable\()v_4w
+ blt t0, a3, .l_\lable\()v_8w
+.l_\lable\()v_4w:
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fldx.s f2, a1, t2
+ add.d a1, a1, t3
+ fld.s f3, a1, 0
+ fldx.s f4, a1, a2
+ fldx.s f5, a1, t2
+ fldx.s f6, a1, t3
+
+ xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25
+ xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27
+ xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29
+ xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31
+ xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27
+ xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
+ xvilvl.w xr2, xr1, xr0
+ xvilvh.w xr0, xr1, xr0
+ xvpermi.q xr0, xr2, 0x20
+
+.l_\lable\()v_4w_loop:
+ add.d a1, a1, t4
+ fld.s f7, a1, 0 //h0
+ fldx.s f10, a1, a2 //h1
+ fldx.s f11, a1, t2 //h2
+ fldx.s f12, a1, t3 //h3
+
+ xvbsrl.v xr9, xr7, 2
+ xvpermi.q xr9, xr7, 0x20
+ xvextrins.b xr0, xr9, 0x70
+ xvextrins.b xr0, xr9, 0xf1
+
+ xvbsrl.v xr1, xr0, 1
+ xvbsrl.v xr7, xr10, 2
+ xvpermi.q xr7, xr10, 0x20
+ xvextrins.b xr1, xr7, 0x70
+ xvextrins.b xr1, xr7, 0xf1
+
+ xvbsrl.v xr2, xr1, 1
+ xvbsrl.v xr7, xr11, 2
+ xvpermi.q xr7, xr11, 0x20
+ xvextrins.b xr2, xr7, 0x70
+ xvextrins.b xr2, xr7, 0xf1
+
+ xvbsrl.v xr3, xr2, 1
+ xvbsrl.v xr7, xr12, 2
+ xvpermi.q xr7, xr12, 0x20
+ xvextrins.b xr3, xr7, 0x70
+ xvextrins.b xr3, xr7, 0xf1
+ xvbsrl.v xr4, xr3, 1
+
+ xvdp2.h.bu.b xr10, xr0, xr8
+ xvdp2.h.bu.b xr11, xr1, xr8
+ xvdp2.h.bu.b xr12, xr2, xr8
+ xvdp2.h.bu.b xr13, xr3, xr8
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ xvpickev.w xr10, xr11, xr10
+ xvpickev.w xr11, xr13, xr12
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr11, xr11, 0xd8
+ xvpickev.h xr10, xr11, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h xr10, xr10, 2
+
+ xvaddi.bu xr0, xr4, 0
+
+ xvst xr10, a0, 0
+ addi.d a0, a0, 32
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()v_4w_loop
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()v_8w:
+ addi.d t0, a1, 0
+ addi.d t5, a4, 0
+ srli.w t7, a3, 2
+ slli.w t7, t7, 3
+ addi.d t8, a0, 0
+.l_\lable\()v_8w_loop0:
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fldx.s f2, a1, t2
+ add.d a1, a1, t3
+ fld.s f3, a1, 0
+ fldx.s f4, a1, a2
+ fldx.s f5, a1, t2
+ fldx.s f6, a1, t3
+
+ xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25
+ xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27
+ xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29
+ xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31
+ xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27
+ xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
+ xvilvl.w xr2, xr1, xr0
+ xvilvh.w xr0, xr1, xr0
+ xvpermi.q xr0, xr2, 0x20
+
+.l_\lable\()v_8w_loop:
+ add.d a1, a1, t4
+ fld.s f7, a1, 0 //h0
+ fldx.s f10, a1, a2 //h1
+ fldx.s f11, a1, t2 //h2
+ fldx.s f12, a1, t3 //h3
+
+ xvbsrl.v xr9, xr7, 2
+ xvpermi.q xr9, xr7, 0x20
+ xvextrins.b xr0, xr9, 0x70
+ xvextrins.b xr0, xr9, 0xf1
+
+ xvbsrl.v xr1, xr0, 1
+ xvbsrl.v xr7, xr10, 2
+ xvpermi.q xr7, xr10, 0x20
+ xvextrins.b xr1, xr7, 0x70
+ xvextrins.b xr1, xr7, 0xf1
+
+ xvbsrl.v xr2, xr1, 1
+ xvbsrl.v xr7, xr11, 2
+ xvpermi.q xr7, xr11, 0x20
+ xvextrins.b xr2, xr7, 0x70
+ xvextrins.b xr2, xr7, 0xf1
+
+ xvbsrl.v xr3, xr2, 1
+ xvbsrl.v xr7, xr12, 2
+ xvpermi.q xr7, xr12, 0x20
+ xvextrins.b xr3, xr7, 0x70
+ xvextrins.b xr3, xr7, 0xf1
+ xvbsrl.v xr4, xr3, 1
+
+ xvdp2.h.bu.b xr10, xr0, xr8
+ xvdp2.h.bu.b xr11, xr1, xr8
+ xvdp2.h.bu.b xr12, xr2, xr8
+ xvdp2.h.bu.b xr13, xr3, xr8
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ xvpickev.w xr10, xr11, xr10
+ xvpickev.w xr11, xr13, xr12
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr11, xr11, 0xd8
+ xvpickev.h xr10, xr11, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h xr10, xr10, 2
+
+ xvaddi.bu xr0, xr4, 0
+
+ xvstelm.d xr10, a0, 0, 0
+ add.d a0, a0, t7
+ xvstelm.d xr10, a0, 0, 1
+ add.d a0, a0, t7
+ xvstelm.d xr10, a0, 0, 2
+ add.d a0, a0, t7
+ xvstelm.d xr10, a0, 0, 3
+ add.d a0, a0, t7
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()v_8w_loop
+
+ addi.d a1, t0, 4
+ addi.d t0, t0, 4
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.d a4, t5, 0
+ addi.d a3, a3, -4
+ bnez a3, .l_\lable\()v_8w_loop0
+
+.l_\lable\()end_pre_8tap:
+.endm
+
+function prep_8tap_regular_8bpc_lasx
+ addi.w a7, zero, 0
+ PREP_8TAP_8BPC_LASX 0
+endfunc
+
+function prep_8tap_smooth_regular_8bpc_lasx
+ addi.w a7, zero, 1
+ PREP_8TAP_8BPC_LASX 1
+endfunc
+
+function prep_8tap_sharp_regular_8bpc_lasx
+ addi.w a7, zero, 2
+ PREP_8TAP_8BPC_LASX 2
+endfunc
+
+function prep_8tap_regular_smooth_8bpc_lasx
+ addi.w a7, zero, 4
+ PREP_8TAP_8BPC_LASX 4
+endfunc
+
+function prep_8tap_smooth_8bpc_lasx
+ addi.w a7, zero, 5
+ PREP_8TAP_8BPC_LASX 5
+endfunc
+
+function prep_8tap_sharp_smooth_8bpc_lasx
+ addi.w a7, zero, 6
+ PREP_8TAP_8BPC_LASX 6
+endfunc
+
+function prep_8tap_regular_sharp_8bpc_lasx
+ addi.w a7, zero, 8
+ PREP_8TAP_8BPC_LASX 8
+endfunc
+
+function prep_8tap_smooth_sharp_8bpc_lasx
+ addi.w a7, zero, 9
+ PREP_8TAP_8BPC_LASX 9
+endfunc
+
+function prep_8tap_sharp_8bpc_lasx
+ addi.w a7, zero, 10
+ PREP_8TAP_8BPC_LASX 10
+endfunc