diff options
Diffstat (limited to 'src/loongarch/loopfilter.S')
-rw-r--r-- | src/loongarch/loopfilter.S | 1108 |
1 files changed, 1108 insertions, 0 deletions
diff --git a/src/loongarch/loopfilter.S b/src/loongarch/loopfilter.S new file mode 100644 index 0000000..e71d5a7 --- /dev/null +++ b/src/loongarch/loopfilter.S @@ -0,0 +1,1108 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +.macro FILTER_W4 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -2 + fld.s f6, t5, 0 //p1 p0 q0 q1 + fldx.s f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.s f8, t5, 0 + fldx.s f9, t5, a1 + + vilvl.b vr6, vr7, vr6 + vilvl.b vr7, vr9, vr8 + vilvl.h vr6, vr7, vr6 //p1p1p1p1 + vbsrl.v vr7, vr6, 4 //p0p0p0p0 + vbsrl.v vr8, vr7, 4 //q0q0q0q0 + vbsrl.v vr9, vr8, 4 //q1q1q1q1 +.else + sub.d t5, a0, a1 + fld.s f7, t5, 0 + sub.d t5, t5, a1 + fld.s f6, t5, 0 + fld.s f8, a0, 0 + fldx.s f9, a0, a1 +.endif + + vabsd.bu vr10, vr6, vr7 // (p1 - p0) + vabsd.bu vr11, vr9, vr8 // (q1 - q0) + vabsd.bu vr12, vr7, vr8 // (p0 - q0) + vabsd.bu vr13, vr6, vr9 // (p1 - q1) + + vmax.bu vr14, vr10, vr11 + vsle.bu vr15, vr14, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I + vsadd.bu vr16, vr12, vr12 + vsrli.b vr17, vr13, 1 + vsadd.bu vr16, vr16, vr17 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) + vsle.bu vr16, vr16, vr3 + vand.v vr20, vr15, vr16 //fm + + vpickve2gr.wu t5, vr20, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W4 + + vslt.bu vr16, vr2, vr14 //hev + + vsllwil.h.b vr30, vr20, 0 //expand fm to w + vsllwil.w.h vr30, vr30, 0 + + vsllwil.hu.bu vr17, vr6, 0 + vsllwil.hu.bu vr18, vr9, 0 + vsub.h vr17, vr17, vr18 + vssrarni.b.h vr17, vr17, 0 //f = iclip_diff(p1 - q1) + + vand.v vr17, vr17, vr16 + vsllwil.h.b vr18, vr17, 0 + + vsllwil.hu.bu vr10, vr8, 0 + vsllwil.hu.bu vr11, vr7, 0 + vsub.h vr10, vr10, vr11 + + vsadd.h vr11, vr10, vr10 + vsadd.h vr10, vr10, vr11 //3 * (q0 - p0) + vsadd.h vr10, vr10, vr18 //f = iclip_diff(3 * (q0 - p0) + f); + vssrani.b.h vr10, vr10, 0 + vsllwil.h.b vr10, vr10, 0 + + vaddi.hu vr11, vr10, 4 + vaddi.hu vr12, vr10, 3 + li.w t5, 127 + vreplgr2vr.h vr13, t5 + vmin.h vr11, vr11, vr13 + vmin.h vr12, vr12, vr13 + vsrai.h vr11, vr11, 3 //f1 + vsrai.h vr12, vr12, 3 //f2 + + vsllwil.hu.bu vr13, vr7, 0 //p0 + vsllwil.hu.bu vr14, vr8, 0 //q0 + vsadd.h vr13, vr13, vr12 + vssub.h vr14, vr14, vr11 + vssrani.bu.h vr13, vr13, 0 //dst-1 + vssrani.bu.h vr14, vr14, 0 //dst+0 + + vsrari.h vr15, vr11, 1 //f + vsllwil.hu.bu vr18, vr6, 0 //p1 + vsllwil.hu.bu vr19, vr9, 0 //q1 + vsadd.h vr18, vr18, vr15 + vssub.h vr19, vr19, vr15 + vssrani.bu.h vr18, vr18, 0 //dst-2 + vssrani.bu.h vr19, vr19, 0 //dst+1 + vbitsel.v vr26, vr18, vr6, vr16 + vbitsel.v vr29, vr19, vr9, vr16 + + vbitsel.v vr6, vr6, vr26, vr20 + vbitsel.v vr7, vr7, vr13, vr20 + vbitsel.v vr8, vr8, vr14, vr20 + vbitsel.v vr9, vr9, vr29, vr20 + +.ifc \DIR, h + vilvl.b vr6, vr7, vr6 + vilvl.b vr9, vr9, vr8 + vilvl.h vr6, vr9, vr6 + + addi.d t5, a0, -2 + vstelm.w vr6, t5, 0, 0 + add.d t5, t5, a1 + vstelm.w vr6, t5, 0, 1 + add.d t5, t5, a1 + vstelm.w vr6, t5, 0, 2 + add.d t5, t5, a1 + vstelm.w vr6, t5, 0, 3 +.else + fst.s f8, a0, 0 + fstx.s f9, a0, a1 + sub.d t5, a0, a1 + fst.s f7, t5, 0 + sub.d t5, t5, a1 + fst.s f6, t5, 0 +.endif +.END_FILTER_\DIR\()\TYPE\()_W4: +.endm + +.macro FILTER_W6 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -3 + fld.d f6, t5, 0 //p2 p1 p0 q0 q1 q2 + fldx.d f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f8, t5, 0 + fldx.d f9, t5, a1 + + vilvl.b vr6, vr7, vr6 + vilvl.b vr7, vr9, vr8 + vilvh.h vr10, vr7, vr6 + vilvl.h vr6, vr7, vr6 + + vbsrl.v vr7, vr6, 4 //p1 + vbsrl.v vr8, vr7, 4 //p0 + vbsrl.v vr9, vr8, 4 //q0 + vbsrl.v vr11, vr10, 4 //q2 +.else + alsl.d t5, a1, a1, 1 + sub.d t5, a0, t5 + fld.d f6, t5, 0 + fldx.d f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f8, t5, 0 + fldx.d f9, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f10, t5, 0 + fldx.d f11, t5, a1 +.endif + + vabsd.bu vr12, vr7, vr8 //abs(p1-p0) + vabsd.bu vr13, vr10, vr9 //abs(q1-q0) + vmax.bu vr14, vr12, vr13 + vslt.bu vr2, vr2, vr14 //hev + vabsd.bu vr12, vr6, vr7 //abs(p2-p1) + vmax.bu vr12, vr12, vr14 + vabsd.bu vr13, vr11, vr10 //abs(q2-q1) + vmax.bu vr12, vr12, vr13 + vsle.bu vr0, vr12, vr4 // <=I + + vabsd.bu vr13, vr8, vr9 //abs(p0-q0) + vsadd.bu vr13, vr13, vr13 + vabsd.bu vr15, vr7, vr10 + vsrli.b vr15, vr15, 1 + vsadd.bu vr13, vr13, vr15 + vsle.bu vr13, vr13, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E + vand.v vr0, vr0, vr13 //fm + + vpickve2gr.wu t5, vr0, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W6 + + vabsd.bu vr12, vr6, vr8 //abs(p2-p0) + vabsd.bu vr13, vr11, vr9 //abs(q2-q0) + vmax.bu vr12, vr12, vr14 + vmax.bu vr12, vr12, vr13 + vxor.v vr13, vr13, vr13 + vaddi.bu vr13, vr13, 1 + vsle.bu vr1, vr12, vr13 //flat8in + + //6789 10 11 --expand to h + vsllwil.hu.bu vr12, vr6, 0 + vsllwil.hu.bu vr13, vr7, 0 + vsllwil.hu.bu vr14, vr8, 0 + vsllwil.hu.bu vr15, vr9, 0 + vsllwil.hu.bu vr16, vr10, 0 + vsllwil.hu.bu vr17, vr11, 0 + + //dst-2 + vsadd.hu vr18, vr12, vr12 + vsadd.hu vr18, vr18, vr12 + vsadd.hu vr18, vr18, vr13 + vsadd.hu vr18, vr18, vr13 + vsadd.hu vr18, vr18, vr14 + vsadd.hu vr18, vr18, vr14 + vsadd.hu vr18, vr18, vr15 + + //dst-1 + vsadd.hu vr19, vr18, vr15 + vsadd.hu vr19, vr19, vr16 + vssub.hu vr19, vr19, vr12 + vssub.hu vr19, vr19, vr12 + + //dst+0 + vsadd.hu vr20, vr19, vr17 + vsadd.hu vr20, vr20, vr16 + vssub.hu vr20, vr20, vr12 + vssub.hu vr20, vr20, vr13 + + //dst+1 + vsadd.hu vr21, vr20, vr17 + vsadd.hu vr21, vr21, vr17 + vssub.hu vr21, vr21, vr13 + vssub.hu vr21, vr21, vr14 + + vsrari.h vr18, vr18, 3 + vsrari.h vr19, vr19, 3 + vsrari.h vr20, vr20, 3 + vsrari.h vr21, vr21, 3 + + vsub.h vr22, vr13, vr16 + vssrani.b.h vr22, vr22, 0 + vand.v vr22, vr22, vr2 + vsllwil.h.b vr22, vr22, 0 //f = iclip_diff(p1 - q1); + + vsub.h vr23, vr15, vr14 + vsadd.h vr24, vr23, vr23 + vsadd.h vr23, vr23, vr24 + vsadd.h vr23, vr23, vr22 + vssrani.b.h vr23, vr23, 0 + vsllwil.h.b vr23, vr23, 0 //f = iclip_diff(3 * (q0 - p0) + f); + + vaddi.hu vr24, vr23, 4 + vaddi.hu vr25, vr23, 3 + li.w t5, 127 + vreplgr2vr.h vr3, t5 + vmin.h vr24, vr24, vr3 + vmin.h vr25, vr25, vr3 + vsrai.h vr24, vr24, 3 //f1 + vsrai.h vr25, vr25, 3 //f2 + + vsadd.h vr26, vr14, vr25 //dst-1 + vssub.h vr27, vr15, vr24 //dst+0 + + vsrari.h vr24, vr24, 1 + vsadd.h vr28, vr13, vr24 + vssub.h vr29, vr16, vr24 + vsllwil.h.b vr2, vr2, 0 + vbitsel.v vr28, vr28, vr13, vr2 //dst-2 + vbitsel.v vr29, vr29, vr16, vr2 //dst+1 + + //flat8in + vsllwil.h.b vr1, vr1, 0 + vbitsel.v vr18, vr28, vr18, vr1 + vbitsel.v vr19, vr26, vr19, vr1 + vbitsel.v vr20, vr27, vr20, vr1 + vbitsel.v vr21, vr29, vr21, vr1 + + vssrani.bu.h vr18, vr18, 0 + vssrani.bu.h vr19, vr19, 0 + vssrani.bu.h vr20, vr20, 0 + vssrani.bu.h vr21, vr21, 0 + + vbitsel.v vr7, vr7, vr18, vr0 //p1 + vbitsel.v vr8, vr8, vr19, vr0 //p0 + vbitsel.v vr9, vr9, vr20, vr0 //q0 + vbitsel.v vr10, vr10, vr21, vr0 //q1 + +.ifc \DIR, h + vilvl.b vr7, vr8, vr7 + vilvl.b vr9, vr10, vr9 + vilvl.h vr7, vr9, vr7 + + addi.d t5, a0, -2 + vstelm.w vr7, t5, 0, 0 + add.d t5, t5, a1 + vstelm.w vr7, t5, 0, 1 + add.d t5, t5, a1 + vstelm.w vr7, t5, 0, 2 + add.d t5, t5, a1 + vstelm.w vr7, t5, 0, 3 +.else + fst.s f9, a0, 0 + fstx.s f10, a0, a1 + sub.d t5, a0, a1 + fst.s f8, t5, 0 + sub.d t5, t5, a1 + fst.s f7, t5, 0 +.endif +.END_FILTER_\DIR\()\TYPE\()_W6: +.endm + +.macro FILTER_W8 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -4 + fld.d f6, t5, 0 //p3 p2 p1 p0 q0 q1 q2 q3 + fldx.d f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f8, t5, 0 + fldx.d f9, t5, a1 + + vilvl.b vr6, vr7, vr6 + vilvl.b vr7, vr9, vr8 + vilvh.h vr10, vr7, vr6 //q0 + vilvl.h vr6, vr7, vr6 //p3 + vbsrl.v vr7, vr6, 4 //p2 + vbsrl.v vr8, vr6, 8 //p1 + vbsrl.v vr9, vr6, 12 //p0 + vbsrl.v vr11, vr10, 4 //q1 + vbsrl.v vr12, vr10, 8 //q2 + vbsrl.v vr13, vr10, 12 //q3 +.else + fld.s f10, a0, 0 + fldx.s f11, a0, a1 + add.d t5, a0, a1 + fldx.s f12, t5, a1 + add.d t5, t5, a1 + fldx.s f13, t5, a1 + sub.d t5, a0, a1 + fld.s f9, t5, 0 + sub.d t5, t5, a1 + fld.s f8, t5, 0 + sub.d t5, t5, a1 + fld.s f7, t5, 0 + sub.d t5, t5, a1 + fld.s f6, t5, 0 +.endif + + vabsd.bu vr14, vr8, vr9 //p1-p0 + vabsd.bu vr15, vr11, vr10 //q1-q0 + vabsd.bu vr16, vr9, vr10 //p0-q0 + vabsd.bu vr17, vr8, vr11 //p1-q1 + vabsd.bu vr18, vr7, vr8 //p2-p1 + vabsd.bu vr19, vr12, vr11 //q2-q1 + vabsd.bu vr20, vr6, vr7 //p3-p2 + vabsd.bu vr21, vr13, vr12 //q3-q2 + + vmax.bu vr22, vr14, vr15 + vsle.bu vr23, vr22, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I + vsadd.bu vr16, vr16, vr16 + vsrli.b vr17, vr17, 1 + vsadd.bu vr16, vr16, vr17 + vsle.bu vr16, vr16, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E + vand.v vr16, vr16, vr23 //fm + + vpickve2gr.wu t5, vr16, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W8 + + vmax.bu vr23, vr18, vr19 + vmax.bu vr23, vr23, vr20 + vmax.bu vr23, vr23, vr21 + vsle.bu vr23, vr23, vr4 + vand.v vr16, vr16, vr23 //fm + + vabsd.bu vr17, vr7, vr9 //abs(p2-p0) + vabsd.bu vr18, vr12, vr10 //abs(q2-q0) + vmax.bu vr17, vr17, vr14 + vmax.bu vr17, vr17, vr15 + vmax.bu vr17, vr17, vr18 + vabsd.bu vr18, vr6, vr9 //abs(p3 - p0) + vabsd.bu vr19, vr13, vr10 //abs(q3 - q0) + vmax.bu vr17, vr17, vr18 + vmax.bu vr17, vr17, vr19 + + vxor.v vr5, vr5, vr5 + vaddi.bu vr5, vr5, 1 //F + vsle.bu vr17, vr17, vr5 //flat8in + + vsllwil.hu.bu vr0, vr6, 0 //p3 + vsllwil.hu.bu vr1, vr7, 0 //p2 + vsllwil.hu.bu vr27, vr8, 0 //p1 + vsllwil.hu.bu vr3, vr9, 0 //p0 + vsllwil.hu.bu vr4, vr10, 0 //q0 + vsllwil.hu.bu vr5, vr11, 0 //q1 + vsllwil.hu.bu vr14, vr12, 0 //q2 + vsllwil.hu.bu vr15, vr13, 0 //q3 + + vsadd.hu vr18, vr0, vr0 //p3+p3 + vsadd.hu vr19, vr15, vr15 //q3+q3 + vsadd.hu vr20, vr0, vr1 //p3+p2 + vsadd.hu vr21, vr1, vr27 //p2+p1 + vsadd.hu vr28, vr27, vr3 //p1+p0 + vsadd.hu vr23, vr3, vr4 //p0+q0 + vsadd.hu vr24, vr4, vr5 //q0+q1 + vsadd.hu vr25, vr5, vr14 //q1+q2 + vsadd.hu vr26, vr14, vr15 //q2+q3 + + // dst-3 + vsadd.hu vr29, vr18, vr20 + vsadd.hu vr29, vr29, vr21 + vsadd.hu vr29, vr29, vr23 + + // dst-2 + vsadd.hu vr30, vr18, vr21 + vsadd.hu vr30, vr30, vr28 + vsadd.hu vr30, vr30, vr24 + + // dst-1 + vsadd.hu vr31, vr20, vr28 + vsadd.hu vr31, vr31, vr23 + vsadd.hu vr31, vr31, vr25 + + // dst+0 + vsadd.hu vr18, vr21, vr23 + vsadd.hu vr18, vr18, vr24 + vsadd.hu vr18, vr18, vr26 + + //dst+1 + vsadd.hu vr20, vr28, vr24 + vsadd.hu vr20, vr20, vr25 + vsadd.hu vr20, vr20, vr19 + + //dst+2 + vsadd.hu vr21, vr23, vr25 + vsadd.hu vr21, vr21, vr26 + vsadd.hu vr21, vr21, vr19 + + vssrarni.bu.h vr23, vr29, 3 + vssrarni.bu.h vr24, vr30, 3 + vssrarni.bu.h vr25, vr31, 3 + vssrarni.bu.h vr19, vr18, 3 + vssrarni.bu.h vr20, vr20, 3 + vssrarni.bu.h vr21, vr21, 3 + + // !flat8in + vslt.bu vr2, vr2, vr22 //hev + + vsub.h vr30, vr27, vr5 //p1-q1 + vssrani.b.h vr30, vr30, 0 + vand.v vr30, vr30, vr2 + vsllwil.h.b vr30, vr30, 0 + + vsub.h vr31, vr4, vr3 + vsadd.h vr0, vr31, vr31 + vsadd.h vr31, vr31, vr0 + vsadd.h vr31, vr31, vr30 + vssrani.b.h vr31, vr31, 0 + vsllwil.h.b vr31, vr31, 0 //f = iclip_diff(3 * (q0 - p0) + f); + + vaddi.hu vr14, vr31, 4 + vaddi.hu vr15, vr31, 3 + li.w t5, 127 + vreplgr2vr.h vr18, t5 + vmin.h vr14, vr14, vr18 + vmin.h vr15, vr15, vr18 + vsrai.h vr14, vr14, 3 //f1 + vsrai.h vr15, vr15, 3 //f2 + + vsadd.h vr3, vr3, vr15 + vssub.h vr4, vr4, vr14 + vssrani.bu.h vr3, vr3, 0 //dst-1 + vssrani.bu.h vr4, vr4, 0 //dst+0 + + vsrari.h vr14, vr14, 1 + vsadd.h vr18, vr27, vr14 + vssub.h vr26, vr5, vr14 + vssrani.bu.h vr18, vr18, 0 //dst-2 + vssrani.bu.h vr26, vr26, 0 //dst+1 + + vbitsel.v vr27, vr18, vr8, vr2 //dst-2 + vbitsel.v vr28, vr26, vr11, vr2 //dst+1 + + vbitsel.v vr23, vr7, vr23, vr17 //dst-3 (p2) + vbitsel.v vr24, vr27, vr24, vr17 //dst-2 + vbitsel.v vr25, vr3, vr25, vr17 //dst-1 + vbitsel.v vr19, vr4, vr19, vr17 //dst+0 + vbitsel.v vr20, vr28, vr20, vr17 //dst+1 + vbitsel.v vr21, vr12, vr21, vr17 //dst+2 + + vbitsel.v vr7, vr7, vr23, vr16 //-3 + vbitsel.v vr8, vr8, vr24, vr16 //-2 + vbitsel.v vr9, vr9, vr25, vr16 //-1 + vbitsel.v vr10, vr10, vr19, vr16 //+0 + vbitsel.v vr11, vr11, vr20, vr16 //+1 + vbitsel.v vr12, vr12, vr21, vr16 //+2 + +.ifc \DIR, h + vilvl.b vr6, vr7, vr6 + vilvl.b vr8, vr9, vr8 + vilvl.b vr10, vr11, vr10 + vilvl.b vr12, vr13, vr12 + vilvl.h vr6, vr8, vr6 //p3p2p1p0 -- -- -- + vilvl.h vr10, vr12, vr10 //q0q1q2q3 -- -- -- + vilvl.w vr0, vr10, vr6 //p3p2p1p0q0q1q2q3 -- + vilvh.w vr1, vr10, vr6 //-- + + addi.d t5, a0, -4 + vstelm.d vr0, t5, 0, 0 + add.d t5, t5, a1 + vstelm.d vr0, t5, 0, 1 + add.d t5, t5, a1 + vstelm.d vr1, t5, 0, 0 + add.d t5, t5, a1 + vstelm.d vr1, t5, 0, 1 +.else + alsl.d t5, a1, a1, 1 + sub.d t5, a0, t5 + fst.s f7, t5, 0 + fstx.s f8, t5, a1 + add.d t5, t5, a1 + fstx.s f9, t5, a1 + + fst.s f10, a0, 0 + add.d t5, a0, a1 + fst.s f11, t5, 0 + fstx.s f12, t5, a1 +.endif +.END_FILTER_\DIR\()\TYPE\()_W8: +.endm + +.macro FILTER_W16 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -7 + vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6 + vldx vr7, t5, a1 + add.d t5, t5, a1 + vldx vr8, t5, a1 + add.d t5, t5, a1 + vldx vr9, t5, a1 + + vilvl.b vr10, vr7, vr6 + vilvh.b vr11, vr7, vr6 + vilvl.b vr12, vr9, vr8 + vilvh.b vr13, vr9, vr8 + vilvl.h vr6, vr12, vr10 + vilvh.h vr10, vr12, vr10 //p2--- + vilvl.h vr15, vr13, vr11 //q1--- + vilvh.h vr19, vr13, vr11 + + vbsrl.v vr7, vr6, 4 //p5--- + vbsrl.v vr8, vr6, 8 //p4--- + vbsrl.v vr9, vr6, 12 //p3--- + vbsrl.v vr12, vr10, 4 //p1--- + vbsrl.v vr13, vr10, 8 //p0--- + vbsrl.v vr14, vr10, 12 //q0--- + vbsrl.v vr16, vr15, 4 //q2--- + vbsrl.v vr17, vr15, 8 //q3--- + vbsrl.v vr18, vr15, 12 //q4--- + vbsrl.v vr20, vr19, 4 //q6--- +.else + slli.d t5, a1, 3 + sub.d t5, a0, t5 + fldx.s f6, t5, a1 //p6 + alsl.d t5, a1, t5, 1 + fld.s f7, t5, 0 //p5 + fldx.s f8, t5, a1 //p4 + alsl.d t5, a1, t5, 1 + fld.s f9, t5, 0 //p3 + fldx.s f10, t5, a1 //p2 + alsl.d t5, a1, t5, 1 + fld.s f12, t5, 0 //p1 + fldx.s f13, t5, a1 //p0 + alsl.d t5, a1, t5, 1 + fld.s f14, t5, 0 //q0 + fldx.s f15, t5, a1 //q1 + alsl.d t5, a1, t5, 1 + fld.s f16, t5, 0 //q2 + fldx.s f17, t5, a1 //q3 + alsl.d t5, a1, t5, 1 + fld.s f18, t5, 0 //q4 + fldx.s f19, t5, a1 //q5 + add.d t5, t5, a1 + fldx.s f20, t5, a1 //q6 + + //temp store + addi.d sp, sp, -96 + fst.d f7, sp, 0 + fst.d f8, sp, 8 + fst.d f9, sp, 16 + fst.d f10, sp, 24 + fst.d f12, sp, 32 + fst.d f13, sp, 40 + fst.d f14, sp, 48 + fst.d f15, sp, 56 + fst.d f16, sp, 64 + fst.d f17, sp, 72 + fst.d f18, sp, 80 + fst.d f19, sp, 88 +.endif + + vabsd.bu vr21, vr12, vr13 //abs(p1-p0) + vabsd.bu vr22, vr15, vr14 //abs(q1-q0) + vmax.bu vr0, vr21, vr22 + vslt.bu vr2, vr2, vr0 //hev + vabsd.bu vr1, vr10, vr12 //abs(p2-p1) + vmax.bu vr0, vr0, vr1 + vabsd.bu vr1, vr16, vr15 //abs(q2-q1) + vmax.bu vr0, vr0, vr1 + vabsd.bu vr1, vr9, vr10 //abs(p3-p2) + vmax.bu vr0, vr0, vr1 + vabsd.bu vr1, vr17, vr16 //abs(q3-q2) + vmax.bu vr0, vr0, vr1 + vsle.bu vr0, vr0, vr4 //vr4 released I + vabsd.bu vr1, vr13, vr14 //abs(p0-q0) + vsadd.bu vr1, vr1, vr1 + vabsd.bu vr4, vr12, vr15 //abs(p1-q1) + vsrli.b vr4, vr4, 1 + vsadd.bu vr1, vr1, vr4 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) + vsle.bu vr1, vr1, vr3 //vr3 released E + vand.v vr0, vr0, vr1 //fm + + vpickve2gr.wu t5, vr0, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W16 + + vabsd.bu vr1, vr6, vr13 //abs(p6-p0) + vabsd.bu vr4, vr7, vr13 //abs(p5-p0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr8, vr13 //abs(p4-p0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr18, vr14 //abs(q4-q0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr19, vr14 //abs(q5-q0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr20, vr14 + vmax.bu vr1, vr1, vr4 + vxor.v vr5, vr5, vr5 + vaddi.bu vr5, vr5, 1 //F + vsle.bu vr1, vr1, vr5 //flat8out + + vabsd.bu vr3, vr10, vr13 //abs(p2-p0) + vmax.bu vr3, vr3, vr21 + vmax.bu vr3, vr3, vr22 + vabsd.bu vr4, vr16, vr14 //abs(q2-q0) + vmax.bu vr3, vr3, vr4 + vabsd.bu vr4, vr9, vr13 //abs(p3-p0) + vmax.bu vr3, vr3, vr4 + vabsd.bu vr4, vr17, vr14 //abs(q3-q0) + vmax.bu vr3, vr3, vr4 + vsle.bu vr3, vr3, vr5 //flatin released vr5 + + vsllwil.hu.bu vr6, vr6, 0 //p6 + vsllwil.hu.bu vr7, vr7, 0 //p5 + vsllwil.hu.bu vr8, vr8, 0 //p4 + vsllwil.hu.bu vr9, vr9, 0 //p3 + vsllwil.hu.bu vr10, vr10, 0 //p2 + vsllwil.hu.bu vr12, vr12, 0 //p1 + vsllwil.hu.bu vr13, vr13, 0 //p0 + vsllwil.hu.bu vr14, vr14, 0 //q0 + vsllwil.hu.bu vr15, vr15, 0 //q1 + vsllwil.hu.bu vr16, vr16, 0 //q2 + vsllwil.hu.bu vr17, vr17, 0 //q3 + vsllwil.hu.bu vr18, vr18, 0 //q4 + vsllwil.hu.bu vr19, vr19, 0 //q5 + vsllwil.hu.bu vr20, vr20, 0 //q6 + + //dst-6 + vslli.w vr21, vr6, 3 + vssub.hu vr21, vr21, vr6 + vsadd.hu vr21, vr21, vr7 + vsadd.hu vr21, vr21, vr7 + vsadd.hu vr21, vr21, vr8 + vsadd.hu vr21, vr21, vr8 + vsadd.hu vr21, vr21, vr9 + vsadd.hu vr21, vr21, vr10 + vsadd.hu vr21, vr21, vr12 + vsadd.hu vr21, vr21, vr13 + vsadd.hu vr21, vr21, vr14 + + //dst-5 + vsadd.hu vr22, vr21, vr15 + vsadd.hu vr22, vr22, vr9 + vssub.hu vr22, vr22, vr6 + vssub.hu vr22, vr22, vr6 + + //dst-4 + vsadd.hu vr23, vr22, vr16 + vsadd.hu vr23, vr23, vr10 + vssub.hu vr23, vr23, vr7 + vssub.hu vr23, vr23, vr6 + + //dst-3 + vsadd.hu vr24, vr23, vr12 + vsadd.hu vr24, vr24, vr17 + vssub.hu vr24, vr24, vr6 + vssub.hu vr24, vr24, vr8 + + //dst-2 + vsadd.hu vr25, vr24, vr18 + vsadd.hu vr25, vr25, vr13 + vssub.hu vr25, vr25, vr6 + vssub.hu vr25, vr25, vr9 + + //dst-1 + vsadd.hu vr26, vr25, vr19 + vsadd.hu vr26, vr26, vr14 + vssub.hu vr26, vr26, vr6 + vssub.hu vr26, vr26, vr10 + + //dst+0 + vsadd.hu vr27, vr26, vr20 + vsadd.hu vr27, vr27, vr15 + vssub.hu vr27, vr27, vr6 + vssub.hu vr27, vr27, vr12 + + //dst+1 + vsadd.hu vr28, vr27, vr20 + vsadd.hu vr28, vr28, vr16 + vssub.hu vr28, vr28, vr7 + vssub.hu vr28, vr28, vr13 + + //dst+2 + vsadd.hu vr29, vr28, vr20 + vsadd.hu vr29, vr29, vr17 + vssub.hu vr29, vr29, vr8 + vssub.hu vr29, vr29, vr14 + + //dst+3 + vsadd.hu vr30, vr29, vr20 + vsadd.hu vr30, vr30, vr18 + vssub.hu vr30, vr30, vr9 + vssub.hu vr30, vr30, vr15 + + //dst+4 + vsadd.hu vr31, vr30, vr20 + vsadd.hu vr31, vr31, vr19 + vssub.hu vr31, vr31, vr10 + vssub.hu vr31, vr31, vr16 + + //dst+5 + vsadd.hu vr11, vr31, vr20 + vsadd.hu vr11, vr11, vr20 + vssub.hu vr11, vr11, vr12 + vssub.hu vr11, vr11, vr17 + + vsrari.h vr21, vr21, 4 + vsrari.h vr22, vr22, 4 + vsrari.h vr23, vr23, 4 + vsrari.h vr24, vr24, 4 + vsrari.h vr25, vr25, 4 + vsrari.h vr26, vr26, 4 + vsrari.h vr27, vr27, 4 + vsrari.h vr28, vr28, 4 + vsrari.h vr29, vr29, 4 + vsrari.h vr30, vr30, 4 + vsrari.h vr31, vr31, 4 + vsrari.h vr11, vr11, 4 + + vand.v vr1, vr1, vr3 + vsllwil.h.b vr1, vr1, 0 //expand to h + //(flat8out & flat8in) + vbitsel.v vr21, vr7, vr21, vr1 //dst-6 + vbitsel.v vr22, vr8, vr22, vr1 //dst-5 + vbitsel.v vr23, vr9, vr23, vr1 //dst-4 + vbitsel.v vr30, vr17, vr30, vr1 //dst+3 + vbitsel.v vr31, vr18, vr31, vr1 //dst+4 + vbitsel.v vr11, vr19, vr11, vr1 //dst+5 + + //flat8in + //dst-3 + vslli.h vr4, vr9, 1 + vsadd.hu vr4, vr4, vr9 //p3*3 + vsadd.hu vr4, vr4, vr10 + vsadd.hu vr4, vr4, vr10 + vsadd.hu vr4, vr4, vr12 + vsadd.hu vr4, vr4, vr13 + vsadd.hu vr4, vr4, vr14 + + //dst-2 + vsadd.hu vr5, vr4, vr12 + vsadd.hu vr5, vr5, vr15 + vssub.hu vr5, vr5, vr9 + vssub.hu vr5, vr5, vr10 + + //dst-1 + vsadd.hu vr18, vr5, vr13 + vsadd.hu vr18, vr18, vr16 + vssub.hu vr18, vr18, vr9 + vssub.hu vr18, vr18, vr12 + + //dst+0 + vsadd.hu vr7, vr18, vr14 + vsadd.hu vr7, vr7, vr17 + vssub.hu vr7, vr7, vr9 + vssub.hu vr7, vr7, vr13 + + //dst+1 + vsadd.hu vr8, vr7, vr15 + vsadd.hu vr8, vr8, vr17 + vssub.hu vr8, vr8, vr10 + vssub.hu vr8, vr8, vr14 + + //dst+2 + vsadd.hu vr9, vr8, vr16 + vsadd.hu vr9, vr9, vr17 + vssub.hu vr9, vr9, vr12 + vssub.hu vr9, vr9, vr15 + + vsrari.h vr4, vr4, 3 + vsrari.h vr5, vr5, 3 + vsrari.h vr18, vr18, 3 + vsrari.h vr7, vr7, 3 + vsrari.h vr8, vr8, 3 + vsrari.h vr9, vr9, 3 + + //flat8out & flat8in + vbitsel.v vr24, vr4, vr24, vr1 //dst-3 + vbitsel.v vr25, vr5, vr25, vr1 //dst-2 + vbitsel.v vr26, vr18, vr26, vr1 //dst-1 + vbitsel.v vr27, vr7, vr27, vr1 //dst+0 + vbitsel.v vr28, vr8, vr28, vr1 //dst+1 + vbitsel.v vr29, vr9, vr29, vr1 //dst+2 + + //!flat8in + vsub.h vr17, vr12, vr15 //p1-q1 + vsllwil.h.b vr2, vr2, 0 + vand.v vr17, vr17, vr2 //&hev + vssrani.b.h vr17, vr17, 0 + vsllwil.h.b vr17, vr17, 0 + + vsub.h vr7, vr14, vr13 + vsadd.h vr8, vr7, vr7 + vsadd.h vr7, vr7, vr8 + vsadd.h vr7, vr7, vr17 + vssrani.b.h vr7, vr7, 0 + vsllwil.h.b vr17, vr7, 0 //f = iclip_diff(3 * (q0 - p0) + f); + + vaddi.hu vr7, vr17, 4 + vaddi.hu vr8, vr17, 3 + li.w t5, 127 + vreplgr2vr.h vr9, t5 + vmin.h vr7, vr7, vr9 + vmin.h vr8, vr8, vr9 + vsrai.h vr7, vr7, 3 //f1 + vsrai.h vr8, vr8, 3 //f2 + + vsadd.h vr4, vr13, vr8 //dst-1 + vssub.h vr5, vr14, vr7 //dst+0 + + vsrari.h vr7, vr7, 1 + vsadd.h vr17, vr12, vr7 + vssub.h vr7, vr15, vr7 + vbitsel.v vr17, vr17, vr12, vr2 //dst-2 + vbitsel.v vr7, vr7, vr15, vr2 //dst+1 + + //flat8in or !flat8in + vsllwil.h.b vr3, vr3, 0 + vbitsel.v vr24, vr10, vr24, vr3 //dst-3 + vbitsel.v vr25, vr17, vr25, vr3 //dst-2 + vbitsel.v vr26, vr4, vr26, vr3 //dst-1 + vbitsel.v vr27, vr5, vr27, vr3 //dst+0 + vbitsel.v vr28, vr7, vr28, vr3 //dst+1 + vbitsel.v vr29, vr16, vr29, vr3 //dst+2 + +.ifc \DIR, h + //dst-6,dst-2,dst-5,dst-1 + vssrani.bu.h vr25, vr21, 0 + vssrani.bu.h vr26, vr22, 0 + vpermi.w vr25, vr25, 0xd8 + vpermi.w vr26, vr26, 0xd8 + vilvl.b vr6, vr26, vr25 //65656565 21212121 + + //dst-4,dst+0,dst-3,dst+1 + vssrani.bu.h vr27, vr23, 0 + vssrani.bu.h vr28, vr24, 0 + vpermi.w vr27, vr27, 0xd8 + vpermi.w vr28, vr28, 0xd8 + vilvl.b vr26, vr28, vr27 //43434343 01010101 + + vilvl.h vr21, vr26, vr6 //6543 -- -- -- + vilvh.h vr22, vr26, vr6 //2101 -- -- -- + vilvl.w vr20, vr22, vr21 //65432101 -- + vilvh.w vr22, vr22, vr21 //65432101 -- + vreplvei.d vr21, vr20, 1 + vreplvei.d vr23, vr22, 1 + + //dst+2,dst+4,dst+3,dst+5 + vssrani.bu.h vr31, vr29, 0 + vssrani.bu.h vr11, vr30, 0 + vpermi.w vr31, vr31, 0xd8 + vpermi.w vr11, vr11, 0xd8 + vilvl.b vr11, vr11, vr31 //23232323 45454545 + vshuf4i.w vr11, vr11, 0xd8 + vshuf4i.h vr11, vr11, 0xd8 //2345 -- -- -- + + vextrins.w vr20, vr11, 0x20 + vextrins.w vr21, vr11, 0x21 + vextrins.w vr22, vr11, 0x22 + vextrins.w vr23, vr11, 0x23 + + addi.d t5, a0, -6 + vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6 + vldx vr7, t5, a1 + add.d t5, t5, a1 + vldx vr8, t5, a1 + add.d t5, t5, a1 + vldx vr9, t5, a1 + + //expand fm to 128 + vreplvei.b vr10, vr0, 0 + vreplvei.b vr11, vr0, 1 + vreplvei.b vr12, vr0, 2 + vreplvei.b vr13, vr0, 3 + + vbitsel.v vr20, vr6, vr20, vr10 + vbitsel.v vr21, vr7, vr21, vr11 + vbitsel.v vr22, vr8, vr22, vr12 + vbitsel.v vr23, vr9, vr23, vr13 + + addi.d t5, a0, -6 + vstelm.d vr20, t5, 0, 0 + vstelm.w vr20, t5, 8, 2 + add.d t5, t5, a1 + vstelm.d vr21, t5, 0, 0 + vstelm.w vr21, t5, 8, 2 + add.d t5, t5, a1 + vstelm.d vr22, t5, 0, 0 + vstelm.w vr22, t5, 8, 2 + add.d t5, t5, a1 + vstelm.d vr23, t5, 0, 0 + vstelm.w vr23, t5, 8, 2 +.else + //reload + fld.d f7, sp, 0 + fld.d f8, sp, 8 + fld.d f9, sp, 16 + fld.d f10, sp, 24 + fld.d f12, sp, 32 + fld.d f13, sp, 40 + fld.d f14, sp, 48 + fld.d f15, sp, 56 + fld.d f16, sp, 64 + fld.d f17, sp, 72 + fld.d f18, sp, 80 + fld.d f19, sp, 88 + + vssrarni.bu.h vr21, vr21, 0 + vssrarni.bu.h vr22, vr22, 0 + vssrarni.bu.h vr23, vr23, 0 + vssrarni.bu.h vr24, vr24, 0 + vssrarni.bu.h vr25, vr25, 0 + vssrarni.bu.h vr26, vr26, 0 + vssrarni.bu.h vr27, vr27, 0 + vssrarni.bu.h vr28, vr28, 0 + vssrarni.bu.h vr29, vr29, 0 + vssrarni.bu.h vr30, vr30, 0 + vssrarni.bu.h vr31, vr31, 0 + vssrarni.bu.h vr11, vr11, 0 + + vbitsel.v vr7, vr7, vr21, vr0 //p5 + vbitsel.v vr8, vr8, vr22, vr0 //p4 + vbitsel.v vr9, vr9, vr23, vr0 //p3 + vbitsel.v vr10, vr10, vr24, vr0 //p2 + vbitsel.v vr12, vr12, vr25, vr0 //p1 + vbitsel.v vr13, vr13, vr26, vr0 //p0 + vbitsel.v vr14, vr14, vr27, vr0 //q0 + vbitsel.v vr15, vr15, vr28, vr0 //q1 + vbitsel.v vr16, vr16, vr29, vr0 //q2 + vbitsel.v vr17, vr17, vr30, vr0 //q3 + vbitsel.v vr18, vr18, vr31, vr0 //q4 + vbitsel.v vr19, vr19, vr11, vr0 //q5 + + fst.s f14, a0, 0 + fstx.s f15, a0, a1 + alsl.d t5, a1, a0, 1 + fst.s f16, t5, 0 + fstx.s f17, t5, a1 + alsl.d t5, a1, t5, 1 + fst.s f18, t5, 0 + fstx.s f19, t5, a1 + + slli.w t5, a1, 2 + alsl.d t5, a1, t5, 1 + sub.d t5, a0, t5 + fst.s f7, t5, 0 + fstx.s f8, t5, a1 + alsl.d t5, a1, t5, 1 + fst.s f9, t5, 0 + fstx.s f10, t5, a1 + alsl.d t5, a1, t5, 1 + fst.s f12, t5, 0 + fstx.s f13, t5, a1 +.endif +.END_FILTER_\DIR\()\TYPE\()_W16: +.ifc \DIR, v + addi.d sp, sp, 96 +.endif +.endm + +.macro PUSH_REG + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 +.endm +.macro POP_REG + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +.endm + +.macro LPF_FUNC DIR, TYPE +function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx + PUSH_REG + vld vr0, a2, 0 //vmask + vpickve2gr.wu t0, vr0, 0 + vpickve2gr.wu t1, vr0, 1 + vpickve2gr.wu t2, vr0, 2 + li.w t3, 1 //y + or t0, t0, t1 +.ifc \TYPE, y + or t0, t0, t2 //vm +.endif + addi.w t8, t3, -1 + andn t8, t0, t8 + beqz t0, .\DIR\()\TYPE\()_END +.\DIR\()\TYPE\()_LOOP: + and t4, t0, t3 //vm & y + beqz t4, .\DIR\()\TYPE\()_LOOP_NEXT + vldrepl.b vr1, a3, 0 //l[0][0] +.ifc \DIR, h + addi.d t5, a3, -4 +.else + slli.d t5, a4, 2 + sub.d t5, a3, t5 +.endif + vldrepl.b vr2, t5, 0 //l[-1][0] + vseqi.b vr3, vr1, 0 + vbitsel.v vr1, vr1, vr2, vr3 //L + vpickve2gr.b t5, vr1, 0 + beqz t5, .\DIR\()\TYPE\()_LOOP_NEXT + vsrai.b vr2, vr1, 4 //H + add.d t6, a5, t5 + vldrepl.b vr3, t6, 0 //E + addi.d t6, t6, 64 + vldrepl.b vr4, t6, 0 //I +.ifc \TYPE, y + and t5, t2, t3 + bnez t5, .FILTER_\DIR\()\TYPE\()_16 +.endif + and t5, t1, t3 +.ifc \TYPE, y + bnez t5, .FILTER_\DIR\()\TYPE\()_8 +.else + bnez t5, .FILTER_\DIR\()\TYPE\()_6 +.endif + FILTER_W4 \DIR, \TYPE + b .\DIR\()\TYPE\()_LOOP_NEXT +.ifc \TYPE, uv +.FILTER_\DIR\()\TYPE\()_6: + FILTER_W6 \DIR, \TYPE +.endif +.ifc \TYPE, y +.FILTER_\DIR\()\TYPE\()_8: + FILTER_W8 \DIR, \TYPE + b .\DIR\()\TYPE\()_LOOP_NEXT +.FILTER_\DIR\()\TYPE\()_16: + FILTER_W16 \DIR, \TYPE +.endif +.\DIR\()\TYPE\()_LOOP_NEXT: + slli.w t3, t3, 1 +.ifc \DIR, h + alsl.d a0, a1, a0, 2 + slli.w t8, a4, 2 + add.d a3, a3, t8 +.else + addi.d a0, a0, 4 + addi.d a3, a3, 4 +.endif + addi.w t8, t3, -1 + andn t8, t0, t8 + bnez t8, .\DIR\()\TYPE\()_LOOP +.\DIR\()\TYPE\()_END: + POP_REG +endfunc +.endm + +LPF_FUNC h, y +LPF_FUNC v, y +LPF_FUNC h, uv +LPF_FUNC v, uv |