diff options
Diffstat (limited to 'files/source/row_lasx.cc')
-rw-r--r-- | files/source/row_lasx.cc | 370 |
1 files changed, 222 insertions, 148 deletions
diff --git a/files/source/row_lasx.cc b/files/source/row_lasx.cc index 7dd18f40..1082ad80 100644 --- a/files/source/row_lasx.cc +++ b/files/source/row_lasx.cc @@ -775,40 +775,6 @@ void UYVYToUV422Row_LASX(const uint8_t* src_uyvy, } } -void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width) { - int x; - int len = width / 32; - __m256i src0, src1, src2, src3, vec0, vec1, vec2, vec3; - __m256i tmp0, tmp1, dst0; - __m256i const_19 = __lasx_xvldi(0x19); - __m256i const_42 = __lasx_xvldi(0x42); - __m256i const_81 = __lasx_xvldi(0x81); - __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, - 0x1080108010801080, 0x1080108010801080}; - __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, - 0x0000000700000003}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64, - src_argb0, 96, src0, src1, src2, src3); - vec0 = __lasx_xvpickev_b(src1, src0); - vec1 = __lasx_xvpickev_b(src3, src2); - vec2 = __lasx_xvpickod_b(src1, src0); - vec3 = __lasx_xvpickod_b(src3, src2); - tmp0 = __lasx_xvmaddwev_h_bu(const_1080, vec0, const_19); - tmp1 = __lasx_xvmaddwev_h_bu(const_1080, vec1, const_19); - tmp0 = __lasx_xvmaddwev_h_bu(tmp0, vec2, const_81); - tmp1 = __lasx_xvmaddwev_h_bu(tmp1, vec3, const_81); - tmp0 = __lasx_xvmaddwod_h_bu(tmp0, vec0, const_42); - tmp1 = __lasx_xvmaddwod_h_bu(tmp1, vec1, const_42); - dst0 = __lasx_xvssrani_b_h(tmp1, tmp0, 8); - dst0 = __lasx_xvperm_w(dst0, control); - __lasx_xvst(dst0, dst_y, 0); - src_argb0 += 128; - dst_y += 32; - } -} - void ARGBToUVRow_LASX(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, @@ -1216,7 +1182,7 @@ void ARGBAttenuateRow_LASX(const uint8_t* src_argb, void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { int x; int len = width / 16; @@ -1811,48 +1777,6 @@ void RGB565ToUVRow_LASX(const uint8_t* src_rgb565, } } -void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - int x; - int len = width / 32; - __m256i src0, src1, src2; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i reg0, reg1, reg2, dst0; - __m256i const_129 = __lasx_xvldi(129); - __m256i const_br = {0x4219421942194219, 0x4219421942194219, - 0x4219421942194219, 0x4219421942194219}; - __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, - 0x1080108010801080, 0x1080108010801080}; - __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200, - 0x17151412110F0E0C}; - __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18, - 0x0F0D0C0A09070604}; - __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001, - 0x001600130010000D}; - __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019, - 0x000E000B00080005}; - - for (x = 0; x < len; x++) { - reg0 = __lasx_xvld(src_rgb24, 0); - reg1 = __lasx_xvld(src_rgb24, 32); - reg2 = __lasx_xvld(src_rgb24, 64); - src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); - src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); - src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); - tmp0 = __lasx_xvshuf_b(src1, src0, shuff0); - tmp1 = __lasx_xvshuf_b(src1, src2, shuff1); - tmp2 = __lasx_xvshuf_b(src1, src0, shuff2); - tmp3 = __lasx_xvshuf_b(src1, src2, shuff3); - reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129); - reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129); - reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1); - dst0 = __lasx_xvpickod_b(reg1, reg0); - __lasx_xvst(dst0, dst_y, 0); - dst_y += 32; - src_rgb24 += 96; - } -} - void RGB24ToUVRow_LASX(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, @@ -1916,48 +1840,6 @@ void RGB24ToUVRow_LASX(const uint8_t* src_rgb24, } } -void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) { - int x; - int len = width / 32; - __m256i src0, src1, src2; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i reg0, reg1, reg2, dst0; - __m256i const_129 = __lasx_xvldi(129); - __m256i const_br = {0x1942194219421942, 0x1942194219421942, - 0x1942194219421942, 0x1942194219421942}; - __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, - 0x1080108010801080, 0x1080108010801080}; - __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200, - 0x17151412110F0E0C}; - __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18, - 0x0F0D0C0A09070604}; - __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001, - 0x001600130010000D}; - __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019, - 0x000E000B00080005}; - - for (x = 0; x < len; x++) { - reg0 = __lasx_xvld(src_raw, 0); - reg1 = __lasx_xvld(src_raw, 32); - reg2 = __lasx_xvld(src_raw, 64); - src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); - src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); - src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); - tmp0 = __lasx_xvshuf_b(src1, src0, shuff0); - tmp1 = __lasx_xvshuf_b(src1, src2, shuff1); - tmp2 = __lasx_xvshuf_b(src1, src0, shuff2); - tmp3 = __lasx_xvshuf_b(src1, src2, shuff3); - reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129); - reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129); - reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1); - dst0 = __lasx_xvpickod_b(reg1, reg0); - __lasx_xvst(dst0, dst_y, 0); - dst_y += 32; - src_raw += 96; - } -} - void RAWToUVRow_LASX(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, @@ -2118,36 +2000,228 @@ void NV21ToARGBRow_LASX(const uint8_t* src_y, } } -void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - int len = width / 32; - __m256i src0, src1, src2, src3, dst0; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i reg0, reg1; - __m256i const_128 = __lasx_xvldi(0x480); - __m256i const_150 = __lasx_xvldi(0x96); - __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D, - 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; - __m256i shuff = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, - 0x0000000700000003}; +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + asm volatile( + "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants + "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants + "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants + "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants + "xvld $xr20, %4, 0 \n\t" // load shuff + "1: \n\t" + "xvld $xr4, %0, 0 \n\t" + "xvld $xr5, %0, 32 \n\t" + "xvld $xr6, %0, 64 \n\t" + "xvld $xr7, %0, 96 \n\t" // load 32 pixels of + // ARGB + "xvor.v $xr12, $xr3, $xr3 \n\t" + "xvor.v $xr13, $xr3, $xr3 \n\t" + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpickev.b $xr8, $xr5, $xr4 \n\t" // BR + "xvpickev.b $xr10, $xr7, $xr6 \n\t" + "xvpickod.b $xr9, $xr5, $xr4 \n\t" // GA + "xvpickod.b $xr11, $xr7, $xr6 \n\t" + "xvmaddwev.h.bu $xr12, $xr8, $xr0 \n\t" // B + "xvmaddwev.h.bu $xr13, $xr10, $xr0 \n\t" + "xvmaddwev.h.bu $xr12, $xr9, $xr1 \n\t" // G + "xvmaddwev.h.bu $xr13, $xr11, $xr1 \n\t" + "xvmaddwod.h.bu $xr12, $xr8, $xr2 \n\t" // R + "xvmaddwod.h.bu $xr13, $xr10, $xr2 \n\t" + "addi.d %0, %0, 128 \n\t" + "xvpickod.b $xr10, $xr13, $xr12 \n\t" + "xvperm.w $xr11, $xr10, $xr20 \n\t" + "xvst $xr11, %1, 0 \n\t" + "addi.d %1, %1, 32 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_argb), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), "r"(shuff) + : "memory"); +} - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, - 96, src0, src1, src2, src3); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - tmp2 = __lasx_xvpickev_b(src3, src2); - tmp3 = __lasx_xvpickod_b(src3, src2); - reg0 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150); - reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp3, const_150); - reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp2); - dst0 = __lasx_xvpickod_b(reg1, reg0); - dst0 = __lasx_xvperm_w(dst0, shuff); - __lasx_xvst(dst0, dst_y, 0); - dst_y += 32; - src_argb += 128; - } +void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_LASX(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_LASX(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_LASX(src_abgr, dst_y, width, &kRawI601Constants); +} + +void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_LASX(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +// Same code as ARGB, except the LD4 +static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + asm volatile( + "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants + "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants + "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants + "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants + "xvld $xr20, %4, 0 \n\t" // load shuff + "1: \n\t" + "xvld $xr4, %0, 0 \n\t" + "xvld $xr5, %0, 32 \n\t" + "xvld $xr6, %0, 64 \n\t" + "xvld $xr7, %0, 96 \n\t" // load 32 pixels of + // RGBA + "xvor.v $xr12, $xr3, $xr3 \n\t" + "xvor.v $xr13, $xr3, $xr3 \n\t" + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpickev.b $xr8, $xr5, $xr4 \n\t" // AG + "xvpickev.b $xr10, $xr7, $xr6 \n\t" + "xvpickod.b $xr9, $xr5, $xr4 \n\t" // BR + "xvpickod.b $xr11, $xr7, $xr6 \n\t" + "xvmaddwev.h.bu $xr12, $xr9, $xr0 \n\t" // B + "xvmaddwev.h.bu $xr13, $xr11, $xr0 \n\t" + "xvmaddwod.h.bu $xr12, $xr8, $xr1 \n\t" // G + "xvmaddwod.h.bu $xr13, $xr10, $xr1 \n\t" + "xvmaddwod.h.bu $xr12, $xr9, $xr2 \n\t" // R + "xvmaddwod.h.bu $xr13, $xr11, $xr2 \n\t" + "addi.d %0, %0, 128 \n\t" + "xvpickod.b $xr10, $xr13, $xr12 \n\t" + "xvperm.w $xr11, $xr10, $xr20 \n\t" + "xvst $xr11, %1, 0 \n\t" + "addi.d %1, %1, 32 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), "r"(shuff) + : "memory"); +} + +void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_LASX(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_LASX(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_LASX(src_bgra, dst_y, width, &kRawI601Constants); +} + +static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int8_t shuff[128] = { + 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, + 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, + 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, + 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, + 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, + 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, + 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, + 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; + asm volatile( + "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants + "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants + "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants + "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants + "xvld $xr4, %4, 0 \n\t" // load shuff + "xvld $xr5, %4, 32 \n\t" + "xvld $xr6, %4, 64 \n\t" + "xvld $xr7, %4, 96 \n\t" + "1: \n\t" + "xvld $xr8, %0, 0 \n\t" + "xvld $xr9, %0, 32 \n\t" + "xvld $xr10, %0, 64 \n\t" // load 32 pixels of + // RGB + "xvor.v $xr12, $xr3, $xr3 \n\t" + "xvor.v $xr13, $xr3, $xr3 \n\t" + "xvor.v $xr11, $xr9, $xr9 \n\t" + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpermi.q $xr9, $xr8, 0x30 \n\t" // src0 + "xvpermi.q $xr8, $xr10, 0x03 \n\t" // src1 + "xvpermi.q $xr10, $xr11, 0x30 \n\t" // src2 + "xvshuf.b $xr14, $xr8, $xr9, $xr4 \n\t" + "xvshuf.b $xr15, $xr8, $xr10, $xr5 \n\t" + "xvshuf.b $xr16, $xr8, $xr9, $xr6 \n\t" + "xvshuf.b $xr17, $xr8, $xr10, $xr7 \n\t" + "xvmaddwev.h.bu $xr12, $xr16, $xr1 \n\t" // G + "xvmaddwev.h.bu $xr13, $xr17, $xr1 \n\t" + "xvmaddwev.h.bu $xr12, $xr14, $xr0 \n\t" // B + "xvmaddwev.h.bu $xr13, $xr15, $xr0 \n\t" + "xvmaddwod.h.bu $xr12, $xr14, $xr2 \n\t" // R + "xvmaddwod.h.bu $xr13, $xr15, $xr2 \n\t" + "addi.d %0, %0, 96 \n\t" + "xvpickod.b $xr10, $xr13, $xr12 \n\t" + "xvst $xr10, %1, 0 \n\t" + "addi.d %1, %1, 32 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), // %3 + "r"(shuff) // %4 + : "memory"); +} + +void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants); } void ARGBToUVJRow_LASX(const uint8_t* src_argb, |