diff options
Diffstat (limited to 'source/row_rvv.cc')
-rw-r--r-- | source/row_rvv.cc | 497 |
1 files changed, 429 insertions, 68 deletions
diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 27e91a3b..c875be2f 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -17,7 +17,9 @@ #include "libyuv/row.h" -#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) +// This module is for clang rvv. GCC hasn't supported segment load & store. +#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \ + defined(__clang__) #include <assert.h> #include <riscv_vector.h> @@ -29,48 +31,48 @@ extern "C" { // Fill YUV -> RGB conversion constants into vectors // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). -#define YUVTORGB_SETUP(vl, yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ - { \ - asm volatile("csrwi vxrm, 0"); \ - ub = yuvconst->kUVCoeff[0]; \ - vr = yuvconst->kUVCoeff[1]; \ - ug = yuvconst->kUVCoeff[2]; \ - vg = yuvconst->kUVCoeff[3]; \ - yg = yuvconst->kRGBCoeffBias[0]; \ - bb = yuvconst->kRGBCoeffBias[1] + 32; \ - bg = yuvconst->kRGBCoeffBias[2] - 32; \ - br = yuvconst->kRGBCoeffBias[3] + 32; \ +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ + { \ + asm volatile("csrwi vxrm, 0"); \ + ub = yuvconst->kUVCoeff[0]; \ + vr = yuvconst->kUVCoeff[1]; \ + ug = yuvconst->kUVCoeff[2]; \ + vg = yuvconst->kUVCoeff[3]; \ + yg = yuvconst->kRGBCoeffBias[0]; \ + bb = yuvconst->kRGBCoeffBias[1] + 32; \ + bg = yuvconst->kRGBCoeffBias[2] - 32; \ + br = yuvconst->kRGBCoeffBias[3] + 32; \ } -// Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422 -#define READYUV422(vl, v_u, v_v, v_y_16) \ - { \ - vuint8m1_t v_tmp0, v_tmp1; \ - vuint8m2_t v_y; \ - vuint16m2_t v_u_16, v_v_16; \ - vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ - v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \ - v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ - v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \ - v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ - v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ - v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ - v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ - v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ +// Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422 +#define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } -// Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444 -#define READYUV444(vl, v_u, v_v, v_y_16) \ - { \ - vuint8m2_t v_y; \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_u = __riscv_vle8_v_u8m2(src_u, vl); \ - v_v = __riscv_vle8_v_u8m2(src_v, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ +// Read [2*VLEN/8] Y, [2*VLEN/8] U, and [2*VLEN/8] V from 444 +#define READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \ + { \ + vuint8m2_t v_y; \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_u = __riscv_vle8_v_u8m2(src_u, vl); \ + v_v = __riscv_vle8_v_u8m2(src_v, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } // Convert from YUV to fixed point RGB @@ -101,6 +103,45 @@ extern "C" { v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \ } +// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv +#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu +#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +#ifdef HAS_ARGBTOAR64ROW_RVV void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { size_t avl = (size_t)4 * width; do { @@ -116,7 +157,9 @@ void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { dst_ar64 += vl; } while (avl > 0); } +#endif +#ifdef HAS_ARGBTOAB64ROW_RVV void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { size_t avl = (size_t)width; do { @@ -138,7 +181,9 @@ void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { dst_ab64 += 4 * vl; } while (avl > 0); } +#endif +#ifdef HAS_AR64TOARGBROW_RVV void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { size_t avl = (size_t)4 * width; do { @@ -153,7 +198,9 @@ void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { dst_argb += vl; } while (avl > 0); } +#endif +#ifdef HAS_AB64TOARGBROW_RVV void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { size_t avl = (size_t)width; do { @@ -171,7 +218,9 @@ void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { dst_argb += 4 * vl; } while (avl > 0); } +#endif +#ifdef HAS_RAWTOARGBROW_RVV void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -186,7 +235,9 @@ void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#endif +#ifdef HAS_RAWTORGBAROW_RVV void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -201,7 +252,9 @@ void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#endif +#ifdef HAS_RAWTORGB24ROW_RVV void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { size_t w = (size_t)width; do { @@ -214,7 +267,9 @@ void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { dst_rgb24 += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_ARGBTORAWROW_RVV void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { size_t w = (size_t)width; do { @@ -227,7 +282,9 @@ void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { dst_raw += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_ARGBTORGB24ROW_RVV void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { @@ -242,7 +299,9 @@ void ARGBToRGB24Row_RVV(const uint8_t* src_argb, dst_rgb24 += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_RGB24TOARGBROW_RVV void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { @@ -259,24 +318,26 @@ void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#endif +#ifdef HAS_I444TOARGBROW_RVV void I444ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - size_t vl; size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); uint8_t ub, vr, ug, vg; int16_t yg, bb, bg, br; vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { - READYUV444(vl, v_u, v_v, v_y_16); + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -288,7 +349,9 @@ void I444ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_I444ALPHATOARGBROW_RVV void I444AlphaToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -303,9 +366,9 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y, vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV444(vl, v_u, v_v, v_y_16); + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); v_a = __riscv_vle8_v_u8m2(src_a, vl); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); @@ -319,7 +382,9 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_I444TORGB24ROW_RVV void I444ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -333,9 +398,9 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y, vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV444(vl, v_u, v_v, v_y_16); + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -347,24 +412,26 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_I422TOARGBROW_RVV void I422ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - size_t vl; size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); uint8_t ub, vr, ug, vg; int16_t yg, bb, bg, br; vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -376,7 +443,9 @@ void I422ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_I422ALPHATOARGBROW_RVV void I422AlphaToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -391,9 +460,9 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y, vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); v_a = __riscv_vle8_v_u8m2(src_a, vl); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); @@ -407,24 +476,26 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_I422TORGBAROW_RVV void I422ToRGBARow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - size_t vl; size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); uint8_t ub, vr, ug, vg; int16_t yg, bb, bg, br; vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -436,7 +507,9 @@ void I422ToRGBARow_RVV(const uint8_t* src_y, dst_rgba += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_I422TORGB24ROW_RVV void I422ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -450,9 +523,9 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y, vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -464,7 +537,9 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_I400TOARGBROW_RVV void I400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, @@ -503,7 +578,9 @@ void I400ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_J400TOARGBROW_RVV void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -518,7 +595,9 @@ void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#endif +#ifdef HAS_COPYROW_RVV void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { size_t w = (size_t)width; do { @@ -530,8 +609,125 @@ void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { dst += vl; } while (w > 0); } +#endif + +#ifdef HAS_NV12TOARGBROW_RVV +void NV12ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif + +#ifdef HAS_NV12TORGB24ROW_RVV +void NV12ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif + +#ifdef HAS_NV21TOARGBROW_RVV +void NV21ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif + +#ifdef HAS_NV21TORGB24ROW_RVV +void NV21ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif // Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1 + +#ifdef HAS_INTERPOLATEROW_RVV void InterpolateRow_RVV(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, @@ -554,13 +750,16 @@ void InterpolateRow_RVV(uint8_t* dst_ptr, } while (dst_w > 0); return; } + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up(0). + asm volatile("csrwi vxrm, 0"); // Blend 50 / 50. if (y1_fraction == 128) { do { size_t vl = __riscv_vsetvl_e8m8(dst_w); vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl); vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl); - // Averaging add + // Use round-to-nearest-up mode for averaging add vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl); __riscv_vse8_v_u8m8(dst_ptr, row_out, vl); dst_w -= vl; @@ -571,15 +770,13 @@ void InterpolateRow_RVV(uint8_t* dst_ptr, return; } // General purpose row blend. - // To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up(0). - asm volatile("csrwi vxrm, 0"); do { size_t vl = __riscv_vsetvl_e8m4(dst_w); vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl); vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl); vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl); acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl); + // Use round-to-nearest-up mode for vnclip __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl); dst_w -= vl; src_ptr += vl; @@ -587,7 +784,9 @@ void InterpolateRow_RVV(uint8_t* dst_ptr, dst_ptr += vl; } while (dst_w > 0); } +#endif +#ifdef HAS_SPLITRGBROW_RVV void SplitRGBRow_RVV(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -608,7 +807,9 @@ void SplitRGBRow_RVV(const uint8_t* src_rgb, src_rgb += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_MERGERGBROW_RVV void MergeRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -628,7 +829,9 @@ void MergeRGBRow_RVV(const uint8_t* src_r, dst_rgb += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_SPLITARGBROW_RVV void SplitARGBRow_RVV(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -652,7 +855,9 @@ void SplitARGBRow_RVV(const uint8_t* src_argb, src_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_MERGEARGBROW_RVV void MergeARGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -675,7 +880,9 @@ void MergeARGBRow_RVV(const uint8_t* src_r, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_SPLITXRGBROW_RVV void SplitXRGBRow_RVV(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -696,7 +903,9 @@ void SplitXRGBRow_RVV(const uint8_t* src_argb, src_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_MERGEXRGBROW_RVV void MergeXRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -719,7 +928,9 @@ void MergeXRGBRow_RVV(const uint8_t* src_r, vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#endif +#ifdef HAS_SPLITUVROW_RVV void SplitUVRow_RVV(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -737,7 +948,9 @@ void SplitUVRow_RVV(const uint8_t* src_uv, src_uv += 2 * vl; } while (w > 0); } +#endif +#ifdef HAS_MERGEUVROW_RVV void MergeUVRow_RVV(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, @@ -755,6 +968,7 @@ void MergeUVRow_RVV(const uint8_t* src_u, dst_uv += 2 * vl; } while (w > 0); } +#endif struct RgbConstants { uint8_t kRGBToY[4]; @@ -787,7 +1001,8 @@ static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080, 0}; -// ARGB expects first 3 values to contain RGB and 4th value is ignored. +// ARGB expects first 3 values to contain RGB and 4th value is ignored +#ifdef HAS_ARGBTOYMATRIXROW_RVV void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width, @@ -817,24 +1032,34 @@ void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, dst_y += vl; } while (w > 0); } +#endif +#ifdef HAS_ARGBTOYROW_RVV void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) { ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants); } +#endif +#ifdef HAS_ARGBTOYJROW_RVV void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) { ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants); } +#endif +#ifdef HAS_ABGRTOYROW_RVV void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) { ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants); } +#endif +#ifdef HAS_ABGRTOYJROW_RVV void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants); } +#endif // RGBA expects first value to be A and ignored, then 3 values to contain RGB. +#ifdef HAS_RGBATOYMATRIXROW_RVV void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width, @@ -864,19 +1089,27 @@ void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, dst_y += vl; } while (w > 0); } +#endif +#ifdef HAS_RGBATOYROW_RVV void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) { RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants); } +#endif +#ifdef HAS_RGBATOYJROW_RVV void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants); } +#endif +#ifdef HAS_BGRATOYROW_RVV void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) { RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants); } +#endif +#ifdef HAS_RGBTOYMATRIXROW_RVV void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, uint8_t* dst_y, int width, @@ -906,51 +1139,179 @@ void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, dst_y += vl; } while (w > 0); } +#endif +#ifdef HAS_RGB24TOYJROW_RVV void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); } +#endif +#ifdef HAS_RAWTOYJROW_RVV void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) { RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants); } +#endif +#ifdef HAS_RGB24TOYROW_RVV void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants); } +#endif +#ifdef HAS_RAWTOYROW_RVV void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) { RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants); } +#endif + +// Blend src_argb over src_argb1 and store to dst_argb. +// dst_argb may be src_argb or src_argb1. +// src_argb: RGB values have already been pre-multiplied by the a. +#ifdef HAS_ARGBBLENDROW_RVV +void ARGBBlendRow_RVV(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvlmax_e8m2(); + // clamp255((((256 - a) * b) >> 8) + f) + // = b * (256 - a) / 256 + f + // = b - (b * a / 256) + f + vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl); + do { + vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a; + vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a; + vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r; + vuint8m2_t v_dst_b, v_dst_g, v_dst_r; + vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a, + src_argb, vl); + __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a, + src_argb1, vl); + + v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl); + v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl); + v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl); + + v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl); + v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl); + v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl); + + v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl); + v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl); + v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl); + + w -= vl; + src_argb += 4 * vl; + src_argb1 += 4 * vl; + dst_argb += 4 * vl; + } while (w > 0); +} +#endif + +#ifdef HAS_BLENDPLANEROW_RVV +void BlendPlaneRow_RVV(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + size_t w = (size_t)width; + do { + vuint16m8_t v_dst_u16; + vuint8m4_t v_dst; + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl); + vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl); + vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl); + vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl); + + // (a * foreground) + (1-a) * background + v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl); + v_dst_u16 = + __riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl); + v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl); + v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl); + + __riscv_vse8_v_u8m4(dst, v_dst, vl); + w -= vl; + src0 += vl; + src1 += vl; + alpha += vl; + dst += vl; + } while (w > 0); +} +#endif +// Attenuate: (f * a + 255) >> 8 +#ifdef HAS_ARGBATTENUATEROW_RVV void ARGBAttenuateRow_RVV(const uint8_t* src_argb, uint8_t* dst_argb, int width) { size_t w = (size_t)width; - // To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up(0). - asm volatile("csrwi vxrm, 0"); do { vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_ba_16, v_ga_16, v_ra_16; size_t vl = __riscv_vsetvl_e8m2(w); __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + // f * a v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl); v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl); v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl); - v_b = __riscv_vnclipu_wx_u8m2(v_ba_16, 8, vl); - v_g = __riscv_vnclipu_wx_u8m2(v_ga_16, 8, vl); - v_r = __riscv_vnclipu_wx_u8m2(v_ra_16, 8, vl); + // f * a + 255 + v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl); + v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl); + v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl); + // (f * a + 255) >> 8 + v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl); + v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl); + v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl); __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); w -= vl; src_argb += vl * 4; dst_argb += vl * 4; } while (w > 0); } +#endif + +#ifdef HAS_ARGBEXTRACTALPHAROW_RVV +void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_b, v_g, v_r, v_a; + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_a += vl; + } while (w > 0); +} +#endif + +#ifdef HAS_ARGBCOPYYTOALPHAROW_RVV +void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) { + size_t w = (size_t)width; + const ptrdiff_t dst_stride = 4; + dst += 3; + do { + size_t vl = __riscv_vsetvl_e8m8(w); + vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl); + __riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl); + w -= vl; + src += vl; + dst += vl * dst_stride; + } while (w > 0); +} +#endif #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) +#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && + // defined(__clang__) |