diff options
author | Lu Wang <wanglu@loongson.cn> | 2023-05-08 21:13:25 +0800 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-05-10 00:25:48 +0000 |
commit | 1d940cc570212c8979d81e78738296fe39f9df43 (patch) | |
tree | 7bcaeb72858520f5bf99655359cd8e9cc723d1db | |
parent | b372510c5699abdde5d50b60e89daa5b71b7792c (diff) | |
download | libyuv-1d940cc570212c8979d81e78738296fe39f9df43.tar.gz |
Optimize the following functions with LSX.
MirrorRow_LSX, MirrorUVRow_LSX, ARGBMirrorRow_LSX,
I422ToYUY2Row_LSX, I422ToUYVYRow_LSX, I422ToARGBRow_LSX,
I422ToRGBARow_LSX, I422AlphaToARGBRow_LSX, I422ToRGB24Row_LSX,
I422ToRGB565Row_LSX, I422ToARGB4444Row_LSX, I422ToARGB1555Row_LSX,
YUY2ToYRow_LSX, YUY2ToUVRow_LSX, YUY2ToUV422Row_LSX
Bug: libyuv:913
Change-Id: I46cec605001d7ddd73846eed6d0a77f936b6dc53
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4515191
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | include/libyuv/row.h | 152 | ||||
-rw-r--r-- | source/convert.cc | 10 | ||||
-rw-r--r-- | source/convert_argb.cc | 104 | ||||
-rw-r--r-- | source/convert_from.cc | 24 | ||||
-rw-r--r-- | source/convert_from_argb.cc | 16 | ||||
-rw-r--r-- | source/planar_functions.cc | 42 | ||||
-rw-r--r-- | source/rotate.cc | 8 | ||||
-rw-r--r-- | source/rotate_argb.cc | 8 | ||||
-rw-r--r-- | source/row_any.cc | 35 | ||||
-rw-r--r-- | source/row_lsx.cc | 559 | ||||
-rw-r--r-- | source/scale_argb.cc | 8 |
11 files changed, 966 insertions, 0 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 6e973bcf..9a9d1b38 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -657,13 +657,24 @@ extern "C" { #define HAS_ARGBSETROW_LSX #define HAS_ARGBTOUVJROW_LSX #define HAS_ARGBTOYJROW_LSX +#define HAS_ARGBMIRRORROW_LSX #define HAS_BGRATOUVROW_LSX #define HAS_BGRATOYROW_LSX #define HAS_I400TOARGBROW_LSX #define HAS_I444TOARGBROW_LSX #define HAS_INTERPOLATEROW_LSX +#define HAS_I422ALPHATOARGBROW_LSX +#define HAS_I422TOARGB1555ROW_LSX +#define HAS_I422TOARGB4444ROW_LSX +#define HAS_I422TORGB24ROW_LSX +#define HAS_I422TORGB565ROW_LSX +#define HAS_I422TORGBAROW_LSX +#define HAS_I422TOUYVYROW_LSX +#define HAS_I422TOYUY2ROW_LSX #define HAS_J400TOARGBROW_LSX #define HAS_MERGEUVROW_LSX +#define HAS_MIRRORROW_LSX +#define HAS_MIRRORUVROW_LSX #define HAS_MIRRORSPLITUVROW_LSX #define HAS_NV12TOARGBROW_LSX #define HAS_NV12TORGB565ROW_LSX @@ -687,6 +698,9 @@ extern "C" { #define HAS_SPLITUVROW_LSX #define HAS_UYVYTOARGBROW_LSX #define HAS_YUY2TOARGBROW_LSX +#define HAS_YUY2TOUVROW_LSX +#define HAS_YUY2TOUV422ROW_LSX +#define HAS_YUY2TOYROW_LSX #define HAS_ARGBTOYROW_LSX #define HAS_ABGRTOYJROW_LSX #define HAS_RGBATOYJROW_LSX @@ -694,6 +708,10 @@ extern "C" { #define HAS_RAWTOYJROW_LSX #endif +#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) +#define HAS_I422TOARGBROW_LSX +#endif + #if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx) #define HAS_ARGB1555TOARGBROW_LASX #define HAS_ARGB1555TOUVROW_LASX @@ -1060,6 +1078,12 @@ void I422ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1072,6 +1096,12 @@ void I422ToRGBARow_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToRGBARow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGBARow_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1085,6 +1115,13 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422AlphaToARGBRow_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1098,6 +1135,12 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB24Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGB24Row_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1110,6 +1153,12 @@ void I422ToRGB565Row_MSA(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB565Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGB565Row_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1122,6 +1171,12 @@ void I422ToARGB4444Row_MSA(const uint8_t* src_y, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB4444Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB4444Row_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1134,6 +1189,12 @@ void I422ToARGB1555Row_MSA(const uint8_t* src_y, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB1555Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB1555Row_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1958,6 +2019,7 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1965,17 +2027,20 @@ void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorSplitUVRow_SSSE3(const uint8_t* src, @@ -2005,6 +2070,7 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr, @@ -2017,6 +2083,9 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBMirrorRow_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4731,6 +4800,12 @@ void I422ToARGBRow_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToARGBRow_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4743,6 +4818,12 @@ void I422ToRGBARow_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToRGBARow_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGBARow_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4756,6 +4837,13 @@ void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422AlphaToARGBRow_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4769,6 +4857,12 @@ void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB24Row_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGB24Row_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4781,6 +4875,12 @@ void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB565Row_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGB565Row_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4793,6 +4893,12 @@ void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB4444Row_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB4444Row_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4805,6 +4911,12 @@ void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB1555Row_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB1555Row_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4917,12 +5029,18 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_v, int width); void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUVRow_LSX(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUVRow_LASX(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, @@ -4932,6 +5050,10 @@ void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, @@ -4993,12 +5115,18 @@ void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_v, int width); void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUVRow_Any_LSX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUVRow_Any_LASX(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, @@ -5008,6 +5136,10 @@ void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUV422Row_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUV422Row_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -5243,6 +5375,11 @@ void I422ToYUY2Row_MSA(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width); +void I422ToYUY2Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); void I422ToYUY2Row_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -5253,6 +5390,11 @@ void I422ToUYVYRow_MSA(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width); +void I422ToUYVYRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); void I422ToUYVYRow_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -5263,6 +5405,11 @@ void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); +void I422ToYUY2Row_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); void I422ToYUY2Row_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5273,6 +5420,11 @@ void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); +void I422ToUYVYRow_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); void I422ToUYVYRow_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, diff --git a/source/convert.cc b/source/convert.cc index 0bcfbf20..ad679c59 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1457,6 +1457,16 @@ int YUY2ToI420(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + YUY2ToYRow = YUY2ToYRow_Any_LSX; + YUY2ToUVRow = YUY2ToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_LSX; + YUY2ToUVRow = YUY2ToUVRow_LSX; + } + } +#endif #if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { YUY2ToYRow = YUY2ToYRow_Any_LASX; diff --git a/source/convert_argb.cc b/source/convert_argb.cc index c797a756..691208fd 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -120,6 +120,14 @@ int I420ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422TOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToARGBRow = I422ToARGBRow_Any_LASX; @@ -361,6 +369,14 @@ int I422ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422TOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToARGBRow = I422ToARGBRow_Any_LASX; @@ -2007,6 +2023,14 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422ALPHATOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422ALPHATOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX; @@ -2132,6 +2156,14 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422ALPHATOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422ALPHATOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX; @@ -4463,6 +4495,14 @@ int I422ToRGBAMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGBAROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGBARow = I422ToRGBARow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_LSX; + } + } +#endif #if defined(HAS_I422TORGBAROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToRGBARow = I422ToRGBARow_Any_LASX; @@ -4678,6 +4718,14 @@ int I420ToRGBAMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGBAROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGBARow = I422ToRGBARow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_LSX; + } + } +#endif #if defined(HAS_I422TORGBAROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToRGBARow = I422ToRGBARow_Any_LASX; @@ -4795,6 +4843,14 @@ int I420ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB24Row = I422ToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_LSX; + } + } +#endif #if defined(HAS_I422TORGB24ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToRGB24Row = I422ToRGB24Row_Any_LASX; @@ -4984,6 +5040,14 @@ int I422ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB24Row = I422ToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_LSX; + } + } +#endif #if defined(HAS_I422TORGB24ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToRGB24Row = I422ToRGB24Row_Any_LASX; @@ -5098,6 +5162,14 @@ int I420ToARGB1555(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGB1555ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGB1555Row = I422ToARGB1555Row_LSX; + } + } +#endif #if defined(HAS_I422TOARGB1555ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX; @@ -5179,6 +5251,14 @@ int I420ToARGB4444(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGB4444ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGB4444Row = I422ToARGB4444Row_LSX; + } + } +#endif #if defined(HAS_I422TOARGB4444ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToARGB4444Row = I422ToARGB4444Row_Any_LASX; @@ -5261,6 +5341,14 @@ int I420ToRGB565Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB565ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_LSX; + } + } +#endif #if defined(HAS_I422TORGB565ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToRGB565Row = I422ToRGB565Row_Any_LASX; @@ -5393,6 +5481,14 @@ int I422ToRGB565Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB565ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_LSX; + } + } +#endif #if defined(HAS_I422TORGB565ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToRGB565Row = I422ToRGB565Row_Any_LASX; @@ -5508,6 +5604,14 @@ int I420ToRGB565Dither(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422TOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToARGBRow = I422ToARGBRow_Any_LASX; diff --git a/source/convert_from.cc b/source/convert_from.cc index 8bd07e4c..4102d610 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -446,6 +446,14 @@ int I420ToYUY2(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_LSX; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToYUY2Row = I422ToYUY2Row_Any_LASX; @@ -533,6 +541,14 @@ int I422ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_LSX; + } + } +#endif #if defined(HAS_I422TOUYVYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToUYVYRow = I422ToUYVYRow_Any_LASX; @@ -608,6 +624,14 @@ int I420ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_LSX; + } + } +#endif #if defined(HAS_I422TOUYVYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToUYVYRow = I422ToUYVYRow_Any_LASX; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 47ee3437..7e6d8647 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1117,6 +1117,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_LSX; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToYUY2Row = I422ToYUY2Row_Any_LASX; @@ -1288,6 +1296,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_LSX; + } + } +#endif #if defined(HAS_I422TOUYVYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToUYVYRow = I422ToUYVYRow_Any_LASX; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index c6f9d5c7..b0dc2f43 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2113,6 +2113,16 @@ int YUY2ToI422(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + YUY2ToYRow = YUY2ToYRow_Any_LSX; + YUY2ToUV422Row = YUY2ToUV422Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_LSX; + YUY2ToUV422Row = YUY2ToUV422Row_LSX; + } + } +#endif #if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { YUY2ToYRow = YUY2ToYRow_Any_LASX; @@ -2414,6 +2424,14 @@ void MirrorPlane(const uint8_t* src_y, } } #endif +#if defined(HAS_MIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MirrorRow = MirrorRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_LSX; + } + } +#endif #if defined(HAS_MIRRORROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { MirrorRow = MirrorRow_Any_LASX; @@ -2480,6 +2498,14 @@ void MirrorUVPlane(const uint8_t* src_uv, } } #endif +#if defined(HAS_MIRRORUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MirrorUVRow = MirrorUVRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_LSX; + } + } +#endif #if defined(HAS_MIRRORUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { MirrorUVRow = MirrorUVRow_Any_LASX; @@ -2652,6 +2678,14 @@ int ARGBMirror(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_LSX; + } + } +#endif #if defined(HAS_ARGBMIRRORROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBMirrorRow = ARGBMirrorRow_Any_LASX; @@ -5314,6 +5348,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + YUY2ToYRow = YUY2ToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_LSX; + } + } +#endif #if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { YUY2ToYRow = YUY2ToYRow_Any_LASX; diff --git a/source/rotate.cc b/source/rotate.cc index b1b4458e..6797ff02 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -178,6 +178,14 @@ void RotatePlane180(const uint8_t* src, } } #endif +#if defined(HAS_MIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MirrorRow = MirrorRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_LSX; + } + } +#endif #if defined(HAS_MIRRORROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { MirrorRow = MirrorRow_Any_LASX; diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index 28226210..9667f34c 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -156,6 +156,14 @@ static int ARGBRotate180(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_LSX; + } + } +#endif #if defined(HAS_ARGBMIRRORROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBMirrorRow = ARGBMirrorRow_Any_LASX; diff --git a/source/row_any.cc b/source/row_any.cc index 37aa1fba..27b12a7a 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -113,6 +113,9 @@ ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7) #ifdef HAS_I422ALPHATOARGBROW_MSA ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) #endif +#ifdef HAS_I422ALPHATOARGBROW_LSX +ANY41C(I422AlphaToARGBRow_Any_LSX, I422AlphaToARGBRow_LSX, 1, 0, 4, 15) +#endif #ifdef HAS_I422ALPHATOARGBROW_LASX ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15) #endif @@ -284,6 +287,9 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #ifdef HAS_I422TOYUY2ROW_MSA ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) #endif +#ifdef HAS_I422TOYUY2ROW_LSX +ANY31(I422ToYUY2Row_Any_LSX, I422ToYUY2Row_LSX, 1, 1, 4, 15) +#endif #ifdef HAS_I422TOYUY2ROW_LASX ANY31(I422ToYUY2Row_Any_LASX, I422ToYUY2Row_LASX, 1, 1, 4, 31) #endif @@ -293,6 +299,9 @@ ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #ifdef HAS_I422TOUYVYROW_MSA ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) #endif +#ifdef HAS_I422TOUYVYROW_LSX +ANY31(I422ToUYVYRow_Any_LSX, I422ToUYVYRow_LSX, 1, 1, 4, 15) +#endif #ifdef HAS_I422TOUYVYROW_LASX ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31) #endif @@ -408,6 +417,14 @@ ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) #endif +#ifdef HAS_I422TOARGBROW_LSX +ANY31C(I422ToARGBRow_Any_LSX, I422ToARGBRow_LSX, 1, 0, 4, 15) +ANY31C(I422ToRGBARow_Any_LSX, I422ToRGBARow_LSX, 1, 0, 4, 15) +ANY31C(I422ToRGB24Row_Any_LSX, I422ToRGB24Row_LSX, 1, 0, 3, 15) +ANY31C(I422ToRGB565Row_Any_LSX, I422ToRGB565Row_LSX, 1, 0, 2, 15) +ANY31C(I422ToARGB4444Row_Any_LSX, I422ToARGB4444Row_LSX, 1, 0, 2, 15) +ANY31C(I422ToARGB1555Row_Any_LSX, I422ToARGB1555Row_LSX, 1, 0, 2, 15) +#endif #ifdef HAS_I422TOARGBROW_LASX ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31) ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31) @@ -1204,6 +1221,9 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) #ifdef HAS_YUY2TOYROW_MSA ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_YUY2TOYROW_LSX +ANY11(YUY2ToYRow_Any_LSX, YUY2ToYRow_LSX, 1, 4, 1, 15) +#endif #ifdef HAS_YUY2TOYROW_LASX ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31) #endif @@ -1842,6 +1862,9 @@ ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31) #ifdef HAS_MIRRORROW_MSA ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) #endif +#ifdef HAS_MIRRORROW_LSX +ANY11M(MirrorRow_Any_LSX, MirrorRow_LSX, 1, 31) +#endif #ifdef HAS_MIRRORROW_LASX ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63) #endif @@ -1857,6 +1880,9 @@ ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) #ifdef HAS_MIRRORUVROW_MSA ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7) #endif +#ifdef HAS_MIRRORUVROW_LSX +ANY11M(MirrorUVRow_Any_LSX, MirrorUVRow_LSX, 2, 7) +#endif #ifdef HAS_MIRRORUVROW_LASX ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15) #endif @@ -1872,6 +1898,9 @@ ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7) #ifdef HAS_ARGBMIRRORROW_MSA ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) #endif +#ifdef HAS_ARGBMIRRORROW_LSX +ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7) +#endif #ifdef HAS_ARGBMIRRORROW_LASX ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15) #endif @@ -1970,6 +1999,9 @@ ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_YUY2TOUV422ROW_LSX +ANY12(YUY2ToUV422Row_Any_LSX, YUY2ToUV422Row_LSX, 1, 4, 1, 15) +#endif #ifdef HAS_YUY2TOUV422ROW_LASX ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31) ANY12(YUY2ToUV422Row_Any_LASX, YUY2ToUV422Row_LASX, 1, 4, 1, 31) @@ -2251,6 +2283,9 @@ ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) #ifdef HAS_YUY2TOUVROW_MSA ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) #endif +#ifdef HAS_YUY2TOUVROW_LSX +ANY12S(YUY2ToUVRow_Any_LSX, YUY2ToUVRow_LSX, 1, 4, 15) +#endif #ifdef HAS_YUY2TOUVROW_LASX ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31) #endif diff --git a/source/row_lsx.cc b/source/row_lsx.cc index 9c1e16f2..48baafb7 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -31,6 +31,91 @@ extern "C" { yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ } +// Load 32 YUV422 pixel data +#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \ + { \ + __m128i temp0, temp1; \ + \ + DUP2_ARG2(__lsx_vld, psrc_y, 0, psrc_u, 0, out_y, temp0); \ + temp1 = __lsx_vld(psrc_v, 0); \ + temp0 = __lsx_vsub_b(temp0, const_80); \ + temp1 = __lsx_vsub_b(temp1, const_80); \ + temp0 = __lsx_vsllwil_h_b(temp0, 0); \ + temp1 = __lsx_vsllwil_h_b(temp1, 0); \ + uv_l = __lsx_vilvl_h(temp0, temp1); \ + uv_h = __lsx_vilvh_h(temp0, temp1); \ + } + +// Load 16 YUV422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \ + { \ + __m128i temp0, temp1; \ + \ + out_y = __lsx_vld(psrc_y, 0); \ + temp0 = __lsx_vldrepl_d(psrc_u, 0); \ + temp1 = __lsx_vldrepl_d(psrc_v, 0); \ + uv = __lsx_vilvl_b(temp0, temp1); \ + uv = __lsx_vsub_b(uv, const_80); \ + uv = __lsx_vsllwil_h_b(uv, 0); \ + } + +// Convert 16 pixels of YUV420 to RGB. +#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, \ + b_h, g_l,g_h, r_l, r_h) \ + { \ + __m128i u_l, u_h, v_l, v_h; \ + __m128i yl_ev, yl_od, yh_ev, yh_od; \ + __m128i temp0, temp1, temp2, temp3; \ + \ + temp0 = __lsx_vilvl_b(in_y, in_y); \ + temp1 = __lsx_vilvh_b(in_y, in_y); \ + yl_ev = __lsx_vmulwev_w_hu_h(temp0, yg); \ + yl_od = __lsx_vmulwod_w_hu_h(temp0, yg); \ + yh_ev = __lsx_vmulwev_w_hu_h(temp1, yg); \ + yh_od = __lsx_vmulwod_w_hu_h(temp1, yg); \ + DUP4_ARG2(__lsx_vsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \ + yl_ev, yl_od, yh_ev, yh_od); \ + yl_ev = __lsx_vadd_w(yl_ev, yb); \ + yl_od = __lsx_vadd_w(yl_od, yb); \ + yh_ev = __lsx_vadd_w(yh_ev, yb); \ + yh_od = __lsx_vadd_w(yh_od, yb); \ + v_l = __lsx_vmulwev_w_h(in_uvl, ubvr); \ + u_l = __lsx_vmulwod_w_h(in_uvl, ubvr); \ + v_h = __lsx_vmulwev_w_h(in_uvh, ubvr); \ + u_h = __lsx_vmulwod_w_h(in_uvh, ubvr); \ + temp0 = __lsx_vadd_w(yl_ev, u_l); \ + temp1 = __lsx_vadd_w(yl_od, u_l); \ + temp2 = __lsx_vadd_w(yh_ev, u_h); \ + temp3 = __lsx_vadd_w(yh_od, u_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + b_l = __lsx_vpackev_h(temp1, temp0); \ + b_h = __lsx_vpackev_h(temp3, temp2); \ + temp0 = __lsx_vadd_w(yl_ev, v_l); \ + temp1 = __lsx_vadd_w(yl_od, v_l); \ + temp2 = __lsx_vadd_w(yh_ev, v_h); \ + temp3 = __lsx_vadd_w(yh_od, v_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + r_l = __lsx_vpackev_h(temp1, temp0); \ + r_h = __lsx_vpackev_h(temp3, temp2); \ + DUP2_ARG2(__lsx_vdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \ + temp0 = __lsx_vsub_w(yl_ev, u_l); \ + temp1 = __lsx_vsub_w(yl_od, u_l); \ + temp2 = __lsx_vsub_w(yh_ev, u_h); \ + temp3 = __lsx_vsub_w(yh_od, u_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + g_l = __lsx_vpackev_h(temp1, temp0); \ + g_h = __lsx_vpackev_h(temp3, temp2); \ + } + // Convert 8 pixels of YUV420 to RGB. #define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \ { \ @@ -118,6 +203,26 @@ extern "C" { out_g = __lsx_vpackev_h(tmp1, tmp0); \ } +// Pack and Store 16 ARGB values. +#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \ + { \ + \ + __m128i temp0, temp1, temp2, temp3; \ + temp0 = __lsx_vpackev_b(g_l, b_l); \ + temp1 = __lsx_vpackev_b(a_l, r_l); \ + temp2 = __lsx_vpackev_b(g_h, b_h); \ + temp3 = __lsx_vpackev_b(a_h, r_h); \ + r_l = __lsx_vilvl_h(temp1, temp0); \ + r_h = __lsx_vilvh_h(temp1, temp0); \ + g_l = __lsx_vilvl_h(temp3, temp2); \ + g_h = __lsx_vilvh_h(temp3, temp2); \ + __lsx_vst(r_l, pdst_argb, 0); \ + __lsx_vst(r_h, pdst_argb, 16); \ + __lsx_vst(g_l, pdst_argb, 32); \ + __lsx_vst(g_h, pdst_argb, 48); \ + pdst_argb += 64; \ + } + // Pack and Store 8 ARGB values. #define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \ { \ @@ -155,6 +260,460 @@ extern "C" { _dst0 = __lsx_vpickod_b(_reg1, _reg0); \ } +void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 32; + __m128i src0, src1; + __m128i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607}; + src += width - 32; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + __lsx_vst(src1, dst, 0); + __lsx_vst(src0, dst, 16); + dst += 32; + src -= 32; + } +} + +void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + int len = width / 8; + __m128i src, dst; + __m128i shuffler = {0x0004000500060007, 0x0000000100020003}; + + src_uv += (width - 8) << 1; + for (x = 0; x < len; x++) { + src = __lsx_vld(src_uv, 0); + dst = __lsx_vshuf_h(shuffler, src, src); + __lsx_vst(dst, dst_uv, 0); + src_uv -= 16; + dst_uv += 16; + } +} + +void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 8; + __m128i src0, src1; + __m128i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504}; + + src += (width * 4) - 32; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + __lsx_vst(src1, dst, 0); + __lsx_vst(src0, dst, 16); + dst += 32; + src -= 32; + } +} + +void I422ToYUY2Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + int x; + int len = width / 16; + __m128i src_u0, src_v0, src_y0, vec_uv0; + __m128i vec_yuy2_0, vec_yuy2_1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lsx_vld(src_y, 0); + vec_uv0 = __lsx_vilvl_b(src_v0, src_u0); + vec_yuy2_0 = __lsx_vilvl_b(vec_uv0, src_y0); + vec_yuy2_1 = __lsx_vilvh_b(vec_uv0, src_y0); + __lsx_vst(vec_yuy2_0, dst_yuy2, 0); + __lsx_vst(vec_yuy2_1, dst_yuy2, 16); + src_u += 8; + src_v += 8; + src_y += 16; + dst_yuy2 += 32; + } +} + +void I422ToUYVYRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + int x; + int len = width / 16; + __m128i src_u0, src_v0, src_y0, vec_uv0; + __m128i vec_uyvy0, vec_uyvy1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lsx_vld(src_y, 0); + vec_uv0 = __lsx_vilvl_b(src_v0, src_u0); + vec_uyvy0 = __lsx_vilvl_b(src_y0, vec_uv0); + vec_uyvy1 = __lsx_vilvh_b(src_y0, vec_uv0); + __lsx_vst(vec_uyvy0, dst_uyvy, 0); + __lsx_vst(vec_uyvy1, dst_uyvy, 16); + src_u += 8; + src_v += 8; + src_y += 16; + dst_uyvy += 32; + } +} + +void I422ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i alpha = __lsx_vldi(0xFF); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422ToRGBARow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i alpha = __lsx_vldi(0xFF); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422AlphaToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + int res = width & 15; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i zero = __lsx_vldi(0); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h; + + y = __lsx_vld(src_a, 0); + a_l = __lsx_vilvl_b(zero, y); + a_h = __lsx_vilvh_b(zero, y); + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + src_a += 16; + } + if (res) { + __m128i y, uv, r, g, b, a; + a = __lsx_vld(src_a, 0); + a = __lsx_vsllwil_hu_bu(a, 0); + READYUV422(src_y, src_u, src_v, y, uv); + YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r); + STOREARGB(a, r, g, b, dst_argb); + } +} + +void I422ToRGB24Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int32_t width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i shuffler0 = {0x0504120302100100, 0x0A18090816070614}; + __m128i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m128i temp0, temp1, temp2, temp3; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + temp0 = __lsx_vpackev_b(g_l, b_l); + temp1 = __lsx_vpackev_b(g_h, b_h); + DUP4_ARG3(__lsx_vshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, + r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0, + temp1); + + b_l = __lsx_vilvl_d(temp1, temp2); + b_h = __lsx_vilvh_d(temp3, temp1); + __lsx_vst(temp0, dst_argb, 0); + __lsx_vst(b_l, dst_argb, 16); + __lsx_vst(b_h, dst_argb, 32); + dst_argb += 48; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 3); + b_h = __lsx_vsrli_h(b_h, 3); + g_l = __lsx_vsrli_h(g_l, 2); + g_h = __lsx_vsrli_h(g_h, 2); + r_l = __lsx_vsrli_h(r_l, 3); + r_h = __lsx_vsrli_h(r_h, 3); + r_l = __lsx_vslli_h(r_l, 11); + r_h = __lsx_vslli_h(r_h, 11); + g_l = __lsx_vslli_h(g_l, 5); + g_h = __lsx_vslli_h(g_h, 5); + r_l = __lsx_vor_v(r_l, g_l); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, g_h); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_rgb565, 0); + __lsx_vst(r_h, dst_rgb565, 16); + dst_rgb565 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i alpha = {0xF000F000F000F000, 0xF000F000F000F000}; + __m128i mask = {0x00F000F000F000F0, 0x00F000F000F000F0}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 4); + b_h = __lsx_vsrli_h(b_h, 4); + r_l = __lsx_vsrli_h(r_l, 4); + r_h = __lsx_vsrli_h(r_h, 4); + g_l = __lsx_vand_v(g_l, mask); + g_h = __lsx_vand_v(g_h, mask); + r_l = __lsx_vslli_h(r_l, 8); + r_h = __lsx_vslli_h(r_h, 8); + r_l = __lsx_vor_v(r_l, alpha); + r_h = __lsx_vor_v(r_h, alpha); + r_l = __lsx_vor_v(r_l, g_l); + r_h = __lsx_vor_v(r_h, g_h); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_argb4444, 0); + __lsx_vst(r_h, dst_argb4444, 16); + dst_argb4444 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422ToARGB1555Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i alpha = {0x8000800080008000, 0x8000800080008000}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 3); + b_h = __lsx_vsrli_h(b_h, 3); + g_l = __lsx_vsrli_h(g_l, 3); + + g_h = __lsx_vsrli_h(g_h, 3); + g_l = __lsx_vslli_h(g_l, 5); + g_h = __lsx_vslli_h(g_h, 5); + r_l = __lsx_vsrli_h(r_l, 3); + r_h = __lsx_vsrli_h(r_h, 3); + r_l = __lsx_vslli_h(r_l, 10); + r_h = __lsx_vslli_h(r_h, 10); + r_l = __lsx_vor_v(r_l, alpha); + r_h = __lsx_vor_v(r_h, alpha); + r_l = __lsx_vor_v(r_l, g_l); + r_h = __lsx_vor_v(r_h, g_h); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_argb1555, 0); + __lsx_vst(r_h, dst_argb1555, 16); + dst_argb1555 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1); + dst0 = __lsx_vpickev_b(src1, src0); + __lsx_vst(dst0, dst_y, 0); + src_yuy2 += 32; + dst_y += 16; + } +} + +void YUY2ToUVRow_LSX(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src_yuy2_next, 0, + src_yuy2_next, 16, src0, src1, src2, src3); + src0 = __lsx_vpickod_b(src1, src0); + src1 = __lsx_vpickod_b(src3, src2); + tmp0 = __lsx_vavgr_bu(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_yuy2 += 32; + src_yuy2_next += 32; + dst_u += 8; + dst_v += 8; + } +} + +void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1); + tmp0 = __lsx_vpickod_b(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_yuy2 += 32; + dst_u += 8; + dst_v += 8; + } +} + void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 3e6f5477..7e78cc1b 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -659,6 +659,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(src_width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422TOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToARGBRow = I422ToARGBRow_Any_LASX; |