diff options
author | Bruce Lai <bruce.lai@sifive.com> | 2023-07-04 22:05:46 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-07-27 02:59:47 +0000 |
commit | c60ac4025cd88ec9139cc01d6021fa81a8d2c085 (patch) | |
tree | b3953e1ae0847255c46c46ec5e59d2736a3dc9e1 | |
parent | 10de943a12737be193e4f51df9e37b555ea09710 (diff) | |
download | libyuv-c60ac4025cd88ec9139cc01d6021fa81a8d2c085.tar.gz |
[RVV] Enable ScaleRowDown38_RVV & ScaleRowDown38_{2,3}_Box_RVV
* Run on SiFive internal FPGA:
Test Case Speedup
I420ScaleDownBy3by8_None 4.2
I420ScaleDownBy3by8_Linear 1.7
I420ScaleDownBy3by8_Bilinear 1.7
I420ScaleDownBy3by8_Box 1.7
I444ScaleDownBy3by8_None 4.2
I444ScaleDownBy3by8_Linear 1.8
I444ScaleDownBy3by8_Bilinear 1.8
I444ScaleDownBy3by8_Box 1.8
Change-Id: Ic2e98de2494d9e7b25f5db115a7f21c618eaefed
Signed-off-by: Bruce Lai <bruce.lai@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4711857
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | include/libyuv/scale_row.h | 14 | ||||
-rw-r--r-- | source/scale.cc | 11 | ||||
-rw-r--r-- | source/scale_rvv.cc | 148 |
3 files changed, 172 insertions, 1 deletions
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 98829172..d825a8db 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -181,6 +181,7 @@ extern "C" { #define HAS_SCALEARGBROWDOWNEVEN_RVV #define HAS_SCALEROWDOWN2_RVV #define HAS_SCALEROWDOWN34_RVV +#define HAS_SCALEROWDOWN38_RVV #define HAS_SCALEROWDOWN4_RVV #define HAS_SCALEROWUP2_LINEAR_RVV #define HAS_SCALEROWUP2_BILINEAR_RVV @@ -1847,6 +1848,19 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); +void ScaleRowDown38_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); diff --git a/source/scale.cc b/source/scale.cc index 0693362a..43d973af 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -711,6 +711,17 @@ static void ScalePlaneDown38(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN38_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_RVV; + ScaleRowDown38_2 = ScaleRowDown38_RVV; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_RVV; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_RVV; + } + } +#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc index 98118831..c7a51d51 100644 --- a/source/scale_rvv.cc +++ b/source/scale_rvv.cc @@ -21,8 +21,8 @@ // This module is for clang rvv. GCC hasn't supported segment load & store. #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \ defined(__clang__) +#include <assert.h> #include <riscv_vector.h> - #ifdef __cplusplus namespace libyuv { extern "C" { @@ -463,6 +463,152 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr, } while (w > 0); } +void ScaleRowDown38_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + (void)src_stride; + assert(dst_width % 3 == 0); + do { + vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7; + size_t vl = __riscv_vsetvl_e8m1(w); + __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6, + &v_s7, src_ptr, vl); + __riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl); + w -= vl; + src_ptr += 8 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} + +void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + const uint16_t coeff_a = (65536u / 6u); + const uint16_t coeff_b = (65536u / 4u); + assert((dst_width % 3 == 0) && (dst_width > 0)); + do { + vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7; + vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7; + vuint16m2_t v_e0, v_e1, v_e2, v_e; + vuint16m2_t v_f0, v_f1, v_f2, v_f; + vuint16m2_t v_g0, v_g1, v_g; + vuint8m1_t v_dst_e, v_dst_f, v_dst_g; + size_t vl = __riscv_vsetvl_e8m1(w); + // s: e00, e10, e20, f00, f10, f20, g00, g10 + // t: e01, e11, e21, f01, f11, f21, g01, g11 + __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6, + &v_s7, src_ptr, vl); + __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6, + &v_t7, src_ptr + src_stride, vl); + // Calculate sum of [e00, e21] to v_e + // Calculate sum of [f00, f21] to v_f + // Calculate sum of [g00, g11] to v_g + v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl); + v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl); + v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl); + v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl); + v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl); + v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl); + v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl); + v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl); + + v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl); + v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl); + v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl); + v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl); + v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl); + + // Average in 16-bit fixed-point + v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl); + v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl); + v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl); + + v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl); + v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl); + v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl); + + __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl); + w -= vl; + src_ptr += 8 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} + +void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + const uint16_t coeff_a = (65536u / 9u); + const uint16_t coeff_b = (65536u / 6u); + assert((dst_width % 3 == 0) && (dst_width > 0)); + do { + vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7; + vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7; + vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7; + vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e; + vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f; + vuint16m2_t v_g0, v_g1, v_g2, v_g; + vuint8m1_t v_dst_e, v_dst_f, v_dst_g; + size_t vl = __riscv_vsetvl_e8m1(w); + // s: e00, e10, e20, f00, f10, f20, g00, g10 + // t: e01, e11, e21, f01, f11, f21, g01, g11 + // u: e02, e12, e22, f02, f12, f22, g02, g12 + __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6, + &v_s7, src_ptr, vl); + __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6, + &v_t7, src_ptr + src_stride, vl); + __riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6, + &v_u7, src_ptr + 2 * src_stride, vl); + // Calculate sum of [e00, e22] + v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl); + v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl); + v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl); + v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl); + v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl); + + v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl); + v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl); + v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl); + v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl); + // Calculate sum of [f00, f22] + v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl); + v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl); + v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl); + v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl); + v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl); + + v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl); + v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl); + v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl); + v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl); + // Calculate sum of [g00, g12] + v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl); + v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl); + v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl); + + v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl); + v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl); + + // Average in 16-bit fixed-point + v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl); + v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl); + v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl); + + v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl); + v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl); + v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl); + __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl); + w -= vl; + src_ptr += 8 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} + // ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms' // ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other // platforms only implement non-edge part of image and process edge with scalar. |