aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Lai <bruce.lai@sifive.com>2023-07-04 22:05:46 -0700
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2023-07-27 02:59:47 +0000
commitc60ac4025cd88ec9139cc01d6021fa81a8d2c085 (patch)
treeb3953e1ae0847255c46c46ec5e59d2736a3dc9e1
parent10de943a12737be193e4f51df9e37b555ea09710 (diff)
downloadlibyuv-c60ac4025cd88ec9139cc01d6021fa81a8d2c085.tar.gz
[RVV] Enable ScaleRowDown38_RVV & ScaleRowDown38_{2,3}_Box_RVV
* Run on SiFive internal FPGA: Test Case Speedup I420ScaleDownBy3by8_None 4.2 I420ScaleDownBy3by8_Linear 1.7 I420ScaleDownBy3by8_Bilinear 1.7 I420ScaleDownBy3by8_Box 1.7 I444ScaleDownBy3by8_None 4.2 I444ScaleDownBy3by8_Linear 1.8 I444ScaleDownBy3by8_Bilinear 1.8 I444ScaleDownBy3by8_Box 1.8 Change-Id: Ic2e98de2494d9e7b25f5db115a7f21c618eaefed Signed-off-by: Bruce Lai <bruce.lai@sifive.com> Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4711857 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r--include/libyuv/scale_row.h14
-rw-r--r--source/scale.cc11
-rw-r--r--source/scale_rvv.cc148
3 files changed, 172 insertions, 1 deletions
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index 98829172..d825a8db 100644
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -181,6 +181,7 @@ extern "C" {
#define HAS_SCALEARGBROWDOWNEVEN_RVV
#define HAS_SCALEROWDOWN2_RVV
#define HAS_SCALEROWDOWN34_RVV
+#define HAS_SCALEROWDOWN38_RVV
#define HAS_SCALEROWDOWN4_RVV
#define HAS_SCALEROWUP2_LINEAR_RVV
#define HAS_SCALEROWUP2_BILINEAR_RVV
@@ -1847,6 +1848,19 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
+void ScaleRowDown38_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+
void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
diff --git a/source/scale.cc b/source/scale.cc
index 0693362a..43d973af 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -711,6 +711,17 @@ static void ScalePlaneDown38(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN38_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_RVV;
+ ScaleRowDown38_2 = ScaleRowDown38_RVV;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_RVV;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_RVV;
+ }
+ }
+#endif
for (y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc
index 98118831..c7a51d51 100644
--- a/source/scale_rvv.cc
+++ b/source/scale_rvv.cc
@@ -21,8 +21,8 @@
// This module is for clang rvv. GCC hasn't supported segment load & store.
#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
defined(__clang__)
+#include <assert.h>
#include <riscv_vector.h>
-
#ifdef __cplusplus
namespace libyuv {
extern "C" {
@@ -463,6 +463,152 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
} while (w > 0);
}
+void ScaleRowDown38_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t w = (size_t)dst_width / 3u;
+ (void)src_stride;
+ assert(dst_width % 3 == 0);
+ do {
+ vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+ size_t vl = __riscv_vsetvl_e8m1(w);
+ __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+ &v_s7, src_ptr, vl);
+ __riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl);
+ w -= vl;
+ src_ptr += 8 * vl;
+ dst_ptr += 3 * vl;
+ } while (w > 0);
+}
+
+void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t w = (size_t)dst_width / 3u;
+ const uint16_t coeff_a = (65536u / 6u);
+ const uint16_t coeff_b = (65536u / 4u);
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ do {
+ vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+ vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
+ vuint16m2_t v_e0, v_e1, v_e2, v_e;
+ vuint16m2_t v_f0, v_f1, v_f2, v_f;
+ vuint16m2_t v_g0, v_g1, v_g;
+ vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+ size_t vl = __riscv_vsetvl_e8m1(w);
+ // s: e00, e10, e20, f00, f10, f20, g00, g10
+ // t: e01, e11, e21, f01, f11, f21, g01, g11
+ __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+ &v_s7, src_ptr, vl);
+ __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
+ &v_t7, src_ptr + src_stride, vl);
+ // Calculate sum of [e00, e21] to v_e
+ // Calculate sum of [f00, f21] to v_f
+ // Calculate sum of [g00, g11] to v_g
+ v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+ v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+ v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+ v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+ v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+ v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+ v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+ v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+
+ v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+ v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+ v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+ v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+ v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+
+ // Average in 16-bit fixed-point
+ v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+ v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+ v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+
+ v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+ v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+ v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+
+ __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
+ w -= vl;
+ src_ptr += 8 * vl;
+ dst_ptr += 3 * vl;
+ } while (w > 0);
+}
+
+void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t w = (size_t)dst_width / 3u;
+ const uint16_t coeff_a = (65536u / 9u);
+ const uint16_t coeff_b = (65536u / 6u);
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ do {
+ vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+ vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
+ vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7;
+ vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e;
+ vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f;
+ vuint16m2_t v_g0, v_g1, v_g2, v_g;
+ vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+ size_t vl = __riscv_vsetvl_e8m1(w);
+ // s: e00, e10, e20, f00, f10, f20, g00, g10
+ // t: e01, e11, e21, f01, f11, f21, g01, g11
+ // u: e02, e12, e22, f02, f12, f22, g02, g12
+ __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+ &v_s7, src_ptr, vl);
+ __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
+ &v_t7, src_ptr + src_stride, vl);
+ __riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6,
+ &v_u7, src_ptr + 2 * src_stride, vl);
+ // Calculate sum of [e00, e22]
+ v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+ v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+ v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+ v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl);
+ v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl);
+
+ v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+ v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl);
+ v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl);
+ v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+ // Calculate sum of [f00, f22]
+ v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+ v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+ v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+ v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl);
+ v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl);
+
+ v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+ v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl);
+ v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl);
+ v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+ // Calculate sum of [g00, g12]
+ v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+ v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+ v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl);
+
+ v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+ v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl);
+
+ // Average in 16-bit fixed-point
+ v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+ v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+ v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+
+ v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+ v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+ v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+ __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
+ w -= vl;
+ src_ptr += 8 * vl;
+ dst_ptr += 3 * vl;
+ } while (w > 0);
+}
+
// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
// ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other
// platforms only implement non-edge part of image and process edge with scalar.