From 5d694bec38c39b03f9eb4339fb0e21f32dbba9bd Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 12 Mar 2024 22:41:52 +0000 Subject: [AArch64] Replace UQSHRN{,2} pair by UZP2 in YUVTORGB The existing Neon code makes use of a pair of UQSHRN and UQSHRN2 instructions to extract the top half of a widened multiply result. These instructions would ordinarily saturate, however saturation can never happen in this case since we are shifting by 16 to get the top half of each element, the top bits remain as-is. We could move this to using a slightly simpler non-saturating shift, however in this case it is simpler and faster to just use UZP2 to extract the top half of each 32-bit lane directly. Reduction in runtime for selected kernels: Kernel | Cortex-A55 | Cortex-A76 | Cortex-X2 I400ToARGBRow_NEON | -9.4% | -14.9% | -13.9% I422AlphaToARGBRow_NEON | -7.9% | -11.4% | -11.5% I422ToARGB1555Row_NEON | -7.3% | -17.2% | -14.7% I422ToARGB4444Row_NEON | -7.6% | -17.9% | -13.7% I422ToARGBRow_NEON | -8.2% | -9.8% | -11.9% I422ToRGB24Row_NEON | -8.0% | -13.3% | -12.8% I422ToRGB565Row_NEON | -7.5% | -15.1% | -14.6% I422ToRGBARow_NEON | -8.3% | -13.1% | -12.2% I444AlphaToARGBRow_NEON | -8.3% | -7.6% | -12.7% I444ToARGBRow_NEON | -8.6% | -3.5% | -13.5% I444ToRGB24Row_NEON | -8.5% | -7.8% | -13.4% NV12ToARGBRow_NEON | -8.8% | -1.4% | -12.0% NV12ToRGB24Row_NEON | -8.5% | -11.5% | -12.3% NV12ToRGB565Row_NEON | -7.9% | -15.0% | -15.7% NV21ToARGBRow_NEON | -8.7% | -1.6% | -12.3% NV21ToRGB24Row_NEON | -8.4% | -11.5% | -12.0% UYVYToARGBRow_NEON | -8.8% | -8.9% | -11.9% YUY2ToARGBRow_NEON | -8.7% | -10.8% | -13.3% Bug: libyuv:976 Change-Id: I6c505fe722e5f91f93718b85fe881ad056d8602d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5366653 Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- source/row_neon64.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index d37b8dad..ac22a198 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -101,8 +101,7 @@ static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13, "umull v6.8h, v1.8b, v30.8b \n" \ "umull v0.4s, v0.4h, v24.4h \n" \ "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \ - "uqshrn v0.4h, v0.4s, #16 \n" \ - "uqshrn2 v0.8h, v3.4s, #16 \n" /* Y */ \ + "uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \ "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \ "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \ "add v17.8h, v0.8h, v26.8h \n" /* G */ \ -- cgit v1.2.3