diff options
author | Frank Barchard <fbarchard@google.com> | 2022-06-08 11:26:19 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2022-06-08 19:40:30 +0000 |
commit | baef41447887e1a17897a4cb6ccc854ef3a9d652 (patch) | |
tree | da602eeef472aec5fc2d2e1c1add2e1dfc90f85a /source/row_neon.cc | |
parent | d011314f14738e0751dcb269c1d989c4dcbaad7b (diff) | |
download | libyuv-baef41447887e1a17897a4cb6ccc854ef3a9d652.tar.gz |
Convert16To8Row_NEON use shift without rounding
Fixes chromium PaintCanvasVideoRendererTest.HighBitDepth
sqdmulh was creating a 9 bit value with rounding, and then shifted it right 1 with no rounding. The rounding had an off by 1 impact in some tests.
Pixel 3
C I010ToI420_Opt (749 ms)
Was sqdmulh I010ToI420_Opt (370 ms)
Now ushl I010ToI420_Opt (324 ms)
Pixel 4
C I010ToI420_Opt (581 ms)
Was sqdmulh I010ToI420_Opt (240 ms)
Now ushl I010ToI420_Opt (231 ms)
Bug: b/216321733, b/233233302
Change-Id: I26f673bb411401d1e4a8126bf22d61c649223e9b
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3694143
Reviewed-by: Justin Green <greenjustin@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_neon.cc')
-rw-r--r-- | source/row_neon.cc | 92 |
1 files changed, 76 insertions, 16 deletions
diff --git a/source/row_neon.cc b/source/row_neon.cc index 8ba71d07..1d912b85 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -10,8 +10,6 @@ #include "libyuv/row.h" -#include <stdio.h> - #ifdef __cplusplus namespace libyuv { extern "C" { @@ -21,6 +19,8 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) +// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. + // q0: Y uint16x8_t // d2: U uint8x8_t // d3: V uint8x8_t @@ -2715,6 +2715,66 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"); } +// Bilinear filter 8x2 -> 8x1 +void InterpolateRow_16_NEON(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "cmp %4, #128 \n" + "beq 50f \n" + + "vdup.16 d17, %4 \n" + "vdup.16 d16, %5 \n" + // General purpose row blend. + "1: \n" + "vld1.16 {q0}, [%1]! \n" + "vld1.16 {q1}, [%2]! \n" + "subs %3, %3, #8 \n" + "vmull.u16 q2, d0, d16 \n" + "vmull.u16 q3, d1, d16 \n" + "vmlal.u16 q2, d2, d17 \n" + "vmlal.u16 q3, d3, d17 \n" + "vrshrn.u32 d0, q2, #8 \n" + "vrshrn.u32 d1, q3, #8 \n" + "vst1.16 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "vld1.16 {q0}, [%1]! \n" + "vld1.16 {q1}, [%2]! \n" + "subs %3, %3, #8 \n" + "vrhadd.u16 q0, q1 \n" + "vst1.16 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.16 {q0}, [%1]! \n" + "subs %3, %3, #8 \n" + "vst1.16 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width) // %3 + : "r"(y1_fraction), // %4 + "r"(y0_fraction) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8"); +} + // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, @@ -3649,31 +3709,31 @@ void DivideRow_16_NEON(const uint16_t* src_y, } // Use scale to convert lsb formats to msb, depending how many bits there are: -// 32768 = 9 bits -// 16384 = 10 bits -// 4096 = 12 bits -// 256 = 16 bits +// 32768 = 9 bits = shr 1 +// 16384 = 10 bits = shr 2 +// 4096 = 12 bits = shr 4 +// 256 = 16 bits = shr 8 void Convert16To8Row_NEON(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { + int shift = 15 - __builtin_clz(scale); // Negative for shl will shift right asm volatile( - "vdup.16 q2, %2 \n" + "vdup.16 q2, %3 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" - "vqdmulh.s16 q0, q0, q2 \n" - "vqdmulh.s16 q1, q1, q2 \n" - "vqshrn.u16 d0, q0, #1 \n" - "vqshrn.u16 d1, q1, #1 \n" - "vst1.16 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" // 16 src pixels per loop + "vshl.u16 q0, q0, q2 \n" + "vshl.u16 q1, q1, q2 \n" + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d1, q1 \n" + "subs %2, %2, #16 \n" // 16 src pixels per loop + "vst1.8 {q0}, [%1]! \n" "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 - "+r"(scale), // %2 - "+r"(width) // %3 - : + "+r"(width) // %2 + : "r"(shift) // %3 : "cc", "memory", "q0", "q1", "q2"); } |