diff options
Diffstat (limited to 'files/source/row_neon.cc')
-rw-r--r-- | files/source/row_neon.cc | 268 |
1 files changed, 237 insertions, 31 deletions
diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc index 804ff839..4ed13638 100644 --- a/files/source/row_neon.cc +++ b/files/source/row_neon.cc @@ -89,12 +89,14 @@ extern "C" { "vsli.u16 d2, d2, #8 \n" \ "vsri.u16 d3, d3, #8 \n" +// TODO: Use single register for kUVCoeff and multiply by lane #define YUVTORGB_SETUP \ + "vld1.16 {d31}, [%[kRGBCoeffBias]] \n" \ "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \ - "vld1.16 {d31[]}, [%[kRGBCoeffBias]]! \n" \ - "vld1.16 {d20[], d21[]}, [%[kRGBCoeffBias]]! \n" \ - "vld1.16 {d22[], d23[]}, [%[kRGBCoeffBias]]! \n" \ - "vld1.16 {d24[], d25[]}, [%[kRGBCoeffBias]] \n" + "vdup.u16 q10, d31[1] \n" \ + "vdup.u16 q11, d31[2] \n" \ + "vdup.u16 q12, d31[3] \n" \ + "vdup.u16 d31, d31[0] \n" // q0: B uint16x8_t // q1: G uint16x8_t @@ -156,6 +158,29 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, : "cc", "memory", YUVTORGB_REGS, "d6"); } +void I444ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV444 YUVTORGB + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + void I422ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -586,10 +611,10 @@ void DetileRow_NEON(const uint8_t* src, int width) { asm volatile( "1: \n" - "vld1.16 {q0}, [%0], %3 \n" // load 16 bytes + "vld1.8 {q0}, [%0], %3 \n" // load 16 bytes "subs %2, %2, #16 \n" // 16 processed per loop - "pld [%0, 1792] \n" - "vst1.16 {q0}, [%1]! \n" // store 16 bytes + "pld [%0, #1792] \n" + "vst1.8 {q0}, [%1]! \n" // store 16 bytes "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -599,6 +624,26 @@ void DetileRow_NEON(const uint8_t* src, ); } +// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's. +void DetileRow_16_NEON(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + asm volatile( + "1: \n" + "vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "pld [%0, #3584] \n" + "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride * 2) // %3 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + // Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. void DetileSplitUVRow_NEON(const uint8_t* src_uv, ptrdiff_t src_tile_stride, @@ -609,7 +654,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, "1: \n" "vld2.8 {d0, d1}, [%0], %4 \n" "subs %3, %3, #16 \n" - "pld [%0, 1792] \n" + "pld [%0, #1792] \n" "vst1.8 {d0}, [%1]! \n" "vst1.8 {d1}, [%2]! \n" "bgt 1b \n" @@ -622,6 +667,101 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, ); } +#if LIBYUV_USE_ST2 +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y + "pld [%0, #1792] \n" + "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV + "pld [%1, #1792] \n" + "subs %3, %3, #16 \n" + "vst2.8 {q0, q1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber list + ); +} +#else +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y + "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV + "subs %3, %3, #16 \n" + "pld [%0, #1792] \n" + "vzip.8 q0, q1 \n" + "pld [%1, #1792] \n" + "vst1.8 {q0, q1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber list + ); +} +#endif + +void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { + asm volatile( + "1: \n" + "vld1.8 {q14}, [%0]! \n" // Load lower bits. + "vld1.8 {q9}, [%0]! \n" // Load upper bits row + // by row. + "vld1.8 {q11}, [%0]! \n" + "vld1.8 {q13}, [%0]! \n" + "vld1.8 {q15}, [%0]! \n" + "vshl.u8 q8, q14, #6 \n" // Shift lower bit data + // appropriately. + "vshl.u8 q10, q14, #4 \n" + "vshl.u8 q12, q14, #2 \n" + "vzip.u8 q8, q9 \n" // Interleave upper and + // lower bits. + "vzip.u8 q10, q11 \n" + "vzip.u8 q12, q13 \n" + "vzip.u8 q14, q15 \n" + "vsri.u16 q8, q8, #10 \n" // Copy upper 6 bits + // into lower 6 bits for + // better accuracy in + // conversions. + "vsri.u16 q9, q9, #10 \n" + "vsri.u16 q10, q10, #10 \n" + "vsri.u16 q11, q11, #10 \n" + "vsri.u16 q12, q12, #10 \n" + "vsri.u16 q13, q13, #10 \n" + "vsri.u16 q14, q14, #10 \n" + "vsri.u16 q15, q15, #10 \n" + "vstmia %1!, {q8-q15} \n" // Store pixel block (64 + // pixels). + "subs %2, %2, #80 \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(size) // %2 + : + : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +} + // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, @@ -664,7 +804,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, "+r"(dst_b), // %3 "+r"(width) // %4 : // Input registers - : "cc", "memory", "d0", "d1", "d2" // Clobber List + : "cc", "memory", "q0", "q1", "q2" // Clobber List ); } @@ -1505,6 +1645,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, ); } +void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_yuy2 + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vld2.8 {q2, q3}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 q4, q1, q3 \n" // average rows of UV + "vst1.8 {q4}, [%2]! \n" // store 8 UV. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); +} + // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, @@ -1590,7 +1753,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { asm volatile( "vdup.32 d7, %2 \n" // dither4 @@ -1762,7 +1925,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, ); } -// TODO(fbarchard): Subsample match C code. +// TODO(fbarchard): Subsample match Intel code. void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1808,6 +1971,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, ); } +void ABGRToUVJRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_uj), // %2 + "+r"(dst_vj), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + // TODO(fbarchard): Subsample match C code. void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, @@ -2567,6 +2775,10 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); } +void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + // RGBA expects first value to be A and ignored, then 3 values to contain RGB. // Same code as ARGB, except the LD4 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, @@ -3633,7 +3845,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, "+r"(dst_v), // %2 "+r"(width) // %3 : "r"(shift) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); + : "cc", "memory", "q0", "q1", "q2"); } void MergeUVRow_16_NEON(const uint16_t* src_u, @@ -3687,31 +3899,25 @@ void DivideRow_16_NEON(const uint16_t* src_y, int scale, int width) { asm volatile( - "vdup.16 q0, %3 \n" - "1: \n" - "vld1.16 {q1}, [%0]! \n" - "vld1.16 {q2}, [%0]! \n" - "vmovl.u16 q3, d2 \n" - "vmovl.u16 q1, d3 \n" - "vmovl.u16 q4, d4 \n" - "vmovl.u16 q2, d5 \n" - "vshl.u32 q3, q3, q0 \n" - "vshl.u32 q4, q4, q0 \n" - "vshl.u32 q1, q1, q0 \n" - "vshl.u32 q2, q2, q0 \n" - "vmovn.u32 d2, q3 \n" - "vmovn.u32 d3, q1 \n" - "vmovn.u32 d4, q4 \n" - "vmovn.u32 d5, q2 \n" - "vst1.16 {q1}, [%1]! \n" - "vst1.16 {q2}, [%1]! \n" + "vdup.16 d8, %3 \n" + "1: \n" + "vld1.16 {q2, q3}, [%0]! \n" + "vmull.u16 q0, d4, d8 \n" + "vmull.u16 q1, d5, d8 \n" + "vmull.u16 q2, d6, d8 \n" + "vmull.u16 q3, d7, d8 \n" + "vshrn.u32 d0, q0, #16 \n" + "vshrn.u32 d1, q1, #16 \n" + "vshrn.u32 d2, q2, #16 \n" + "vshrn.u32 d3, q3, #16 \n" + "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels "subs %2, %2, #16 \n" // 16 src pixels per loop "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "r"(scale) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); + : "cc", "memory", "q0", "q1", "q2", "q3", "d8"); } // Use scale to convert lsb formats to msb, depending how many bits there are: |