diff options
author | ashok.bhat@gmail.com <ashok.bhat@gmail.com> | 2014-09-26 09:16:48 +0000 |
---|---|---|
committer | ashok.bhat@gmail.com <ashok.bhat@gmail.com> | 2014-09-26 09:16:48 +0000 |
commit | c8a34d2e5b18ac42e37e97841a37b875f567a0b9 (patch) | |
tree | 6f19fc081c8b2e8b0a60bc684a1bc21e74ea7fcf | |
parent | 093fd4842f0c1040545daecc4b590261a7df662c (diff) | |
download | libyuv-c8a34d2e5b18ac42e37e97841a37b875f567a0b9.tar.gz |
Row AArch64 Neon implementation - Part 9
BUG=319
TESTED=libyuv_unittest
R=fbarchard@google.com
Change-Id: Id3af83a6efbd70b4a808a8442c3badbef749c0cc
Signed-off-by: Ashok Bhat <ashok.bhat@arm.com>
Review URL: https://webrtc-codereview.appspot.com/23769004
git-svn-id: http://libyuv.googlecode.com/svn/trunk@1092 16f28f9a-4ce2-e073-06de-1de4eb20be90
-rw-r--r-- | include/libyuv/row.h | 6 | ||||
-rw-r--r-- | source/row_neon64.cc | 70 |
2 files changed, 33 insertions, 43 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7e52c18..74ac0f0 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -305,9 +305,9 @@ extern "C" { #define HAS_ARGBSHUFFLEROW_NEON #define HAS_I422TOYUY2ROW_NEON #define HAS_I422TOUYVYROW_NEON -// #define HAS_ARGBTORGB565ROW_NEON -// #define HAS_ARGBTOARGB1555ROW_NEON -// #define HAS_ARGBTOARGB4444ROW_NEON +#define HAS_ARGBTORGB565ROW_NEON +#define HAS_ARGBTOARGB1555ROW_NEON +#define HAS_ARGBTOARGB4444ROW_NEON #define HAS_ARGBTOYROW_NEON #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOUV444ROW_NEON diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 09775c4..dece8c5 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -414,16 +414,11 @@ void I422ToRAWRow_NEON(const uint8* src_y, #endif // HAS_I422TORAWROW_NEON #define ARGBTORGB565 \ - "vshr.u8 d20, d20, #3 \n" /* B */ \ - "vshr.u8 d21, d21, #2 \n" /* G */ \ - "vshr.u8 d22, d22, #3 \n" /* R */ \ - "vmovl.u8 q8, d20 \n" /* B */ \ - "vmovl.u8 q9, d21 \n" /* G */ \ - "vmovl.u8 q10, d22 \n" /* R */ \ - "vshl.u16 q9, q9, #5 \n" /* G */ \ - "vshl.u16 q10, q10, #11 \n" /* R */ \ - "vorr q0, q8, q9 \n" /* BG */ \ - "vorr q0, q0, q10 \n" /* BGR */ + "shll v0.8h, v22.8b, #8 \n" /* R */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "sri v0.8h, v21.8h, #5 \n" /* RG */ \ + "sri v0.8h, v20.8h, #11 \n" /* RGB */ #ifdef HAS_I422TORGB565ROW_NEON void I422ToRGB565Row_NEON(const uint8* src_y, @@ -462,19 +457,13 @@ void I422ToRGB565Row_NEON(const uint8* src_y, #endif // HAS_I422TORGB565ROW_NEON #define ARGBTOARGB1555 \ - "vshr.u8 q10, q10, #3 \n" /* B */ \ - "vshr.u8 d22, d22, #3 \n" /* R */ \ - "vshr.u8 d23, d23, #7 \n" /* A */ \ - "vmovl.u8 q8, d20 \n" /* B */ \ - "vmovl.u8 q9, d21 \n" /* G */ \ - "vmovl.u8 q10, d22 \n" /* R */ \ - "vmovl.u8 q11, d23 \n" /* A */ \ - "vshl.u16 q9, q9, #5 \n" /* G */ \ - "vshl.u16 q10, q10, #10 \n" /* R */ \ - "vshl.u16 q11, q11, #15 \n" /* A */ \ - "vorr q0, q8, q9 \n" /* BG */ \ - "vorr q1, q10, q11 \n" /* RA */ \ - "vorr q0, q0, q1 \n" /* BGRA */ + "shll v0.8h, v23.8b, #8 \n" /* A */ \ + "shll v22.8h, v22.8b, #8 \n" /* R */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "sri v0.8h, v22.8h, #1 \n" /* AR */ \ + "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ + "sri v0.8h, v20.8h, #11 \n" /* ARGB */ #ifdef HAS_I422TOARGB1555ROW_NEON void I422ToARGB1555Row_NEON(const uint8* src_y, @@ -514,13 +503,14 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, #endif // HAS_I422TOARGB1555ROW_NEON #define ARGBTOARGB4444 \ - "vshr.u8 d20, d20, #4 \n" /* B */ \ - "vbic.32 d21, d21, d4 \n" /* G */ \ - "vshr.u8 d22, d22, #4 \n" /* R */ \ - "vbic.32 d23, d23, d4 \n" /* A */ \ - "vorr d0, d20, d21 \n" /* BG */ \ - "vorr d1, d22, d23 \n" /* RA */ \ - "vzip.u8 d0, d1 \n" /* BGRA */ + /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ + "ushr v20.8b, v20.8b, #4 \n" /* B */ \ + "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ + "ushr v22.8b, v22.8b, #4 \n" /* R */ \ + "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ + "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ + "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ + "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ #ifdef HAS_I422TOARGB4444ROW_NEON void I422ToARGB4444Row_NEON(const uint8* src_y, @@ -1516,17 +1506,17 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v20.8b-v23.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + : "cc", "memory", "v0", "v20", "v21", "v22", "v23" ); } #endif // HAS_ARGBTORGB565ROW_NEON @@ -1538,17 +1528,17 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v20.8b-v23.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + : "cc", "memory", "v0", "v20", "v21", "v22", "v23" ); } #endif // HAS_ARGBTOARGB1555ROW_NEON @@ -1557,21 +1547,21 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, int pix) { asm volatile ( - "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. + "movi v4.16b, #0x0f \n" // bits to clear with vbic. ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v20.8b-v23.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" ); } #endif // HAS_ARGBTOARGB4444ROW_NEON |