diff options
author | mtklein <mtklein@chromium.org> | 2015-08-06 11:18:50 -0700 |
---|---|---|
committer | The Android Automerger <android-build@google.com> | 2015-10-27 15:22:05 -0700 |
commit | f39144db54af553a33774f581da605e484b55e27 (patch) | |
tree | 74b07db46912e0a18dedee68eed3983d1c0215d3 | |
parent | 48c55fe972572c0331ecf6b3d9e8f1a49108bd7e (diff) | |
download | skia-android-cts-6.0_r24.tar.gz |
Purge non-NEON ARM code DO NOT MERGEandroid-cts-6.0_r9android-cts-6.0_r8android-cts-6.0_r7android-cts-6.0_r6android-cts-6.0_r5android-cts-6.0_r4android-cts-6.0_r32android-cts-6.0_r31android-cts-6.0_r30android-cts-6.0_r3android-cts-6.0_r29android-cts-6.0_r28android-cts-6.0_r27android-cts-6.0_r26android-cts-6.0_r25android-cts-6.0_r24android-cts-6.0_r23android-cts-6.0_r22android-cts-6.0_r21android-cts-6.0_r20android-cts-6.0_r19android-cts-6.0_r18android-cts-6.0_r17android-cts-6.0_r16android-cts-6.0_r15android-cts-6.0_r14android-cts-6.0_r13android-cts-6.0_r12android-6.0.0_r7marshmallow-releasemarshmallow-cts-release
As I begin to wade in here, it's nice to remove as much code as possible.
BUG=skia:4117
Review URL: https://codereview.chromium.org/1277953002
=======================================================================
Cherry-pick of https://skia.googlesource.com/skia/+/e683e810
Speculative fix for b/23648740. Remove the function that crashes.
BUG:23648740
Change-Id: Ie4d681976bc1e39ed4f78d63d30259c9e35aea07
-rw-r--r-- | src/opts/SkBitmapProcState_opts_arm.cpp | 203 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_arm.cpp | 364 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_arm_neon.cpp | 150 |
3 files changed, 6 insertions, 711 deletions
diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp index e6799dea19..e3726e7274 100644 --- a/src/opts/SkBitmapProcState_opts_arm.cpp +++ b/src/opts/SkBitmapProcState_opts_arm.cpp @@ -16,208 +16,7 @@ #include "SkConvolver.h" -#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN) -void SI8_D16_nofilter_DX_arm( - const SkBitmapProcState& s, - const uint32_t* SK_RESTRICT xy, - int count, - uint16_t* SK_RESTRICT colors) SK_ATTRIBUTE_OPTIMIZE_O1; - -void SI8_D16_nofilter_DX_arm(const SkBitmapProcState& s, - const uint32_t* SK_RESTRICT xy, - int count, uint16_t* SK_RESTRICT colors) { - SkASSERT(count > 0 && colors != NULL); - SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)); - SkASSERT(kNone_SkFilterQuality == s.fFilterLevel); - - const uint16_t* SK_RESTRICT table = s.fBitmap->getColorTable()->read16BitCache(); - const uint8_t* SK_RESTRICT srcAddr = (const uint8_t*)s.fBitmap->getPixels(); - - // buffer is y32, x16, x16, x16, x16, x16 - // bump srcAddr to the proper row, since we're told Y never changes - SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height()); - srcAddr = (const uint8_t*)((const char*)srcAddr + - xy[0] * s.fBitmap->rowBytes()); - - uint8_t src; - - if (1 == s.fBitmap->width()) { - src = srcAddr[0]; - uint16_t dstValue = table[src]; - sk_memset16(colors, dstValue, count); - } else { - int i; - int count8 = count >> 3; - const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1); - - asm volatile ( - "cmp %[count8], #0 \n\t" // compare loop counter with 0 - "beq 2f \n\t" // if loop counter == 0, exit - "1: \n\t" - "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 - "subs %[count8], %[count8], #1 \n\t" // decrement loop counter - "uxth r4, r5 \n\t" // extract ptr 0 - "mov r5, r5, lsr #16 \n\t" // extract ptr 1 - "uxth r6, r7 \n\t" // extract ptr 2 - "mov r7, r7, lsr #16 \n\t" // extract ptr 3 - "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image - "uxth r8, r9 \n\t" // extract ptr 4 - "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image - "mov r9, r9, lsr #16 \n\t" // extract ptr 5 - "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image - "uxth r10, r11 \n\t" // extract ptr 6 - "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image - "mov r11, r11, lsr #16 \n\t" // extract ptr 7 - "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image - "add r4, r4, r4 \n\t" // double pixel 0 for RGB565 lookup - "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image - "add r5, r5, r5 \n\t" // double pixel 1 for RGB565 lookup - "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image - "add r6, r6, r6 \n\t" // double pixel 2 for RGB565 lookup - "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image - "add r7, r7, r7 \n\t" // double pixel 3 for RGB565 lookup - "ldrh r4, [%[table], r4] \n\t" // load pixel 0 RGB565 from colmap - "add r8, r8, r8 \n\t" // double pixel 4 for RGB565 lookup - "ldrh r5, [%[table], r5] \n\t" // load pixel 1 RGB565 from colmap - "add r9, r9, r9 \n\t" // double pixel 5 for RGB565 lookup - "ldrh r6, [%[table], r6] \n\t" // load pixel 2 RGB565 from colmap - "add r10, r10, r10 \n\t" // double pixel 6 for RGB565 lookup - "ldrh r7, [%[table], r7] \n\t" // load pixel 3 RGB565 from colmap - "add r11, r11, r11 \n\t" // double pixel 7 for RGB565 lookup - "ldrh r8, [%[table], r8] \n\t" // load pixel 4 RGB565 from colmap - "ldrh r9, [%[table], r9] \n\t" // load pixel 5 RGB565 from colmap - "ldrh r10, [%[table], r10] \n\t" // load pixel 6 RGB565 from colmap - "ldrh r11, [%[table], r11] \n\t" // load pixel 7 RGB565 from colmap - "pkhbt r5, r4, r5, lsl #16 \n\t" // pack pixels 0 and 1 - "pkhbt r6, r6, r7, lsl #16 \n\t" // pack pixels 2 and 3 - "pkhbt r8, r8, r9, lsl #16 \n\t" // pack pixels 4 and 5 - "pkhbt r10, r10, r11, lsl #16 \n\t" // pack pixels 6 and 7 - "stmia %[colors]!, {r5, r6, r8, r10} \n\t" // store last 8 pixels - "bgt 1b \n\t" // loop if counter > 0 - "2: \n\t" - : [xx] "+r" (xx), [count8] "+r" (count8), [colors] "+r" (colors) - : [table] "r" (table), [srcAddr] "r" (srcAddr) - : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" - ); - - for (i = (count & 7); i > 0; --i) { - src = srcAddr[*xx++]; *colors++ = table[src]; - } - } -} - -void SI8_opaque_D32_nofilter_DX_arm( - const SkBitmapProcState& s, - const uint32_t* SK_RESTRICT xy, - int count, - SkPMColor* SK_RESTRICT colors) SK_ATTRIBUTE_OPTIMIZE_O1; - -void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s, - const uint32_t* SK_RESTRICT xy, - int count, SkPMColor* SK_RESTRICT colors) { - SkASSERT(count > 0 && colors != NULL); - SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)); - SkASSERT(kNone_SkFilterQuality == s.fFilterLevel); - - const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->readColors(); - const uint8_t* SK_RESTRICT srcAddr = (const uint8_t*)s.fBitmap->getPixels(); - - // buffer is y32, x16, x16, x16, x16, x16 - // bump srcAddr to the proper row, since we're told Y never changes - SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height()); - srcAddr = (const uint8_t*)((const char*)srcAddr + xy[0] * s.fBitmap->rowBytes()); - - if (1 == s.fBitmap->width()) { - uint8_t src = srcAddr[0]; - SkPMColor dstValue = table[src]; - sk_memset32(colors, dstValue, count); - } else { - const uint16_t* xx = (const uint16_t*)(xy + 1); - - asm volatile ( - "subs %[count], %[count], #8 \n\t" // decrement count by 8, set flags - "blt 2f \n\t" // if count < 0, branch to singles - "1: \n\t" // eights loop - "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 - "uxth r4, r5 \n\t" // extract ptr 0 - "mov r5, r5, lsr #16 \n\t" // extract ptr 1 - "uxth r6, r7 \n\t" // extract ptr 2 - "mov r7, r7, lsr #16 \n\t" // extract ptr 3 - "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image - "uxth r8, r9 \n\t" // extract ptr 4 - "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image - "mov r9, r9, lsr #16 \n\t" // extract ptr 5 - "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image - "uxth r10, r11 \n\t" // extract ptr 6 - "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image - "mov r11, r11, lsr #16 \n\t" // extract ptr 7 - "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image - "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image - "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image - "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image - "ldr r4, [%[table], r4, lsl #2] \n\t" // load pixel 0 SkPMColor from colmap - "ldr r5, [%[table], r5, lsl #2] \n\t" // load pixel 1 SkPMColor from colmap - "ldr r6, [%[table], r6, lsl #2] \n\t" // load pixel 2 SkPMColor from colmap - "ldr r7, [%[table], r7, lsl #2] \n\t" // load pixel 3 SkPMColor from colmap - "ldr r8, [%[table], r8, lsl #2] \n\t" // load pixel 4 SkPMColor from colmap - "ldr r9, [%[table], r9, lsl #2] \n\t" // load pixel 5 SkPMColor from colmap - "ldr r10, [%[table], r10, lsl #2] \n\t" // load pixel 6 SkPMColor from colmap - "ldr r11, [%[table], r11, lsl #2] \n\t" // load pixel 7 SkPMColor from colmap - "subs %[count], %[count], #8 \n\t" // decrement loop counter - "stmia %[colors]!, {r4-r11} \n\t" // store 8 pixels - "bge 1b \n\t" // loop if counter >= 0 - "2: \n\t" - "adds %[count], %[count], #8 \n\t" // fix up counter, set flags - "beq 4f \n\t" // if count == 0, branch to exit - "3: \n\t" // singles loop - "ldrh r4, [%[xx]], #2 \n\t" // load pixel ptr - "subs %[count], %[count], #1 \n\t" // decrement loop counter - "ldrb r5, [%[srcAddr], r4] \n\t" // load pixel from image - "ldr r6, [%[table], r5, lsl #2] \n\t" // load SkPMColor from colmap - "str r6, [%[colors]], #4 \n\t" // store pixel, update ptr - "bne 3b \n\t" // loop if counter != 0 - "4: \n\t" // exit - : [xx] "+r" (xx), [count] "+r" (count), [colors] "+r" (colors) - : [table] "r" (table), [srcAddr] "r" (srcAddr) - : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" - ); - } -} -#endif // !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN) - -/////////////////////////////////////////////////////////////////////////////// - -/* If we replace a sampleproc, then we null-out the associated shaderproc, - otherwise the shader won't even look at the matrix/sampler - */ -void SkBitmapProcState::platformProcs() { -#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN) - bool isOpaque = 256 == fAlphaScale; - bool justDx = false; - - if (fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)) { - justDx = true; - } - - switch (fBitmap->colorType()) { - case kIndex_8_SkColorType: - if (justDx && kNone_SkFilterQuality == fFilterLevel) { -#if 0 /* crashing on android device */ - fSampleProc16 = SI8_D16_nofilter_DX_arm; - fShaderProc16 = NULL; -#endif - if (isOpaque) { - // this one is only very slighty faster than the C version - fSampleProc32 = SI8_opaque_D32_nofilter_DX_arm; - fShaderProc32 = NULL; - } - } - break; - default: - break; - } -#endif -} +void SkBitmapProcState::platformProcs() { } /////////////////////////////////////////////////////////////////////////////// diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index 9dd4b3b43d..81f31fd26a 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -6,378 +6,22 @@ */ #include "SkBlitRow.h" -#include "SkColorPriv.h" -#include "SkDither.h" -#include "SkMathPriv.h" -#include "SkUtils.h" #include "SkUtilsArm.h" -// Define USE_NEON_CODE to indicate that we need to build NEON routines -#define USE_NEON_CODE (!SK_ARM_NEON_IS_NONE) - -// Define USE_ARM_CODE to indicate that we need to build ARM routines -#define USE_ARM_CODE (!SK_ARM_NEON_IS_ALWAYS) - -#if USE_NEON_CODE - #include "SkBlitRow_opts_arm_neon.h" -#endif - -#if USE_ARM_CODE - -static void S32A_D565_Opaque(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/) { - SkASSERT(255 == alpha); - - asm volatile ( - "1: \n\t" - "ldr r3, [%[src]], #4 \n\t" - "cmp r3, #0xff000000 \n\t" - "blo 2f \n\t" - "and r4, r3, #0x0000f8 \n\t" - "and r5, r3, #0x00fc00 \n\t" - "and r6, r3, #0xf80000 \n\t" -#ifdef SK_ARM_HAS_EDSP - "pld [r1, #32] \n\t" -#endif - "lsl r3, r4, #8 \n\t" - "orr r3, r3, r5, lsr #5 \n\t" - "orr r3, r3, r6, lsr #19 \n\t" - "subs %[count], %[count], #1 \n\t" - "strh r3, [%[dst]], #2 \n\t" - "bne 1b \n\t" - "b 4f \n\t" - "2: \n\t" - "lsrs r7, r3, #24 \n\t" - "beq 3f \n\t" - "ldrh r4, [%[dst]] \n\t" - "rsb r7, r7, #255 \n\t" - "and r6, r4, #0x001f \n\t" -#if SK_ARM_ARCH <= 6 - "lsl r5, r4, #21 \n\t" - "lsr r5, r5, #26 \n\t" -#else - "ubfx r5, r4, #5, #6 \n\t" -#endif -#ifdef SK_ARM_HAS_EDSP - "pld [r0, #16] \n\t" -#endif - "lsr r4, r4, #11 \n\t" -#ifdef SK_ARM_HAS_EDSP - "smulbb r6, r6, r7 \n\t" - "smulbb r5, r5, r7 \n\t" - "smulbb r4, r4, r7 \n\t" -#else - "mul r6, r6, r7 \n\t" - "mul r5, r5, r7 \n\t" - "mul r4, r4, r7 \n\t" -#endif -#if SK_ARM_ARCH >= 6 - "uxtb r7, r3, ROR #16 \n\t" - "uxtb ip, r3, ROR #8 \n\t" -#else - "mov ip, #0xff \n\t" - "and r7, ip, r3, ROR #16 \n\t" - "and ip, ip, r3, ROR #8 \n\t" -#endif - "and r3, r3, #0xff \n\t" - "add r6, r6, #16 \n\t" - "add r5, r5, #32 \n\t" - "add r4, r4, #16 \n\t" - "add r6, r6, r6, lsr #5 \n\t" - "add r5, r5, r5, lsr #6 \n\t" - "add r4, r4, r4, lsr #5 \n\t" - "add r6, r7, r6, lsr #5 \n\t" - "add r5, ip, r5, lsr #6 \n\t" - "add r4, r3, r4, lsr #5 \n\t" - "lsr r6, r6, #3 \n\t" - "and r5, r5, #0xfc \n\t" - "and r4, r4, #0xf8 \n\t" - "orr r6, r6, r5, lsl #3 \n\t" - "orr r4, r6, r4, lsl #8 \n\t" - "strh r4, [%[dst]], #2 \n\t" -#ifdef SK_ARM_HAS_EDSP - "pld [r1, #32] \n\t" -#endif - "subs %[count], %[count], #1 \n\t" - "bne 1b \n\t" - "b 4f \n\t" - "3: \n\t" - "subs %[count], %[count], #1 \n\t" - "add %[dst], %[dst], #2 \n\t" - "bne 1b \n\t" - "4: \n\t" - : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) - : - : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip" - ); -} - -static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { - - SkASSERT(255 == alpha); - - asm volatile ( - "cmp %[count], #0 \n\t" /* comparing count with 0 */ - "beq 3f \n\t" /* if zero exit */ - - "mov ip, #0xff \n\t" /* load the 0xff mask in ip */ - "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */ - - "cmp %[count], #2 \n\t" /* compare count with 2 */ - "blt 2f \n\t" /* if less than 2 -> single loop */ - - /* Double Loop */ - "1: \n\t" /* <double loop> */ - "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */ - "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */ - "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ - - /* ----------- */ - "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ - "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ - "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ - - "mul r9, r9, r4 \n\t" /* br = br * scale */ - "mul r10, r10, r4 \n\t" /* ag = ag * scale */ - "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ - - "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ - "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ - "orr r7, r9, r10 \n\t" /* br | ag*/ - - "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */ - "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */ - - /* ----------- */ - "and r9, ip, r8 \n\t" /* r9 = br masked by ip */ - - "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */ - "mul r9, r9, r4 \n\t" /* br = br * scale */ - "sub %[count], %[count], #2 \n\t" - "mul r10, r10, r4 \n\t" /* ag = ag * scale */ - - "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ - "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ - "cmp %[count], #1 \n\t" /* comparing count with 1 */ - "orr r8, r9, r10 \n\t" /* br | ag */ - - "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */ - - /* ----------------- */ - "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */ - /* ----------------- */ - - "bgt 1b \n\t" /* if greater than 1 -> reloop */ - "blt 3f \n\t" /* if less than 1 -> exit */ - - /* Single Loop */ - "2: \n\t" /* <single loop> */ - "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */ - "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */ - "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ - - /* ----------- */ - "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ - "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ - - "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ - "mul r9, r9, r4 \n\t" /* br = br * scale */ - "mul r10, r10, r4 \n\t" /* ag = ag * scale */ - "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ - - "and r10, r10, ip, lsl #8 \n\t" /* mask ag */ - "orr r7, r9, r10 \n\t" /* br | ag */ - - "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */ - - /* ----------------- */ - "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */ - /* ----------------- */ - - "3: \n\t" /* <exit> */ - : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) - : - : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory" - ); -} - -/* - * ARM asm version of S32A_Blend_BlitRow32 - */ -void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha) { - asm volatile ( - "cmp %[count], #0 \n\t" /* comparing count with 0 */ - "beq 3f \n\t" /* if zero exit */ - - "mov r12, #0xff \n\t" /* load the 0xff mask in r12 */ - "orr r12, r12, r12, lsl #16 \n\t" /* convert it to 0xff00ff in r12 */ - - /* src1,2_scale */ - "add %[alpha], %[alpha], #1 \n\t" /* loading %[alpha]=src_scale=alpha+1 */ - - "cmp %[count], #2 \n\t" /* comparing count with 2 */ - "blt 2f \n\t" /* if less than 2 -> single loop */ - - /* Double Loop */ - "1: \n\t" /* <double loop> */ - "ldm %[src]!, {r5, r6} \n\t" /* loading src pointers into r5 and r6 */ - "ldm %[dst], {r7, r8} \n\t" /* loading dst pointers into r7 and r8 */ - - /* dst1_scale and dst2_scale*/ - "lsr r9, r5, #24 \n\t" /* src >> 24 */ - "lsr r10, r6, #24 \n\t" /* src >> 24 */ -#ifdef SK_ARM_HAS_EDSP - "smulbb r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */ - "smulbb r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */ -#else - "mul r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */ - "mul r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */ -#endif - "lsr r9, r9, #8 \n\t" /* r9 >> 8 */ - "lsr r10, r10, #8 \n\t" /* r10 >> 8 */ - "rsb r9, r9, #256 \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */ - "rsb r10, r10, #256 \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */ - - /* ---------------------- */ - - /* src1, src1_scale */ - "and r11, r12, r5, lsr #8 \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */ - "and r4, r12, r5 \n\t" /* rb = r4 = r5 masked by r12 */ - "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ - "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ - "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ - "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ - "orr r5, r11, r4 \n\t" /* r5 = (src1, src_scale) */ - - /* dst1, dst1_scale */ - "and r11, r12, r7, lsr #8 \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */ - "and r4, r12, r7 \n\t" /* rb = r4 = r7 masked by r12 */ - "mul r11, r11, r9 \n\t" /* ag = r11 times dst_scale (r9) */ - "mul r4, r4, r9 \n\t" /* rb = r4 times dst_scale (r9) */ - "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ - "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ - "orr r9, r11, r4 \n\t" /* r9 = (dst1, dst_scale) */ - - /* ---------------------- */ - "add r9, r5, r9 \n\t" /* *dst = src plus dst both scaled */ - /* ---------------------- */ - - /* ====================== */ - - /* src2, src2_scale */ - "and r11, r12, r6, lsr #8 \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */ - "and r4, r12, r6 \n\t" /* rb = r4 = r6 masked by r12 */ - "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ - "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ - "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ - "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ - "orr r6, r11, r4 \n\t" /* r6 = (src2, src_scale) */ - - /* dst2, dst2_scale */ - "and r11, r12, r8, lsr #8 \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */ - "and r4, r12, r8 \n\t" /* rb = r4 = r8 masked by r12 */ - "mul r11, r11, r10 \n\t" /* ag = r11 times dst_scale (r10) */ - "mul r4, r4, r10 \n\t" /* rb = r4 times dst_scale (r6) */ - "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ - "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ - "orr r10, r11, r4 \n\t" /* r10 = (dst2, dst_scale) */ - - "sub %[count], %[count], #2 \n\t" /* decrease count by 2 */ - /* ---------------------- */ - "add r10, r6, r10 \n\t" /* *dst = src plus dst both scaled */ - /* ---------------------- */ - "cmp %[count], #1 \n\t" /* compare count with 1 */ - /* ----------------- */ - "stm %[dst]!, {r9, r10} \n\t" /* copy r9 and r10 to r7 and r8 respectively */ - /* ----------------- */ - - "bgt 1b \n\t" /* if %[count] greater than 1 reloop */ - "blt 3f \n\t" /* if %[count] less than 1 exit */ - /* else get into the single loop */ - /* Single Loop */ - "2: \n\t" /* <single loop> */ - "ldr r5, [%[src]], #4 \n\t" /* loading src pointer into r5: r5=src */ - "ldr r7, [%[dst]] \n\t" /* loading dst pointer into r7: r7=dst */ - - "lsr r6, r5, #24 \n\t" /* src >> 24 */ - "and r8, r12, r5, lsr #8 \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */ -#ifdef SK_ARM_HAS_EDSP - "smulbb r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */ -#else - "mul r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */ -#endif - "and r9, r12, r5 \n\t" /* rb = r9 = r5 masked by r12 */ - "lsr r6, r6, #8 \n\t" /* r6 >> 8 */ - "mul r8, r8, %[alpha] \n\t" /* ag = r8 times scale */ - "rsb r6, r6, #256 \n\t" /* r6 = 255 - r6 + 1 */ - - /* src, src_scale */ - "mul r9, r9, %[alpha] \n\t" /* rb = r9 times scale */ - "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ - "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ - "orr r10, r8, r9 \n\t" /* r10 = (scr, src_scale) */ - - /* dst, dst_scale */ - "and r8, r12, r7, lsr #8 \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */ - "and r9, r12, r7 \n\t" /* rb = r9 = r7 masked by r12 */ - "mul r8, r8, r6 \n\t" /* ag = r8 times scale (r6) */ - "mul r9, r9, r6 \n\t" /* rb = r9 times scale (r6) */ - "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ - "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ - "orr r7, r8, r9 \n\t" /* r7 = (dst, dst_scale) */ - - "add r10, r7, r10 \n\t" /* *dst = src plus dst both scaled */ - - /* ----------------- */ - "str r10, [%[dst]], #4 \n\t" /* *dst = r10, postincrement dst by one (times 4) */ - /* ----------------- */ - - "3: \n\t" /* <exit> */ - : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha) - : - : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory" - ); - -} - -/////////////////////////////////////////////////////////////////////////////// +#include "SkBlitRow_opts_arm_neon.h" static const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm[] = { - // no dither - // NOTE: For the functions below, we don't have a special version - // that assumes that each source pixel is opaque. But our S32A is - // still faster than the default, so use it. - S32A_D565_Opaque, // S32_D565_Opaque - NULL, // S32_D565_Blend - S32A_D565_Opaque, // S32A_D565_Opaque - NULL, // S32A_D565_Blend - - // dither - NULL, // S32_D565_Opaque_Dither - NULL, // S32_D565_Blend_Dither - NULL, // S32A_D565_Opaque_Dither - NULL, // S32A_D565_Blend_Dither + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm[] = { - NULL, // Color32A_D565, - NULL, // Color32A_D565_Dither + NULL, NULL, }; static const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = { - NULL, // S32_Opaque, - NULL, // S32_Blend, - S32A_Opaque_BlitRow32_arm, // S32A_Opaque, - S32A_Blend_BlitRow32_arm // S32A_Blend + NULL, NULL, NULL, NULL, }; -#endif // USE_ARM_CODE - SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) { return SK_ARM_NEON_WRAP(sk_blitrow_platform_565_procs_arm)[flags]; } diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp index 4a6514af7f..27ed891363 100644 --- a/src/opts/SkBlitRow_opts_arm_neon.cpp +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp @@ -635,8 +635,7 @@ void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, vdst = vld1q_u16(dst); #ifdef SK_CPU_ARM64 vsrc = sk_vld4_u8_arm64_4(src); -#else -#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) +#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) asm ( "vld4.u8 %h[vsrc], [%[src]]!" : [vsrc] "=w" (vsrc), [src] "+&r" (src) @@ -659,7 +658,6 @@ void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, vsrc.val[2] = d2; vsrc.val[3] = d3; #endif -#endif // #ifdef SK_CPU_ARM64 // deinterleave dst @@ -1311,37 +1309,6 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, /////////////////////////////////////////////////////////////////////////////// -#undef DEBUG_OPAQUE_DITHER - -#if defined(DEBUG_OPAQUE_DITHER) -static void showme8(char *str, void *p, int len) -{ - static char buf[256]; - char tbuf[32]; - int i; - char *pc = (char*) p; - sprintf(buf,"%8s:", str); - for(i=0;i<len;i++) { - sprintf(tbuf, " %02x", pc[i]); - strcat(buf, tbuf); - } - SkDebugf("%s\n", buf); -} -static void showme16(char *str, void *p, int len) -{ - static char buf[256]; - char tbuf[32]; - int i; - uint16_t *pc = (uint16_t*) p; - sprintf(buf,"%8s:", str); - len = (len / sizeof(uint16_t)); /* passed as bytes */ - for(i=0;i<len;i++) { - sprintf(tbuf, " %04x", pc[i]); - strcat(buf, tbuf); - } - SkDebugf("%s\n", buf); -} -#endif #endif // #ifdef SK_CPU_ARM32 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, @@ -1353,17 +1320,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, if (count >= UNROLL) { -#if defined(DEBUG_OPAQUE_DITHER) - uint16_t tmpbuf[UNROLL]; - int td[UNROLL]; - int tdv[UNROLL]; - int ta[UNROLL]; - int tap[UNROLL]; - uint16_t in_dst[UNROLL]; - int offset = 0; - int noisy = 0; -#endif - uint8x8_t dbase; const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; dbase = vld1_u8(dstart); @@ -1374,52 +1330,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, uint16x8_t dst8, scale8, alpha8; uint16x8_t dst_r, dst_g, dst_b; -#if defined(DEBUG_OPAQUE_DITHER) - // calculate 8 elements worth into a temp buffer - { - int my_y = y; - int my_x = x; - SkPMColor* my_src = (SkPMColor*)src; - uint16_t* my_dst = dst; - int i; - - DITHER_565_SCAN(my_y); - for(i = 0; i < UNROLL; i++) { - SkPMColor c = *my_src++; - SkPMColorAssert(c); - if (c) { - unsigned a = SkGetPackedA32(c); - - int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); - tdv[i] = DITHER_VALUE(my_x); - ta[i] = a; - tap[i] = SkAlpha255To256(a); - td[i] = d; - - unsigned sr = SkGetPackedR32(c); - unsigned sg = SkGetPackedG32(c); - unsigned sb = SkGetPackedB32(c); - sr = SkDITHER_R32_FOR_565(sr, d); - sg = SkDITHER_G32_FOR_565(sg, d); - sb = SkDITHER_B32_FOR_565(sb, d); - - uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); - uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); - dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); - // now src and dst expanded are in g:11 r:10 x:1 b:10 - tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); - td[i] = d; - } else { - tmpbuf[i] = *my_dst; - ta[i] = tdv[i] = td[i] = 0xbeef; - } - in_dst[i] = *my_dst; - my_dst += 1; - DITHER_INC_X(my_x); - } - } -#endif - #ifdef SK_CPU_ARM64 vsrc = sk_vld4_u8_arm64_4(src); #else @@ -1489,43 +1399,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, vst1q_u16(dst, dst8); -#if defined(DEBUG_OPAQUE_DITHER) - // verify my 8 elements match the temp buffer - { - int i, bad=0; - static int invocation; - - for (i = 0; i < UNROLL; i++) { - if (tmpbuf[i] != dst[i]) { - bad=1; - } - } - if (bad) { - SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", - invocation, offset); - SkDebugf(" alpha 0x%x\n", alpha); - for (i = 0; i < UNROLL; i++) - SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", - i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i], - in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); - - showme16("alpha8", &alpha8, sizeof(alpha8)); - showme16("scale8", &scale8, sizeof(scale8)); - showme8("d", &d, sizeof(d)); - showme16("dst8", &dst8, sizeof(dst8)); - showme16("dst_b", &dst_b, sizeof(dst_b)); - showme16("dst_g", &dst_g, sizeof(dst_g)); - showme16("dst_r", &dst_r, sizeof(dst_r)); - showme8("sb", &sb, sizeof(sb)); - showme8("sg", &sg, sizeof(sg)); - showme8("sr", &sr, sizeof(sr)); - - return; - } - offset += UNROLL; - invocation++; - } -#endif dst += UNROLL; count -= UNROLL; // skip x += UNROLL, since it's unchanged mod-4 @@ -1569,8 +1442,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, /////////////////////////////////////////////////////////////////////////////// -#undef DEBUG_S32_OPAQUE_DITHER - void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha, int x, int y) { @@ -1637,25 +1508,6 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, // store it vst1q_u16(dst, dst8); -#if defined(DEBUG_S32_OPAQUE_DITHER) - // always good to know if we generated good results - { - int i, myx = x, myy = y; - DITHER_565_SCAN(myy); - for (i=0;i<UNROLL;i++) { - // the '!' in the asm block above post-incremented src by the 8 pixels it reads. - SkPMColor c = src[i-8]; - unsigned dither = DITHER_VALUE(myx); - uint16_t val = SkDitherRGB32To565(c, dither); - if (val != dst[i]) { - SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", - c, dither, val, dst[i], dstart[i]); - } - DITHER_INC_X(myx); - } - } -#endif - dst += UNROLL; // we don't need to increment src as the asm above has already done it count -= UNROLL; |