#include #include #include #include #define SK_B16_BITS 5 #define SK_G16_BITS 6 #define SK_R16_BITS 5 #define SK_R16_SHIFT (SK_B16_BITS + SK_G16_BITS) #define SK_G16_SHIFT (SK_B16_BITS) #define SK_G16_MASK ((1 << SK_G16_BITS) - 1) #define SK_B16_MASK ((1 << SK_B16_BITS) - 1) #define SK_G16_MASK_IN_PLACE (SK_G16_MASK << SK_G16_SHIFT) static inline uint32_t SkExpand_rgb_16(unsigned c) { return ((c & SK_G16_MASK_IN_PLACE) << 16) | (c & ~SK_G16_MASK_IN_PLACE); } static inline unsigned SkCompact_rgb_16(uint32_t c) { return ((c >> 16) & SK_G16_MASK_IN_PLACE) | (c & ~SK_G16_MASK_IN_PLACE); } void SkRGB16_Opaque_Blitter_blitV_c(uint16_t* device, size_t deviceRB, int height, uint8_t alpha) { unsigned scale5 = alpha >> 3; uint32_t src32 = 256 * scale5; scale5 = 32 - scale5; do { uint32_t dst32 = SkExpand_rgb_16(*device) * scale5; *device = SkCompact_rgb_16((src32 + dst32) >> 5); device = (uint16_t*)((char*)device + deviceRB); } while (--height != 0); } #define LOAD_LANE_16(reg, n) \ reg = vld1q_lane_u16(device, reg, n); \ device = (uint16_t*)((char*)device + deviceRB); #define STORE_LANE_16(reg, n) \ vst1_lane_u16(dst, reg, n); \ dst = (uint16_t*)((char*)dst + deviceRB); void SkRGB16_Opaque_Blitter_blitV_neon(uint16_t* device, size_t deviceRB, int height, uint8_t alpha) { unsigned scale = alpha >> 3; uint32_t src32 = 256 * scale; scale = 32 - scale; if (height >= 8) { uint16_t* dst = device; // prepare constants uint16x8_t vdev = vdupq_n_u16(0); uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE); uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE); uint32x4_t vsrc32 = vdupq_n_u32(src32); uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale); while (height >= 8){ LOAD_LANE_16(vdev, 0) LOAD_LANE_16(vdev, 1) LOAD_LANE_16(vdev, 2) LOAD_LANE_16(vdev, 3) LOAD_LANE_16(vdev, 4) LOAD_LANE_16(vdev, 5) LOAD_LANE_16(vdev, 6) LOAD_LANE_16(vdev, 7) // Expand_rgb_16 uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16)); uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5); uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5); // Compact_rgb_16 vdst32_lo = vaddq_u32(vdst32_lo, vsrc32); vdst32_hi = vaddq_u32(vdst32_hi, vsrc32); vdst32_lo = vshrq_n_u32(vdst32_lo, 5); vdst32_hi = vshrq_n_u32(vdst32_hi, 5); uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16); uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16); uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi); vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16); vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16); uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi); STORE_LANE_16(vdst16_lo, 0) STORE_LANE_16(vdst16_lo, 1) STORE_LANE_16(vdst16_lo, 2) STORE_LANE_16(vdst16_lo, 3) STORE_LANE_16(vdst16_hi, 0) STORE_LANE_16(vdst16_hi, 1) STORE_LANE_16(vdst16_hi, 2) STORE_LANE_16(vdst16_hi, 3) height -= 8; } } while (height != 0){ uint32_t dst32 = SkExpand_rgb_16(*device) * scale; *device = SkCompact_rgb_16((src32 + dst32) >> 5); device = (uint16_t*)((char*)device + deviceRB); height--; } } #undef LOAD_LANE_16 #undef STORE_LANE_16 void SkRGB16_Opaque_Blitter_blitH_c(uint16_t* device, int height, uint8_t alpha) { unsigned scale5 = alpha >> 3; uint32_t src32 = 256 * scale5; scale5 = 32 - scale5; do { uint32_t dst32 = SkExpand_rgb_16(*device) * scale5; *device++ = SkCompact_rgb_16((src32 + dst32) >> 5); } while (--height != 0); } void SkRGB16_Opaque_Blitter_blitH_neon(uint16_t* device, int height, uint8_t alpha) { unsigned scale = alpha >> 3; uint32_t src32 = 256 * scale; scale = 32 - scale; if (height >= 8) { uint16_t* dst = device; // prepare constants uint16x8_t vdev = vdupq_n_u16(0); uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE); uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE); uint32x4_t vsrc32 = vdupq_n_u32(src32); uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale); while (height >= 8){ vdev = vld1q_u16(device); // Expand_rgb_16 uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16)); uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5); uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5); // Compact_rgb_16 vdst32_lo = vaddq_u32(vdst32_lo, vsrc32); vdst32_hi = vaddq_u32(vdst32_hi, vsrc32); vdst32_lo = vshrq_n_u32(vdst32_lo, 5); vdst32_hi = vshrq_n_u32(vdst32_hi, 5); uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16); uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16); uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi); vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16); vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16); uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi); vst1q_u16(device, vcombine_u16(vdst16_lo, vdst16_hi)); device += 8; height -= 8; } } while (height != 0){ uint32_t dst32 = SkExpand_rgb_16(*device) * scale; *device++ = SkCompact_rgb_16((src32 + dst32) >> 5); height--; } }