ARM Skia NEON patches - 11 - Blitter_RGB16

Blitter_RGB16: fixes and improvements - fix alpha calculation: it was still using the old version of SkAlpha255To256. 11 more tests pass in gm. - clean a lot the code: the existing code was "a bit" messy with a lot of duplicated hardcoded constants, got rid of all this. - improve speed a little: part of it as a side-effect of the change in the way alpha is calculated but also by grouping loads and stores. One "issue" was present and still remains: the NEON code doesn't give the same result as the black blitter on black. It accounts for dozens of mismatches in gm. Is this considered "not too bad"? Would you be interested in a NEON version of the black blitter? The current comments seem to indicate that the black blitter is here only to give a performance boost when NEON is not presents so I didn't write a NEON version. BUG= R=djsollen@google.com, tomhudson@google.com, reed@google.com Author: kevin.petit.arm@gmail.com Review URL: https://chromiumcodereview.appspot.com/18666005 git-svn-id: http://skia.googlecode.com/svn/trunk/src@10635 2bbb7eff-a529-9590-31e7-b0007b416f81
author: commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> 2013-08-08 10:51:45 +0000
committer: Ben Murdoch <benm@google.com> 2013-08-12 09:13:03 +0100
commit: de821746af44fb21710fb85b899213f9bb652543 (patch)
tree: 39995e4280db2cbf2f42191a1879a8a328335b5c
parent: 4ca6e60afcc520251576a6d502d96fcbcb4c3233 (diff)
download: src-de821746af44fb21710fb85b899213f9bb652543.tar.gz
1 files changed, 45 insertions, 55 deletions
diff --git a/core/SkBlitter_RGB16.cpp b/core/SkBlitter_RGB16.cpp
index cb572325..256cbc69 100644
--- a/core/SkBlitter_RGB16.cpp
+++ b/core/SkBlitter_RGB16.cpp
@@ -390,63 +390,53 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask,
     do {
         int w = width;
         if (w >= UNROLL) {
-            uint32x4_t color;        /* can use same one */
-            uint32x4_t dev_lo, dev_hi;
-            uint32x4_t t1;
-            uint32x4_t wn1, wn2;
-            uint16x4_t odev_lo, odev_hi;
-            uint16x4_t alpha_lo, alpha_hi;
-            uint16x8_t  alpha_full;
-
+            uint32x4_t color, dev_lo, dev_hi;
+            uint32x4_t wn1, wn2, tmp;
+            uint32x4_t vmask_g16, vmask_ng16;
+            uint16x8_t valpha, vdev;
+            uint16x4_t odev_lo, odev_hi, valpha_lo, valpha_hi;
+
+            // prepare constants
+            vmask_g16 = vdupq_n_u32(SK_G16_MASK_IN_PLACE);
+            vmask_ng16 = vdupq_n_u32(~SK_G16_MASK_IN_PLACE);
             color = vdupq_n_u32(expanded32);
 
             do {
-                /* alpha is 8x8, widen and split to get pair of 16x4's */
-                alpha_full = vmovl_u8(vld1_u8(alpha));
-                alpha_full = vaddq_u16(alpha_full, vshrq_n_u16(alpha_full,7));
-                alpha_full = vshrq_n_u16(alpha_full, 3);
-                alpha_lo = vget_low_u16(alpha_full);
-                alpha_hi = vget_high_u16(alpha_full);
-
-                dev_lo = vmovl_u16(vld1_u16(device));
-                dev_hi = vmovl_u16(vld1_u16(device+4));
-
-                /* unpack in 32 bits */
-                dev_lo = vorrq_u32(
-                                   vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)),
-                                   vshlq_n_u32(vandq_u32(dev_lo,
-                                                         vdupq_n_u32(0x000007E0)),
-                                               16)
-                                   );
-                dev_hi = vorrq_u32(
-                                   vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)),
-                                   vshlq_n_u32(vandq_u32(dev_hi,
-                                                         vdupq_n_u32(0x000007E0)),
-                                               16)
-                                   );
-
-                /* blend the two */
-                t1 = vmulq_u32(vsubq_u32(color, dev_lo), vmovl_u16(alpha_lo));
-                t1 = vshrq_n_u32(t1, 5);
-                dev_lo = vaddq_u32(dev_lo, t1);
-
-                t1 = vmulq_u32(vsubq_u32(color, dev_hi), vmovl_u16(alpha_hi));
-                t1 = vshrq_n_u32(t1, 5);
-                dev_hi = vaddq_u32(dev_hi, t1);
-
-                /* re-compact and store */
-                wn1 = vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)),
-                wn2 = vshrq_n_u32(dev_lo, 16);
-                wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0));
-                odev_lo = vmovn_u32(vorrq_u32(wn1, wn2));
-
-                wn1 = vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)),
-                wn2 = vshrq_n_u32(dev_hi, 16);
-                wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0));
-                odev_hi = vmovn_u32(vorrq_u32(wn1, wn2));
-
-                vst1_u16(device, odev_lo);
-                vst1_u16(device+4, odev_hi);
+                // alpha is 8x8, widen and split to get a pair of 16x4
+                valpha = vaddw_u8(vdupq_n_u16(1), vld1_u8(alpha));
+                valpha = vshrq_n_u16(valpha, 3);
+                valpha_lo = vget_low_u16(valpha);
+                valpha_hi = vget_high_u16(valpha);
+
+                // load pixels
+                vdev = vld1q_u16(device);
+                dev_lo = vmovl_u16(vget_low_u16(vdev));
+                dev_hi = vmovl_u16(vget_high_u16(vdev));
+
+                // unpack them in 32 bits
+                dev_lo = (dev_lo & vmask_ng16) | vshlq_n_u32(dev_lo & vmask_g16, 16);
+                dev_hi = (dev_hi & vmask_ng16) | vshlq_n_u32(dev_hi & vmask_g16, 16);
+
+                // blend with color
+                tmp = (color - dev_lo) * vmovl_u16(valpha_lo);
+                tmp = vshrq_n_u32(tmp, 5);
+                dev_lo += tmp;
+
+                tmp = vmulq_u32(color - dev_hi, vmovl_u16(valpha_hi));
+                tmp = vshrq_n_u32(tmp, 5);
+                dev_hi += tmp;
+
+                // re-compact
+                wn1 = dev_lo & vmask_ng16;
+                wn2 = vshrq_n_u32(dev_lo, 16) & vmask_g16;
+                odev_lo = vmovn_u32(wn1 | wn2);
+
+                wn1 = dev_hi & vmask_ng16;
+                wn2 = vshrq_n_u32(dev_hi, 16) & vmask_g16;
+                odev_hi = vmovn_u32(wn1 | wn2);
+
+                // store
+                vst1q_u16(device, vcombine_u16(odev_lo, odev_hi));
 
                 device += UNROLL;
                 alpha += UNROLL;
@@ -454,7 +444,7 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask,
             } while (w >= UNROLL);
         }
 
-        /* residuals (which is everything if we have no neon) */
+        // residuals
         while (w > 0) {
             *device = blend_compact(expanded32, SkExpand_rgb_16(*device),
                                     SkAlpha255To256(*alpha++) >> 3);
author	commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>	2013-08-08 10:51:45 +0000
committer	Ben Murdoch <benm@google.com>	2013-08-12 09:13:03 +0100
commit	de821746af44fb21710fb85b899213f9bb652543 (patch)
tree	39995e4280db2cbf2f42191a1879a8a328335b5c
parent	4ca6e60afcc520251576a6d502d96fcbcb4c3233 (diff)
download	src-de821746af44fb21710fb85b899213f9bb652543.tar.gz