Merge "Sync-patch with libwebp ver 0.4.2"

author: Vikas Arora <vikasa@google.com> 2015-01-29 17:29:29 +0000
committer: Android (Google) Code Review <android-gerrit@google.com> 2015-01-29 17:29:30 +0000
commit: 9dbaf404293aff5b35993a9d6c7a3b45aeba1c52 (patch)
tree: d7ca3ab2344e6eaae0c5af176fc998b317952c23 /src/dsp
parent: b01fe55d0ee2435cf881e68989b599563ae543e8 (diff)
parent: 8c098653157979e397d3954fc2ea0ee43bae6ab2 (diff)
download: webp-9dbaf404293aff5b35993a9d6c7a3b45aeba1c52.tar.gz
7 files changed, 186 insertions, 37 deletions
diff --git a/src/dsp/alpha_processing.c b/src/dsp/alpha_processing.c
index 09deacfb..d0f7a6cc 100644
--- a/src/dsp/alpha_processing.c
+++ b/src/dsp/alpha_processing.c
@@ -284,15 +284,46 @@ static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
 #endif
 }
 
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  uint8_t alpha_mask = 0xff;
+  int i, j;
+
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const uint8_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  return (alpha_mask == 0xff);
+}
+
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
+int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 
 //------------------------------------------------------------------------------
 // Init function
 
+extern void WebPInitAlphaProcessingSSE2(void);
+
 void WebPInitAlphaProcessing(void) {
   WebPMultARGBRow = MultARGBRow;
   WebPMultRow = MultRow;
   WebPApplyAlphaMultiply = ApplyAlphaMultiply;
   WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPExtractAlpha = ExtractAlpha;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitAlphaProcessingSSE2();
+    }
+#endif
+  }
 }
diff --git a/src/dsp/alpha_processing_sse2.c b/src/dsp/alpha_processing_sse2.c
new file mode 100644
index 00000000..3d0a9b57
--- /dev/null
+++ b/src/dsp/alpha_processing_sse2.c
@@ -0,0 +1,77 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+//------------------------------------------------------------------------------
+
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  // alpha_and stores an 'and' operation of all the alpha[] values. The final
+  // value is not 0xff if any of the alpha[] is not equal to 0xff.
+  uint32_t alpha_and = 0xff;
+  int i, j;
+  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  __m128i all_alphas = all_0xff;
+
+  // We must be able to access 3 extra bytes after the last written byte
+  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
+  // last byte of the quadruplet.
+  const int limit = (width - 1) & ~7;
+
+  for (j = 0; j < height; ++j) {
+    const __m128i* src = (const __m128i*)argb;
+    for (i = 0; i < limit; i += 8) {
+      // load 32 argb bytes
+      const __m128i a0 = _mm_loadu_si128(src + 0);
+      const __m128i a1 = _mm_loadu_si128(src + 1);
+      const __m128i b0 = _mm_and_si128(a0, a_mask);
+      const __m128i b1 = _mm_and_si128(a1, a_mask);
+      const __m128i c0 = _mm_packs_epi32(b0, b1);
+      const __m128i d0 = _mm_packus_epi16(c0, c0);
+      // store
+      _mm_storel_epi64((__m128i*)&alpha[i], d0);
+      // accumulate eight alpha 'and' in parallel
+      all_alphas = _mm_and_si128(all_alphas, d0);
+      src += 2;
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_and &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  // Combine the eight alpha 'and' into a 8-bit mask.
+  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
+  return (alpha_and == 0xff);
+}
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Init function
+
+extern void WebPInitAlphaProcessingSSE2(void);
+
+void WebPInitAlphaProcessingSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  WebPExtractAlpha = ExtractAlpha;
+#endif
+}
diff --git a/src/dsp/cpu.c b/src/dsp/cpu.c
index 581b5e30..70ba2ab0 100644
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@@ -57,7 +57,7 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 }
 #elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
 #define xgetbv() _xgetbv(0)
-#elif defined(_M_IX86)
+#elif defined(_MSC_VER) && defined(_M_IX86)
 static WEBP_INLINE uint64_t xgetbv(void) {
   uint32_t eax_, edx_;
   __asm {
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
index 8208da53..3b31ae08 100644
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -32,9 +32,19 @@ extern "C" {
 # define LOCAL_GCC_PREREQ(maj, min) \
     (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
 #else
+# define LOCAL_GCC_VERSION 0
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif
 
+#ifdef __clang__
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif  // __clang__
+
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
     (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
@@ -62,6 +72,9 @@ extern "C" {
 
 #if defined(__mips__) && !defined(__mips64) && (__mips_isa_rev < 6)
 #define WEBP_USE_MIPS32
+#if (__mips_isa_rev >= 2)
+#define WEBP_USE_MIPS32_R2
+#endif
 #endif
 
 typedef enum {
@@ -244,6 +257,13 @@ extern void (*WebPApplyAlphaMultiply)(
 extern void (*WebPApplyAlphaMultiply4444)(
     uint8_t* rgba4444, int w, int h, int stride);
 
+// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
+// (this is the opposite of WebPDispatchAlpha).
+// Returns true if there's only trivial 0xff alpha values.
+extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
+                               int width, int height,
+                               uint8_t* alpha, int alpha_stride);
+
 // Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
 // Un-Multiply operation transforms x into x * 255 / A.
 
diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c
index 1e712c52..42041f73 100644
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@@ -253,7 +253,7 @@ static void ITransform(const uint8_t* ref,
 
 // Load all 4x4 pixels into a single uint8x16_t variable.
 static uint8x16_t Load4x4(const uint8_t* src) {
-  uint32x4_t out = { 0, 0, 0, 0 };
+  uint32x4_t out = vdupq_n_u32(0);
   out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
   out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
   out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
@@ -929,7 +929,7 @@ static int SumToInt(uint32x4_t sum) {
 }
 
 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
-  uint32x4_t sum = { 0, 0, 0, 0 };
+  uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 16; ++y) {
     AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
@@ -938,7 +938,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
 }
 
 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
-  uint32x4_t sum = { 0, 0, 0, 0 };
+  uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 8; ++y) {
     AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
@@ -947,7 +947,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
 }
 
 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
-  uint32x4_t sum = { 0, 0, 0, 0 };
+  uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 8; ++y) {
     const uint8x8_t a0 = vld1_u8(a + y * BPS);
@@ -970,9 +970,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 
 //------------------------------------------------------------------------------
 
-// Compilation with gcc-4.6.x is problematic for now and vtbl? are unavailable
-// in iOS/arm64 builds. Disable this function in those cases.
-#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__))
+// Compilation with gcc-4.6.x is problematic for now.
+#if !defined(WORK_AROUND_GCC)
 
 static int16x8_t Quantize(int16_t* const in,
                           const VP8Matrix* const mtx, int offset) {
@@ -1002,27 +1001,44 @@ static int16x8_t Quantize(int16_t* const in,
 }
 
 static const uint8_t kShuffles[4][8] = {
- { 0,   1,  2,  3,  8,  9, 16, 17 },
- { 10, 11,  4,  5,  6,  7, 12, 13 },
- { 18, 19, 24, 25, 26, 27, 20, 21 },
- { 14, 15, 22, 23, 28, 29, 30, 31 }
+  { 0,   1,  2,  3,  8,  9, 16, 17 },
+  { 10, 11,  4,  5,  6,  7, 12, 13 },
+  { 18, 19, 24, 25, 26, 27, 20, 21 },
+  { 14, 15, 22, 23, 28, 29, 30, 31 }
 };
 
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                          const VP8Matrix* const mtx) {
   const int16x8_t out0 = Quantize(in, mtx, 0);
   const int16x8_t out1 = Quantize(in, mtx, 8);
+  uint8x8x4_t shuffles;
+  // vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there.
+#if defined(__APPLE__) && defined(__aarch64__)
+  uint8x16x2_t all_out;
+  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
+  INIT_VECTOR4(shuffles,
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
+#else
   uint8x8x4_t all_out;
   INIT_VECTOR4(all_out,
                vreinterpret_u8_s16(vget_low_s16(out0)),
                vreinterpret_u8_s16(vget_high_s16(out0)),
                vreinterpret_u8_s16(vget_low_s16(out1)),
                vreinterpret_u8_s16(vget_high_s16(out1)));
+  INIT_VECTOR4(shuffles,
+               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
+#endif
   // Zigzag reordering
-  vst1_u8((uint8_t*)(out +  0), vtbl4_u8(all_out, vld1_u8(kShuffles[0])));
-  vst1_u8((uint8_t*)(out +  4), vtbl4_u8(all_out, vld1_u8(kShuffles[1])));
-  vst1_u8((uint8_t*)(out +  8), vtbl4_u8(all_out, vld1_u8(kShuffles[2])));
-  vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
+  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
+  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
+  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
+  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
   // test zeros
   if (*(uint64_t*)(out +  0) != 0) return 1;
   if (*(uint64_t*)(out +  4) != 0) return 1;
@@ -1031,7 +1047,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return 0;
 }
 
-#endif   // !WORK_AROUND_GCC && !__aarch64__
+#endif   // !WORK_AROUND_GCC
 
 #endif   // WEBP_USE_NEON
 
@@ -1054,7 +1070,7 @@ void VP8EncDspInitNEON(void) {
   VP8SSE16x8 = SSE16x8;
   VP8SSE8x8 = SSE8x8;
   VP8SSE4x4 = SSE4x4;
-#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__))
+#if !defined(WORK_AROUND_GCC)
   VP8EncQuantizeBlock = QuantizeBlock;
 #endif
 #endif   // WEBP_USE_NEON
diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c
index 84e20784..a1bf3584 100644
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@@ -450,12 +450,21 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
   return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }
 
-static WEBP_INLINE int Sub3(int a, int b, int c) {
+// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
+#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
+# define LOCAL_INLINE __attribute__ ((noinline))
+#else
+# define LOCAL_INLINE WEBP_INLINE
+#endif
+
+static LOCAL_INLINE int Sub3(int a, int b, int c) {
   const int pb = b - c;
   const int pa = a - c;
   return abs(pb) - abs(pa);
 }
 
+#undef LOCAL_INLINE
+
 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
   const int pa_minus_pb =
       Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
@@ -1169,7 +1178,7 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
       data += remaining_width;
     }
     ++y;
-    if ((y & mask) == 0) pred_row += tiles_per_row;;
+    if ((y & mask) == 0) pred_row += tiles_per_row;
   }
 }
 
diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h
index 03dfe223..08be9375 100644
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@@ -56,24 +56,20 @@ extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
 extern VP8LConvertFunc VP8LConvertBGRAToBGR;
 
 // Expose some C-only fallback functions
-extern void VP8LTransformColor_C(const VP8LMultipliers* const m,
+void VP8LTransformColor_C(const VP8LMultipliers* const m,
+                          uint32_t* data, int num_pixels);
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
                                  uint32_t* data, int num_pixels);
-extern void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
-                                        uint32_t* data, int num_pixels);
-
-extern void VP8LConvertBGRAToRGB_C(const uint32_t* src,
-                                   int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
-                                    int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
-                                        int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
-                                      int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToBGR_C(const uint32_t* src,
-                                   int num_pixels, uint8_t* dst);
-extern void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data,
-                                              int num_pixels);
-extern void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
+
+void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+                               int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
 
 // Must be called before calling any of the above methods.
 void VP8LDspInit(void);
author	Vikas Arora <vikasa@google.com>	2015-01-29 17:29:29 +0000
committer	Android (Google) Code Review <android-gerrit@google.com>	2015-01-29 17:29:30 +0000
commit	9dbaf404293aff5b35993a9d6c7a3b45aeba1c52 (patch)
tree	d7ca3ab2344e6eaae0c5af176fc998b317952c23 /src/dsp
parent	b01fe55d0ee2435cf881e68989b599563ae543e8 (diff)
parent	8c098653157979e397d3954fc2ea0ee43bae6ab2 (diff)
download	webp-9dbaf404293aff5b35993a9d6c7a3b45aeba1c52.tar.gz