diff options
author | Vikas Arora <vikasa@google.com> | 2015-01-29 17:29:29 +0000 |
---|---|---|
committer | Android (Google) Code Review <android-gerrit@google.com> | 2015-01-29 17:29:30 +0000 |
commit | 9dbaf404293aff5b35993a9d6c7a3b45aeba1c52 (patch) | |
tree | d7ca3ab2344e6eaae0c5af176fc998b317952c23 /src/dsp | |
parent | b01fe55d0ee2435cf881e68989b599563ae543e8 (diff) | |
parent | 8c098653157979e397d3954fc2ea0ee43bae6ab2 (diff) | |
download | webp-9dbaf404293aff5b35993a9d6c7a3b45aeba1c52.tar.gz |
Merge "Sync-patch with libwebp ver 0.4.2"
Diffstat (limited to 'src/dsp')
-rw-r--r-- | src/dsp/alpha_processing.c | 31 | ||||
-rw-r--r-- | src/dsp/alpha_processing_sse2.c | 77 | ||||
-rw-r--r-- | src/dsp/cpu.c | 2 | ||||
-rw-r--r-- | src/dsp/dsp.h | 20 | ||||
-rw-r--r-- | src/dsp/enc_neon.c | 50 | ||||
-rw-r--r-- | src/dsp/lossless.c | 13 | ||||
-rw-r--r-- | src/dsp/lossless.h | 30 |
7 files changed, 186 insertions, 37 deletions
diff --git a/src/dsp/alpha_processing.c b/src/dsp/alpha_processing.c index 09deacfb..d0f7a6cc 100644 --- a/src/dsp/alpha_processing.c +++ b/src/dsp/alpha_processing.c @@ -284,15 +284,46 @@ static void ApplyAlphaMultiply_16b(uint8_t* rgba4444, #endif } +static int ExtractAlpha(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride) { + uint8_t alpha_mask = 0xff; + int i, j; + + for (j = 0; j < height; ++j) { + for (i = 0; i < width; ++i) { + const uint8_t alpha_value = argb[4 * i]; + alpha[i] = alpha_value; + alpha_mask &= alpha_value; + } + argb += argb_stride; + alpha += alpha_stride; + } + return (alpha_mask == 0xff); +} + void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int); void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int); +int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int); //------------------------------------------------------------------------------ // Init function +extern void WebPInitAlphaProcessingSSE2(void); + void WebPInitAlphaProcessing(void) { WebPMultARGBRow = MultARGBRow; WebPMultRow = MultRow; WebPApplyAlphaMultiply = ApplyAlphaMultiply; WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b; + WebPExtractAlpha = ExtractAlpha; + + // If defined, use CPUInfo() to overwrite some pointers with faster versions. + if (VP8GetCPUInfo != NULL) { +#if defined(WEBP_USE_SSE2) + if (VP8GetCPUInfo(kSSE2)) { + WebPInitAlphaProcessingSSE2(); + } +#endif + } } diff --git a/src/dsp/alpha_processing_sse2.c b/src/dsp/alpha_processing_sse2.c new file mode 100644 index 00000000..3d0a9b57 --- /dev/null +++ b/src/dsp/alpha_processing_sse2.c @@ -0,0 +1,77 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Utilities for processing transparent channel. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "./dsp.h" + +#if defined(WEBP_USE_SSE2) +#include <emmintrin.h> + +//------------------------------------------------------------------------------ + +static int ExtractAlpha(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride) { + // alpha_and stores an 'and' operation of all the alpha[] values. The final + // value is not 0xff if any of the alpha[] is not equal to 0xff. + uint32_t alpha_and = 0xff; + int i, j; + const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha + const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); + __m128i all_alphas = all_0xff; + + // We must be able to access 3 extra bytes after the last written byte + // 'src[4 * width - 4]', because we don't know if alpha is the first or the + // last byte of the quadruplet. + const int limit = (width - 1) & ~7; + + for (j = 0; j < height; ++j) { + const __m128i* src = (const __m128i*)argb; + for (i = 0; i < limit; i += 8) { + // load 32 argb bytes + const __m128i a0 = _mm_loadu_si128(src + 0); + const __m128i a1 = _mm_loadu_si128(src + 1); + const __m128i b0 = _mm_and_si128(a0, a_mask); + const __m128i b1 = _mm_and_si128(a1, a_mask); + const __m128i c0 = _mm_packs_epi32(b0, b1); + const __m128i d0 = _mm_packus_epi16(c0, c0); + // store + _mm_storel_epi64((__m128i*)&alpha[i], d0); + // accumulate eight alpha 'and' in parallel + all_alphas = _mm_and_si128(all_alphas, d0); + src += 2; + } + for (; i < width; ++i) { + const uint32_t alpha_value = argb[4 * i]; + alpha[i] = alpha_value; + alpha_and &= alpha_value; + } + argb += argb_stride; + alpha += alpha_stride; + } + // Combine the eight alpha 'and' into a 8-bit mask. + alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); + return (alpha_and == 0xff); +} + +#endif // WEBP_USE_SSE2 + +//------------------------------------------------------------------------------ +// Init function + +extern void WebPInitAlphaProcessingSSE2(void); + +void WebPInitAlphaProcessingSSE2(void) { +#if defined(WEBP_USE_SSE2) + WebPExtractAlpha = ExtractAlpha; +#endif +} diff --git a/src/dsp/cpu.c b/src/dsp/cpu.c index 581b5e30..70ba2ab0 100644 --- a/src/dsp/cpu.c +++ b/src/dsp/cpu.c @@ -57,7 +57,7 @@ static WEBP_INLINE uint64_t xgetbv(void) { } #elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219 // >= VS2010 SP1 #define xgetbv() _xgetbv(0) -#elif defined(_M_IX86) +#elif defined(_MSC_VER) && defined(_M_IX86) static WEBP_INLINE uint64_t xgetbv(void) { uint32_t eax_, edx_; __asm { diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 8208da53..3b31ae08 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -32,9 +32,19 @@ extern "C" { # define LOCAL_GCC_PREREQ(maj, min) \ (LOCAL_GCC_VERSION >= (((maj) << 8) | (min))) #else +# define LOCAL_GCC_VERSION 0 # define LOCAL_GCC_PREREQ(maj, min) 0 #endif +#ifdef __clang__ +# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__) +# define LOCAL_CLANG_PREREQ(maj, min) \ + (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min))) +#else +# define LOCAL_CLANG_VERSION 0 +# define LOCAL_CLANG_PREREQ(maj, min) 0 +#endif // __clang__ + #if defined(_MSC_VER) && _MSC_VER > 1310 && \ (defined(_M_X64) || defined(_M_IX86)) #define WEBP_MSC_SSE2 // Visual C++ SSE2 targets @@ -62,6 +72,9 @@ extern "C" { #if defined(__mips__) && !defined(__mips64) && (__mips_isa_rev < 6) #define WEBP_USE_MIPS32 +#if (__mips_isa_rev >= 2) +#define WEBP_USE_MIPS32_R2 +#endif #endif typedef enum { @@ -244,6 +257,13 @@ extern void (*WebPApplyAlphaMultiply)( extern void (*WebPApplyAlphaMultiply4444)( uint8_t* rgba4444, int w, int h, int stride); +// Extract the alpha values from 32b values in argb[] and pack them into alpha[] +// (this is the opposite of WebPDispatchAlpha). +// Returns true if there's only trivial 0xff alpha values. +extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride); + // Pre-Multiply operation transforms x into x * A / 255 (where x=Y,R,G or B). // Un-Multiply operation transforms x into x * 255 / A. diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 1e712c52..42041f73 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -253,7 +253,7 @@ static void ITransform(const uint8_t* ref, // Load all 4x4 pixels into a single uint8x16_t variable. static uint8x16_t Load4x4(const uint8_t* src) { - uint32x4_t out = { 0, 0, 0, 0 }; + uint32x4_t out = vdupq_n_u32(0); out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); @@ -929,7 +929,7 @@ static int SumToInt(uint32x4_t sum) { } static int SSE16x16(const uint8_t* a, const uint8_t* b) { - uint32x4_t sum = { 0, 0, 0, 0 }; + uint32x4_t sum = vdupq_n_u32(0); int y; for (y = 0; y < 16; ++y) { AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); @@ -938,7 +938,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) { } static int SSE16x8(const uint8_t* a, const uint8_t* b) { - uint32x4_t sum = { 0, 0, 0, 0 }; + uint32x4_t sum = vdupq_n_u32(0); int y; for (y = 0; y < 8; ++y) { AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); @@ -947,7 +947,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) { } static int SSE8x8(const uint8_t* a, const uint8_t* b) { - uint32x4_t sum = { 0, 0, 0, 0 }; + uint32x4_t sum = vdupq_n_u32(0); int y; for (y = 0; y < 8; ++y) { const uint8x8_t a0 = vld1_u8(a + y * BPS); @@ -970,9 +970,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { //------------------------------------------------------------------------------ -// Compilation with gcc-4.6.x is problematic for now and vtbl? are unavailable -// in iOS/arm64 builds. Disable this function in those cases. -#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__)) +// Compilation with gcc-4.6.x is problematic for now. +#if !defined(WORK_AROUND_GCC) static int16x8_t Quantize(int16_t* const in, const VP8Matrix* const mtx, int offset) { @@ -1002,27 +1001,44 @@ static int16x8_t Quantize(int16_t* const in, } static const uint8_t kShuffles[4][8] = { - { 0, 1, 2, 3, 8, 9, 16, 17 }, - { 10, 11, 4, 5, 6, 7, 12, 13 }, - { 18, 19, 24, 25, 26, 27, 20, 21 }, - { 14, 15, 22, 23, 28, 29, 30, 31 } + { 0, 1, 2, 3, 8, 9, 16, 17 }, + { 10, 11, 4, 5, 6, 7, 12, 13 }, + { 18, 19, 24, 25, 26, 27, 20, 21 }, + { 14, 15, 22, 23, 28, 29, 30, 31 } }; static int QuantizeBlock(int16_t in[16], int16_t out[16], const VP8Matrix* const mtx) { const int16x8_t out0 = Quantize(in, mtx, 0); const int16x8_t out1 = Quantize(in, mtx, 8); + uint8x8x4_t shuffles; + // vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there. +#if defined(__APPLE__) && defined(__aarch64__) + uint8x16x2_t all_out; + INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1)); + INIT_VECTOR4(shuffles, + vtbl2q_u8(all_out, vld1_u8(kShuffles[0])), + vtbl2q_u8(all_out, vld1_u8(kShuffles[1])), + vtbl2q_u8(all_out, vld1_u8(kShuffles[2])), + vtbl2q_u8(all_out, vld1_u8(kShuffles[3]))); +#else uint8x8x4_t all_out; INIT_VECTOR4(all_out, vreinterpret_u8_s16(vget_low_s16(out0)), vreinterpret_u8_s16(vget_high_s16(out0)), vreinterpret_u8_s16(vget_low_s16(out1)), vreinterpret_u8_s16(vget_high_s16(out1))); + INIT_VECTOR4(shuffles, + vtbl4_u8(all_out, vld1_u8(kShuffles[0])), + vtbl4_u8(all_out, vld1_u8(kShuffles[1])), + vtbl4_u8(all_out, vld1_u8(kShuffles[2])), + vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); +#endif // Zigzag reordering - vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0]))); - vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1]))); - vst1_u8((uint8_t*)(out + 8), vtbl4_u8(all_out, vld1_u8(kShuffles[2]))); - vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); + vst1_u8((uint8_t*)(out + 0), shuffles.val[0]); + vst1_u8((uint8_t*)(out + 4), shuffles.val[1]); + vst1_u8((uint8_t*)(out + 8), shuffles.val[2]); + vst1_u8((uint8_t*)(out + 12), shuffles.val[3]); // test zeros if (*(uint64_t*)(out + 0) != 0) return 1; if (*(uint64_t*)(out + 4) != 0) return 1; @@ -1031,7 +1047,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return 0; } -#endif // !WORK_AROUND_GCC && !__aarch64__ +#endif // !WORK_AROUND_GCC #endif // WEBP_USE_NEON @@ -1054,7 +1070,7 @@ void VP8EncDspInitNEON(void) { VP8SSE16x8 = SSE16x8; VP8SSE8x8 = SSE8x8; VP8SSE4x4 = SSE4x4; -#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__)) +#if !defined(WORK_AROUND_GCC) VP8EncQuantizeBlock = QuantizeBlock; #endif #endif // WEBP_USE_NEON diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c index 84e20784..a1bf3584 100644 --- a/src/dsp/lossless.c +++ b/src/dsp/lossless.c @@ -450,12 +450,21 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b; } -static WEBP_INLINE int Sub3(int a, int b, int c) { +// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined. +#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409 +# define LOCAL_INLINE __attribute__ ((noinline)) +#else +# define LOCAL_INLINE WEBP_INLINE +#endif + +static LOCAL_INLINE int Sub3(int a, int b, int c) { const int pb = b - c; const int pa = a - c; return abs(pb) - abs(pa); } +#undef LOCAL_INLINE + static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) { const int pa_minus_pb = Sub3((a >> 24) , (b >> 24) , (c >> 24) ) + @@ -1169,7 +1178,7 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform, data += remaining_width; } ++y; - if ((y & mask) == 0) pred_row += tiles_per_row;; + if ((y & mask) == 0) pred_row += tiles_per_row; } } diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h index 03dfe223..08be9375 100644 --- a/src/dsp/lossless.h +++ b/src/dsp/lossless.h @@ -56,24 +56,20 @@ extern VP8LConvertFunc VP8LConvertBGRAToRGB565; extern VP8LConvertFunc VP8LConvertBGRAToBGR; // Expose some C-only fallback functions -extern void VP8LTransformColor_C(const VP8LMultipliers* const m, +void VP8LTransformColor_C(const VP8LMultipliers* const m, + uint32_t* data, int num_pixels); +void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data, int num_pixels); -extern void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, - uint32_t* data, int num_pixels); - -extern void VP8LConvertBGRAToRGB_C(const uint32_t* src, - int num_pixels, uint8_t* dst); -extern void VP8LConvertBGRAToRGBA_C(const uint32_t* src, - int num_pixels, uint8_t* dst); -extern void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src, - int num_pixels, uint8_t* dst); -extern void VP8LConvertBGRAToRGB565_C(const uint32_t* src, - int num_pixels, uint8_t* dst); -extern void VP8LConvertBGRAToBGR_C(const uint32_t* src, - int num_pixels, uint8_t* dst); -extern void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, - int num_pixels); -extern void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels); + +void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst); +void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst); +void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src, + int num_pixels, uint8_t* dst); +void VP8LConvertBGRAToRGB565_C(const uint32_t* src, + int num_pixels, uint8_t* dst); +void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst); +void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels); +void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels); // Must be called before calling any of the above methods. void VP8LDspInit(void); |