Merge "Sync-patch with libwebp ver 0.4.2"

author: Vikas Arora <vikasa@google.com> 2015-01-29 17:29:29 +0000
committer: Android (Google) Code Review <android-gerrit@google.com> 2015-01-29 17:29:30 +0000
commit: 9dbaf404293aff5b35993a9d6c7a3b45aeba1c52 (patch)
tree: d7ca3ab2344e6eaae0c5af176fc998b317952c23
parent: b01fe55d0ee2435cf881e68989b599563ae543e8 (diff)
parent: 8c098653157979e397d3954fc2ea0ee43bae6ab2 (diff)
download: webp-9dbaf404293aff5b35993a9d6c7a3b45aeba1c52.tar.gz
33 files changed, 1185 insertions, 249 deletions
diff --git a/ChangeLog b/ChangeLog
index 4f5955dd..7a71aa08 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -9,3 +9,4 @@
 - 9/13: Fix memleak in WebPIDelete() (change#Id4faef1b)
 - 1/14: Release version 0.4.0-rc1 (change#I22be12d8)
 - 7/14: Release version 0.4.1-rc1 (change#I5346984d2)
+- 1/15: Release version 0.4.2 (change#I32a22786f)
diff --git a/NEWS b/NEWS
index e2acf8d2..1c437b7f 100644
--- a/NEWS
+++ b/NEWS
@@ -6,3 +6,4 @@
         (#I737451d7f, #Ia300385a & #I9566a8e2).
 - 1/14: release version 0.4.0-rc1 (change#I22be12d8).
 - 7/14: release version 0.4.1-rc1 (change#I5346984d2).
+- 1/15: release version 0.4.2 (change#I32a22786f).
diff --git a/README b/README
index e4867e88..13f32ea1 100644
--- a/README
+++ b/README
@@ -4,7 +4,7 @@
           \__\__/\____/\_____/__/ ____  ___
                 / _/ /    \    \ /  _ \/ _/
                /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.4.1
+               \____/____/\_____/_____/____/v0.4.2
 
 Description:
 ============
diff --git a/README.android b/README.android
index a68e0a0d..81e7ac7d 100644
--- a/README.android
+++ b/README.android
@@ -46,6 +46,8 @@ Local modifications:
     - ~10% faster lossless decode
     - ~5-10% faster lossless encode (-m 3/4)
   - Arch64 (arm64) & MIPS support/optimizations.
+- Sync-patch with libwebp ver 0.4.2 (change#I32a22786f).
+  - Cherry-picked Alpha-decoding bug.
 
 The Android.mk file creates WebP Decoder and Encoder static libraries which
 can be added to any application by Adding to LOCAL_STATIC_LIBRARIES
diff --git a/include/webp/decode.h b/include/webp/decode.h
index 36c27c37..8d3f7be9 100644
--- a/include/webp/decode.h
+++ b/include/webp/decode.h
@@ -444,16 +444,20 @@ struct WebPDecoderOptions {
   int dithering_strength;             // dithering strength (0=Off, 100=full)
 #if WEBP_DECODER_ABI_VERSION > 0x0203
   int flip;                           // flip output vertically
+#endif
+#if WEBP_DECODER_ABI_VERSION > 0x0204
   int alpha_dithering_strength;       // alpha dithering strength in [0..100]
 #endif
 
   // Unused for now:
   int force_rotation;                 // forced rotation (to be applied _last_)
   int no_enhancement;                 // if true, discard enhancement layer
-#if WEBP_DECODER_ABI_VERSION > 0x0203
-  uint32_t pad[3];                    // padding for later use
-#else
+#if WEBP_DECODER_ABI_VERSION < 0x0203
   uint32_t pad[5];                    // padding for later use
+#elif WEBP_DECODER_ABI_VERSION < 0x0204
+  uint32_t pad[4];                    // padding for later use
+#else
+  uint32_t pad[3];                    // padding for later use
 #endif
 };
 
diff --git a/include/webp/encode.h b/include/webp/encode.h
index dd600568..3c263748 100644
--- a/include/webp/encode.h
+++ b/include/webp/encode.h
@@ -231,14 +231,14 @@ struct WebPMemoryWriter {
 // The following must be called first before any use.
 WEBP_EXTERN(void) WebPMemoryWriterInit(WebPMemoryWriter* writer);
 
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
+#if WEBP_ENCODER_ABI_VERSION > 0x0203
 // The following must be called to deallocate writer->mem memory. The 'writer'
 // object itself is not deallocated.
 WEBP_EXTERN(void) WebPMemoryWriterClear(WebPMemoryWriter* writer);
 #endif
 // The custom writer to be used with WebPMemoryWriter as custom_ptr. Upon
 // completion, writer.mem and writer.size will hold the coded data.
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
+#if WEBP_ENCODER_ABI_VERSION > 0x0203
 // writer.mem must be freed by calling WebPMemoryWriterClear.
 #else
 // writer.mem must be freed by calling 'free(writer.mem)'.
@@ -446,13 +446,14 @@ WEBP_EXTERN(int) WebPPictureImportBGRA(
 WEBP_EXTERN(int) WebPPictureImportBGRX(
     WebPPicture* picture, const uint8_t* bgrx, int bgrx_stride);
 
-// Converts picture->argb data to the YUVA format specified by 'colorspace'.
+// Converts picture->argb data to the YUV420A format. The 'colorspace'
+// parameter is deprecated and should be equal to WEBP_YUV420.
 // Upon return, picture->use_argb is set to false. The presence of real
 // non-opaque transparent values is detected, and 'colorspace' will be
 // adjusted accordingly. Note that this method is lossy.
 // Returns false in case of error.
 WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
-                                       WebPEncCSP colorspace);
+                                       WebPEncCSP /*colorspace = WEBP_YUV420*/);
 
 // Same as WebPPictureARGBToYUVA(), but the conversion is done using
 // pseudo-random dithering with a strength 'dithering' between
@@ -461,6 +462,15 @@ WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
 WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
     WebPPicture* picture, WebPEncCSP colorspace, float dithering);
 
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+// Performs 'smart' RGBA->YUVA420 downsampling and colorspace conversion.
+// Downsampling is handled with extra care in case of color clipping. This
+// method is roughly 2x slower than WebPPictureARGBToYUVA() but produces better
+// YUV representation.
+// Returns false in case of error.
+WEBP_EXTERN(int) WebPPictureSmartARGBToYUVA(WebPPicture* picture);
+#endif
+
 // Converts picture->yuv to picture->argb and sets picture->use_argb to true.
 // The input format must be YUV_420 or YUV_420A.
 // Note that the use of this method is discouraged if one has access to the
diff --git a/src/Android.mk b/src/Android.mk
index 90303cae..027a17b4 100644
--- a/src/Android.mk
+++ b/src/Android.mk
@@ -38,6 +38,7 @@ LOCAL_SRC_FILES := \
         enc/vp8l.c \
         enc/webpenc.c \
         dsp/alpha_processing.c \
+        dsp/alpha_processing_sse2.c \
         dsp/cpu.c \
         dsp/cpu-features.c \
         dsp/enc.c \
@@ -89,6 +90,7 @@ LOCAL_SRC_FILES := \
         dec/vp8l.c \
         dec/webp.c \
         dsp/alpha_processing.c \
+        dsp/alpha_processing_sse2.c \
         dsp/cpu.c \
         dsp/cpu-features.c \
         dsp/dec.c \
diff --git a/src/dec/frame.c b/src/dec/frame.c
index f7a0d1d8..2359acc5 100644
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -177,7 +177,7 @@ void VP8InitDithering(const WebPDecoderOptions* const options,
         dec->dither_ = 1;
       }
     }
-#if WEBP_DECODER_ABI_VERSION > 0x0203
+#if WEBP_DECODER_ABI_VERSION > 0x0204
     // potentially allow alpha dithering
     dec->alpha_dithering_ = options->alpha_dithering_strength;
     if (dec->alpha_dithering_ > 100) {
diff --git a/src/dec/idec.c b/src/dec/idec.c
index 7bab1eab..5d8bb0c2 100644
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@@ -529,6 +529,12 @@ static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
   }
 
   if (!VP8LDecodeImage(dec)) {
+    // The decoding is called after all the data-bytes are aggregated. Change
+    // the error to VP8_BITSTREAM_ERROR in case lossless decoder fails to decode
+    // all the pixels (VP8_STATUS_SUSPENDED).
+    if (dec->status_ == VP8_STATUS_SUSPENDED) {
+      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    }
     return ErrorStatusLossless(idec, dec->status_);
   }
 
diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h
index 7cc1840f..29701be7 100644
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -31,7 +31,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 0
 #define DEC_MIN_VERSION 4
-#define DEC_REV_VERSION 1
+#define DEC_REV_VERSION 2
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
diff --git a/src/dec/vp8l.c b/src/dec/vp8l.c
index 81cf99fc..e2780e5b 100644
--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
@@ -234,6 +234,7 @@ static int ReadHuffmanCodeLengths(
 
  End:
   VP8LHuffmanTreeFree(&tree);
+  if (!ok) dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
   return ok;
 }
 
@@ -801,6 +802,7 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
       ok = 0;
       goto End;
     }
+    assert(br->eos_ == VP8LIsEndOfStream(br));
     ok = !br->error_;
     if (!ok) goto End;
   }
@@ -898,7 +900,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
           process_func(dec, row);
         }
       }
-      if (src < src_last) {
+      if (src < src_end) {
         if (col & mask) htree_group = GetHtreeGroupForPos(hdr, col, row);
         if (color_cache != NULL) {
           while (last_cached < src) {
@@ -918,6 +920,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       ok = 0;
       goto End;
     }
+    assert(br->eos_ == VP8LIsEndOfStream(br));
     ok = !br->error_;
     if (!ok) goto End;
   }
@@ -1354,6 +1357,10 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
   // Sanity checks.
   if (dec == NULL) return 0;
 
+  dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+  assert(dec->hdr_.htree_groups_ != NULL);
+  assert(dec->hdr_.num_htree_groups_ > 0);
+
   io = dec->io_;
   assert(io != NULL);
   params = (WebPDecParams*)io->opaque;
diff --git a/src/demux/demux.c b/src/demux/demux.c
index 870c47b3..5aabdd25 100644
--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@@ -25,7 +25,7 @@
 
 #define DMUX_MAJ_VERSION 0
 #define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 1
+#define DMUX_REV_VERSION 2
 
 typedef struct {
   size_t start_;        // start location of the data
diff --git a/src/dsp/alpha_processing.c b/src/dsp/alpha_processing.c
index 09deacfb..d0f7a6cc 100644
--- a/src/dsp/alpha_processing.c
+++ b/src/dsp/alpha_processing.c
@@ -284,15 +284,46 @@ static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
 #endif
 }
 
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  uint8_t alpha_mask = 0xff;
+  int i, j;
+
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const uint8_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  return (alpha_mask == 0xff);
+}
+
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
+int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 
 //------------------------------------------------------------------------------
 // Init function
 
+extern void WebPInitAlphaProcessingSSE2(void);
+
 void WebPInitAlphaProcessing(void) {
   WebPMultARGBRow = MultARGBRow;
   WebPMultRow = MultRow;
   WebPApplyAlphaMultiply = ApplyAlphaMultiply;
   WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPExtractAlpha = ExtractAlpha;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitAlphaProcessingSSE2();
+    }
+#endif
+  }
 }
diff --git a/src/dsp/alpha_processing_sse2.c b/src/dsp/alpha_processing_sse2.c
new file mode 100644
index 00000000..3d0a9b57
--- /dev/null
+++ b/src/dsp/alpha_processing_sse2.c
@@ -0,0 +1,77 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+//------------------------------------------------------------------------------
+
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  // alpha_and stores an 'and' operation of all the alpha[] values. The final
+  // value is not 0xff if any of the alpha[] is not equal to 0xff.
+  uint32_t alpha_and = 0xff;
+  int i, j;
+  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  __m128i all_alphas = all_0xff;
+
+  // We must be able to access 3 extra bytes after the last written byte
+  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
+  // last byte of the quadruplet.
+  const int limit = (width - 1) & ~7;
+
+  for (j = 0; j < height; ++j) {
+    const __m128i* src = (const __m128i*)argb;
+    for (i = 0; i < limit; i += 8) {
+      // load 32 argb bytes
+      const __m128i a0 = _mm_loadu_si128(src + 0);
+      const __m128i a1 = _mm_loadu_si128(src + 1);
+      const __m128i b0 = _mm_and_si128(a0, a_mask);
+      const __m128i b1 = _mm_and_si128(a1, a_mask);
+      const __m128i c0 = _mm_packs_epi32(b0, b1);
+      const __m128i d0 = _mm_packus_epi16(c0, c0);
+      // store
+      _mm_storel_epi64((__m128i*)&alpha[i], d0);
+      // accumulate eight alpha 'and' in parallel
+      all_alphas = _mm_and_si128(all_alphas, d0);
+      src += 2;
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_and &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  // Combine the eight alpha 'and' into a 8-bit mask.
+  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
+  return (alpha_and == 0xff);
+}
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Init function
+
+extern void WebPInitAlphaProcessingSSE2(void);
+
+void WebPInitAlphaProcessingSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  WebPExtractAlpha = ExtractAlpha;
+#endif
+}
diff --git a/src/dsp/cpu.c b/src/dsp/cpu.c
index 581b5e30..70ba2ab0 100644
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@@ -57,7 +57,7 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 }
 #elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
 #define xgetbv() _xgetbv(0)
-#elif defined(_M_IX86)
+#elif defined(_MSC_VER) && defined(_M_IX86)
 static WEBP_INLINE uint64_t xgetbv(void) {
   uint32_t eax_, edx_;
   __asm {
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
index 8208da53..3b31ae08 100644
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -32,9 +32,19 @@ extern "C" {
 # define LOCAL_GCC_PREREQ(maj, min) \
     (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
 #else
+# define LOCAL_GCC_VERSION 0
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif
 
+#ifdef __clang__
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif  // __clang__
+
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
     (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
@@ -62,6 +72,9 @@ extern "C" {
 
 #if defined(__mips__) && !defined(__mips64) && (__mips_isa_rev < 6)
 #define WEBP_USE_MIPS32
+#if (__mips_isa_rev >= 2)
+#define WEBP_USE_MIPS32_R2
+#endif
 #endif
 
 typedef enum {
@@ -244,6 +257,13 @@ extern void (*WebPApplyAlphaMultiply)(
 extern void (*WebPApplyAlphaMultiply4444)(
     uint8_t* rgba4444, int w, int h, int stride);
 
+// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
+// (this is the opposite of WebPDispatchAlpha).
+// Returns true if there's only trivial 0xff alpha values.
+extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
+                               int width, int height,
+                               uint8_t* alpha, int alpha_stride);
+
 // Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
 // Un-Multiply operation transforms x into x * 255 / A.
 
diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c
index 1e712c52..42041f73 100644
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@@ -253,7 +253,7 @@ static void ITransform(const uint8_t* ref,
 
 // Load all 4x4 pixels into a single uint8x16_t variable.
 static uint8x16_t Load4x4(const uint8_t* src) {
-  uint32x4_t out = { 0, 0, 0, 0 };
+  uint32x4_t out = vdupq_n_u32(0);
   out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
   out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
   out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
@@ -929,7 +929,7 @@ static int SumToInt(uint32x4_t sum) {
 }
 
 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
-  uint32x4_t sum = { 0, 0, 0, 0 };
+  uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 16; ++y) {
     AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
@@ -938,7 +938,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
 }
 
 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
-  uint32x4_t sum = { 0, 0, 0, 0 };
+  uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 8; ++y) {
     AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
@@ -947,7 +947,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
 }
 
 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
-  uint32x4_t sum = { 0, 0, 0, 0 };
+  uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 8; ++y) {
     const uint8x8_t a0 = vld1_u8(a + y * BPS);
@@ -970,9 +970,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 
 //------------------------------------------------------------------------------
 
-// Compilation with gcc-4.6.x is problematic for now and vtbl? are unavailable
-// in iOS/arm64 builds. Disable this function in those cases.
-#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__))
+// Compilation with gcc-4.6.x is problematic for now.
+#if !defined(WORK_AROUND_GCC)
 
 static int16x8_t Quantize(int16_t* const in,
                           const VP8Matrix* const mtx, int offset) {
@@ -1002,27 +1001,44 @@ static int16x8_t Quantize(int16_t* const in,
 }
 
 static const uint8_t kShuffles[4][8] = {
- { 0,   1,  2,  3,  8,  9, 16, 17 },
- { 10, 11,  4,  5,  6,  7, 12, 13 },
- { 18, 19, 24, 25, 26, 27, 20, 21 },
- { 14, 15, 22, 23, 28, 29, 30, 31 }
+  { 0,   1,  2,  3,  8,  9, 16, 17 },
+  { 10, 11,  4,  5,  6,  7, 12, 13 },
+  { 18, 19, 24, 25, 26, 27, 20, 21 },
+  { 14, 15, 22, 23, 28, 29, 30, 31 }
 };
 
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                          const VP8Matrix* const mtx) {
   const int16x8_t out0 = Quantize(in, mtx, 0);
   const int16x8_t out1 = Quantize(in, mtx, 8);
+  uint8x8x4_t shuffles;
+  // vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there.
+#if defined(__APPLE__) && defined(__aarch64__)
+  uint8x16x2_t all_out;
+  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
+  INIT_VECTOR4(shuffles,
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
+#else
   uint8x8x4_t all_out;
   INIT_VECTOR4(all_out,
                vreinterpret_u8_s16(vget_low_s16(out0)),
                vreinterpret_u8_s16(vget_high_s16(out0)),
                vreinterpret_u8_s16(vget_low_s16(out1)),
                vreinterpret_u8_s16(vget_high_s16(out1)));
+  INIT_VECTOR4(shuffles,
+               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
+#endif
   // Zigzag reordering
-  vst1_u8((uint8_t*)(out +  0), vtbl4_u8(all_out, vld1_u8(kShuffles[0])));
-  vst1_u8((uint8_t*)(out +  4), vtbl4_u8(all_out, vld1_u8(kShuffles[1])));
-  vst1_u8((uint8_t*)(out +  8), vtbl4_u8(all_out, vld1_u8(kShuffles[2])));
-  vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
+  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
+  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
+  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
+  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
   // test zeros
   if (*(uint64_t*)(out +  0) != 0) return 1;
   if (*(uint64_t*)(out +  4) != 0) return 1;
@@ -1031,7 +1047,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return 0;
 }
 
-#endif   // !WORK_AROUND_GCC && !__aarch64__
+#endif   // !WORK_AROUND_GCC
 
 #endif   // WEBP_USE_NEON
 
@@ -1054,7 +1070,7 @@ void VP8EncDspInitNEON(void) {
   VP8SSE16x8 = SSE16x8;
   VP8SSE8x8 = SSE8x8;
   VP8SSE4x4 = SSE4x4;
-#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__))
+#if !defined(WORK_AROUND_GCC)
   VP8EncQuantizeBlock = QuantizeBlock;
 #endif
 #endif   // WEBP_USE_NEON
diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c
index 84e20784..a1bf3584 100644
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@@ -450,12 +450,21 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
   return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }
 
-static WEBP_INLINE int Sub3(int a, int b, int c) {
+// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
+#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
+# define LOCAL_INLINE __attribute__ ((noinline))
+#else
+# define LOCAL_INLINE WEBP_INLINE
+#endif
+
+static LOCAL_INLINE int Sub3(int a, int b, int c) {
   const int pb = b - c;
   const int pa = a - c;
   return abs(pb) - abs(pa);
 }
 
+#undef LOCAL_INLINE
+
 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
   const int pa_minus_pb =
       Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
@@ -1169,7 +1178,7 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
       data += remaining_width;
     }
     ++y;
-    if ((y & mask) == 0) pred_row += tiles_per_row;;
+    if ((y & mask) == 0) pred_row += tiles_per_row;
   }
 }
 
diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h
index 03dfe223..08be9375 100644
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@@ -56,24 +56,20 @@ extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
 extern VP8LConvertFunc VP8LConvertBGRAToBGR;
 
 // Expose some C-only fallback functions
-extern void VP8LTransformColor_C(const VP8LMultipliers* const m,
+void VP8LTransformColor_C(const VP8LMultipliers* const m,
+                          uint32_t* data, int num_pixels);
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
                                  uint32_t* data, int num_pixels);
-extern void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
-                                        uint32_t* data, int num_pixels);
-
-extern void VP8LConvertBGRAToRGB_C(const uint32_t* src,
-                                   int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
-                                    int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
-                                        int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
-                                      int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToBGR_C(const uint32_t* src,
-                                   int num_pixels, uint8_t* dst);
-extern void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data,
-                                              int num_pixels);
-extern void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
+
+void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+                               int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
 
 // Must be called before calling any of the above methods.
 void VP8LDspInit(void);
diff --git a/src/enc/alpha.c b/src/enc/alpha.c
index ae4bf8ab..d624baa3 100644
--- a/src/enc/alpha.c
+++ b/src/enc/alpha.c
@@ -47,12 +47,11 @@
 
 static int EncodeLossless(const uint8_t* const data, int width, int height,
                           int effort_level,  // in [0..6] range
-                          VP8BitWriter* const bw,
+                          VP8LBitWriter* const bw,
                           WebPAuxStats* const stats) {
   int ok = 0;
   WebPConfig config;
   WebPPicture picture;
-  VP8LBitWriter tmp_bw;
 
   WebPPictureInit(&picture);
   picture.width = width;
@@ -84,16 +83,15 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   config.quality = 8.f * effort_level;
   assert(config.quality >= 0 && config.quality <= 100.f);
 
-  ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
-  ok = ok && (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
+  ok = (VP8LEncodeStream(&config, &picture, bw) == VP8_ENC_OK);
   WebPPictureFree(&picture);
-  if (ok) {
-    const uint8_t* const buffer = VP8LBitWriterFinish(&tmp_bw);
-    const size_t buffer_size = VP8LBitWriterNumBytes(&tmp_bw);
-    VP8BitWriterAppend(bw, buffer, buffer_size);
+  ok = ok && !bw->error_;
+  if (!ok) {
+    VP8LBitWriterDestroy(bw);
+    return 0;
   }
-  VP8LBitWriterDestroy(&tmp_bw);
-  return ok && !bw->error_;
+  return 1;
+
 }
 
 // -----------------------------------------------------------------------------
@@ -115,8 +113,10 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   const uint8_t* alpha_src;
   WebPFilterFunc filter_func;
   uint8_t header;
-  size_t expected_size;
   const size_t data_size = width * height;
+  const uint8_t* output = NULL;
+  size_t output_size = 0;
+  VP8LBitWriter tmp_bw;
 
   assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
   assert(filter >= 0 && filter < WEBP_FILTER_LAST);
@@ -125,15 +125,6 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   assert(sizeof(header) == ALPHA_HEADER_LEN);
   // TODO(skal): have a common function and #define's to validate alpha params.
 
-  expected_size =
-      (method == ALPHA_NO_COMPRESSION) ? (ALPHA_HEADER_LEN + data_size)
-                                       : (data_size >> 5);
-  header = method | (filter << 2);
-  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
-
-  VP8BitWriterInit(&result->bw, expected_size);
-  VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
-
   filter_func = WebPFilters[filter];
   if (filter_func != NULL) {
     filter_func(data, width, height, width, tmp_alpha);
@@ -142,14 +133,42 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
     alpha_src = data;
   }
 
+  if (method != ALPHA_NO_COMPRESSION) {
+    ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
+    ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
+                              &tmp_bw, &result->stats);
+    if (ok) {
+      output = VP8LBitWriterFinish(&tmp_bw);
+      output_size = VP8LBitWriterNumBytes(&tmp_bw);
+      if (output_size > data_size) {
+        // compressed size is larger than source! Revert to uncompressed mode.
+        method = ALPHA_NO_COMPRESSION;
+        VP8LBitWriterDestroy(&tmp_bw);
+      }
+    } else {
+      VP8LBitWriterDestroy(&tmp_bw);
+      return 0;
+    }
+  }
+
   if (method == ALPHA_NO_COMPRESSION) {
-    ok = VP8BitWriterAppend(&result->bw, alpha_src, width * height);
-    ok = ok && !result->bw.error_;
-  } else {
-    ok = EncodeLossless(alpha_src, width, height, effort_level,
-                        &result->bw, &result->stats);
-    VP8BitWriterFinish(&result->bw);
+    output = alpha_src;
+    output_size = data_size;
+    ok = 1;
+  }
+
+  // Emit final result.
+  header = method | (filter << 2);
+  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
+
+  VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size);
+  ok = ok && VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
+  ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
+
+  if (method != ALPHA_NO_COMPRESSION) {
+    VP8LBitWriterDestroy(&tmp_bw);
   }
+  ok = ok && !result->bw.error_;
   result->score = VP8BitWriterSize(&result->bw);
   return ok;
 }
diff --git a/src/enc/analysis.c b/src/enc/analysis.c
index 934d0912..e019465b 100644
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@@ -141,7 +141,11 @@ static void MergeHistograms(const VP8Histogram* const in,
 
 static void AssignSegments(VP8Encoder* const enc,
                            const int alphas[MAX_ALPHA + 1]) {
-  const int nb = enc->segment_hdr_.num_segments_;
+  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
+  // explicit check is needed to avoid spurious warning about 'n + 1' exceeding
+  // array bounds of 'centers' with some compilers (noticed with gcc-4.9).
+  const int nb = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS) ?
+                 enc->segment_hdr_.num_segments_ : NUM_MB_SEGMENTS;
   int centers[NUM_MB_SEGMENTS];
   int weighted_average = 0;
   int map[MAX_ALPHA + 1];
diff --git a/src/enc/config.c b/src/enc/config.c
index 8a2eef08..58c03e48 100644
--- a/src/enc/config.c
+++ b/src/enc/config.c
@@ -111,7 +111,11 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->show_compressed < 0 || config->show_compressed > 1)
     return 0;
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+  if (config->preprocessing < 0 || config->preprocessing > 7)
+#else
   if (config->preprocessing < 0 || config->preprocessing > 3)
+#endif
     return 0;
   if (config->partitions < 0 || config->partitions > 3)
     return 0;
diff --git a/src/enc/cost.h b/src/enc/cost.h
index 5d107569..4e558952 100644
--- a/src/enc/cost.h
+++ b/src/enc/cost.h
@@ -42,7 +42,7 @@ typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
                                          VP8Residual* const res);
 extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
 
-extern void VP8SetResidualCoeffsInit(void);  // must be called first
+void VP8SetResidualCoeffsInit(void);  // must be called first
 
 int VP8RecordCoeffs(int ctx, const VP8Residual* const res);
 
@@ -59,7 +59,7 @@ static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
 typedef int (*VP8GetResidualCostFunc)(int ctx0, const VP8Residual* const res);
 extern VP8GetResidualCostFunc VP8GetResidualCost;
 
-extern void VP8GetResidualCostInit(void);  // must be called first
+void VP8GetResidualCostInit(void);  // must be called first
 
 // Level cost calculations
 extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
diff --git a/src/enc/frame.c b/src/enc/frame.c
index ff3cd653..6fd20bb7 100644
--- a/src/enc/frame.c
+++ b/src/enc/frame.c
@@ -508,7 +508,7 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
       }
       case 7: *info = mb->alpha_; break;
       default: *info = 0; break;
-    };
+    }
   }
 #if SEGMENT_VISU  // visualize segments and prediction modes
   SetBlock(it->yuv_out_ + Y_OFF, mb->segment_ * 64, 16);
diff --git a/src/enc/picture_csp.c b/src/enc/picture_csp.c
index 7964f257..7875f625 100644
--- a/src/enc/picture_csp.c
+++ b/src/enc/picture_csp.c
@@ -17,11 +17,15 @@
 
 #include "./vp8enci.h"
 #include "../utils/random.h"
+#include "../utils/utils.h"
 #include "../dsp/yuv.h"
 
 // Uncomment to disable gamma-compression during RGB->U/V averaging
 #define USE_GAMMA_COMPRESSION
 
+// If defined, use table to compute x / alpha.
+#define USE_INVERSE_ALPHA_TABLE
+
 static const union {
   uint32_t argb;
   uint8_t  bytes[4];
@@ -70,26 +74,12 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 }
 
 //------------------------------------------------------------------------------
-// RGB -> YUV conversion
-
-static int RGBToY(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
-}
-
-static int RGBToU(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-static int RGBToV(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-//------------------------------------------------------------------------------
+// Code for gamma correction
 
 #if defined(USE_GAMMA_COMPRESSION)
 
 // gamma-compensates loss of resolution during chroma subsampling
-#define kGamma 0.80
+#define kGamma 0.80      // for now we use a different gamma value than kGammaF
 #define kGammaFix 12     // fixed-point precision for linear values
 #define kGammaScale ((1 << kGammaFix) - 1)
 #define kGammaTabFix 7   // fixed-point fractional bits precision
@@ -104,14 +94,14 @@ static int kGammaTablesOk = 0;
 static void InitGammaTables(void) {
   if (!kGammaTablesOk) {
     int v;
-    const double scale = 1. / kGammaScale;
+    const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
+    const double norm = 1. / 255.;
     for (v = 0; v <= 255; ++v) {
       kGammaToLinearTab[v] =
-          (uint16_t)(pow(v / 255., kGamma) * kGammaScale + .5);
+          (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5);
     }
     for (v = 0; v <= kGammaTabSize; ++v) {
-      const double x = scale * (v << kGammaTabFix);
-      kLinearToGammaTab[v] = (int)(pow(x, 1. / kGamma) * 255. + .5);
+      kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5);
     }
     kGammaTablesOk = 1;
   }
@@ -121,16 +111,21 @@ static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
   return kGammaToLinearTab[v];
 }
 
-// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
-// U/V value, suitable for RGBToU/V calls.
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  const int v = base_value << shift;              // final uplifted value
+static WEBP_INLINE int Interpolate(int v) {
   const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
   const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
   const int v0 = kLinearToGammaTab[tab_pos];
   const int v1 = kLinearToGammaTab[tab_pos + 1];
   const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
-  return (y + kGammaTabRounder) >> kGammaTabFix;             // descale
+  assert(tab_pos + 1 < kGammaTabSize + 1);
+  return y;
+}
+
+// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
+// U/V value, suitable for RGBToU/V calls.
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  const int y = Interpolate(base_value << shift);   // final uplifted value
+  return (y + kGammaTabRounder) >> kGammaTabFix;    // descale
 }
 
 #else
@@ -144,28 +139,700 @@ static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
 #endif    // USE_GAMMA_COMPRESSION
 
 //------------------------------------------------------------------------------
+// RGB -> YUV conversion
+
+static int RGBToY(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToY(r, g, b, YUV_HALF)
+                      : VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
+}
+
+static int RGBToU(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToU(r, g, b, YUV_HALF << 2)
+                      : VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+static int RGBToV(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToV(r, g, b, YUV_HALF << 2)
+                      : VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+//------------------------------------------------------------------------------
+// Smart RGB->YUV conversion
+
+static const int kNumIterations = 6;
+static const int kMinDimensionIterativeConversion = 4;
+
+// We use a-priori a different precision for storing RGB and Y/W components
+// We could use YFIX=0 and only uint8_t for fixed_y_t, but it produces some
+// banding sometimes. Better use extra precision.
+// TODO(skal): cleanup once TFIX/YFIX values are fixed.
+
+typedef int16_t fixed_t;      // signed type with extra TFIX precision for UV
+typedef uint16_t fixed_y_t;   // unsigned type with extra YFIX precision for W
+#define TFIX 6   // fixed-point precision of RGB
+#define YFIX 2   // fixed point precision for Y/W
+
+#define THALF ((1 << TFIX) >> 1)
+#define MAX_Y_T ((256 << YFIX) - 1)
+#define TROUNDER (1 << (YUV_FIX + TFIX - 1))
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// float variant of gamma-correction
+// We use tables of different size and precision, along with a 'real-world'
+// Gamma value close to ~2.
+#define kGammaF 2.2
+static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
+static float kLinearToGammaTabF[kGammaTabSize + 2];
+static int kGammaTablesFOk = 0;
+
+static void InitGammaTablesF(void) {
+  if (!kGammaTablesFOk) {
+    int v;
+    const double norm = 1. / MAX_Y_T;
+    const double scale = 1. / kGammaTabSize;
+    for (v = 0; v <= MAX_Y_T; ++v) {
+      kGammaToLinearTabF[v] = (float)pow(norm * v, kGammaF);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      kLinearToGammaTabF[v] = (float)(MAX_Y_T * pow(scale * v, 1. / kGammaF));
+    }
+    // to prevent small rounding errors to cause read-overflow:
+    kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize];
+    kGammaTablesFOk = 1;
+  }
+}
+
+static WEBP_INLINE float GammaToLinearF(int v) {
+  return kGammaToLinearTabF[v];
+}
+
+static WEBP_INLINE float LinearToGammaF(float value) {
+  const float v = value * kGammaTabSize;
+  const int tab_pos = (int)v;
+  const float x = v - (float)tab_pos;      // fractional part
+  const float v0 = kLinearToGammaTabF[tab_pos + 0];
+  const float v1 = kLinearToGammaTabF[tab_pos + 1];
+  const float y = v1 * x + v0 * (1.f - x);  // interpolate
+  return y;
+}
+
+#else
+
+static void InitGammaTablesF(void) {}
+static WEBP_INLINE float GammaToLinearF(int v) {
+  const float norm = 1.f / MAX_Y_T;
+  return norm * v;
+}
+static WEBP_INLINE float LinearToGammaF(float value) {
+  return MAX_Y_T * value;
+}
+
+#endif    // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+
+// precision: YFIX -> TFIX
+static WEBP_INLINE int FixedYToW(int v) {
+#if TFIX == YFIX
+  return v;
+#elif TFIX >= YFIX
+  return v << (TFIX - YFIX);
+#else
+  return v >> (YFIX - TFIX);
+#endif
+}
+
+static WEBP_INLINE int FixedWToY(int v) {
+#if TFIX == YFIX
+  return v;
+#elif YFIX >= TFIX
+  return v << (YFIX - TFIX);
+#else
+  return v >> (TFIX - YFIX);
+#endif
+}
+
+static uint8_t clip_8b(fixed_t v) {
+  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
+}
+
+static fixed_y_t clip_y(int y) {
+  return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
+}
+
+// precision: TFIX -> YFIX
+static fixed_y_t clip_fixed_t(fixed_t v) {
+  const int y = FixedWToY(v);
+  const fixed_y_t w = clip_y(y);
+  return w;
+}
+
+//------------------------------------------------------------------------------
 
-#define SUM4(ptr) LinearToGamma(                         \
-    GammaToLinear((ptr)[0]) +                            \
-    GammaToLinear((ptr)[step]) +                         \
-    GammaToLinear((ptr)[rgb_stride]) +                   \
-    GammaToLinear((ptr)[rgb_stride + step]), 0)          \
+static int RGBToGray(int r, int g, int b) {
+  const int luma = 19595 * r + 38470 * g + 7471 * b + YUV_HALF;
+  return (luma >> YUV_FIX);
+}
+
+static float RGBToGrayF(float r, float g, float b) {
+  return 0.299f * r + 0.587f * g + 0.114f * b;
+}
 
-#define SUM2H(ptr) \
-    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[step]), 1)
-#define SUM2V(ptr) \
+static float ScaleDown(int a, int b, int c, int d) {
+  const float A = GammaToLinearF(a);
+  const float B = GammaToLinearF(b);
+  const float C = GammaToLinearF(c);
+  const float D = GammaToLinearF(d);
+  return LinearToGammaF(0.25f * (A + B + C + D));
+}
+
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int len) {
+  while (len-- > 0) {
+    const float R = GammaToLinearF(src[0]);
+    const float G = GammaToLinearF(src[1]);
+    const float B = GammaToLinearF(src[2]);
+    const float Y = RGBToGrayF(R, G, B);
+    *dst++ = (fixed_y_t)(LinearToGammaF(Y) + .5);
+    src += 3;
+  }
+}
+
+static WEBP_INLINE void UpdateChroma(const fixed_y_t* src1,
+                                     const fixed_y_t* src2,
+                                     fixed_t* dst, fixed_y_t* tmp, int len) {
+  while (len--> 0) {
+    const float r = ScaleDown(src1[0], src1[3], src2[0], src2[3]);
+    const float g = ScaleDown(src1[1], src1[4], src2[1], src2[4]);
+    const float b = ScaleDown(src1[2], src1[5], src2[2], src2[5]);
+    const float W = RGBToGrayF(r, g, b);
+    dst[0] = (fixed_t)FixedYToW((int)(r - W));
+    dst[1] = (fixed_t)FixedYToW((int)(g - W));
+    dst[2] = (fixed_t)FixedYToW((int)(b - W));
+    dst += 3;
+    src1 += 6;
+    src2 += 6;
+    if (tmp != NULL) {
+      tmp[0] = tmp[1] = clip_y((int)(W + .5));
+      tmp += 2;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int Filter(const fixed_t* const A, const fixed_t* const B,
+                              int rightwise) {
+  int v;
+  if (!rightwise) {
+    v = (A[0] * 9 + A[-3] * 3 + B[0] * 3 + B[-3]);
+  } else {
+    v = (A[0] * 9 + A[+3] * 3 + B[0] * 3 + B[+3]);
+  }
+  return (v + 8) >> 4;
+}
+
+static WEBP_INLINE int Filter2(int A, int B) { return (A * 3 + B + 2) >> 2; }
+
+//------------------------------------------------------------------------------
+
+// 8bit -> YFIX
+static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {
+  return ((fixed_y_t)a << YFIX) | (1 << (YFIX - 1));
+}
+
+static void ImportOneRow(const uint8_t* const r_ptr,
+                         const uint8_t* const g_ptr,
+                         const uint8_t* const b_ptr,
+                         int step,
+                         int pic_width,
+                         fixed_y_t* const dst) {
+  int i;
+  for (i = 0; i < pic_width; ++i) {
+    const int off = i * step;
+    dst[3 * i + 0] = UpLift(r_ptr[off]);
+    dst[3 * i + 1] = UpLift(g_ptr[off]);
+    dst[3 * i + 2] = UpLift(b_ptr[off]);
+  }
+  if (pic_width & 1) {  // replicate rightmost pixel
+    memcpy(dst + 3 * pic_width, dst + 3 * (pic_width - 1), 3 * sizeof(*dst));
+  }
+}
+
+static void InterpolateTwoRows(const fixed_y_t* const best_y,
+                               const fixed_t* const prev_uv,
+                               const fixed_t* const cur_uv,
+                               const fixed_t* const next_uv,
+                               int w,
+                               fixed_y_t* const out1,
+                               fixed_y_t* const out2) {
+  int i, k;
+  {  // special boundary case for i==0
+    const int W0 = FixedYToW(best_y[0]);
+    const int W1 = FixedYToW(best_y[w]);
+    for (k = 0; k <= 2; ++k) {
+      out1[k] = clip_fixed_t(Filter2(cur_uv[k], prev_uv[k]) + W0);
+      out2[k] = clip_fixed_t(Filter2(cur_uv[k], next_uv[k]) + W1);
+    }
+  }
+  for (i = 1; i < w - 1; ++i) {
+    const int W0 = FixedYToW(best_y[i + 0]);
+    const int W1 = FixedYToW(best_y[i + w]);
+    const int off = 3 * (i >> 1);
+    for (k = 0; k <= 2; ++k) {
+      const int tmp0 = Filter(cur_uv + off + k, prev_uv + off + k, i & 1);
+      const int tmp1 = Filter(cur_uv + off + k, next_uv + off + k, i & 1);
+      out1[3 * i + k] = clip_fixed_t(tmp0 + W0);
+      out2[3 * i + k] = clip_fixed_t(tmp1 + W1);
+    }
+  }
+  {  // special boundary case for i == w - 1
+    const int W0 = FixedYToW(best_y[i + 0]);
+    const int W1 = FixedYToW(best_y[i + w]);
+    const int off = 3 * (i >> 1);
+    for (k = 0; k <= 2; ++k) {
+      out1[3 * i + k] =
+          clip_fixed_t(Filter2(cur_uv[off + k], prev_uv[off + k]) + W0);
+      out2[3 * i + k] =
+          clip_fixed_t(Filter2(cur_uv[off + k], next_uv[off + k]) + W1);
+    }
+  }
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
+  const int luma = 16839 * r + 33059 * g + 6420 * b + TROUNDER;
+  return clip_8b(16 + (luma >> (YUV_FIX + TFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
+  const int u =  -9719 * r - 19081 * g + 28800 * b + TROUNDER;
+  return clip_8b(128 + (u >> (YUV_FIX + TFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
+  const int v = +28800 * r - 24116 * g -  4684 * b + TROUNDER;
+  return clip_8b(128 + (v >> (YUV_FIX + TFIX)));
+}
+
+static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
+                            const fixed_t* const best_uv,
+                            WebPPicture* const picture) {
+  int i, j;
+  const int w = (picture->width + 1) & ~1;
+  const int h = (picture->height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  for (j = 0; j < picture->height; ++j) {
+    for (i = 0; i < picture->width; ++i) {
+      const int off = 3 * ((i >> 1) + (j >> 1) * uv_w);
+      const int off2 = i + j * picture->y_stride;
+      const int W = FixedYToW(best_y[i + j * w]);
+      const int r = best_uv[off + 0] + W;
+      const int g = best_uv[off + 1] + W;
+      const int b = best_uv[off + 2] + W;
+      picture->y[off2] = ConvertRGBToY(r, g, b);
+    }
+  }
+  for (j = 0; j < uv_h; ++j) {
+    uint8_t* const dst_u = picture->u + j * picture->uv_stride;
+    uint8_t* const dst_v = picture->v + j * picture->uv_stride;
+    for (i = 0; i < uv_w; ++i) {
+      const int off = 3 * (i + j * uv_w);
+      const int r = best_uv[off + 0];
+      const int g = best_uv[off + 1];
+      const int b = best_uv[off + 2];
+      dst_u[i] = ConvertRGBToU(r, g, b);
+      dst_v[i] = ConvertRGBToV(r, g, b);
+    }
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main function
+
+#define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
+
+static int PreprocessARGB(const uint8_t* const r_ptr,
+                          const uint8_t* const g_ptr,
+                          const uint8_t* const b_ptr,
+                          int step, int rgb_stride,
+                          WebPPicture* const picture) {
+  // we expand the right/bottom border if needed
+  const int w = (picture->width + 1) & ~1;
+  const int h = (picture->height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  int i, j, iter;
+
+  // TODO(skal): allocate one big memory chunk. But for now, it's easier
+  // for valgrind debugging to have several chunks.
+  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
+  fixed_y_t* const best_y = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
+  fixed_t* const best_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  int ok;
+
+  if (best_y == NULL || best_uv == NULL ||
+      target_y == NULL || target_uv == NULL ||
+      best_rgb_y == NULL || best_rgb_uv == NULL ||
+      tmp_buffer == NULL) {
+    ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    goto End;
+  }
+  assert(picture->width >= kMinDimensionIterativeConversion);
+  assert(picture->height >= kMinDimensionIterativeConversion);
+
+  // Import RGB samples to W/RGB representation.
+  for (j = 0; j < picture->height; j += 2) {
+    const int is_last_row = (j == picture->height - 1);
+    fixed_y_t* const src1 = tmp_buffer;
+    fixed_y_t* const src2 = tmp_buffer + 3 * w;
+    const int off1 = j * rgb_stride;
+    const int off2 = off1 + rgb_stride;
+    const int uv_off = (j >> 1) * 3 * uv_w;
+    fixed_y_t* const dst_y = best_y + j * w;
+
+    // prepare two rows of input
+    ImportOneRow(r_ptr + off1, g_ptr + off1, b_ptr + off1,
+                 step, picture->width, src1);
+    if (!is_last_row) {
+      ImportOneRow(r_ptr + off2, g_ptr + off2, b_ptr + off2,
+                   step, picture->width, src2);
+    } else {
+      memcpy(src2, src1, 3 * w * sizeof(*src2));
+    }
+    UpdateW(src1, target_y + (j + 0) * w, w);
+    UpdateW(src2, target_y + (j + 1) * w, w);
+    UpdateChroma(src1, src2, target_uv + uv_off, dst_y, uv_w);
+    memcpy(best_uv + uv_off, target_uv + uv_off, 3 * uv_w * sizeof(*best_uv));
+    memcpy(dst_y + w, dst_y, w * sizeof(*dst_y));
+  }
+
+  // Iterate and resolve clipping conflicts.
+  for (iter = 0; iter < kNumIterations; ++iter) {
+    int k;
+    const fixed_t* cur_uv = best_uv;
+    const fixed_t* prev_uv = best_uv;
+    for (j = 0; j < h; j += 2) {
+      fixed_y_t* const src1 = tmp_buffer;
+      fixed_y_t* const src2 = tmp_buffer + 3 * w;
+
+      {
+        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
+        InterpolateTwoRows(best_y + j * w, prev_uv, cur_uv, next_uv,
+                           w, src1, src2);
+        prev_uv = cur_uv;
+        cur_uv = next_uv;
+      }
+
+      UpdateW(src1, best_rgb_y + 0 * w, w);
+      UpdateW(src2, best_rgb_y + 1 * w, w);
+      UpdateChroma(src1, src2, best_rgb_uv, NULL, uv_w);
+
+      // update two rows of Y and one row of RGB
+      for (i = 0; i < 2 * w; ++i) {
+        const int off = i + j * w;
+        const int diff_y = target_y[off] - best_rgb_y[i];
+        const int new_y = (int)best_y[off] + diff_y;
+        best_y[off] = clip_y(new_y);
+      }
+      for (i = 0; i < uv_w; ++i) {
+        const int off = 3 * (i + (j >> 1) * uv_w);
+        int W;
+        for (k = 0; k <= 2; ++k) {
+          const int diff_uv = (int)target_uv[off + k] - best_rgb_uv[3 * i + k];
+          best_uv[off + k] += diff_uv;
+        }
+        W = RGBToGray(best_uv[off + 0], best_uv[off + 1], best_uv[off + 2]);
+        for (k = 0; k <= 2; ++k) {
+          best_uv[off + k] -= W;
+        }
+      }
+    }
+    // TODO(skal): add early-termination criterion
+  }
+
+  // final reconstruction
+  ok = ConvertWRGBToYUV(best_y, best_uv, picture);
+
+ End:
+  WebPSafeFree(best_y);
+  WebPSafeFree(best_uv);
+  WebPSafeFree(target_y);
+  WebPSafeFree(target_uv);
+  WebPSafeFree(best_rgb_y);
+  WebPSafeFree(best_rgb_uv);
+  WebPSafeFree(tmp_buffer);
+  return ok;
+}
+#undef SAFE_ALLOC
+
+//------------------------------------------------------------------------------
+// "Fast" regular RGB->YUV
+
+#define SUM4(ptr, step) LinearToGamma(                     \
+    GammaToLinear((ptr)[0]) +                              \
+    GammaToLinear((ptr)[(step)]) +                         \
+    GammaToLinear((ptr)[rgb_stride]) +                     \
+    GammaToLinear((ptr)[rgb_stride + (step)]), 0)          \
+
+#define SUM2(ptr) \
     LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
-#define SUM1(ptr)  \
-    LinearToGamma(GammaToLinear((ptr)[0]), 2)
 
-#define RGB_TO_UV(x, y, SUM) {                           \
-  const int src = (2 * (step * (x) + (y) * rgb_stride)); \
-  const int dst = (x) + (y) * picture->uv_stride;        \
-  const int r = SUM(r_ptr + src);                        \
-  const int g = SUM(g_ptr + src);                        \
-  const int b = SUM(b_ptr + src);                        \
-  picture->u[dst] = RGBToU(r, g, b, &rg);                \
-  picture->v[dst] = RGBToV(r, g, b, &rg);                \
+#define SUM2ALPHA(ptr) ((ptr)[0] + (ptr)[rgb_stride])
+#define SUM4ALPHA(ptr) (SUM2ALPHA(ptr) + SUM2ALPHA((ptr) + 4))
+
+#if defined(USE_INVERSE_ALPHA_TABLE)
+
+static const int kAlphaFix = 19;
+// Following table is (1 << kAlphaFix) / a. The (v * kInvAlpha[a]) >> kAlphaFix
+// formula is then equal to v / a in most (99.6%) cases. Note that this table
+// and constant are adjusted very tightly to fit 32b arithmetic.
+// In particular, they use the fact that the operands for 'v / a' are actually
+// derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3
+// with ai in [0..255] and pi in [0..1<<kGammaFix). The constraint to avoid
+// overflow is: kGammaFix + kAlphaFix <= 31.
+static const uint32_t kInvAlpha[4 * 0xff + 1] = {
+  0,  /* alpha = 0 */
+  524288, 262144, 174762, 131072, 104857, 87381, 74898, 65536,
+  58254, 52428, 47662, 43690, 40329, 37449, 34952, 32768,
+  30840, 29127, 27594, 26214, 24966, 23831, 22795, 21845,
+  20971, 20164, 19418, 18724, 18078, 17476, 16912, 16384,
+  15887, 15420, 14979, 14563, 14169, 13797, 13443, 13107,
+  12787, 12483, 12192, 11915, 11650, 11397, 11155, 10922,
+  10699, 10485, 10280, 10082, 9892, 9709, 9532, 9362,
+  9198, 9039, 8886, 8738, 8594, 8456, 8322, 8192,
+  8065, 7943, 7825, 7710, 7598, 7489, 7384, 7281,
+  7182, 7084, 6990, 6898, 6808, 6721, 6636, 6553,
+  6472, 6393, 6316, 6241, 6168, 6096, 6026, 5957,
+  5890, 5825, 5761, 5698, 5637, 5577, 5518, 5461,
+  5405, 5349, 5295, 5242, 5190, 5140, 5090, 5041,
+  4993, 4946, 4899, 4854, 4809, 4766, 4723, 4681,
+  4639, 4599, 4559, 4519, 4481, 4443, 4405, 4369,
+  4332, 4297, 4262, 4228, 4194, 4161, 4128, 4096,
+  4064, 4032, 4002, 3971, 3942, 3912, 3883, 3855,
+  3826, 3799, 3771, 3744, 3718, 3692, 3666, 3640,
+  3615, 3591, 3566, 3542, 3518, 3495, 3472, 3449,
+  3426, 3404, 3382, 3360, 3339, 3318, 3297, 3276,
+  3256, 3236, 3216, 3196, 3177, 3158, 3139, 3120,
+  3102, 3084, 3066, 3048, 3030, 3013, 2995, 2978,
+  2962, 2945, 2928, 2912, 2896, 2880, 2864, 2849,
+  2833, 2818, 2803, 2788, 2774, 2759, 2744, 2730,
+  2716, 2702, 2688, 2674, 2661, 2647, 2634, 2621,
+  2608, 2595, 2582, 2570, 2557, 2545, 2532, 2520,
+  2508, 2496, 2484, 2473, 2461, 2449, 2438, 2427,
+  2416, 2404, 2394, 2383, 2372, 2361, 2351, 2340,
+  2330, 2319, 2309, 2299, 2289, 2279, 2269, 2259,
+  2250, 2240, 2231, 2221, 2212, 2202, 2193, 2184,
+  2175, 2166, 2157, 2148, 2139, 2131, 2122, 2114,
+  2105, 2097, 2088, 2080, 2072, 2064, 2056, 2048,
+  2040, 2032, 2024, 2016, 2008, 2001, 1993, 1985,
+  1978, 1971, 1963, 1956, 1949, 1941, 1934, 1927,
+  1920, 1913, 1906, 1899, 1892, 1885, 1879, 1872,
+  1865, 1859, 1852, 1846, 1839, 1833, 1826, 1820,
+  1814, 1807, 1801, 1795, 1789, 1783, 1777, 1771,
+  1765, 1759, 1753, 1747, 1741, 1736, 1730, 1724,
+  1718, 1713, 1707, 1702, 1696, 1691, 1685, 1680,
+  1675, 1669, 1664, 1659, 1653, 1648, 1643, 1638,
+  1633, 1628, 1623, 1618, 1613, 1608, 1603, 1598,
+  1593, 1588, 1583, 1579, 1574, 1569, 1565, 1560,
+  1555, 1551, 1546, 1542, 1537, 1533, 1528, 1524,
+  1519, 1515, 1510, 1506, 1502, 1497, 1493, 1489,
+  1485, 1481, 1476, 1472, 1468, 1464, 1460, 1456,
+  1452, 1448, 1444, 1440, 1436, 1432, 1428, 1424,
+  1420, 1416, 1413, 1409, 1405, 1401, 1398, 1394,
+  1390, 1387, 1383, 1379, 1376, 1372, 1368, 1365,
+  1361, 1358, 1354, 1351, 1347, 1344, 1340, 1337,
+  1334, 1330, 1327, 1323, 1320, 1317, 1314, 1310,
+  1307, 1304, 1300, 1297, 1294, 1291, 1288, 1285,
+  1281, 1278, 1275, 1272, 1269, 1266, 1263, 1260,
+  1257, 1254, 1251, 1248, 1245, 1242, 1239, 1236,
+  1233, 1230, 1227, 1224, 1222, 1219, 1216, 1213,
+  1210, 1208, 1205, 1202, 1199, 1197, 1194, 1191,
+  1188, 1186, 1183, 1180, 1178, 1175, 1172, 1170,
+  1167, 1165, 1162, 1159, 1157, 1154, 1152, 1149,
+  1147, 1144, 1142, 1139, 1137, 1134, 1132, 1129,
+  1127, 1125, 1122, 1120, 1117, 1115, 1113, 1110,
+  1108, 1106, 1103, 1101, 1099, 1096, 1094, 1092,
+  1089, 1087, 1085, 1083, 1081, 1078, 1076, 1074,
+  1072, 1069, 1067, 1065, 1063, 1061, 1059, 1057,
+  1054, 1052, 1050, 1048, 1046, 1044, 1042, 1040,
+  1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024,
+  1022, 1020, 1018, 1016, 1014, 1012, 1010, 1008,
+  1006, 1004, 1002, 1000, 998, 996, 994, 992,
+  991, 989, 987, 985, 983, 981, 979, 978,
+  976, 974, 972, 970, 969, 967, 965, 963,
+  961, 960, 958, 956, 954, 953, 951, 949,
+  948, 946, 944, 942, 941, 939, 937, 936,
+  934, 932, 931, 929, 927, 926, 924, 923,
+  921, 919, 918, 916, 914, 913, 911, 910,
+  908, 907, 905, 903, 902, 900, 899, 897,
+  896, 894, 893, 891, 890, 888, 887, 885,
+  884, 882, 881, 879, 878, 876, 875, 873,
+  872, 870, 869, 868, 866, 865, 863, 862,
+  860, 859, 858, 856, 855, 853, 852, 851,
+  849, 848, 846, 845, 844, 842, 841, 840,
+  838, 837, 836, 834, 833, 832, 830, 829,
+  828, 826, 825, 824, 823, 821, 820, 819,
+  817, 816, 815, 814, 812, 811, 810, 809,
+  807, 806, 805, 804, 802, 801, 800, 799,
+  798, 796, 795, 794, 793, 791, 790, 789,
+  788, 787, 786, 784, 783, 782, 781, 780,
+  779, 777, 776, 775, 774, 773, 772, 771,
+  769, 768, 767, 766, 765, 764, 763, 762,
+  760, 759, 758, 757, 756, 755, 754, 753,
+  752, 751, 750, 748, 747, 746, 745, 744,
+  743, 742, 741, 740, 739, 738, 737, 736,
+  735, 734, 733, 732, 731, 730, 729, 728,
+  727, 726, 725, 724, 723, 722, 721, 720,
+  719, 718, 717, 716, 715, 714, 713, 712,
+  711, 710, 709, 708, 707, 706, 705, 704,
+  703, 702, 701, 700, 699, 699, 698, 697,
+  696, 695, 694, 693, 692, 691, 690, 689,
+  688, 688, 687, 686, 685, 684, 683, 682,
+  681, 680, 680, 679, 678, 677, 676, 675,
+  674, 673, 673, 672, 671, 670, 669, 668,
+  667, 667, 666, 665, 664, 663, 662, 661,
+  661, 660, 659, 658, 657, 657, 656, 655,
+  654, 653, 652, 652, 651, 650, 649, 648,
+  648, 647, 646, 645, 644, 644, 643, 642,
+  641, 640, 640, 639, 638, 637, 637, 636,
+  635, 634, 633, 633, 632, 631, 630, 630,
+  629, 628, 627, 627, 626, 625, 624, 624,
+  623, 622, 621, 621, 620, 619, 618, 618,
+  617, 616, 616, 615, 614, 613, 613, 612,
+  611, 611, 610, 609, 608, 608, 607, 606,
+  606, 605, 604, 604, 603, 602, 601, 601,
+  600, 599, 599, 598, 597, 597, 596, 595,
+  595, 594, 593, 593, 592, 591, 591, 590,
+  589, 589, 588, 587, 587, 586, 585, 585,
+  584, 583, 583, 582, 581, 581, 580, 579,
+  579, 578, 578, 577, 576, 576, 575, 574,
+  574, 573, 572, 572, 571, 571, 570, 569,
+  569, 568, 568, 567, 566, 566, 565, 564,
+  564, 563, 563, 562, 561, 561, 560, 560,
+  559, 558, 558, 557, 557, 556, 555, 555,
+  554, 554, 553, 553, 552, 551, 551, 550,
+  550, 549, 548, 548, 547, 547, 546, 546,
+  545, 544, 544, 543, 543, 542, 542, 541,
+  541, 540, 539, 539, 538, 538, 537, 537,
+  536, 536, 535, 534, 534, 533, 533, 532,
+  532, 531, 531, 530, 530, 529, 529, 528,
+  527, 527, 526, 526, 525, 525, 524, 524,
+  523, 523, 522, 522, 521, 521, 520, 520,
+  519, 519, 518, 518, 517, 517, 516, 516,
+  515, 515, 514, 514
+};
+
+// Note that LinearToGamma() expects the values to be premultiplied by 4,
+// so we incorporate this factor 4 inside the DIVIDE_BY_ALPHA macro directly.
+#define DIVIDE_BY_ALPHA(sum, a)  (((sum) * kInvAlpha[(a)]) >> (kAlphaFix - 2))
+
+#else
+
+#define DIVIDE_BY_ALPHA(sum, a) (4 * (sum) / (a))
+
+#endif  // USE_INVERSE_ALPHA_TABLE
+
+static WEBP_INLINE int LinearToGammaWeighted(const uint8_t* src,
+                                             const uint8_t* a_ptr,
+                                             uint32_t total_a, int step,
+                                             int rgb_stride) {
+  const uint32_t sum =
+      a_ptr[0] * GammaToLinear(src[0]) +
+      a_ptr[step] * GammaToLinear(src[step]) +
+      a_ptr[rgb_stride] * GammaToLinear(src[rgb_stride]) +
+      a_ptr[rgb_stride + step] * GammaToLinear(src[rgb_stride + step]);
+  assert(total_a > 0 && total_a <= 4 * 0xff);
+#if defined(USE_INVERSE_ALPHA_TABLE)
+  assert((uint64_t)sum * kInvAlpha[total_a] < ((uint64_t)1 << 32));
+#endif
+  return LinearToGamma(DIVIDE_BY_ALPHA(sum, total_a), 0);
+}
+
+static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr,
+                                      const uint8_t* const g_ptr,
+                                      const uint8_t* const b_ptr,
+                                      int step,
+                                      uint8_t* const dst_y,
+                                      int width,
+                                      VP8Random* const rg) {
+  int i, j;
+  for (i = 0, j = 0; i < width; ++i, j += step) {
+    dst_y[i] = RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], rg);
+  }
+}
+
+static WEBP_INLINE void ConvertRowsToUVWithAlpha(const uint8_t* const r_ptr,
+                                                 const uint8_t* const g_ptr,
+                                                 const uint8_t* const b_ptr,
+                                                 const uint8_t* const a_ptr,
+                                                 int rgb_stride,
+                                                 uint8_t* const dst_u,
+                                                 uint8_t* const dst_v,
+                                                 int width,
+                                                 VP8Random* const rg) {
+  int i, j;
+  // we loop over 2x2 blocks and produce one U/V value for each.
+  for (i = 0, j = 0; i < (width >> 1); ++i, j += 2 * sizeof(uint32_t)) {
+    const uint32_t a = SUM4ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM4(r_ptr + j, 4);
+      g = SUM4(g_ptr + j, 4);
+      b = SUM4(b_ptr + j, 4);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride);
+    }
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+  if (width & 1) {
+    const uint32_t a = 2u * SUM2ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM2(r_ptr + j);
+      g = SUM2(g_ptr + j);
+      b = SUM2(b_ptr + j);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride);
+    }
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+}
+
+static WEBP_INLINE void ConvertRowsToUV(const uint8_t* const r_ptr,
+                                        const uint8_t* const g_ptr,
+                                        const uint8_t* const b_ptr,
+                                        int step, int rgb_stride,
+                                        uint8_t* const dst_u,
+                                        uint8_t* const dst_v,
+                                        int width,
+                                        VP8Random* const rg) {
+  int i, j;
+  for (i = 0, j = 0; i < (width >> 1); ++i, j += 2 * step) {
+    const int r = SUM4(r_ptr + j, step);
+    const int g = SUM4(g_ptr + j, step);
+    const int b = SUM4(b_ptr + j, step);
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+  if (width & 1) {
+    const int r = SUM2(r_ptr + j);
+    const int g = SUM2(g_ptr + j);
+    const int b = SUM2(b_ptr + j);
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
 }
 
 static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
@@ -175,59 +842,99 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
                               int step,         // bytes per pixel
                               int rgb_stride,   // bytes per scanline
                               float dithering,
+                              int use_iterative_conversion,
                               WebPPicture* const picture) {
-  int x, y;
+  int y;
   const int width = picture->width;
   const int height = picture->height;
   const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
-  VP8Random rg;
 
-  if (has_alpha) {
-    picture->colorspace |= WEBP_CSP_ALPHA_BIT;
-  } else {
-    picture->colorspace &= WEBP_CSP_UV_MASK;
-  }
+  picture->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
   picture->use_argb = 0;
 
-  if (!WebPPictureAllocYUVA(picture, width, height)) return 0;
-
-  VP8InitRandom(&rg, dithering);
-  InitGammaTables();
+  // disable smart conversion if source is too small (overkill).
+  if (width < kMinDimensionIterativeConversion ||
+      height < kMinDimensionIterativeConversion) {
+    use_iterative_conversion = 0;
+  }
 
-  // Import luma plane
-  for (y = 0; y < height; ++y) {
-    uint8_t* const dst = &picture->y[y * picture->y_stride];
-    for (x = 0; x < width; ++x) {
-      const int offset = step * x + y * rgb_stride;
-      dst[x] = RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset], &rg);
-    }
+  if (!WebPPictureAllocYUVA(picture, width, height)) {
+    return 0;
+  }
+  if (has_alpha) {
+    WebPInitAlphaProcessing();
+    assert(step == 4);
+#if defined(USE_INVERSE_ALPHA_TABLE)
+    assert(kAlphaFix + kGammaFix <= 31);
+#endif
   }
 
-  // Downsample U/V plane
-  for (y = 0; y < (height >> 1); ++y) {
-    for (x = 0; x < (width >> 1); ++x) {
-      RGB_TO_UV(x, y, SUM4);
-    }
-    if (width & 1) {
-      RGB_TO_UV(x, y, SUM2V);
+  if (use_iterative_conversion) {
+    InitGammaTablesF();
+    if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
+      return 0;
     }
-  }
-  if (height & 1) {
-    for (x = 0; x < (width >> 1); ++x) {
-      RGB_TO_UV(x, y, SUM2H);
+    if (has_alpha) {
+      WebPExtractAlpha(a_ptr, rgb_stride, width, height,
+                       picture->a, picture->a_stride);
     }
-    if (width & 1) {
-      RGB_TO_UV(x, y, SUM1);
+  } else {
+    uint8_t* dst_y = picture->y;
+    uint8_t* dst_u = picture->u;
+    uint8_t* dst_v = picture->v;
+    uint8_t* dst_a = picture->a;
+
+    VP8Random base_rg;
+    VP8Random* rg = NULL;
+    if (dithering > 0.) {
+      VP8InitRandom(&base_rg, dithering);
+      rg = &base_rg;
     }
-  }
 
-  if (has_alpha) {
-    assert(step >= 4);
-    assert(picture->a != NULL);
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        picture->a[x + y * picture->a_stride] =
-            a_ptr[step * x + y * rgb_stride];
+    InitGammaTables();
+
+    // Downsample Y/U/V planes, two rows at a time
+    for (y = 0; y < (height >> 1); ++y) {
+      int rows_have_alpha = has_alpha;
+      const int off1 = (2 * y + 0) * rgb_stride;
+      const int off2 = (2 * y + 1) * rgb_stride;
+      ConvertRowToY(r_ptr + off1, g_ptr + off1, b_ptr + off1, step,
+                    dst_y, width, rg);
+      ConvertRowToY(r_ptr + off2, g_ptr + off2, b_ptr + off2, step,
+                    dst_y + picture->y_stride, width, rg);
+      dst_y += 2 * picture->y_stride;
+      if (has_alpha) {
+        rows_have_alpha &= !WebPExtractAlpha(a_ptr + off1, rgb_stride,
+                                             width, 2,
+                                             dst_a, picture->a_stride);
+        dst_a += 2 * picture->a_stride;
+      }
+      if (!rows_have_alpha) {
+        ConvertRowsToUV(r_ptr + off1, g_ptr + off1, b_ptr + off1,
+                        step, rgb_stride, dst_u, dst_v, width, rg);
+      } else {
+        ConvertRowsToUVWithAlpha(r_ptr + off1, g_ptr + off1, b_ptr + off1,
+                                 a_ptr + off1, rgb_stride,
+                                 dst_u, dst_v, width, rg);
+      }
+      dst_u += picture->uv_stride;
+      dst_v += picture->uv_stride;
+    }
+    if (height & 1) {    // extra last row
+      const int off = 2 * y * rgb_stride;
+      int row_has_alpha = has_alpha;
+      ConvertRowToY(r_ptr + off, g_ptr + off, b_ptr + off, step,
+                    dst_y, width, rg);
+      if (row_has_alpha) {
+        row_has_alpha &= !WebPExtractAlpha(a_ptr + off, 0, width, 1, dst_a, 0);
+      }
+      if (!row_has_alpha) {
+        ConvertRowsToUV(r_ptr + off, g_ptr + off, b_ptr + off,
+                        step, 0, dst_u, dst_v, width, rg);
+      } else {
+        ConvertRowsToUVWithAlpha(r_ptr + off, g_ptr + off, b_ptr + off,
+                                 a_ptr + off, 0,
+                                 dst_u, dst_v, width, rg);
       }
     }
   }
@@ -235,19 +942,20 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
 }
 
 #undef SUM4
-#undef SUM2V
-#undef SUM2H
-#undef SUM1
-#undef RGB_TO_UV
+#undef SUM2
+#undef SUM4ALPHA
+#undef SUM2ALPHA
 
 //------------------------------------------------------------------------------
 // call for ARGB->YUVA conversion
 
-int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
-                                  float dithering) {
+static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace,
+                             float dithering, int use_iterative_conversion) {
   if (picture == NULL) return 0;
   if (picture->argb == NULL) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  } else if ((colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   } else {
     const uint8_t* const argb = (const uint8_t*)picture->argb;
     const uint8_t* const r = ALPHA_IS_LAST ? argb + 2 : argb + 1;
@@ -255,15 +963,26 @@ int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
     const uint8_t* const b = ALPHA_IS_LAST ? argb + 0 : argb + 3;
     const uint8_t* const a = ALPHA_IS_LAST ? argb + 3 : argb + 0;
 
-    picture->colorspace = colorspace;
+    picture->colorspace = WEBP_YUV420;
     return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride,
-                              dithering, picture);
+                              dithering, use_iterative_conversion, picture);
   }
 }
 
+int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
+                                  float dithering) {
+  return PictureARGBToYUVA(picture, colorspace, dithering, 0);
+}
+
 int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
-  return WebPPictureARGBToYUVADithered(picture, colorspace, 0.f);
+  return PictureARGBToYUVA(picture, colorspace, 0.f, 0);
+}
+
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+  return PictureARGBToYUVA(picture, WEBP_YUV420, 0.f, 1);
 }
+#endif
 
 //------------------------------------------------------------------------------
 // call for YUVA -> ARGB conversion
@@ -343,7 +1062,7 @@ static int Import(WebPPicture* const picture,
 
   if (!picture->use_argb) {
     return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
-                              0.f /* no dithering */, picture);
+                              0.f /* no dithering */, 0, picture);
   }
   if (!WebPPictureAlloc(picture)) return 0;
 
diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h
index b543172e..dbc4b66f 100644
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@@ -30,7 +30,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 0
 #define ENC_MIN_VERSION 4
-#define ENC_REV_VERSION 1
+#define ENC_REV_VERSION 2
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -457,10 +457,10 @@ struct VP8Encoder {
   VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
   uint8_t*   preds_;     // predictions modes: (4*mb_w+1) * (4*mb_h+1)
   uint32_t*  nz_;        // non-zero bit context: mb_w+1
-  uint8_t   *y_top_;     // top luma samples.
-  uint8_t   *uv_top_;    // top u/v samples.
+  uint8_t*   y_top_;     // top luma samples.
+  uint8_t*   uv_top_;    // top u/v samples.
                          // U and V are packed into 16 bytes (8 U + 8 V)
-  LFStats   *lf_stats_;  // autofilter stats (if NULL, autofilter is off)
+  LFStats*   lf_stats_;  // autofilter stats (if NULL, autofilter is off)
 };
 
 //------------------------------------------------------------------------------
@@ -571,7 +571,7 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
 
 //------------------------------------------------------------------------------
 
-#if WEBP_ENCODER_ABI_VERSION <= 0x0202
+#if WEBP_ENCODER_ABI_VERSION <= 0x0203
 void WebPMemoryWriterClear(WebPMemoryWriter* writer);
 #endif
 
diff --git a/src/enc/webpenc.c b/src/enc/webpenc.c
index fe8a358f..0cb83f12 100644
--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@@ -328,16 +328,24 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
     VP8Encoder* enc = NULL;
     if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
-      float dithering = 0.f;
-      if (config->preprocessing & 2) {
-        const float x = config->quality / 100.f;
-        const float x2 = x * x;
-        // slowly decreasing from max dithering at low quality (q->0)
-        // to 0.5 dithering amplitude at high quality (q->100)
-        dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
-      }
-      if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
-        return 0;
+      if (config->preprocessing & 4) {
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+        if (!WebPPictureSmartARGBToYUVA(pic)) {
+          return 0;
+        }
+#endif
+      } else {
+        float dithering = 0.f;
+        if (config->preprocessing & 2) {
+          const float x = config->quality / 100.f;
+          const float x2 = x * x;
+          // slowly decreasing from max dithering at low quality (q->0)
+          // to 0.5 dithering amplitude at high quality (q->100)
+          dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
+        }
+        if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
+          return 0;
+        }
       }
     }
 
diff --git a/src/utils/bit_reader.c b/src/utils/bit_reader.c
index 55b08cc1..bbddd42c 100644
--- a/src/utils/bit_reader.c
+++ b/src/utils/bit_reader.c
@@ -105,9 +105,7 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
 //------------------------------------------------------------------------------
 // VP8LBitReader
 
-#define LBITS 64      // Number of bits prefetched.
-#define WBITS 32      // Minimum number of bytes needed after VP8LFillBitWindow.
-#define LOG8_WBITS 4  // Number of bytes needed to store WBITS bits.
+#define VP8L_LOG8_WBITS 4  // Number of bytes needed to store VP8L_WBITS bits.
 
 #if !defined(WEBP_FORCE_ALIGNED) && \
     (defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
@@ -151,16 +149,6 @@ void VP8LInitBitReader(VP8LBitReader* const br, const uint8_t* const start,
   br->buf_ = start;
 }
 
-// Special version that assumes br->pos_ <= br_len_.
-static int IsEndOfStreamSpecial(const VP8LBitReader* const br) {
-  assert(br->pos_ <= br->len_);
-  return br->pos_ == br->len_ && br->bit_pos_ >= LBITS;
-}
-
-static int IsEndOfStream(const VP8LBitReader* const br) {
-  return (br->pos_ > br->len_) || IsEndOfStreamSpecial(br);
-}
-
 void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
                             const uint8_t* const buf, size_t len) {
   assert(br != NULL);
@@ -168,38 +156,39 @@ void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
   assert(len < 0xfffffff8u);   // can't happen with a RIFF chunk.
   br->buf_ = buf;
   br->len_ = len;
-  br->eos_ = IsEndOfStream(br);
+  // pos_ > len_ should be considered a param error.
+  br->error_ = (br->pos_ > br->len_);
+  br->eos_ = br->error_ || VP8LIsEndOfStream(br);
 }
 
-// If not at EOS, reload up to LBITS byte-by-byte
+// If not at EOS, reload up to VP8L_LBITS byte-by-byte
 static void ShiftBytes(VP8LBitReader* const br) {
   while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
     br->val_ >>= 8;
-    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (LBITS - 8);
+    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (VP8L_LBITS - 8);
     ++br->pos_;
     br->bit_pos_ -= 8;
   }
+  br->eos_ = VP8LIsEndOfStream(br);
 }
 
-void VP8LFillBitWindow(VP8LBitReader* const br) {
-  if (br->bit_pos_ >= WBITS) {
-    // TODO(jzern): given the fixed read size it may be possible to force
-    //              alignment in this block.
+void VP8LDoFillBitWindow(VP8LBitReader* const br) {
+  assert(br->bit_pos_ >= VP8L_WBITS);
+  // TODO(jzern): given the fixed read size it may be possible to force
+  //              alignment in this block.
 #if defined(VP8L_USE_UNALIGNED_LOAD)
-    if (br->pos_ + sizeof(br->val_) < br->len_) {
-      br->val_ >>= WBITS;
-      br->bit_pos_ -= WBITS;
-      // The expression below needs a little-endian arch to work correctly.
-      // This gives a large speedup for decoding speed.
-      br->val_ |= (vp8l_val_t)*(const uint32_t*)(br->buf_ + br->pos_) <<
-                  (LBITS - WBITS);
-      br->pos_ += LOG8_WBITS;
-      return;
-    }
-#endif
-    ShiftBytes(br);       // Slow path.
-    br->eos_ = IsEndOfStreamSpecial(br);
+  if (br->pos_ + sizeof(br->val_) < br->len_) {
+    br->val_ >>= VP8L_WBITS;
+    br->bit_pos_ -= VP8L_WBITS;
+    // The expression below needs a little-endian arch to work correctly.
+    // This gives a large speedup for decoding speed.
+    br->val_ |= (vp8l_val_t)*(const uint32_t*)(br->buf_ + br->pos_) <<
+                (VP8L_LBITS - VP8L_WBITS);
+    br->pos_ += VP8L_LOG8_WBITS;
+    return;
   }
+#endif
+  ShiftBytes(br);       // Slow path.
 }
 
 uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
@@ -210,8 +199,6 @@ uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
         (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
     const int new_bits = br->bit_pos_ + n_bits;
     br->bit_pos_ = new_bits;
-    // If this read is going to cross the read buffer, set the eos flag.
-    br->eos_ = IsEndOfStreamSpecial(br);
     ShiftBytes(br);
     return val;
   } else {
diff --git a/src/utils/bit_reader.h b/src/utils/bit_reader.h
index 2c1e0872..a6ae85e5 100644
--- a/src/utils/bit_reader.h
+++ b/src/utils/bit_reader.h
@@ -107,6 +107,9 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
 // maximum number of bits (inclusive) the bit-reader can handle:
 #define VP8L_MAX_NUM_BIT_READ 24
 
+#define VP8L_LBITS 64  // Number of bits prefetched.
+#define VP8L_WBITS 32  // Minimum number of bytes ready after VP8LFillBitWindow.
+
 typedef uint64_t vp8l_val_t;  // right now, this bit-reader can only use 64bit.
 
 typedef struct {
@@ -138,14 +141,26 @@ static WEBP_INLINE uint32_t VP8LPrefetchBits(VP8LBitReader* const br) {
   return (uint32_t)(br->val_ >> br->bit_pos_);
 }
 
+// Returns true if there was an attempt at reading bit past the end of
+// the buffer. Doesn't set br->eos_ flag.
+static WEBP_INLINE int VP8LIsEndOfStream(const VP8LBitReader* const br) {
+  assert(br->pos_ <= br->len_);
+  return (br->pos_ == br->len_) && (br->bit_pos_ > VP8L_LBITS);
+}
+
 // For jumping over a number of bits in the bit stream when accessed with
 // VP8LPrefetchBits and VP8LFillBitWindow.
 static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
   br->bit_pos_ = val;
+  br->eos_ = VP8LIsEndOfStream(br);
 }
 
 // Advances the read buffer by 4 bytes to make room for reading next 32 bits.
-void VP8LFillBitWindow(VP8LBitReader* const br);
+// Speed critical, but infrequent part of the code can be non-inlined.
+extern void VP8LDoFillBitWindow(VP8LBitReader* const br);
+static WEBP_INLINE void VP8LFillBitWindow(VP8LBitReader* const br) {
+  if (br->bit_pos_ >= VP8L_WBITS) VP8LDoFillBitWindow(br);
+}
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/src/utils/bit_writer.c b/src/utils/bit_writer.c
index 23031f60..9875ca66 100644
--- a/src/utils/bit_writer.c
+++ b/src/utils/bit_writer.c
@@ -52,7 +52,7 @@ static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
   return 1;
 }
 
-static void kFlush(VP8BitWriter* const bw) {
+static void Flush(VP8BitWriter* const bw) {
   const int s = 8 + bw->nb_bits_;
   const int32_t bits = bw->value_ >> s;
   assert(bw->nb_bits_ >= 0);
@@ -118,7 +118,7 @@ int VP8PutBit(VP8BitWriter* const bw, int bit, int prob) {
     bw->range_ = kNewRange[bw->range_];
     bw->value_ <<= shift;
     bw->nb_bits_ += shift;
-    if (bw->nb_bits_ > 0) kFlush(bw);
+    if (bw->nb_bits_ > 0) Flush(bw);
   }
   return bit;
 }
@@ -135,7 +135,7 @@ int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
     bw->range_ = kNewRange[bw->range_];
     bw->value_ <<= 1;
     bw->nb_bits_ += 1;
-    if (bw->nb_bits_ > 0) kFlush(bw);
+    if (bw->nb_bits_ > 0) Flush(bw);
   }
   return bit;
 }
@@ -173,14 +173,14 @@ int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
 uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
   VP8PutValue(bw, 0, 9 - bw->nb_bits_);
   bw->nb_bits_ = 0;   // pad with zeroes
-  kFlush(bw);
+  Flush(bw);
   return bw->buf_;
 }
 
 int VP8BitWriterAppend(VP8BitWriter* const bw,
                        const uint8_t* data, size_t size) {
   assert(data != NULL);
-  if (bw->nb_bits_ != -8) return 0;   // kFlush() must have been called
+  if (bw->nb_bits_ != -8) return 0;   // Flush() must have been called
   if (!BitWriterResize(bw, size)) return 0;
   memcpy(bw->buf_ + bw->pos_, data, size);
   bw->pos_ += size;
diff --git a/src/utils/endian_inl.h b/src/utils/endian_inl.h
index 4c6b4fe4..f362a6e8 100644
--- a/src/utils/endian_inl.h
+++ b/src/utils/endian_inl.h
@@ -16,6 +16,7 @@
 #include "webp/config.h"
 #endif
 
+#include "../dsp/dsp.h"
 #include "webp/types.h"
 
 // some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
@@ -34,25 +35,13 @@
 #endif
 
 #if !defined(HAVE_CONFIG_H)
-#ifdef __GNUC__
-# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
-#else
-# define LOCAL_GCC_VERSION 0
-#endif  // __GNUC__
-
-#ifdef __clang__
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-#else
-# define LOCAL_CLANG_VERSION 0
-#endif  // __clang__
-
 // clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
-#if LOCAL_GCC_VERSION >= 0x403 || LOCAL_CLANG_VERSION >= 0x303
+#if LOCAL_GCC_PREREQ(4,3) || LOCAL_CLANG_PREREQ(3,3)
 #define HAVE_BUILTIN_BSWAP32
 #define HAVE_BUILTIN_BSWAP64
 #endif
 // clang-3.3 and gcc-4.8 have a builtin function for swap16
-#if LOCAL_GCC_VERSION >= 0x408 || LOCAL_CLANG_VERSION >= 0x303
+#if LOCAL_GCC_PREREQ(4,8) || LOCAL_CLANG_PREREQ(3,3)
 #define HAVE_BUILTIN_BSWAP16
 #endif
 #endif  // !HAVE_CONFIG_H
@@ -69,7 +58,16 @@ static WEBP_INLINE uint16_t BSwap16(uint16_t x) {
 }
 
 static WEBP_INLINE uint32_t BSwap32(uint32_t x) {
-#if defined(HAVE_BUILTIN_BSWAP32)
+#if defined(WEBP_USE_MIPS32_R2)
+  uint32_t ret;
+  __asm__ volatile (
+    "wsbh   %[ret], %[x]          \n\t"
+    "rotr   %[ret], %[ret],  16   \n\t"
+    : [ret]"=r"(ret)
+    : [x]"r"(x)
+  );
+  return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
   return __builtin_bswap32(x);
 #elif defined(__i386__) || defined(__x86_64__)
   uint32_t swapped_bytes;
diff --git a/src/utils/quant_levels_dec.c b/src/utils/quant_levels_dec.c
index c599e40a..5b8b8b49 100644
--- a/src/utils/quant_levels_dec.c
+++ b/src/utils/quant_levels_dec.c
@@ -32,10 +32,10 @@
 #define DSIZE 4          // dithering size (must be a power of two)
 // cf. http://en.wikipedia.org/wiki/Ordered_dithering
 static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
- {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
- { 12,  4, 14,  6 },
- {  3, 11,  1,  9 },
- { 15,  7, 13,  5 }
+  {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
+  { 12,  4, 14,  6 },
+  {  3, 11,  1,  9 },
+  { 15,  7, 13,  5 }
 };
 
 #else
diff --git a/src/utils/utils.c b/src/utils/utils.c
index 4a86886e..8ff7f12f 100644
--- a/src/utils/utils.c
+++ b/src/utils/utils.c
@@ -155,9 +155,9 @@ static void SubMem(void* ptr) {
 }
 
 #else
-#define Increment(v) do {} while(0)
-#define AddMem(p, s) do {} while(0)
-#define SubMem(p)    do {} while(0)
+#define Increment(v) do {} while (0)
+#define AddMem(p, s) do {} while (0)
+#define SubMem(p)    do {} while (0)
 #endif
 
 // Returns 0 in case of overflow of nmemb * size.
author	Vikas Arora <vikasa@google.com>	2015-01-29 17:29:29 +0000
committer	Android (Google) Code Review <android-gerrit@google.com>	2015-01-29 17:29:30 +0000
commit	9dbaf404293aff5b35993a9d6c7a3b45aeba1c52 (patch)
tree	d7ca3ab2344e6eaae0c5af176fc998b317952c23
parent	b01fe55d0ee2435cf881e68989b599563ae543e8 (diff)
parent	8c098653157979e397d3954fc2ea0ee43bae6ab2 (diff)
download	webp-9dbaf404293aff5b35993a9d6c7a3b45aeba1c52.tar.gz