8 files changed, 157 insertions, 135 deletions
diff --git a/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c b/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c
index e103476..9d6807a 100644
--- a/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c
+++ b/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c
@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 #include "./vpx_config.h"
+#include "vpx_ports/arm.h"
 
 static INLINE void vp8_loop_filter_neon(
         uint8x16_t qblimit,  // flimit
@@ -251,38 +252,56 @@ void vp8_loop_filter_horizontal_edge_uv_neon(
     return;
 }
 
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ == 6))
-#warning Using GCC 4.6 is not recommended
-// Some versions of gcc4.6 do not correctly process vst4_lane_u8. When built
-// with any gcc4.6, use the C code.
-extern void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p,
-                                            const unsigned char *blimit,
-                                            const unsigned char *limit,
-                                            const unsigned char *thresh,
-                                            int count);
-
-void vp8_loop_filter_vertical_edge_y_neon(
-        unsigned char *src,
-        int pitch,
-        unsigned char blimit,
-        unsigned char limit,
-        unsigned char thresh) {
-  vp8_loop_filter_vertical_edge_c(src, pitch, &blimit, &limit, &thresh, 2);
-}
-
-void vp8_loop_filter_vertical_edge_uv_neon(
-        unsigned char *u,
-        int pitch,
-        unsigned char blimit,
-        unsigned char limit,
-        unsigned char thresh,
-        unsigned char *v) {
-  vp8_loop_filter_vertical_edge_c(u, pitch, &blimit, &limit, &thresh, 1);
-  vp8_loop_filter_vertical_edge_c(v, pitch, &blimit, &limit, &thresh, 1);
-}
-#else
 static INLINE void write_4x8(unsigned char *dst, int pitch,
                              const uint8x8x4_t result) {
+#ifdef VPX_INCOMPATIBLE_GCC
+    /*
+     * uint8x8x4_t result
+    00 01 02 03 | 04 05 06 07
+    10 11 12 13 | 14 15 16 17
+    20 21 22 23 | 24 25 26 27
+    30 31 32 33 | 34 35 36 37
+    ---
+    * after vtrn_u16
+    00 01 20 21 | 04 05 24 25
+    02 03 22 23 | 06 07 26 27
+    10 11 30 31 | 14 15 34 35
+    12 13 32 33 | 16 17 36 37
+    ---
+    * after vtrn_u8
+    00 10 20 30 | 04 14 24 34
+    01 11 21 31 | 05 15 25 35
+    02 12 22 32 | 06 16 26 36
+    03 13 23 33 | 07 17 27 37
+    */
+    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
+                                          vreinterpret_u16_u8(result.val[2]));
+    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
+                                          vreinterpret_u16_u8(result.val[3]));
+    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+                                       vreinterpret_u8_u16(r13_u16.val[0]));
+    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+                                       vreinterpret_u8_u16(r13_u16.val[1]));
+    const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
+    const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
+    const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
+    const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
+    vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
+#else
     vst4_lane_u8(dst, result, 0);
     dst += pitch;
     vst4_lane_u8(dst, result, 1);
@@ -298,6 +317,7 @@ static INLINE void write_4x8(unsigned char *dst, int pitch,
     vst4_lane_u8(dst, result, 6);
     dst += pitch;
     vst4_lane_u8(dst, result, 7);
+#endif  // VPX_INCOMPATIBLE_GCC
 }
 
 void vp8_loop_filter_vertical_edge_y_neon(
@@ -528,4 +548,3 @@ void vp8_loop_filter_vertical_edge_uv_neon(
     vd = v - 2;
     write_4x8(vd, pitch, q4ResultH);
 }
-#endif  // (__GNUC__ == 4 && (__GNUC_MINOR__ == 6))
diff --git a/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
index d5178bb..e1c8609 100644
--- a/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
+++ b/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -10,45 +10,9 @@
 
 #include <arm_neon.h>
 #include "./vpx_config.h"
+#include "vpx_ports/arm.h"
 
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-static INLINE void write_2x8(unsigned char *dst, int pitch,
-                             const uint8x8x2_t result,
-                             const uint8x8x2_t result2) {
-  vst2_lane_u8(dst, result, 0);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 1);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 2);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 3);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 4);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 5);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 6);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 7);
-  dst += pitch;
-
-  vst2_lane_u8(dst, result2, 0);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 1);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 2);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 3);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 4);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 5);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 6);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 7);
-}
-#else
+#ifdef VPX_INCOMPATIBLE_GCC
 static INLINE void write_2x4(unsigned char *dst, int pitch,
                              const uint8x8x2_t result) {
     /*
@@ -88,30 +52,47 @@ static INLINE void write_2x8(unsigned char *dst, int pitch,
   dst += pitch * 8;
   write_2x4(dst, pitch, result2);
 }
-#endif
-
+#else
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result,
+                             const uint8x8x2_t result2) {
+  vst2_lane_u8(dst, result, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 7);
+  dst += pitch;
 
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-static INLINE
-uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
-    x = vld4_lane_u8(src, x, 0);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 1);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 2);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 3);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 4);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 5);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 6);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 7);
-    return x;
+  vst2_lane_u8(dst, result2, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 7);
 }
-#else
+#endif  // VPX_INCOMPATIBLE_GCC
+
+
+#ifdef VPX_INCOMPATIBLE_GCC
 static INLINE
 uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
     const uint8x8_t a = vld1_u8(src);
@@ -169,7 +150,27 @@ uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
 
     return x;
 }
-#endif
+#else
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+    x = vld4_lane_u8(src, x, 0);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 1);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 2);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 3);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 4);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 5);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 6);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 7);
+    return x;
+}
+#endif  // VPX_INCOMPATIBLE_GCC
 
 static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
         unsigned char *s,
diff --git a/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
index ffa3d91..5ad9465 100644
--- a/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
+++ b/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
@@ -9,11 +9,9 @@
  */
 
 #include <arm_neon.h>
+#include "vpx_ports/arm.h"
 
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ == 6))
-#warning Using GCC 4.6 is not recommended
-// Some versions of gcc4.6 do not correctly process this function. When built
-// with any gcc4.6, use the C code.
+#ifdef VPX_INCOMPATIBLE_GCC
 #include "./vp8_rtcd.h"
 void vp8_short_walsh4x4_neon(
         int16_t *input,
@@ -128,4 +126,4 @@ void vp8_short_walsh4x4_neon(
     vst1q_s16(output + 8, q1s16);
     return;
 }
-#endif  // (__GNUC__ == 4 && (__GNUC_MINOR__ == 6))
+#endif  // VPX_INCOMPATIBLE_GCC
diff --git a/source/libvpx/vp8/encoder/denoising.c b/source/libvpx/vp8/encoder/denoising.c
index 12f9734..75b2a3b 100644
--- a/source/libvpx/vp8/encoder/denoising.c
+++ b/source/libvpx/vp8/encoder/denoising.c
@@ -390,9 +390,9 @@ void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) {
     denoiser->denoise_pars.scale_motion_thresh = 16;
     denoiser->denoise_pars.scale_increase_filter = 1;
     denoiser->denoise_pars.denoise_mv_bias = 60;
-    denoiser->denoise_pars.pickmode_mv_bias = 60;
-    denoiser->denoise_pars.qp_thresh = 100;
-    denoiser->denoise_pars.consec_zerolast = 10;
+    denoiser->denoise_pars.pickmode_mv_bias = 75;
+    denoiser->denoise_pars.qp_thresh = 85;
+    denoiser->denoise_pars.consec_zerolast = 15;
     denoiser->denoise_pars.spatial_blur = 20;
   }
 }
@@ -453,17 +453,17 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
     // Bitrate thresholds and noise metric (nmse) thresholds for switching to
     // aggressive mode.
     // TODO(marpan): Adjust thresholds, including effect on resolution.
-    denoiser->bitrate_threshold = 200000;  // (bits/sec).
+    denoiser->bitrate_threshold = 300000;  // (bits/sec).
     denoiser->threshold_aggressive_mode = 35;
-    if (width * height > 640 * 480) {
-      denoiser->bitrate_threshold = 500000;
-      denoiser->threshold_aggressive_mode = 100;
+    if (width * height > 1280 * 720) {
+      denoiser->bitrate_threshold = 2000000;
+      denoiser->threshold_aggressive_mode = 1400;
     } else if (width * height > 960 * 540) {
       denoiser->bitrate_threshold = 800000;
       denoiser->threshold_aggressive_mode = 150;
-    } else if (width * height > 1280 * 720) {
-      denoiser->bitrate_threshold = 2000000;
-      denoiser->threshold_aggressive_mode = 1400;
+    } else if (width * height > 640 * 480) {
+      denoiser->bitrate_threshold = 500000;
+      denoiser->threshold_aggressive_mode = 100;
     }
     return 0;
 }
diff --git a/source/libvpx/vp8/encoder/denoising.h b/source/libvpx/vp8/encoder/denoising.h
index fb7930b..6c1f9e2 100644
--- a/source/libvpx/vp8/encoder/denoising.h
+++ b/source/libvpx/vp8/encoder/denoising.h
@@ -27,6 +27,8 @@ extern "C" {
 #define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8)
 #define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
 
+#define MAX_GF_ARF_DENOISE_RANGE (16)
+
 enum vp8_denoiser_decision
 {
   COPY_BLOCK,
diff --git a/source/libvpx/vp8/encoder/mcomp.c b/source/libvpx/vp8/encoder/mcomp.c
index 54abe76..545f2c8 100644
--- a/source/libvpx/vp8/encoder/mcomp.c
+++ b/source/libvpx/vp8/encoder/mcomp.c
@@ -393,8 +393,8 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 #endif
 
     /* central mv */
-    bestmv->as_mv.row <<= 3;
-    bestmv->as_mv.col <<= 3;
+    bestmv->as_mv.row *= 8;
+    bestmv->as_mv.col *= 8;
     startmv = *bestmv;
 
     /* calculate central point error */
diff --git a/source/libvpx/vp8/encoder/pickinter.c b/source/libvpx/vp8/encoder/pickinter.c
index 43f8957..9d5556d 100644
--- a/source/libvpx/vp8/encoder/pickinter.c
+++ b/source/libvpx/vp8/encoder/pickinter.c
@@ -516,9 +516,8 @@ static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2,
     // Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame.
     if (this_mode == ZEROMV &&
         x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
-        (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME))
-    {
-        this_rd = ((int64_t)this_rd) * rd_adj / 100;
+        (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME)) {
+      this_rd = ((int64_t)this_rd) * rd_adj / 100;
     }
 
     check_for_encode_breakout(*sse, x);
@@ -1083,7 +1082,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         {
 
             /* Store for later use by denoiser. */
-            if (this_mode == ZEROMV && sse < zero_mv_sse )
+            // Dont' denoise with GOLDEN OR ALTREF is they are old reference
+            // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past).
+            int skip_old_reference = ((this_ref_frame != LAST_FRAME) &&
+                (cpi->common.current_video_frame -
+                 cpi->current_ref_frames[this_ref_frame] >
+                 MAX_GF_ARF_DENOISE_RANGE)) ? 1 : 0;
+            if (this_mode == ZEROMV && sse < zero_mv_sse &&
+                !skip_old_reference)
             {
                 zero_mv_sse = sse;
                 x->best_zeromv_reference_frame =
@@ -1092,7 +1098,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
             /* Store the best NEWMV in x for later use in the denoiser. */
             if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
-                    sse < best_sse)
+                sse < best_sse && !skip_old_reference)
             {
                 best_sse = sse;
                 x->best_sse_inter_mode = NEWMV;
diff --git a/source/libvpx/vp8/vp8_dx_iface.c b/source/libvpx/vp8/vp8_dx_iface.c
index 3ab8ed0..5aa274d 100644
--- a/source/libvpx/vp8/vp8_dx_iface.c
+++ b/source/libvpx/vp8/vp8_dx_iface.c
@@ -112,22 +112,19 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
      * structure. More memory may be required at the time the stream
      * information becomes known.
      */
-    if (!ctx->priv)
-    {
-        vp8_init_ctx(ctx);
-        priv = (vpx_codec_alg_priv_t *)ctx->priv;
-
-        /* initialize number of fragments to zero */
-        priv->fragments.count = 0;
-        /* is input fragments enabled? */
-        priv->fragments.enabled =
-            (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
-
-        /*post processing level initialized to do nothing */
-    }
-    else
-    {
-        priv = (vpx_codec_alg_priv_t *)ctx->priv;
+    if (!ctx->priv) {
+      vp8_init_ctx(ctx);
+      priv = (vpx_codec_alg_priv_t *)ctx->priv;
+
+      /* initialize number of fragments to zero */
+      priv->fragments.count = 0;
+      /* is input fragments enabled? */
+      priv->fragments.enabled =
+          (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
+
+      /*post processing level initialized to do nothing */
+    } else {
+      priv = (vpx_codec_alg_priv_t *)ctx->priv;
     }
 
     priv->yv12_frame_buffers.use_frame_threads =
@@ -138,11 +135,10 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
 
     if (priv->yv12_frame_buffers.use_frame_threads &&
         ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) ||
-         (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS)))
-    {
-        /* row-based threading, error concealment, and input fragments will
-         * not be supported when using frame-based threading */
-        res = VPX_CODEC_INVALID_PARAM;
+         (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS))) {
+      /* row-based threading, error concealment, and input fragments will
+       * not be supported when using frame-based threading */
+      res = VPX_CODEC_INVALID_PARAM;
     }
 
     return res;