13 files changed, 325 insertions, 321 deletions
diff --git a/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c b/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c
index 6d41708ee0..945e7e48ee 100644
--- a/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c
+++ b/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c
@@ -11,8 +11,6 @@
 
 #include <arm_neon.h>
 
-#include "common/tools_common.h"
-
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
diff --git a/third_party/libaom/source/libaom/aom_dsp/butteraugli.c b/third_party/libaom/source/libaom/aom_dsp/butteraugli.c
index 7ce2324c06..038efcd313 100644
--- a/third_party/libaom/source/libaom/aom_dsp/butteraugli.c
+++ b/third_party/libaom/source/libaom/aom_dsp/butteraugli.c
@@ -18,37 +18,71 @@
 
 int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *distorted, int bit_depth,
-                         float *dist_map) {
+                         aom_matrix_coefficients_t matrix_coefficients,
+                         aom_color_range_t color_range, float *dist_map) {
   (void)bit_depth;
   assert(bit_depth == 8);
-  assert(source->y_width == source->uv_width * 2);
   const int width = source->y_crop_width;
   const int height = source->y_crop_height;
+  const int ss_x = source->subsampling_x;
+  const int ss_y = source->subsampling_y;
 
-  size_t buffer_size = width * height * 3;
-  uint8_t *src_rgb = (uint8_t *)aom_malloc(buffer_size);
-  uint8_t *distorted_rgb = (uint8_t *)aom_malloc(buffer_size);
-  if (!src_rgb || !distorted_rgb) {
-    aom_free(src_rgb);
-    aom_free(distorted_rgb);
+  const struct YuvConstants *yuv_constants;
+  if (matrix_coefficients == AOM_CICP_MC_BT_709) {
+    if (color_range == AOM_CR_FULL_RANGE) return 0;
+    yuv_constants = &kYuvH709Constants;
+  } else {
+    yuv_constants = color_range == AOM_CR_FULL_RANGE ? &kYuvJPEGConstants
+                                                     : &kYuvI601Constants;
+  }
+
+  const size_t stride_argb = width * 4;
+  const size_t buffer_size = height * stride_argb;
+  uint8_t *src_argb = (uint8_t *)aom_malloc(buffer_size);
+  uint8_t *distorted_argb = (uint8_t *)aom_malloc(buffer_size);
+  if (!src_argb || !distorted_argb) {
+    aom_free(src_argb);
+    aom_free(distorted_argb);
     return 0;
   }
 
-  I420ToRGB24Matrix(source->y_buffer, source->y_stride, source->u_buffer,
-                    source->uv_stride, source->v_buffer, source->uv_stride,
-                    src_rgb, width * 3, &kYuvH709Constants, width, height);
-  I420ToRGB24Matrix(distorted->y_buffer, distorted->y_stride,
-                    distorted->u_buffer, distorted->uv_stride,
-                    distorted->v_buffer, distorted->uv_stride, distorted_rgb,
-                    width * 3, &kYuvH709Constants, width, height);
+  if (ss_x == 1 && ss_y == 1) {
+    I420ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I420ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else if (ss_x == 1 && ss_y == 0) {
+    I422ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I422ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else if (ss_x == 0 && ss_y == 0) {
+    I444ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I444ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else {
+    aom_free(src_argb);
+    aom_free(distorted_argb);
+    return 0;
+  }
 
-  JxlPixelFormat pixel_format = { 3, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 };
+  JxlPixelFormat pixel_format = { 4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 };
   JxlButteraugliApi *api = JxlButteraugliApiCreate(NULL);
   JxlButteraugliApiSetHFAsymmetry(api, 0.8f);
 
   JxlButteraugliResult *result = JxlButteraugliCompute(
-      api, width, height, &pixel_format, src_rgb, buffer_size, &pixel_format,
-      distorted_rgb, buffer_size);
+      api, width, height, &pixel_format, src_argb, buffer_size, &pixel_format,
+      distorted_argb, buffer_size);
 
   const float *distmap = NULL;
   uint32_t row_stride;
@@ -56,8 +90,8 @@ int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
   if (distmap == NULL) {
     JxlButteraugliApiDestroy(api);
     JxlButteraugliResultDestroy(result);
-    aom_free(src_rgb);
-    aom_free(distorted_rgb);
+    aom_free(src_argb);
+    aom_free(distorted_argb);
     return 0;
   }
 
@@ -69,7 +103,7 @@ int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
 
   JxlButteraugliApiDestroy(api);
   JxlButteraugliResultDestroy(result);
-  aom_free(src_rgb);
-  aom_free(distorted_rgb);
+  aom_free(src_argb);
+  aom_free(distorted_argb);
   return 1;
 }
diff --git a/third_party/libaom/source/libaom/aom_dsp/butteraugli.h b/third_party/libaom/source/libaom/aom_dsp/butteraugli.h
index 06402aa3e4..5304092ccb 100644
--- a/third_party/libaom/source/libaom/aom_dsp/butteraugli.h
+++ b/third_party/libaom/source/libaom/aom_dsp/butteraugli.h
@@ -14,8 +14,10 @@
 
 #include "aom_scale/yv12config.h"
 
+// Returns a boolean that indicates success/failure.
 int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *distorted, int bit_depth,
-                         float *dist_map);
+                         aom_matrix_coefficients_t matrix_coefficients,
+                         aom_color_range_t color_range, float *dist_map);
 
 #endif  // AOM_AOM_DSP_BUTTERAUGLI_H_
diff --git a/third_party/libaom/source/libaom/aom_dsp/fastssim.c b/third_party/libaom/source/libaom/aom_dsp/fastssim.c
index 3804519b31..89712c5f40 100644
--- a/third_party/libaom/source/libaom/aom_dsp/fastssim.c
+++ b/third_party/libaom/source/libaom/aom_dsp/fastssim.c
@@ -31,6 +31,7 @@ typedef struct fs_ctx fs_ctx;
 #define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
 #define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
 #define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
+#define MAX_SSIM_DB 100.0
 
 #define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
 #define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
diff --git a/third_party/libaom/source/libaom/aom_dsp/grain_table.c b/third_party/libaom/source/libaom/aom_dsp/grain_table.c
index e03f04d5da..b22752abd9 100644
--- a/third_party/libaom/source/libaom/aom_dsp/grain_table.c
+++ b/third_party/libaom/source/libaom/aom_dsp/grain_table.c
@@ -202,7 +202,7 @@ int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
                                 int64_t end_time, int erase,
                                 aom_film_grain_t *grain) {
   aom_film_grain_table_entry_t *entry = t->head;
-  aom_film_grain_table_entry_t *prev_entry = 0;
+  aom_film_grain_table_entry_t *prev_entry = NULL;
   uint16_t random_seed = grain ? grain->random_seed : 0;
   if (grain) memset(grain, 0, sizeof(*grain));
 
@@ -241,10 +241,10 @@ int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
         entry->end_time = time_stamp;
         if (t->tail == entry) t->tail = new_entry;
       }
-      // If segments aren't aligned, delete from the beggining of subsequent
+      // If segments aren't aligned, delete from the beginning of subsequent
       // segments
       if (end_time > entry_end_time) {
-        aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0);
+        aom_film_grain_table_lookup(t, entry_end_time, end_time, 1, 0);
       }
       return 1;
     }
@@ -275,12 +275,12 @@ aom_codec_err_t aom_film_grain_table_read(
     return error_info->error_code;
   }
 
-  aom_film_grain_table_entry_t *prev_entry = 0;
+  aom_film_grain_table_entry_t *prev_entry = NULL;
   while (!feof(file)) {
     aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
     memset(entry, 0, sizeof(*entry));
     grain_table_entry_read(file, error_info, entry);
-    entry->next = 0;
+    entry->next = NULL;
 
     if (prev_entry) prev_entry->next = entry;
     if (!t->head) t->head = entry;
diff --git a/third_party/libaom/source/libaom/aom_dsp/noise_model.c b/third_party/libaom/source/libaom/aom_dsp/noise_model.c
index f56fdd5860..19c660e911 100644
--- a/third_party/libaom/source/libaom/aom_dsp/noise_model.c
+++ b/third_party/libaom/source/libaom/aom_dsp/noise_model.c
@@ -214,6 +214,7 @@ static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) {
 
 int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
   if (!lut) return 0;
+  if (num_points <= 0) return 0;
   lut->num_points = 0;
   lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
   if (!lut->points) return 0;
@@ -1152,12 +1153,24 @@ int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
 
   // Convert the scaling functions to 8 bit values
   aom_noise_strength_lut_t scaling_points[3];
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0);
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1);
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2);
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[0].strength_solver, 14,
+          scaling_points + 0)) {
+    return 0;
+  }
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[1].strength_solver, 10,
+          scaling_points + 1)) {
+    aom_noise_strength_lut_free(scaling_points + 0);
+    return 0;
+  }
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[2].strength_solver, 10,
+          scaling_points + 2)) {
+    aom_noise_strength_lut_free(scaling_points + 0);
+    aom_noise_strength_lut_free(scaling_points + 1);
+    return 0;
+  }
 
   // Both the domain and the range of the scaling functions in the film_grain
   // are normalized to 8-bit (e.g., they are implicitly scaled during grain
diff --git a/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c b/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c
index 69a1d99bf2..25f075aa2f 100644
--- a/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c
+++ b/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c
@@ -34,6 +34,7 @@ static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                                int xstride) {
   int i, j;
@@ -43,6 +44,7 @@ static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
     for (j = 0; j < 8; j++)
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 /* Normalized inverse quantization matrix for 8x8 DCT at the point of
  * transparency. This is not the JPEG based matrix from the paper,
@@ -210,6 +212,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
         }
       }
       s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f;
+#if CONFIG_AV1_HIGHBITDEPTH
       if (!buf_is_hbd) {
         od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
         od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
@@ -217,6 +220,10 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
         hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
         hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
       }
+#else
+      od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+      od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
       for (i = 0; i < 8; i++)
         for (j = (i == 0); j < 8; j++)
           s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
diff --git a/third_party/libaom/source/libaom/aom_dsp/ssim.c b/third_party/libaom/source/libaom/aom_dsp/ssim.c
index 357da99ae4..c5334fd2c5 100644
--- a/third_party/libaom/source/libaom/aom_dsp/ssim.c
+++ b/third_party/libaom/source/libaom/aom_dsp/ssim.c
@@ -18,6 +18,7 @@
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 
+#if CONFIG_INTERNAL_STATS
 void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
                             uint32_t *sum_s, uint32_t *sum_r,
                             uint32_t *sum_sq_s, uint32_t *sum_sq_r,
@@ -33,6 +34,7 @@ void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
     }
   }
 }
+#endif  // CONFIG_INTERNAL_STATS
 
 void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
@@ -49,24 +51,6 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
   }
 }
 
-#if CONFIG_AV1_HIGHBITDEPTH
-void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
-                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
-                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
-                                 uint32_t *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 8; i++, s += sp, r += rp) {
-    for (j = 0; j < 8; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-#endif
-
 static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
 static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
 static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
@@ -78,7 +62,7 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
                          uint32_t sum_sq_r, uint32_t sum_sxr, int count,
                          uint32_t bd) {
   double ssim_n, ssim_d;
-  int64_t c1, c2;
+  int64_t c1 = 0, c2 = 0;
   if (bd == 8) {
     // scale the constants by number of pixels
     c1 = (cc1 * count * count) >> 12;
@@ -90,8 +74,9 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
     c1 = (cc1_12 * count * count) >> 12;
     c2 = (cc2_12 * count * count) >> 12;
   } else {
-    c1 = c2 = 0;
     assert(0);
+    // Return similarity as zero for unsupported bit-depth values.
+    return 0;
   }
 
   ssim_n = (2.0 * sum_s * sum_r + c1) *
@@ -111,21 +96,11 @@ static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
 }
 
-static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
-                              int rp, uint32_t bd, uint32_t shift) {
-  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                            &sum_sxr);
-  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
-                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
-}
-
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
-static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
-                        int stride_img1, int stride_img2, int width,
-                        int height) {
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                 int stride_img2, int width, int height) {
   int i, j;
   int samples = 0;
   double ssim_total = 0;
@@ -143,31 +118,10 @@ static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
   return ssim_total;
 }
 
-static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
-                               int stride_img1, int stride_img2, int width,
-                               int height, uint32_t bd, uint32_t shift) {
-  int i, j;
-  int samples = 0;
-  double ssim_total = 0;
-
-  // sample point start with each 4x4 location
-  for (i = 0; i <= height - 8;
-       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
-    for (j = 0; j <= width - 8; j += 4) {
-      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
-                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
-                                 shift);
-      ssim_total += v;
-      samples++;
-    }
-  }
-  ssim_total /= samples;
-  return ssim_total;
-}
-
-void aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *dest, double *weight,
-                   double *fast_ssim) {
+#if CONFIG_INTERNAL_STATS
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *weight,
+                         double *fast_ssim) {
   double abc[3];
   for (int i = 0; i < 3; ++i) {
     const int is_uv = i > 0;
@@ -421,7 +375,57 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
   m->dssim = dssim_total;
   return inconsistency_total;
 }
+#endif  // CONFIG_INTERNAL_STATS
 
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
+                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                 uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, uint32_t bd, uint32_t shift) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                            &sum_sxr);
+  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width, int height,
+                        uint32_t bd, uint32_t shift) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+                                 shift);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+#if CONFIG_INTERNAL_STATS
 void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                           const YV12_BUFFER_CONFIG *dest, double *weight,
                           uint32_t bd, uint32_t in_bd, double *fast_ssim) {
@@ -455,3 +459,25 @@ void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
     fast_ssim[1] = abc[0] * .8 + .1 * (abc[1] + abc[2]);
   }
 }
+#endif  // CONFIG_INTERNAL_STATS
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if CONFIG_INTERNAL_STATS
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+                   const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+                   const uint32_t in_bit_depth, int is_hbd, double *weight,
+                   double *frame_ssim2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_hbd) {
+    aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth,
+                         frame_ssim2);
+    return;
+  }
+#else
+  (void)bit_depth;
+  (void)in_bit_depth;
+  (void)is_hbd;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  aom_lowbd_calc_ssim(orig, recon, weight, frame_ssim2);
+}
+#endif  // CONFIG_INTERNAL_STATS
diff --git a/third_party/libaom/source/libaom/aom_dsp/ssim.h b/third_party/libaom/source/libaom/aom_dsp/ssim.h
index d635ef5bbe..fb92556a8c 100644
--- a/third_party/libaom/source/libaom/aom_dsp/ssim.h
+++ b/third_party/libaom/source/libaom/aom_dsp/ssim.h
@@ -12,14 +12,13 @@
 #ifndef AOM_AOM_DSP_SSIM_H_
 #define AOM_AOM_DSP_SSIM_H_
 
-#define MAX_SSIM_DB 100.0;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #include "config/aom_config.h"
 
+#if CONFIG_INTERNAL_STATS
 #include "aom_scale/yv12config.h"
 
 // metrics used for calculating ssim, ssim2, dssim, and ssimc
@@ -68,18 +67,35 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
                             int img2_pitch, int width, int height, Ssimv *sv2,
                             Metrics *m, int do_inconsistency);
 
-void aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *dest, double *weight,
-                   double *fast_ssim);
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *weight,
+                         double *fast_ssim);
 
 double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *dest, double *ssim_y,
                          double *ssim_u, double *ssim_v, uint32_t bd,
                          uint32_t in_bd);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                           const YV12_BUFFER_CONFIG *dest, double *weight,
                           uint32_t bd, uint32_t in_bd, double *fast_ssim);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+                   const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+                   const uint32_t in_bit_depth, int is_hbd, double *weight,
+                   double *frame_ssim2);
+#endif  // CONFIG_INTERNAL_STATS
+
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                 int stride_img2, int width, int height);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width, int height,
+                        uint32_t bd, uint32_t shift);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/libaom/source/libaom/aom_dsp/vmaf.c b/third_party/libaom/source/libaom/aom_dsp/vmaf.c
index 41653430c1..219e278303 100644
--- a/third_party/libaom/source/libaom/aom_dsp/vmaf.c
+++ b/third_party/libaom/source/libaom/aom_dsp/vmaf.c
@@ -12,9 +12,6 @@
 #include "aom_dsp/vmaf.h"
 
 #include <assert.h>
-#if !CONFIG_USE_VMAF_RC
-#include <libvmaf.h>
-#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -24,10 +21,7 @@
 #include <unistd.h>
 #endif
 
-#if CONFIG_USE_VMAF_RC
-#include <libvmaf/libvmaf.rc.h>
-#endif
-
+#include <libvmaf/libvmaf.h>
 #include "aom_dsp/blend.h"
 #include "aom_ports/system_state.h"
 
@@ -36,162 +30,18 @@ static void vmaf_fatal_error(const char *message) {
   exit(EXIT_FAILURE);
 }
 
-#if !CONFIG_USE_VMAF_RC
-typedef struct FrameData {
-  const YV12_BUFFER_CONFIG *source;
-  const YV12_BUFFER_CONFIG *distorted;
-  int frame_set;
-  int bit_depth;
-} FrameData;
-
-// A callback function used to pass data to VMAF.
-// Returns 0 after reading a frame.
-// Returns 2 when there is no more frame to read.
-static int read_frame(float *ref_data, float *main_data, float *temp_data,
-                      int stride, void *user_data) {
-  FrameData *frames = (FrameData *)user_data;
-
-  if (!frames->frame_set) {
-    const int width = frames->source->y_width;
-    const int height = frames->source->y_height;
-    assert(width == frames->distorted->y_width);
-    assert(height == frames->distorted->y_height);
-
-    if (frames->source->flags & YV12_FLAG_HIGHBITDEPTH) {
-      const float scale_factor = 1.0f / (float)(1 << (frames->bit_depth - 8));
-      uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(frames->source->y_buffer);
-      uint16_t *main_ptr = CONVERT_TO_SHORTPTR(frames->distorted->y_buffer);
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          ref_data[col] = scale_factor * (float)ref_ptr[col];
-        }
-        ref_ptr += frames->source->y_stride;
-        ref_data += stride / sizeof(*ref_data);
-      }
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          main_data[col] = scale_factor * (float)main_ptr[col];
-        }
-        main_ptr += frames->distorted->y_stride;
-        main_data += stride / sizeof(*main_data);
-      }
-    } else {
-      uint8_t *ref_ptr = frames->source->y_buffer;
-      uint8_t *main_ptr = frames->distorted->y_buffer;
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          ref_data[col] = (float)ref_ptr[col];
-        }
-        ref_ptr += frames->source->y_stride;
-        ref_data += stride / sizeof(*ref_data);
-      }
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          main_data[col] = (float)main_ptr[col];
-        }
-        main_ptr += frames->distorted->y_stride;
-        main_data += stride / sizeof(*main_data);
-      }
-    }
-    frames->frame_set = 1;
-    return 0;
-  }
-
-  (void)temp_data;
-  return 2;
-}
-
-void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *distorted, const int bit_depth,
-                   double *const vmaf) {
-  aom_clear_system_state();
-  const int width = source->y_width;
-  const int height = source->y_height;
-  FrameData frames = { source, distorted, 0, bit_depth };
-  char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p";
-  double vmaf_score;
-  const int ret =
-      compute_vmaf(&vmaf_score, fmt, width, height, read_frame,
-                   /*user_data=*/&frames, (char *)model_path,
-                   /*log_path=*/NULL, /*log_fmt=*/NULL, /*disable_clip=*/1,
-                   /*disable_avx=*/0, /*enable_transform=*/0,
-                   /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
-                   /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
-                   /*n_subsample=*/1, /*enable_conf_interval=*/0);
-  if (ret) vmaf_fatal_error("Failed to compute VMAF scores.");
-
-  aom_clear_system_state();
-  *vmaf = vmaf_score;
-}
-
-void aom_calc_vmaf_multi_frame(void *user_data, const char *model_path,
-                               int (*rd_frm)(float *ref_data, float *main_data,
-                                             float *temp_data, int stride_byte,
-                                             void *user_data),
-                               int frame_width, int frame_height, int bit_depth,
-                               double *vmaf) {
-  aom_clear_system_state();
-
-  char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p";
-  int log_path_length = snprintf(NULL, 0, "vmaf_scores_%d.xml", getpid()) + 1;
-  char *log_path = malloc(log_path_length);
-  snprintf(log_path, log_path_length, "vmaf_scores_%d.xml", getpid());
-  double vmaf_score;
-  const int ret =
-      compute_vmaf(&vmaf_score, fmt, frame_width, frame_height, rd_frm,
-                   /*user_data=*/user_data, (char *)model_path,
-                   /*log_path=*/log_path, /*log_fmt=*/NULL, /*disable_clip=*/0,
-                   /*disable_avx=*/0, /*enable_transform=*/0,
-                   /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
-                   /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
-                   /*n_subsample=*/1, /*enable_conf_interval=*/0);
-  FILE *vmaf_log = fopen(log_path, "r");
-  free(log_path);
-  log_path = NULL;
-  if (vmaf_log == NULL || ret) {
-    vmaf_fatal_error("Failed to compute VMAF scores.");
-  }
-
-  int frame_index = 0;
-  char buf[512];
-  while (fgets(buf, 511, vmaf_log) != NULL) {
-    if (memcmp(buf, "\t\t<frame ", 9) == 0) {
-      char *p = strstr(buf, "vmaf=");
-      if (p != NULL && p[5] == '"') {
-        char *p2 = strstr(&p[6], "\"");
-        *p2 = '\0';
-        const double score = atof(&p[6]);
-        if (score < 0.0 || score > 100.0) {
-          vmaf_fatal_error("Failed to compute VMAF scores.");
-        }
-        vmaf[frame_index++] = score;
-      }
-    }
-  }
-  fclose(vmaf_log);
-
-  aom_clear_system_state();
-}
-#endif
-
-#if CONFIG_USE_VMAF_RC
-void aom_init_vmaf_model_rc(VmafModel **vmaf_model, const char *model_path) {
+void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path) {
   if (*vmaf_model != NULL) return;
   VmafModelConfig model_cfg;
   model_cfg.flags = VMAF_MODEL_FLAG_DISABLE_CLIP;
   model_cfg.name = "vmaf";
-  model_cfg.path = (char *)model_path;
 
-  if (vmaf_model_load_from_path(vmaf_model, &model_cfg)) {
+  if (vmaf_model_load_from_path(vmaf_model, &model_cfg, model_path)) {
     vmaf_fatal_error("Failed to load VMAF model.");
   }
 }
 
-void aom_close_vmaf_model_rc(VmafModel *vmaf_model) {
+void aom_close_vmaf_model(VmafModel *vmaf_model) {
   vmaf_model_destroy(vmaf_model);
 }
 
@@ -221,8 +71,9 @@ static void copy_picture(const int bit_depth, const YV12_BUFFER_CONFIG *src,
   }
 }
 
-void aom_init_vmaf_context_rc(VmafContext **vmaf_context, VmafModel *vmaf_model,
-                              bool cal_vmaf_neg) {
+void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model,
+                           bool cal_vmaf_neg) {
+  // TODO(sdeng): make them CLI arguments.
   VmafConfiguration cfg;
   cfg.log_level = VMAF_LOG_LEVEL_NONE;
   cfg.n_threads = 0;
@@ -233,41 +84,53 @@ void aom_init_vmaf_context_rc(VmafContext **vmaf_context, VmafModel *vmaf_model,
     vmaf_fatal_error("Failed to init VMAF context.");
   }
 
-  if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) {
-    vmaf_fatal_error("Failed to load feature extractors from VMAF model.");
-  }
-
   if (cal_vmaf_neg) {
     VmafFeatureDictionary *vif_feature = NULL;
-    vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit", "1.0");
-    if (vmaf_use_feature(*vmaf_context, "float_vif", vif_feature)) {
+    if (vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit",
+                                    "1.0")) {
+      vmaf_fatal_error("Failed to set vif_enhn_gain_limit.");
+    }
+    if (vmaf_model_feature_overload(vmaf_model, "float_vif", vif_feature)) {
       vmaf_fatal_error("Failed to use feature float_vif.");
     }
 
     VmafFeatureDictionary *adm_feature = NULL;
-    vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit", "1.0");
-    if (vmaf_use_feature(*vmaf_context, "float_adm", adm_feature)) {
+    if (vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit",
+                                    "1.0")) {
+      vmaf_fatal_error("Failed to set adm_enhn_gain_limit.");
+    }
+    if (vmaf_model_feature_overload(vmaf_model, "adm", adm_feature)) {
       vmaf_fatal_error("Failed to use feature float_adm.");
     }
   }
 
   VmafFeatureDictionary *motion_force_zero = NULL;
-  vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero", "true");
-  if (vmaf_use_feature(*vmaf_context, "float_motion", motion_force_zero)) {
+  if (vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero",
+                                  "1")) {
+    vmaf_fatal_error("Failed to set motion_force_zero.");
+  }
+  if (vmaf_model_feature_overload(vmaf_model, "float_motion",
+                                  motion_force_zero)) {
     vmaf_fatal_error("Failed to use feature float_motion.");
   }
+
+  if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) {
+    vmaf_fatal_error("Failed to load feature extractors from VMAF model.");
+  }
 }
 
-void aom_close_vmaf_context_rc(VmafContext *vmaf_context) {
+void aom_close_vmaf_context(VmafContext *vmaf_context) {
   if (vmaf_close(vmaf_context)) {
     vmaf_fatal_error("Failed to close VMAF context.");
   }
 }
 
-void aom_calc_vmaf_at_index_rc(VmafContext *vmaf_context, VmafModel *vmaf_model,
-                               const YV12_BUFFER_CONFIG *source,
-                               const YV12_BUFFER_CONFIG *distorted,
-                               int bit_depth, int frame_index, double *vmaf) {
+void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                   bool cal_vmaf_neg, double *vmaf) {
+  VmafContext *vmaf_context;
+  aom_init_vmaf_context(&vmaf_context, vmaf_model, cal_vmaf_neg);
+  const int frame_index = 0;
   VmafPicture ref, dist;
   if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width,
                          source->y_height) ||
@@ -282,10 +145,50 @@ void aom_calc_vmaf_at_index_rc(VmafContext *vmaf_context, VmafModel *vmaf_model,
     vmaf_fatal_error("Failed to read VMAF pictures.");
   }
 
+  if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) {
+    vmaf_fatal_error("Failed to flush context.");
+  }
+
   vmaf_picture_unref(&ref);
   vmaf_picture_unref(&dist);
 
   vmaf_score_at_index(vmaf_context, vmaf_model, vmaf, frame_index);
+  aom_close_vmaf_context(vmaf_context);
 }
 
-#endif  // CONFIG_USE_VMAF_RC
+void aom_read_vmaf_image(VmafContext *vmaf_context,
+                         const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         int frame_index) {
+  VmafPicture ref, dist;
+  if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width,
+                         source->y_height) ||
+      vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth,
+                         source->y_width, source->y_height)) {
+    vmaf_fatal_error("Failed to alloc VMAF pictures.");
+  }
+  copy_picture(bit_depth, source, &ref);
+  copy_picture(bit_depth, distorted, &dist);
+  if (vmaf_read_pictures(vmaf_context, &ref, &dist,
+                         /*picture index=*/frame_index)) {
+    vmaf_fatal_error("Failed to read VMAF pictures.");
+  }
+
+  vmaf_picture_unref(&ref);
+  vmaf_picture_unref(&dist);
+}
+
+double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model,
+                              int frame_index) {
+  double vmaf;
+  if (vmaf_score_at_index(vmaf_context, vmaf_model, &vmaf, frame_index)) {
+    vmaf_fatal_error("Failed to calc VMAF scores.");
+  }
+  return vmaf;
+}
+
+void aom_flush_vmaf_context(VmafContext *vmaf_context) {
+  if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) {
+    vmaf_fatal_error("Failed to flush context.");
+  }
+}
diff --git a/third_party/libaom/source/libaom/aom_dsp/vmaf.h b/third_party/libaom/source/libaom/aom_dsp/vmaf.h
index d9da223e29..3ba8c8d565 100644
--- a/third_party/libaom/source/libaom/aom_dsp/vmaf.h
+++ b/third_party/libaom/source/libaom/aom_dsp/vmaf.h
@@ -15,33 +15,28 @@
 #include <stdbool.h>
 #include "aom_scale/yv12config.h"
 
-#if CONFIG_USE_VMAF_RC
 typedef struct VmafContext VmafContext;
 typedef struct VmafModel VmafModel;
-#endif
-
-#if CONFIG_USE_VMAF_RC
-void aom_init_vmaf_context_rc(VmafContext **vmaf_context, VmafModel *vmaf_model,
-                              bool cal_vmaf_neg);
-void aom_close_vmaf_context_rc(VmafContext *vmaf_context);
-
-void aom_init_vmaf_model_rc(VmafModel **vmaf_model, const char *model_path);
-void aom_close_vmaf_model_rc(VmafModel *vmaf_model);
-
-void aom_calc_vmaf_at_index_rc(VmafContext *vmaf_context, VmafModel *vmaf_model,
-                               const YV12_BUFFER_CONFIG *source,
-                               const YV12_BUFFER_CONFIG *distorted,
-                               int bit_depth, int frame_index, double *vmaf);
-#else
-void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source,
+
+void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model,
+                           bool cal_vmaf_neg);
+void aom_close_vmaf_context(VmafContext *vmaf_context);
+
+void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path);
+void aom_close_vmaf_model(VmafModel *vmaf_model);
+
+void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source,
                    const YV12_BUFFER_CONFIG *distorted, int bit_depth,
-                   double *vmaf);
-
-void aom_calc_vmaf_multi_frame(
-    void *user_data, const char *model_path,
-    int (*read_frame)(float *ref_data, float *main_data, float *temp_data,
-                      int stride_byte, void *user_data),
-    int frame_width, int frame_height, int bit_depth, double *vmaf);
-#endif  // CONFIG_USE_VMAF_RC
+                   bool cal_vmaf_neg, double *vmaf);
+
+void aom_read_vmaf_image(VmafContext *vmaf_context,
+                         const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         int frame_index);
+
+double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model,
+                              int frame_index);
+
+void aom_flush_vmaf_context(VmafContext *vmaf_context);
 
 #endif  // AOM_AOM_DSP_VMAF_H_
diff --git a/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm
index 58f1ac964e..a2510d5e7f 100644
--- a/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm
@@ -20,20 +20,21 @@ SECTION .text
 ; Arg 2: Height
 ; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
 ; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
-%macro HIGH_SAD_FN 4
+; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
+%macro HIGH_SAD_FN 4-5 7
 %if %4 == 0
 %if %3 == 5
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
 %elif %4 == 1 ; avg
 %if %3 == 5
-cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
 %else ; %3 == 7
-cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, %5, src, src_stride, \
                                               ref, ref_stride, \
                                               second_pred, \
                                               src_stride3, ref_stride3
@@ -356,7 +357,7 @@ HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
 ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD8XN 1-2 0
-  HIGH_SAD_FN 8, %1, 7, %2
+  HIGH_SAD_FN 8, %1, 7, %2, 8
 %if %2 == 2  ; skip rows, so divide number of rows by 2
   mov              n_rowsd, %1/8
 %else
@@ -377,22 +378,30 @@ HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
   pavgw                 m4, [second_predq+mmsize*3]
   lea         second_predq, [second_predq+mmsize*4]
 %endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
+  mova                  m7, m1
+  movu                  m5, [srcq]
+  psubusw               m1, m5
+  psubusw               m5, m7
   por                   m1, m5
-  mova                  m5, [srcq+src_strideq*2]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+src_strideq*2]
+
+  mova                  m7, m2
+  movu                  m5, [srcq+src_strideq*2]
+  psubusw               m2, m5
+  psubusw               m5, m7
   por                   m2, m5
-  mova                  m5, [srcq+src_strideq*4]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+src_strideq*4]
+
+  mova                  m7, m3
+  movu                  m5, [srcq+src_strideq*4]
+  psubusw               m3, m5
+  psubusw               m5, m7
   por                   m3, m5
-  mova                  m5, [srcq+src_stride3q*2]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+src_stride3q*2]
+
+  mova                  m7, m4
+  movu                  m5, [srcq+src_stride3q*2]
+  psubusw               m4, m5
+  psubusw               m5, m7
   por                   m4, m5
+
   paddw                 m1, m2
   paddw                 m3, m4
   movhlps               m2, m1
diff --git a/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c b/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c
index f779270ae3..163e4cc566 100644
--- a/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c
+++ b/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c
@@ -616,7 +616,7 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         src += src_stride;
         dst += dst_stride;
       }
-    } else if (y_offset == 8) {
+    } else if (y_offset == 4) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
         LOAD_SRC_DST
@@ -652,8 +652,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         dst += dst_stride;
       }
     }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
     if (y_offset == 0) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
@@ -668,8 +668,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i src_next_reg, src_avg;
       // load source and another source starting from the next
       // following byte
@@ -691,7 +691,7 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         CALC_SUM_SSE_INSIDE_LOOP
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = bilin interpolation
+      // x_offset = 4  and y_offset = bilin interpolation
     } else {
       __m256i filter, pw8, src_next_reg, src_avg;
       y_offset <<= 5;
@@ -741,8 +741,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i filter, pw8, src_next_reg, src_pack;
       x_offset <<= 5;
       filter = _mm256_load_si256(