1 files changed, 375 insertions, 0 deletions
diff --git a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
new file mode 100644
index 000000000..be4cd8685
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
@@ -0,0 +1,375 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
+// which may be bound by the edges of the block being filtered.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+
+// Load values from 'a' and 'b'. Compute the difference squared and sum
+// neighboring values such that:
+// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
+// Values to the left and right of the row are set to 0.
+// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
+static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
+  const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);
+
+  const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
+  const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);
+
+  const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
+  const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);
+
+  // Shift all the values one place to the left/right so we can efficiently sum
+  // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
+  const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
+  const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);
+
+  // It becomes necessary to treat the values as unsigned at this point. The
+  // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
+  // forward since the filter is only applied to smooth small pixel changes.
+  // Once the value has saturated to uint16_t it is well outside the useful
+  // range.
+  __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
+  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+  *sum = sum_u16;
+}
+
+static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
+                   __m128i *sum_1) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
+  const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
+  const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
+  const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);
+
+  const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
+  const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
+  const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
+  const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);
+
+  __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
+  // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
+  __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);
+
+  __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
+  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+  *sum_0 = sum_u16;
+
+  shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
+  shift_right = _mm_srli_si128(diff_sq_1_u16, 2);
+
+  sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
+  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+  *sum_1 = sum_u16;
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static __m128i average_8(__m128i sum, const __m128i mul_constants,
+                         const int strength, const int rounding,
+                         const int weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 = _mm_set1_epi16(weight);
+  const __m128i sixteen = _mm_set1_epi16(16);
+
+  // modifier * 3 / index;
+  sum = _mm_mulhi_epu16(sum, mul_constants);
+
+  sum = _mm_adds_epu16(sum, rounding_u16);
+  sum = _mm_srl_epi16(sum, strength_u128);
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = _mm_min_epu16(sum, sixteen);
+
+  sum = _mm_sub_epi16(sixteen, sum);
+
+  return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
+                       const __m128i mul_constants_0,
+                       const __m128i mul_constants_1, const int strength,
+                       const int rounding, const int weight) {
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 = _mm_set1_epi16(weight);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  __m128i input_0, input_1;
+
+  input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
+  input_0 = _mm_adds_epu16(input_0, rounding_u16);
+
+  input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
+  input_1 = _mm_adds_epu16(input_1, rounding_u16);
+
+  input_0 = _mm_srl_epi16(input_0, strength_u128);
+  input_1 = _mm_srl_epi16(input_1, strength_u128);
+
+  input_0 = _mm_min_epu16(input_0, sixteen);
+  input_1 = _mm_min_epu16(input_1, sixteen);
+  input_0 = _mm_sub_epi16(sixteen, input_0);
+  input_1 = _mm_sub_epi16(sixteen, input_1);
+
+  *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
+  *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
+                                   uint16_t *count, uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+  __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static void accumulate_and_store_16(const __m128i sum_0_u16,
+                                    const __m128i sum_1_u16,
+                                    const uint8_t *pred, uint16_t *count,
+                                    uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
+          count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
+  __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
+          pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
+  __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
+  __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+  count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
+  _mm_storeu_si128((__m128i *)count, count_0_u16);
+
+  count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
+  _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
+
+  pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
+  pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
+  pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
+  pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+  accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
+  accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+  accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
+  accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
+}
+
+void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
+                                      const uint8_t *b, unsigned int width,
+                                      unsigned int height, int strength,
+                                      int weight, uint32_t *accumulator,
+                                      uint16_t *count) {
+  unsigned int h;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  assert(weight >= 0);
+  assert(weight <= 2);
+
+  assert(width == 8 || width == 16);
+
+  if (width == 8) {
+    __m128i sum_row_a, sum_row_b, sum_row_c;
+    __m128i mul_constants = _mm_setr_epi16(
+        NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+
+    sum_8(a, b, &sum_row_a);
+    sum_8(a + stride, b + width, &sum_row_b);
+    sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
+    sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
+    accumulate_and_store_8(sum_row_c, b, count, accumulator);
+
+    a += stride + stride;
+    b += width;
+    count += width;
+    accumulator += width;
+
+    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
+                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
+
+    for (h = 0; h < height - 2; ++h) {
+      sum_8(a, b + width, &sum_row_c);
+      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
+      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
+      sum_row_a =
+          average_8(sum_row_a, mul_constants, strength, rounding, weight);
+      accumulate_and_store_8(sum_row_a, b, count, accumulator);
+
+      a += stride;
+      b += width;
+      count += width;
+      accumulator += width;
+
+      sum_row_a = sum_row_b;
+      sum_row_b = sum_row_c;
+    }
+
+    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
+                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+    sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
+    sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
+    accumulate_and_store_8(sum_row_a, b, count, accumulator);
+
+  } else {  // width == 16
+    __m128i sum_row_a_0, sum_row_a_1;
+    __m128i sum_row_b_0, sum_row_b_1;
+    __m128i sum_row_c_0, sum_row_c_1;
+    __m128i mul_constants_0 = _mm_setr_epi16(
+                NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
+            mul_constants_1 = _mm_setr_epi16(
+                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+
+    sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
+    sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);
+
+    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+
+    average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+               strength, rounding, weight);
+    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
+
+    a += stride + stride;
+    b += width;
+    count += width;
+    accumulator += width;
+
+    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
+    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
+    for (h = 0; h < height - 2; ++h) {
+      sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);
+
+      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
+      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);
+
+      average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
+                 strength, rounding, weight);
+      accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);
+
+      a += stride;
+      b += width;
+      count += width;
+      accumulator += width;
+
+      sum_row_a_0 = sum_row_b_0;
+      sum_row_a_1 = sum_row_b_1;
+      sum_row_b_0 = sum_row_c_0;
+      sum_row_b_1 = sum_row_c_1;
+    }
+
+    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
+    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+
+    average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+               strength, rounding, weight);
+    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
+  }
+}