aboutsummaryrefslogtreecommitdiff
path: root/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
diff options
context:
space:
mode:
Diffstat (limited to 'libvpx/vp9/encoder/x86/temporal_filter_sse4.c')
-rw-r--r--libvpx/vp9/encoder/x86/temporal_filter_sse4.c375
1 files changed, 375 insertions, 0 deletions
diff --git a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
new file mode 100644
index 000000000..be4cd8685
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
+// which may be bound by the edges of the block being filtered.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+
+// Load values from 'a' and 'b'. Compute the difference squared and sum
+// neighboring values such that:
+// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
+// Values to the left and right of the row are set to 0.
+// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
+static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
+ const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);
+
+ const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
+ const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);
+
+ const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
+ const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);
+
+ // Shift all the values one place to the left/right so we can efficiently sum
+ // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
+ const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
+ const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);
+
+ // It becomes necessary to treat the values as unsigned at this point. The
+ // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
+ // forward since the filter is only applied to smooth small pixel changes.
+ // Once the value has saturated to uint16_t it is well outside the useful
+ // range.
+ __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
+ sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+ *sum = sum_u16;
+}
+
+static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
+ __m128i *sum_1) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
+ const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);
+
+ const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
+ const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
+ const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
+ const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);
+
+ const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
+ const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
+ const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
+ const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);
+
+ __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
+ // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
+ __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);
+
+ __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
+ sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+ *sum_0 = sum_u16;
+
+ shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
+ shift_right = _mm_srli_si128(diff_sq_1_u16, 2);
+
+ sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
+ sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+ *sum_1 = sum_u16;
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static __m128i average_8(__m128i sum, const __m128i mul_constants,
+ const int strength, const int rounding,
+ const int weight) {
+ // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+ const __m128i weight_u16 = _mm_set1_epi16(weight);
+ const __m128i sixteen = _mm_set1_epi16(16);
+
+ // modifier * 3 / index;
+ sum = _mm_mulhi_epu16(sum, mul_constants);
+
+ sum = _mm_adds_epu16(sum, rounding_u16);
+ sum = _mm_srl_epi16(sum, strength_u128);
+
+ // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+ // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+ // So this needs to use the epu16 version which did not come until SSE4.
+ sum = _mm_min_epu16(sum, sixteen);
+
+ sum = _mm_sub_epi16(sixteen, sum);
+
+ return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
+ const __m128i mul_constants_0,
+ const __m128i mul_constants_1, const int strength,
+ const int rounding, const int weight) {
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+ const __m128i weight_u16 = _mm_set1_epi16(weight);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ __m128i input_0, input_1;
+
+ input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
+ input_0 = _mm_adds_epu16(input_0, rounding_u16);
+
+ input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
+ input_1 = _mm_adds_epu16(input_1, rounding_u16);
+
+ input_0 = _mm_srl_epi16(input_0, strength_u128);
+ input_1 = _mm_srl_epi16(input_1, strength_u128);
+
+ input_0 = _mm_min_epu16(input_0, sixteen);
+ input_1 = _mm_min_epu16(input_1, sixteen);
+ input_0 = _mm_sub_epi16(sixteen, input_0);
+ input_1 = _mm_sub_epi16(sixteen, input_1);
+
+ *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
+ *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
+ uint16_t *count, uint32_t *accumulator) {
+ const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+ __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
+ __m128i pred_0_u32, pred_1_u32;
+ __m128i accum_0_u32, accum_1_u32;
+
+ count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+ _mm_storeu_si128((__m128i *)count, count_u16);
+
+ pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static void accumulate_and_store_16(const __m128i sum_0_u16,
+ const __m128i sum_1_u16,
+ const uint8_t *pred, uint16_t *count,
+ uint32_t *accumulator) {
+ const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
+ count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
+ __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
+ pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
+ __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
+ __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+ count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
+ _mm_storeu_si128((__m128i *)count, count_0_u16);
+
+ count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
+ _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
+
+ pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
+ pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
+ pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
+ pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+ accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
+ accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+ accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
+ accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
+}
+
+void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
+ const uint8_t *b, unsigned int width,
+ unsigned int height, int strength,
+ int weight, uint32_t *accumulator,
+ uint16_t *count) {
+ unsigned int h;
+ const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+ assert(strength >= 0);
+ assert(strength <= 6);
+
+ assert(weight >= 0);
+ assert(weight <= 2);
+
+ assert(width == 8 || width == 16);
+
+ if (width == 8) {
+ __m128i sum_row_a, sum_row_b, sum_row_c;
+ __m128i mul_constants = _mm_setr_epi16(
+ NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+
+ sum_8(a, b, &sum_row_a);
+ sum_8(a + stride, b + width, &sum_row_b);
+ sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
+ sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
+ accumulate_and_store_8(sum_row_c, b, count, accumulator);
+
+ a += stride + stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
+
+ for (h = 0; h < height - 2; ++h) {
+ sum_8(a, b + width, &sum_row_c);
+ sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
+ sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
+ sum_row_a =
+ average_8(sum_row_a, mul_constants, strength, rounding, weight);
+ accumulate_and_store_8(sum_row_a, b, count, accumulator);
+
+ a += stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ sum_row_a = sum_row_b;
+ sum_row_b = sum_row_c;
+ }
+
+ mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+ sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
+ sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
+ accumulate_and_store_8(sum_row_a, b, count, accumulator);
+
+ } else { // width == 16
+ __m128i sum_row_a_0, sum_row_a_1;
+ __m128i sum_row_b_0, sum_row_b_1;
+ __m128i sum_row_c_0, sum_row_c_1;
+ __m128i mul_constants_0 = _mm_setr_epi16(
+ NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
+ mul_constants_1 = _mm_setr_epi16(
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+
+ sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
+ sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);
+
+ sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+ sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+
+ average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+ strength, rounding, weight);
+ accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
+
+ a += stride + stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
+ mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
+ for (h = 0; h < height - 2; ++h) {
+ sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);
+
+ sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+ sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
+ sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+ sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);
+
+ average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
+ strength, rounding, weight);
+ accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);
+
+ a += stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ sum_row_a_0 = sum_row_b_0;
+ sum_row_a_1 = sum_row_b_1;
+ sum_row_b_0 = sum_row_c_0;
+ sum_row_b_1 = sum_row_c_1;
+ }
+
+ mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
+ mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+ sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+ sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+
+ average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+ strength, rounding, weight);
+ accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
+ }
+}