aboutsummaryrefslogtreecommitdiff
path: root/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
diff options
context:
space:
mode:
Diffstat (limited to 'libvpx/vp9/encoder/x86/vp9_dct_sse2.c')
-rw-r--r--libvpx/vp9/encoder/x86/vp9_dct_sse2.c244
1 files changed, 226 insertions, 18 deletions
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index 686582238..487deef42 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -12,6 +12,35 @@
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vpx_ports/mem.h"
+void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
+ __m128i in0, in1;
+ __m128i tmp;
+ const __m128i zero = _mm_setzero_si128();
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+ (input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+ (input + 3 * stride)));
+
+ tmp = _mm_add_epi16(in0, in1);
+ in0 = _mm_unpacklo_epi16(zero, tmp);
+ in1 = _mm_unpackhi_epi16(zero, tmp);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(tmp, zero);
+ in1 = _mm_unpackhi_epi32(tmp, zero);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(tmp, 8);
+
+ in1 = _mm_add_epi32(tmp, in0);
+ in0 = _mm_slli_epi32(in1, 1);
+ _mm_store_si128((__m128i *)(output), in0);
+}
+
void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
// This 2D transform implements 4 vertical 1D transforms followed
// by 4 horizontal 1D transforms. The multiplies and adds are as given
@@ -377,6 +406,46 @@ void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
}
}
+void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i u0, u1, sum;
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ sum = _mm_add_epi16(u0, u1);
+
+ in0 = _mm_add_epi16(in0, in1);
+ in2 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, in0);
+
+ u0 = _mm_setzero_si128();
+ sum = _mm_add_epi16(sum, in2);
+
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ _mm_store_si128((__m128i *)(output), in1);
+}
+
void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
int pass;
// Constants
@@ -1168,6 +1237,74 @@ void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
}
}
+void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ input += 8 * i;
+ in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 1);
+ _mm_store_si128((__m128i *)(output), in1);
+}
+
void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
@@ -1187,7 +1324,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
@@ -1513,8 +1650,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
// dct_const_round_shift
const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
@@ -1535,8 +1672,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
// dct_const_round_shift
const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
@@ -1554,10 +1691,10 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
{
step1_0 = _mm_add_epi16(step3_0, step2_1);
step1_1 = _mm_sub_epi16(step3_0, step2_1);
- step1_2 = _mm_sub_epi16(step3_3, step2_2);
- step1_3 = _mm_add_epi16(step3_3, step2_2);
- step1_4 = _mm_add_epi16(step3_4, step2_5);
- step1_5 = _mm_sub_epi16(step3_4, step2_5);
+ step1_2 = _mm_add_epi16(step3_3, step2_2);
+ step1_3 = _mm_sub_epi16(step3_3, step2_2);
+ step1_4 = _mm_sub_epi16(step3_4, step2_5);
+ step1_5 = _mm_add_epi16(step3_4, step2_5);
step1_6 = _mm_sub_epi16(step3_7, step2_6);
step1_7 = _mm_add_epi16(step3_7, step2_6);
}
@@ -1848,7 +1985,7 @@ void fdct16_8col(__m128i *in) {
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
@@ -2052,10 +2189,10 @@ void fdct16_8col(__m128i *in) {
v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
- v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
- v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
@@ -2085,10 +2222,10 @@ void fdct16_8col(__m128i *in) {
// stage 5
s[0] = _mm_add_epi16(p[0], t[1]);
s[1] = _mm_sub_epi16(p[0], t[1]);
- s[2] = _mm_sub_epi16(p[3], t[2]);
- s[3] = _mm_add_epi16(p[3], t[2]);
- s[4] = _mm_add_epi16(p[4], t[5]);
- s[5] = _mm_sub_epi16(p[4], t[5]);
+ s[2] = _mm_add_epi16(p[3], t[2]);
+ s[3] = _mm_sub_epi16(p[3], t[2]);
+ s[4] = _mm_sub_epi16(p[4], t[5]);
+ s[5] = _mm_add_epi16(p[4], t[5]);
s[6] = _mm_sub_epi16(p[7], t[6]);
s[7] = _mm_add_epi16(p[7], t[6]);
@@ -2680,6 +2817,77 @@ void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
}
}
+void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 3);
+ _mm_store_si128((__m128i *)(output), in1);
+}
+
#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
#define FDCT32x32_HIGH_PRECISION 0
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"