diff options
Diffstat (limited to 'libvpx/vp9/encoder/x86/vp9_dct_sse2.c')
-rw-r--r-- | libvpx/vp9/encoder/x86/vp9_dct_sse2.c | 244 |
1 files changed, 226 insertions, 18 deletions
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c index 686582238..487deef42 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c @@ -12,6 +12,35 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1; + __m128i tmp; + const __m128i zero = _mm_setzero_si128(); + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) + (input + 2 * stride))); + in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) + (input + 3 * stride))); + + tmp = _mm_add_epi16(in0, in1); + in0 = _mm_unpacklo_epi16(zero, tmp); + in1 = _mm_unpackhi_epi16(zero, tmp); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(tmp, zero); + in1 = _mm_unpackhi_epi32(tmp, zero); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(tmp, 8); + + in1 = _mm_add_epi32(tmp, in0); + in0 = _mm_slli_epi32(in1, 1); + _mm_store_si128((__m128i *)(output), in0); +} + void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { // This 2D transform implements 4 vertical 1D transforms followed // by 4 horizontal 1D transforms. The multiplies and adds are as given @@ -377,6 +406,46 @@ void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, } } +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i u0, u1, sum; + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(u0, u1); + + in0 = _mm_add_epi16(in0, in1); + in2 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, in0); + + u0 = _mm_setzero_si128(); + sum = _mm_add_epi16(sum, in2); + + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + _mm_store_si128((__m128i *)(output), in1); +} + void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { int pass; // Constants @@ -1168,6 +1237,74 @@ void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, } } +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 2; ++i) { + input += 8 * i; + in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 1); + _mm_store_si128((__m128i *)(output), in1); +} + void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose @@ -1187,7 +1324,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); @@ -1513,8 +1650,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); @@ -1535,8 +1672,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); @@ -1554,10 +1691,10 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { { step1_0 = _mm_add_epi16(step3_0, step2_1); step1_1 = _mm_sub_epi16(step3_0, step2_1); - step1_2 = _mm_sub_epi16(step3_3, step2_2); - step1_3 = _mm_add_epi16(step3_3, step2_2); - step1_4 = _mm_add_epi16(step3_4, step2_5); - step1_5 = _mm_sub_epi16(step3_4, step2_5); + step1_2 = _mm_add_epi16(step3_3, step2_2); + step1_3 = _mm_sub_epi16(step3_3, step2_2); + step1_4 = _mm_sub_epi16(step3_4, step2_5); + step1_5 = _mm_add_epi16(step3_4, step2_5); step1_6 = _mm_sub_epi16(step3_7, step2_6); step1_7 = _mm_add_epi16(step3_7, step2_6); } @@ -1848,7 +1985,7 @@ void fdct16_8col(__m128i *in) { const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); @@ -2052,10 +2189,10 @@ void fdct16_8col(__m128i *in) { v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); - v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); - v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); - v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); + v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); + v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); + v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); + v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); @@ -2085,10 +2222,10 @@ void fdct16_8col(__m128i *in) { // stage 5 s[0] = _mm_add_epi16(p[0], t[1]); s[1] = _mm_sub_epi16(p[0], t[1]); - s[2] = _mm_sub_epi16(p[3], t[2]); - s[3] = _mm_add_epi16(p[3], t[2]); - s[4] = _mm_add_epi16(p[4], t[5]); - s[5] = _mm_sub_epi16(p[4], t[5]); + s[2] = _mm_add_epi16(p[3], t[2]); + s[3] = _mm_sub_epi16(p[3], t[2]); + s[4] = _mm_sub_epi16(p[4], t[5]); + s[5] = _mm_add_epi16(p[4], t[5]); s[6] = _mm_sub_epi16(p[7], t[6]); s[7] = _mm_add_epi16(p[7], t[6]); @@ -2680,6 +2817,77 @@ void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, } } +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; ++i) { + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 3); + _mm_store_si128((__m128i *)(output), in1); +} + #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 #define FDCT32x32_HIGH_PRECISION 0 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" |