diff options
Diffstat (limited to 'renderscript-toolkit/src/main/cpp/x86.cpp')
-rw-r--r-- | renderscript-toolkit/src/main/cpp/x86.cpp | 1321 |
1 files changed, 1321 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/x86.cpp b/renderscript-toolkit/src/main/cpp/x86.cpp new file mode 100644 index 0000000..ac3df27 --- /dev/null +++ b/renderscript-toolkit/src/main/cpp/x86.cpp @@ -0,0 +1,1321 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <stdint.h> +#include <x86intrin.h> + +namespace renderscript { + +/* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */ +static inline __m128i cvtepu8_epi32(__m128i x) { +#if defined(__SSE4_1__) + return _mm_cvtepu8_epi32(x); +#elif defined(__SSSE3__) + const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00); + x = _mm_shuffle_epi8(x, M8to32); + return x; +#else +# error "Require at least SSSE3" +#endif +} + +static inline __m128i packus_epi32(__m128i lo, __m128i hi) { +#if defined(__SSE4_1__) + return _mm_packus_epi32(lo, hi); +#elif defined(__SSSE3__) + const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000); + const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff); + const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100); + const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff); + lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0)); + lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1)); + hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0)); + hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1)); + return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L), + _mm_shuffle_epi8(hi, M32to16H)); +#else +# error "Require at least SSSE3" +#endif +} + +static inline __m128i mullo_epi32(__m128i x, __m128i y) { +#if defined(__SSE4_1__) + return _mm_mullo_epi32(x, y); +#elif defined(__SSSE3__) + const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff); + __m128i even = _mm_mul_epu32(x, y); + __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4), + _mm_srli_si128(y, 4)); + even = _mm_and_si128(even, Meven); + odd = _mm_and_si128(odd, Meven); + return _mm_or_si128(even, _mm_slli_si128(odd, 4)); +#else +# error "Require at least SSSE3" +#endif +} + +/* 'mask' must packed 8-bit of 0x00 or 0xff */ +static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) { +#if defined(__SSE4_1__) + return _mm_blendv_epi8(x, y, mask); +#elif defined(__SSSE3__) + return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask)); +#else +# error "Require at least SSSE3" +#endif +} + +extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, + const void *y1, const void *y2, + const short *coef, uint32_t count) { + __m128i x; + __m128i c0, c2, c4, c6, c8; + __m128i r0, r1, r2; + __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11; + __m128i o0, o1; + uint32_t i; + + x = _mm_loadl_epi64((const __m128i *)(coef+0)); + c0 = _mm_shuffle_epi32(x, 0x00); + c2 = _mm_shuffle_epi32(x, 0x55); + x = _mm_loadl_epi64((const __m128i *)(coef+4)); + c4 = _mm_shuffle_epi32(x, 0x00); + c6 = _mm_shuffle_epi32(x, 0x55); + x = _mm_loadl_epi64((const __m128i *)(coef+8)); + c8 = _mm_shuffle_epi32(x, 0x00); + + for (i = 0; i < count; ++i) { + + p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128()); + p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128()); + p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128()); + p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128()); + p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128()); + p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128()); + p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128()); + p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128()); + p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128()); + p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128()); + p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128()); + p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128()); + + o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0); + o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0); + + o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2)); + + o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4)); + + o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6)); + + o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8)); + + o0 = _mm_srai_epi32(o0, 8); + o1 = _mm_srai_epi32(o1, 8); + + o0 = packus_epi32(o0, o1); + o0 = _mm_packus_epi16(o0, o0); + _mm_storel_epi64((__m128i *)dst, o0); + + y0 = (const char *)y0 + 8; + y1 = (const char *)y1 + 8; + y2 = (const char *)y2 + 8; + dst = (char *)dst + 8; + } +} + +void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, + const short *coef, uint32_t count) { + const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, + 14, 10, 6, 2, + 13, 9, 5, 1, + 12, 8, 4, 0); + + const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); + const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); + __m128i c0, c1, c2, c3; + __m128i i4, o4; + __m128i xy, zw; + __m128i x2, y2, z2, w2; + uint32_t i; + + c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); + c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); + c0 = _mm_unpacklo_epi16(c0, c1); + + c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); + c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); + c2 = _mm_unpacklo_epi16(c2, c3); + + for (i = 0; i < count; ++i) { + i4 = _mm_load_si128((const __m128i *)src); + xy = _mm_shuffle_epi8(i4, Mxy); + zw = _mm_shuffle_epi8(i4, Mzw); + + x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00)); + y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55)); + z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa)); + w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff)); + + x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00))); + y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55))); + z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa))); + w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff))); + + x2 = _mm_srai_epi32(x2, 8); + y2 = _mm_srai_epi32(y2, 8); + z2 = _mm_srai_epi32(z2, 8); + w2 = _mm_srai_epi32(w2, 8); + + x2 = packus_epi32(x2, y2); + z2 = packus_epi32(z2, w2); + o4 = _mm_packus_epi16(x2, z2); + + o4 = _mm_shuffle_epi8(o4, T4x4); + _mm_storeu_si128((__m128i *)dst, o4); + + src = (const char *)src + 16; + dst = (char *)dst + 16; + } +} + +void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, + const short *coef, uint32_t count) { + const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, + 14, 10, 6, 2, + 13, 9, 5, 1, + 12, 8, 4, 0); + + const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); + const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); + + __m128i c0, c1, c2, c3; + __m128i i4, o4; + __m128i xy, zw; + __m128i x2, y2, z2, w2; + uint32_t i; + + c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); + c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); + c0 = _mm_unpacklo_epi16(c0, c1); + + c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); + c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); + c2 = _mm_unpacklo_epi16(c2, c3); + + for (i = 0; i < count; ++i) { + i4 = _mm_loadu_si128((const __m128i *)src); + xy = _mm_shuffle_epi8(i4, Mxy); + zw = _mm_shuffle_epi8(i4, Mzw); + + x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00)); + y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55)); + z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa)); + + x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00))); + y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55))); + z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa))); + + x2 = _mm_srai_epi32(x2, 8); + y2 = _mm_srai_epi32(y2, 8); + z2 = _mm_srai_epi32(z2, 8); + w2 = _mm_srli_epi32(zw, 16); + + x2 = packus_epi32(x2, y2); + z2 = packus_epi32(z2, w2); + o4 = _mm_packus_epi16(x2, z2); + + o4 = _mm_shuffle_epi8(o4, T4x4); + _mm_storeu_si128((__m128i *)dst, o4); + + src = (const char *)src + 16; + dst = (char *)dst + 16; + } +} + +void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, + const short *coef, uint32_t count) { + const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, + 14, 10, 6, 2, + 13, 9, 5, 1, + 12, 8, 4, 0); + const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); + const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); + __m128i c0, c1, c2, c3; + __m128i i4, o4; + __m128i xy, zw; + __m128i x2, y2, z2, w2; + uint32_t i; + + c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); + c0 = _mm_shufflelo_epi16(c0, 0); + c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); + c1 = _mm_shufflelo_epi16(c1, 0); + c0 = _mm_unpacklo_epi16(c0, c1); + + c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); + c2 = _mm_shufflelo_epi16(c2, 0); + c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); + c3 = _mm_shufflelo_epi16(c3, 0); + c2 = _mm_unpacklo_epi16(c2, c3); + + for (i = 0; i < count; ++i) { + i4 = _mm_loadu_si128((const __m128i *)src); + + xy = _mm_shuffle_epi8(i4, Mxy); + zw = _mm_shuffle_epi8(i4, Mzw); + + x2 = _mm_madd_epi16(xy, c0); + x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2)); + + x2 = _mm_srai_epi32(x2, 8); + y2 = x2; + z2 = x2; + w2 = _mm_srli_epi32(zw, 16); + + x2 = packus_epi32(x2, y2); + z2 = packus_epi32(z2, w2); + o4 = _mm_packus_epi16(x2, z2); + + o4 = _mm_shuffle_epi8(o4, T4x4); + _mm_storeu_si128((__m128i *)dst, o4); + + src = (const char *)src + 16; + dst = (char *)dst + 16; + } +} + +void rsdIntrinsicBlurVFU4_K(void *dst, + const void *pin, int stride, const void *gptr, + int rct, int x1, int x2) { + const char *pi; + __m128i pi0, pi1; + __m128 pf0, pf1; + __m128 bp0, bp1; + __m128 x; + int r; + + for (; x1 < x2; x1 += 2) { + pi = (const char *)pin + (x1 << 2); + bp0 = _mm_setzero_ps(); + bp1 = _mm_setzero_ps(); + + for (r = 0; r < rct; ++r) { + x = _mm_load_ss((const float *)gptr + r); + x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); + + pi0 = _mm_cvtsi32_si128(*(const int *)pi); + pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1)); + + pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0)); + pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1)); + + bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x)); + bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x)); + + pi += stride; + } + + _mm_storeu_ps((float *)dst, bp0); + _mm_storeu_ps((float *)dst + 4, bp1); + dst = (char *)dst + 32; + } +} + +void rsdIntrinsicBlurHFU4_K(void *dst, + const void *pin, const void *gptr, + int rct, int x1, int x2) { + const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400); + const float *pi; + __m128 pf, x, y; + __m128i o; + int r; + + for (; x1 < x2; ++x1) { + /* rct is define as 2*r+1 by the caller */ + x = _mm_load_ss((const float *)gptr); + x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); + + pi = (const float *)pin + (x1 << 2); + pf = _mm_mul_ps(x, _mm_load_ps(pi)); + + for (r = 1; r < rct; r += 2) { + x = _mm_load_ss((const float *)gptr + r); + y = _mm_load_ss((const float *)gptr + r + 1); + x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); + y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0)); + + pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2)))); + pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4))); + } + + o = _mm_cvtps_epi32(pf); + *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8)); + dst = (char *)dst + 4; + } +} + +void rsdIntrinsicBlurHFU1_K(void *dst, + const void *pin, const void *gptr, + int rct, int x1, int x2) { + const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400); + const float *pi; + __m128 pf, g0, g1, g2, g3, gx, p0, p1; + __m128i o; + int r; + + for (; x1 < x2; x1+=4) { + g0 = _mm_load_ss((const float *)gptr); + g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0)); + + pi = (const float *)pin + x1; + pf = _mm_mul_ps(g0, _mm_loadu_ps(pi)); + + for (r = 1; r < rct; r += 4) { + gx = _mm_loadu_ps((const float *)gptr + r); + p0 = _mm_loadu_ps(pi + r); + p1 = _mm_loadu_ps(pi + r + 4); + + g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0)); + pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0)); + g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1)); + pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4))); + g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2)); + pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8))); + g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3)); + pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12))); + } + + o = _mm_cvtps_epi32(pf); + *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8)); + dst = (char *)dst + 4; + } +} + +void rsdIntrinsicYuv_K(void *dst, + const unsigned char *pY, const unsigned char *pUV, + uint32_t count, const short *param) { + __m128i biasY, biasUV; + __m128i c0, c1, c2, c3, c4; + + biasY = _mm_set1_epi32(param[8]); /* 16 */ + biasUV = _mm_set1_epi32(param[16]); /* 128 */ + + c0 = _mm_set1_epi32(param[0]); /* 298 */ + c1 = _mm_set1_epi32(param[1]); /* 409 */ + c2 = _mm_set1_epi32(param[2]); /* -100 */ + c3 = _mm_set1_epi32(param[3]); /* 516 */ + c4 = _mm_set1_epi32(param[4]); /* -208 */ + + __m128i Y, UV, U, V, R, G, B, A; + + A = _mm_set1_epi32(255); + uint32_t i; + + for (i = 0; i < (count << 1); ++i) { + Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); + UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV)); + + Y = _mm_sub_epi32(Y, biasY); + UV = _mm_sub_epi32(UV, biasUV); + + U = _mm_shuffle_epi32(UV, 0xf5); + V = _mm_shuffle_epi32(UV, 0xa0); + + Y = mullo_epi32(Y, c0); + + R = _mm_add_epi32(Y, mullo_epi32(V, c1)); + R = _mm_add_epi32(R, biasUV); + R = _mm_srai_epi32(R, 8); + + G = _mm_add_epi32(Y, mullo_epi32(U, c2)); + G = _mm_add_epi32(G, mullo_epi32(V, c4)); + G = _mm_add_epi32(G, biasUV); + G = _mm_srai_epi32(G, 8); + + B = _mm_add_epi32(Y, mullo_epi32(U, c3)); + B = _mm_add_epi32(B, biasUV); + B = _mm_srai_epi32(B, 8); + + __m128i y1, y2, y3, y4; + + y1 = packus_epi32(R, G); + y2 = packus_epi32(B, A); + y3 = _mm_packus_epi16(y1, y2); + const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, + 14, 10, 6, 2, + 13, 9, 5, 1, + 12, 8, 4, 0); + y4 = _mm_shuffle_epi8(y3, T4x4); + _mm_storeu_si128((__m128i *)dst, y4); + pY += 4; + pUV += 4; + dst = (__m128i *)dst + 1; + } +} + +void rsdIntrinsicYuvR_K(void *dst, + const unsigned char *pY, const unsigned char *pUV, + uint32_t count, const short *param) { + __m128i biasY, biasUV; + __m128i c0, c1, c2, c3, c4; + + biasY = _mm_set1_epi32(param[8]); /* 16 */ + biasUV = _mm_set1_epi32(param[16]); /* 128 */ + + c0 = _mm_set1_epi32(param[0]); /* 298 */ + c1 = _mm_set1_epi32(param[1]); /* 409 */ + c2 = _mm_set1_epi32(param[2]); /* -100 */ + c3 = _mm_set1_epi32(param[3]); /* 516 */ + c4 = _mm_set1_epi32(param[4]); /* -208 */ + + __m128i Y, UV, U, V, R, G, B, A; + + A = _mm_set1_epi32(255); + uint32_t i; + + for (i = 0; i < (count << 1); ++i) { + Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); + UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV)); + + Y = _mm_sub_epi32(Y, biasY); + UV = _mm_sub_epi32(UV, biasUV); + + V = _mm_shuffle_epi32(UV, 0xf5); + U = _mm_shuffle_epi32(UV, 0xa0); + + Y = mullo_epi32(Y, c0); + + R = _mm_add_epi32(Y, mullo_epi32(V, c1)); + R = _mm_add_epi32(R, biasUV); + R = _mm_srai_epi32(R, 8); + + G = _mm_add_epi32(Y, mullo_epi32(U, c2)); + G = _mm_add_epi32(G, mullo_epi32(V, c4)); + G = _mm_add_epi32(G, biasUV); + G = _mm_srai_epi32(G, 8); + + B = _mm_add_epi32(Y, mullo_epi32(U, c3)); + B = _mm_add_epi32(B, biasUV); + B = _mm_srai_epi32(B, 8); + + __m128i y1, y2, y3, y4; + + y1 = packus_epi32(R, G); + y2 = packus_epi32(B, A); + y3 = _mm_packus_epi16(y1, y2); + const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, + 14, 10, 6, 2, + 13, 9, 5, 1, + 12, 8, 4, 0); + y4 = _mm_shuffle_epi8(y3, T4x4); + _mm_storeu_si128((__m128i *)dst, y4); + pY += 4; + pUV += 4; + dst = (__m128i *)dst + 1; + } +} + +void rsdIntrinsicYuv2_K(void *dst, + const unsigned char *pY, const unsigned char *pU, + const unsigned char *pV, uint32_t count, const short *param) { + __m128i biasY, biasUV; + __m128i c0, c1, c2, c3, c4; + + biasY = _mm_set1_epi32(param[8]); /* 16 */ + biasUV = _mm_set1_epi32(param[16]); /* 128 */ + + c0 = _mm_set1_epi32(param[0]); /* 298 */ + c1 = _mm_set1_epi32(param[1]); /* 409 */ + c2 = _mm_set1_epi32(param[2]); /* -100 */ + c3 = _mm_set1_epi32(param[3]); /* 516 */ + c4 = _mm_set1_epi32(param[4]); /* -208 */ + + __m128i Y, U, V, R, G, B, A; + + A = _mm_set1_epi32(255); + uint32_t i; + + for (i = 0; i < (count << 1); ++i) { + Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); + U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU)); + V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV)); + + Y = _mm_sub_epi32(Y, biasY); + U = _mm_sub_epi32(U, biasUV); + V = _mm_sub_epi32(V, biasUV); + + Y = mullo_epi32(Y, c0); + + R = _mm_add_epi32(Y, mullo_epi32(V, c1)); + R = _mm_add_epi32(R, biasUV); + R = _mm_srai_epi32(R, 8); + + G = _mm_add_epi32(Y, mullo_epi32(U, c2)); + G = _mm_add_epi32(G, mullo_epi32(V, c4)); + G = _mm_add_epi32(G, biasUV); + G = _mm_srai_epi32(G, 8); + + B = _mm_add_epi32(Y, mullo_epi32(U, c3)); + B = _mm_add_epi32(B, biasUV); + B = _mm_srai_epi32(B, 8); + + __m128i y1, y2, y3, y4; + + y1 = packus_epi32(R, G); + y2 = packus_epi32(B, A); + y3 = _mm_packus_epi16(y1, y2); + const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, + 14, 10, 6, 2, + 13, 9, 5, 1, + 12, 8, 4, 0); + y4 = _mm_shuffle_epi8(y3, T4x4); + _mm_storeu_si128((__m128i *)dst, y4); + pY += 4; + pU += 4; + pV += 4; + dst = (__m128i *)dst + 1; + } +} + +extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, + const void *y1, const void *y2, + const void *y3, const void *y4, + const short *coef, uint32_t count) { + __m128i x; + __m128i c0, c2, c4, c6, c8, c10, c12; + __m128i c14, c16, c18, c20, c22, c24; + __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; + __m128i p0, p1, p2, p3, p4, p5, p6, p7; + __m128i p8, p9, p10, p11, p12, p13, p14, p15; + __m128i p16, p17, p18, p19, p20, p21, p22, p23; + __m128i p24, p25, p26, p27, p28, p29, p30, p31; + __m128i p32, p33, p34, p35, p36, p37, p38, p39; + __m128i o0, o1, o2, o3; + uint32_t i; + + x = _mm_loadl_epi64((const __m128i *)(coef+0)); + c0 = _mm_shuffle_epi32(x, 0x00); + c2 = _mm_shuffle_epi32(x, 0x55); + + x = _mm_loadl_epi64((const __m128i *)(coef+4)); + c4 = _mm_shuffle_epi32(x, 0x00); + c6 = _mm_shuffle_epi32(x, 0x55); + + x = _mm_loadl_epi64((const __m128i *)(coef+8)); + c8 = _mm_shuffle_epi32(x, 0x00); + c10 = _mm_shuffle_epi32(x, 0x55); + + x = _mm_loadl_epi64((const __m128i *)(coef+12)); + c12 = _mm_shuffle_epi32(x, 0x00); + c14 = _mm_shuffle_epi32(x, 0x55); + + x = _mm_loadl_epi64((const __m128i *)(coef+16)); + c16 = _mm_shuffle_epi32(x, 0x00); + c18 = _mm_shuffle_epi32(x, 0x55); + + x = _mm_loadl_epi64((const __m128i *)(coef+20)); + c20 = _mm_shuffle_epi32(x, 0x00); + c22 = _mm_shuffle_epi32(x, 0x55); + + x = _mm_loadl_epi64((const __m128i *)(coef+24)); + c24 = _mm_shuffle_epi32(x, 0x00); + + for (i = 0; i < count; ++i) { + + p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128()); + p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128()); + p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128()); + p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128()); + p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128()); + p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128()); + p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128()); + p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128()); + + p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128()); + p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128()); + p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128()); + p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128()); + p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128()); + p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128()); + p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128()); + p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128()); + + p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128()); + p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128()); + p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128()); + p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128()); + p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128()); + p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128()); + p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128()); + p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128()); + + p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128()); + p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128()); + p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128()); + p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128()); + p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128()); + p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128()); + p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128()); + p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128()); + + p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128()); + p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128()); + p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128()); + p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128()); + p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128()); + p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128()); + p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128()); + p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128()); + + o0 = _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1), c0); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3), c2)); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8), c4)); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10), c6)); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c8)); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10)); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12)); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14)); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16)); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18)); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20)); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22)); + o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24)); + o0 = _mm_srai_epi32(o0, 8); + + o1 = _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2), c0); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c2)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9), c4)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11), c6)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13), c8)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22)); + o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24)); + o1 = _mm_srai_epi32(o1, 8); + + o2 = _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3), c0); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5), c2)); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10), c4)); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c6)); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14), c8)); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10)); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12)); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14)); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16)); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18)); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20)); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22)); + o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24)); + o2 = _mm_srai_epi32(o2, 8); + + o3 = _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c0); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6), c2)); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11), c4)); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13), c6)); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15), c8)); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10)); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12)); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14)); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16)); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18)); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20)); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22)); + o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24)); + o3 = _mm_srai_epi32(o3, 8); + + o0 = packus_epi32(o0, o1); + o2 = packus_epi32(o2, o3); + o0 = _mm_packus_epi16(o0, o2); + _mm_storeu_si128((__m128i *)dst, o0); + + y0 = (const char *)y0 + 16; + y1 = (const char *)y1 + 16; + y2 = (const char *)y2 + 16; + y3 = (const char *)y3 + 16; + y4 = (const char *)y4 + 16; + dst = (char *)dst + 16; + } +} + +void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) { + __m128i all1s, ina, ins; + __m128i in0, in1, out0, out1; + __m128i t0, t1, t2, t3; + uint32_t i; + + all1s = _mm_set1_epi16(255); + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); + t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina)); + t0 = _mm_srli_epi16(t0, 8); + t0 = _mm_add_epi16(t0, ins); + + ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); + t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina)); + t1 = _mm_srli_epi16(t1, 8); + t1 = _mm_add_epi16(t1, ins); + + ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); + t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina)); + t2 = _mm_srli_epi16(t2, 8); + t2 = _mm_add_epi16(t2, ins); + + ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); + t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina)); + t3 = _mm_srli_epi16(t3, 8); + t3 = _mm_add_epi16(t3, ins); + + t0 = _mm_packus_epi16(t0, t1); + t2 = _mm_packus_epi16(t2, t3); + _mm_storeu_si128((__m128i *)dst, t0); + _mm_storeu_si128((__m128i *)dst + 1, t2); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) { + __m128i all1s, outa, outs; + __m128i in0, in1, out0, out1; + __m128i t0, t1, t2, t3; + uint32_t i; + + all1s = _mm_set1_epi16(255); + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + + outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); + t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa)); + t0 = _mm_srli_epi16(t0, 8); + t0 = _mm_add_epi16(t0, outs); + + outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); + t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa)); + t1 = _mm_srli_epi16(t1, 8); + t1 = _mm_add_epi16(t1, outs); + + outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); + t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa)); + t2 = _mm_srli_epi16(t2, 8); + t2 = _mm_add_epi16(t2, outs); + + outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); + t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa)); + t3 = _mm_srli_epi16(t3, 8); + t3 = _mm_add_epi16(t3, outs); + + t0 = _mm_packus_epi16(t0, t1); + t2 = _mm_packus_epi16(t2, t3); + _mm_storeu_si128((__m128i *)dst, t0); + _mm_storeu_si128((__m128i *)dst + 1, t2); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) { + __m128i outa; + __m128i in0, in1, out0, out1; + __m128i t0, t1, t2, t3; + uint32_t i; + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outa, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); + t0 = _mm_mullo_epi16(t0, outa); + t0 = _mm_srli_epi16(t0, 8); + + outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outa, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); + t1 = _mm_mullo_epi16(t1, outa); + t1 = _mm_srli_epi16(t1, 8); + + outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outa, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); + t2 = _mm_mullo_epi16(t2, outa); + t2 = _mm_srli_epi16(t2, 8); + + outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outa, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); + t3 = _mm_mullo_epi16(t3, outa); + t3 = _mm_srli_epi16(t3, 8); + + t0 = _mm_packus_epi16(t0, t1); + t2 = _mm_packus_epi16(t2, t3); + _mm_storeu_si128((__m128i *)dst, t0); + _mm_storeu_si128((__m128i *)dst + 1, t2); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) { + __m128i ina; + __m128i in0, in1, out0, out1; + __m128i t0, t1, t2, t3; + uint32_t i; + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ina, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); + t0 = _mm_mullo_epi16(t0, ina); + t0 = _mm_srli_epi16(t0, 8); + + ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ina, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); + t1 = _mm_mullo_epi16(t1, ina); + t1 = _mm_srli_epi16(t1, 8); + + ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ina, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); + t2 = _mm_mullo_epi16(t2, ina); + t2 = _mm_srli_epi16(t2, 8); + + ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ina, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); + t3 = _mm_mullo_epi16(t3, ina); + t3 = _mm_srli_epi16(t3, 8); + + t0 = _mm_packus_epi16(t0, t1); + t2 = _mm_packus_epi16(t2, t3); + _mm_storeu_si128((__m128i *)dst, t0); + _mm_storeu_si128((__m128i *)dst + 1, t2); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) { + __m128i all1s, outa; + __m128i in0, in1, out0, out1; + __m128i t0, t1, t2, t3; + uint32_t i; + + all1s = _mm_set1_epi16(255); + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outa, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); + t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa)); + t0 = _mm_srli_epi16(t0, 8); + + outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outa, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); + t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa)); + t1 = _mm_srli_epi16(t1, 8); + + outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outa, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); + t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa)); + t2 = _mm_srli_epi16(t2, 8); + + outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outa, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); + t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa)); + t3 = _mm_srli_epi16(t3, 8); + + t0 = _mm_packus_epi16(t0, t1); + t2 = _mm_packus_epi16(t2, t3); + _mm_storeu_si128((__m128i *)dst, t0); + _mm_storeu_si128((__m128i *)dst + 1, t2); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) { + __m128i all1s, ina; + __m128i in0, in1, out0, out1; + __m128i t0, t1, t2, t3; + uint32_t i; + + all1s = _mm_set1_epi16(255); + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ina, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); + t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina)); + t0 = _mm_srli_epi16(t0, 8); + + ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ina, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); + t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina)); + t1 = _mm_srli_epi16(t1, 8); + + ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ina, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); + t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina)); + t2 = _mm_srli_epi16(t2, 8); + + ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ina, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); + t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina)); + t3 = _mm_srli_epi16(t3, 8); + + t0 = _mm_packus_epi16(t0, t1); + t2 = _mm_packus_epi16(t2, t3); + _mm_storeu_si128((__m128i *)dst, t0); + _mm_storeu_si128((__m128i *)dst + 1, t2); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) { + const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000); + __m128i all1s, ina, outa, ins, outs; + __m128i in0, in1, out0, out1; + __m128i t0, t1, t2, t3; + uint32_t i; + + all1s = _mm_set1_epi16(255); + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t0 = _mm_sub_epi16(all1s, ina); + t0 = _mm_mullo_epi16(t0, outs); + t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins)); + t0 = _mm_srli_epi16(t0, 8); + + ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t1 = _mm_sub_epi16(all1s, ina); + t1 = _mm_mullo_epi16(t1, outs); + t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins)); + t1 = _mm_srli_epi16(t1, 8); + + ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t2 = _mm_sub_epi16(all1s, ina); + t2 = _mm_mullo_epi16(t2, outs); + t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins)); + t2 = _mm_srli_epi16(t2, 8); + + ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t3 = _mm_sub_epi16(all1s, ina); + t3 = _mm_mullo_epi16(t3, outs); + t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins)); + t3 = _mm_srli_epi16(t3, 8); + + t0 = _mm_packus_epi16(t0, t1); + t0 = blendv_epi8(t0, out0, M0001); + t2 = _mm_packus_epi16(t2, t3); + t2 = blendv_epi8(t2, out1, M0001); + _mm_storeu_si128((__m128i *)dst, t0); + _mm_storeu_si128((__m128i *)dst + 1, t2); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) { + const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000); + __m128i all1s, ina, ins, outa, outs; + __m128i in0, in1, out0, out1; + __m128i t0, t1, t2, t3; + uint32_t i; + + all1s = _mm_set1_epi16(255); + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t0 = _mm_sub_epi16(all1s, outa); + t0 = _mm_mullo_epi16(t0, ins); + t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs)); + t0 = _mm_srli_epi16(t0, 8); + + ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t1 = _mm_sub_epi16(all1s, outa); + t1 = _mm_mullo_epi16(t1, ins); + t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs)); + t1 = _mm_srli_epi16(t1, 8); + + ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t2 = _mm_sub_epi16(all1s, outa); + t2 = _mm_mullo_epi16(t2, ins); + t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs)); + t2 = _mm_srli_epi16(t2, 8); + + ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); + ina = _mm_shufflelo_epi16(ins, 0xFF); + ina = _mm_shufflehi_epi16(ina, 0xFF); + outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); + outa = _mm_shufflelo_epi16(outs, 0xFF); + outa = _mm_shufflehi_epi16(outa, 0xFF); + t3 = _mm_sub_epi16(all1s, outa); + t3 = _mm_mullo_epi16(t3, ins); + t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs)); + t3 = _mm_srli_epi16(t3, 8); + + t0 = _mm_packus_epi16(t0, t1); + t0 = blendv_epi8(t0, in0, M0001); + t2 = _mm_packus_epi16(t2, t3); + t2 = blendv_epi8(t2, in1, M0001); + _mm_storeu_si128((__m128i *)dst, t0); + _mm_storeu_si128((__m128i *)dst + 1, t2); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) { + __m128i in0, in1, out0, out1; + uint32_t i; + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + out0 = _mm_xor_si128(out0, in0); + out1 = _mm_xor_si128(out1, in1); + + _mm_storeu_si128((__m128i *)dst, out0); + _mm_storeu_si128((__m128i *)dst + 1, out1); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) { + __m128i in0, in1, out0, out1; + __m128i t0, t1, t2, t3; + uint32_t i; + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); + t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128())); + t0 = _mm_srli_epi16(t0, 8); + + t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); + t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128())); + t1 = _mm_srli_epi16(t1, 8); + + t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); + t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128())); + t2 = _mm_srli_epi16(t2, 8); + + t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); + t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128())); + t3 = _mm_srli_epi16(t3, 8); + + t0 = _mm_packus_epi16(t0, t1); + t2 = _mm_packus_epi16(t2, t3); + _mm_storeu_si128((__m128i *)dst, t0); + _mm_storeu_si128((__m128i *)dst + 1, t2); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) { + __m128i in0, in1, out0, out1; + uint32_t i; + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + out0 = _mm_adds_epu8(out0, in0); + out1 = _mm_adds_epu8(out1, in1); + + _mm_storeu_si128((__m128i *)dst, out0); + _mm_storeu_si128((__m128i *)dst + 1, out1); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) { + __m128i in0, in1, out0, out1; + uint32_t i; + + for (i = 0; i < count8; ++i) { + in0 = _mm_loadu_si128((const __m128i *)src); + in1 = _mm_loadu_si128((const __m128i *)src + 1); + out0 = _mm_loadu_si128((const __m128i *)dst); + out1 = _mm_loadu_si128((const __m128i *)dst + 1); + + out0 = _mm_subs_epu8(out0, in0); + out1 = _mm_subs_epu8(out1, in1); + + _mm_storeu_si128((__m128i *)dst, out0); + _mm_storeu_si128((__m128i *)dst + 1, out1); + + src = (const __m128i *)src + 2; + dst = (__m128i *)dst + 2; + } +} + +} // namespace renderscript |