diff options
author | Guilhem IMBERTON <guilhem.imberton@intel.com> | 2014-08-06 20:47:04 +0200 |
---|---|---|
committer | Patrick Tjin <pattjin@google.com> | 2014-08-07 14:31:21 -0700 |
commit | 82b428e49a70ddc051a36d2b3a25d90db79770dc (patch) | |
tree | 3c7387e0ff0d1a4dfebec762a9b0a80f09724ef1 /videodecoder/use_util_sse4.h | |
parent | 4d358311bdb7a2e02671ecf499effeb0262e1fc3 (diff) | |
download | libmix-82b428e49a70ddc051a36d2b3a25d90db79770dc.tar.gz |
Initial libmix commit
Change-Id: I7a0b9afdc83a3274189cef0788c7296a871a3d98
Signed-off-by: Guilhem IMBERTON <guilhem.imberton@intel.com>
Diffstat (limited to 'videodecoder/use_util_sse4.h')
-rw-r--r-- | videodecoder/use_util_sse4.h | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/videodecoder/use_util_sse4.h b/videodecoder/use_util_sse4.h new file mode 100644 index 0000000..454099d --- /dev/null +++ b/videodecoder/use_util_sse4.h @@ -0,0 +1,93 @@ +/* +* Copyright (c) 2009-2011 Intel Corporation. All rights reserved. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include <emmintrin.h> +#include <x86intrin.h> + +inline void stream_memcpy(void* dst_buff, const void* src_buff, size_t size) +{ + bool isAligned = (((size_t)(src_buff) | (size_t)(dst_buff)) & 0xF) == 0; + if (!isAligned) { + memcpy(dst_buff, src_buff, size); + return; + } + + static const size_t regs_count = 8; + + __m128i xmm_data0, xmm_data1, xmm_data2, xmm_data3; + __m128i xmm_data4, xmm_data5, xmm_data6, xmm_data7; + + size_t remain_data = size & (regs_count * sizeof(xmm_data0) - 1); + size_t end_position = 0; + + __m128i* pWb_buff = (__m128i*)dst_buff; + __m128i* pWb_buff_end = pWb_buff + ((size - remain_data) >> 4); + __m128i* pWc_buff = (__m128i*)src_buff; + + /*sync the wc memory data*/ + _mm_mfence(); + + while (pWb_buff < pWb_buff_end) + { + xmm_data0 = _mm_stream_load_si128(pWc_buff); + xmm_data1 = _mm_stream_load_si128(pWc_buff + 1); + xmm_data2 = _mm_stream_load_si128(pWc_buff + 2); + xmm_data3 = _mm_stream_load_si128(pWc_buff + 3); + xmm_data4 = _mm_stream_load_si128(pWc_buff + 4); + xmm_data5 = _mm_stream_load_si128(pWc_buff + 5); + xmm_data6 = _mm_stream_load_si128(pWc_buff + 6); + xmm_data7 = _mm_stream_load_si128(pWc_buff + 7); + + pWc_buff += regs_count; + _mm_store_si128(pWb_buff, xmm_data0); + _mm_store_si128(pWb_buff + 1, xmm_data1); + _mm_store_si128(pWb_buff + 2, xmm_data2); + _mm_store_si128(pWb_buff + 3, xmm_data3); + _mm_store_si128(pWb_buff + 4, xmm_data4); + _mm_store_si128(pWb_buff + 5, xmm_data5); + _mm_store_si128(pWb_buff + 6, xmm_data6); + _mm_store_si128(pWb_buff + 7, xmm_data7); + + pWb_buff += regs_count; + } + + /*copy data by 16 bytes step from the remainder*/ + if (remain_data >= 16) + { + size = remain_data; + remain_data = size & 15; + end_position = size >> 4; + for (size_t i = 0; i < end_position; ++i) + { + pWb_buff[i] = _mm_stream_load_si128(pWc_buff + i); + } + } + + /*copy the remainder data, if it still existed*/ + if (remain_data) + { + __m128i temp_data = _mm_stream_load_si128(pWc_buff + end_position); + + char* psrc_buf = (char*)(&temp_data); + char* pdst_buf = (char*)(pWb_buff + end_position); + + for (size_t i = 0; i < remain_data; ++i) + { + pdst_buf[i] = psrc_buf[i]; + } + } + +} |