aboutsummaryrefslogtreecommitdiff
path: root/videodecoder/use_util_sse4.h
diff options
context:
space:
mode:
authorGuilhem IMBERTON <guilhem.imberton@intel.com>2014-08-06 20:47:04 +0200
committerPatrick Tjin <pattjin@google.com>2014-08-07 14:31:21 -0700
commit82b428e49a70ddc051a36d2b3a25d90db79770dc (patch)
tree3c7387e0ff0d1a4dfebec762a9b0a80f09724ef1 /videodecoder/use_util_sse4.h
parent4d358311bdb7a2e02671ecf499effeb0262e1fc3 (diff)
downloadlibmix-82b428e49a70ddc051a36d2b3a25d90db79770dc.tar.gz
Initial libmix commit
Change-Id: I7a0b9afdc83a3274189cef0788c7296a871a3d98 Signed-off-by: Guilhem IMBERTON <guilhem.imberton@intel.com>
Diffstat (limited to 'videodecoder/use_util_sse4.h')
-rw-r--r--videodecoder/use_util_sse4.h93
1 files changed, 93 insertions, 0 deletions
diff --git a/videodecoder/use_util_sse4.h b/videodecoder/use_util_sse4.h
new file mode 100644
index 0000000..454099d
--- /dev/null
+++ b/videodecoder/use_util_sse4.h
@@ -0,0 +1,93 @@
+/*
+* Copyright (c) 2009-2011 Intel Corporation. All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <emmintrin.h>
+#include <x86intrin.h>
+
+inline void stream_memcpy(void* dst_buff, const void* src_buff, size_t size)
+{
+ bool isAligned = (((size_t)(src_buff) | (size_t)(dst_buff)) & 0xF) == 0;
+ if (!isAligned) {
+ memcpy(dst_buff, src_buff, size);
+ return;
+ }
+
+ static const size_t regs_count = 8;
+
+ __m128i xmm_data0, xmm_data1, xmm_data2, xmm_data3;
+ __m128i xmm_data4, xmm_data5, xmm_data6, xmm_data7;
+
+ size_t remain_data = size & (regs_count * sizeof(xmm_data0) - 1);
+ size_t end_position = 0;
+
+ __m128i* pWb_buff = (__m128i*)dst_buff;
+ __m128i* pWb_buff_end = pWb_buff + ((size - remain_data) >> 4);
+ __m128i* pWc_buff = (__m128i*)src_buff;
+
+ /*sync the wc memory data*/
+ _mm_mfence();
+
+ while (pWb_buff < pWb_buff_end)
+ {
+ xmm_data0 = _mm_stream_load_si128(pWc_buff);
+ xmm_data1 = _mm_stream_load_si128(pWc_buff + 1);
+ xmm_data2 = _mm_stream_load_si128(pWc_buff + 2);
+ xmm_data3 = _mm_stream_load_si128(pWc_buff + 3);
+ xmm_data4 = _mm_stream_load_si128(pWc_buff + 4);
+ xmm_data5 = _mm_stream_load_si128(pWc_buff + 5);
+ xmm_data6 = _mm_stream_load_si128(pWc_buff + 6);
+ xmm_data7 = _mm_stream_load_si128(pWc_buff + 7);
+
+ pWc_buff += regs_count;
+ _mm_store_si128(pWb_buff, xmm_data0);
+ _mm_store_si128(pWb_buff + 1, xmm_data1);
+ _mm_store_si128(pWb_buff + 2, xmm_data2);
+ _mm_store_si128(pWb_buff + 3, xmm_data3);
+ _mm_store_si128(pWb_buff + 4, xmm_data4);
+ _mm_store_si128(pWb_buff + 5, xmm_data5);
+ _mm_store_si128(pWb_buff + 6, xmm_data6);
+ _mm_store_si128(pWb_buff + 7, xmm_data7);
+
+ pWb_buff += regs_count;
+ }
+
+ /*copy data by 16 bytes step from the remainder*/
+ if (remain_data >= 16)
+ {
+ size = remain_data;
+ remain_data = size & 15;
+ end_position = size >> 4;
+ for (size_t i = 0; i < end_position; ++i)
+ {
+ pWb_buff[i] = _mm_stream_load_si128(pWc_buff + i);
+ }
+ }
+
+ /*copy the remainder data, if it still existed*/
+ if (remain_data)
+ {
+ __m128i temp_data = _mm_stream_load_si128(pWc_buff + end_position);
+
+ char* psrc_buf = (char*)(&temp_data);
+ char* pdst_buf = (char*)(pWb_buff + end_position);
+
+ for (size_t i = 0; i < remain_data; ++i)
+ {
+ pdst_buf[i] = psrc_buf[i];
+ }
+ }
+
+}