summaryrefslogtreecommitdiff
path: root/driver
diff options
context:
space:
mode:
authorJason Sams <jsams@google.com>2014-03-01 00:26:33 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2014-03-01 00:26:33 +0000
commit039aa29085562aa0cf967e8584fd40ae818a20c6 (patch)
treee710756d37482e709ea5c28b8d57310fe3f76cb6 /driver
parent66961ba075442e5ae297dc1eb4273fc40e6697df (diff)
parent41660c4c73fc425a2e3511e2070b2748cdd1107c (diff)
downloadrs-039aa29085562aa0cf967e8584fd40ae818a20c6.tar.gz
Merge "Implement vector load/store."
Diffstat (limited to 'driver')
-rw-r--r--driver/runtime/allocation.ll384
-rw-r--r--driver/runtime/rs_allocation.c76
2 files changed, 459 insertions, 1 deletions
diff --git a/driver/runtime/allocation.ll b/driver/runtime/allocation.ll
index e1d6c7e1..2b04aef6 100644
--- a/driver/runtime/allocation.ll
+++ b/driver/runtime/allocation.ll
@@ -2,6 +2,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
target triple = "armv7-none-linux-gnueabi"
declare i8* @rsOffset([1 x i32] %a.coerce, i32 %sizeOf, i32 %x, i32 %y, i32 %z)
+declare i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z)
; The loads and stores in this file are annotated with RenderScript-specific
; information for the type based alias analysis, such that the TBAA analysis
@@ -648,6 +649,389 @@ define void @rsGetElementAtImpl_double4(<4 x double>* noalias nocapture sret %ag
ret void
}
+
+define <4 x i64> @__rsAllocationVLoadXImpl_long4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i64>*
+ %3 = load <4 x i64>* %2, align 8
+ ret <4 x i64> %3
+}
+define <3 x i64> @__rsAllocationVLoadXImpl_long3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i64>*
+ %3 = load <3 x i64>* %2, align 8
+ ret <3 x i64> %3
+}
+define <2 x i64> @__rsAllocationVLoadXImpl_long2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i64>*
+ %3 = load <2 x i64>* %2, align 8
+ ret <2 x i64> %3
+}
+
+define <4 x i64> @__rsAllocationVLoadXImpl_ulong4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i64>*
+ %3 = load <4 x i64>* %2, align 8
+ ret <4 x i64> %3
+}
+define <3 x i64> @__rsAllocationVLoadXImpl_ulong3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i64>*
+ %3 = load <3 x i64>* %2, align 8
+ ret <3 x i64> %3
+}
+define <2 x i64> @__rsAllocationVLoadXImpl_ulong2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i64>*
+ %3 = load <2 x i64>* %2, align 8
+ ret <2 x i64> %3
+}
+
+define <4 x i32> @__rsAllocationVLoadXImpl_int4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i32>*
+ %3 = load <4 x i32>* %2, align 4
+ ret <4 x i32> %3
+}
+define <3 x i32> @__rsAllocationVLoadXImpl_int3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i32>*
+ %3 = load <3 x i32>* %2, align 4
+ ret <3 x i32> %3
+}
+define <2 x i32> @__rsAllocationVLoadXImpl_int2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i32>*
+ %3 = load <2 x i32>* %2, align 4
+ ret <2 x i32> %3
+}
+
+define <4 x i32> @__rsAllocationVLoadXImpl_uint4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i32>*
+ %3 = load <4 x i32>* %2, align 4
+ ret <4 x i32> %3
+}
+define <3 x i32> @__rsAllocationVLoadXImpl_uint3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i32>*
+ %3 = load <3 x i32>* %2, align 4
+ ret <3 x i32> %3
+}
+define <2 x i32> @__rsAllocationVLoadXImpl_uint2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i32>*
+ %3 = load <2 x i32>* %2, align 4
+ ret <2 x i32> %3
+}
+
+define <4 x i16> @__rsAllocationVLoadXImpl_short4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i16>*
+ %3 = load <4 x i16>* %2, align 2
+ ret <4 x i16> %3
+}
+define <3 x i16> @__rsAllocationVLoadXImpl_short3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i16>*
+ %3 = load <3 x i16>* %2, align 2
+ ret <3 x i16> %3
+}
+define <2 x i16> @__rsAllocationVLoadXImpl_short2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i16>*
+ %3 = load <2 x i16>* %2, align 2
+ ret <2 x i16> %3
+}
+
+define <4 x i16> @__rsAllocationVLoadXImpl_ushort4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i16>*
+ %3 = load <4 x i16>* %2, align 2
+ ret <4 x i16> %3
+}
+define <3 x i16> @__rsAllocationVLoadXImpl_ushort3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i16>*
+ %3 = load <3 x i16>* %2, align 2
+ ret <3 x i16> %3
+}
+define <2 x i16> @__rsAllocationVLoadXImpl_ushort2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i16>*
+ %3 = load <2 x i16>* %2, align 2
+ ret <2 x i16> %3
+}
+
+define <4 x i8> @__rsAllocationVLoadXImpl_char4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i8>*
+ %3 = load <4 x i8>* %2, align 1
+ ret <4 x i8> %3
+}
+define <3 x i8> @__rsAllocationVLoadXImpl_char3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i8>*
+ %3 = load <3 x i8>* %2, align 1
+ ret <3 x i8> %3
+}
+define <2 x i8> @__rsAllocationVLoadXImpl_char2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i8>*
+ %3 = load <2 x i8>* %2, align 1
+ ret <2 x i8> %3
+}
+
+define <4 x i8> @__rsAllocationVLoadXImpl_uchar4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i8>*
+ %3 = load <4 x i8>* %2, align 1
+ ret <4 x i8> %3
+}
+define <3 x i8> @__rsAllocationVLoadXImpl_uchar3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i8>*
+ %3 = load <3 x i8>* %2, align 1
+ ret <3 x i8> %3
+}
+define <2 x i8> @__rsAllocationVLoadXImpl_uchar2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i8>*
+ %3 = load <2 x i8>* %2, align 1
+ ret <2 x i8> %3
+}
+
+define <4 x float> @__rsAllocationVLoadXImpl_float4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x float>*
+ %3 = load <4 x float>* %2, align 4
+ ret <4 x float> %3
+}
+define <3 x float> @__rsAllocationVLoadXImpl_float3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x float>*
+ %3 = load <3 x float>* %2, align 4
+ ret <3 x float> %3
+}
+define <2 x float> @__rsAllocationVLoadXImpl_float2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x float>*
+ %3 = load <2 x float>* %2, align 4
+ ret <2 x float> %3
+}
+
+define <4 x double> @__rsAllocationVLoadXImpl_double4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x double>*
+ %3 = load <4 x double>* %2, align 8
+ ret <4 x double> %3
+}
+define <3 x double> @__rsAllocationVLoadXImpl_double3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x double>*
+ %3 = load <3 x double>* %2, align 8
+ ret <3 x double> %3
+}
+define <2 x double> @__rsAllocationVLoadXImpl_double2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x double>*
+ %3 = load <2 x double>* %2, align 8
+ ret <2 x double> %3
+}
+
+
+define void @__rsAllocationVStoreXImpl_long4([1 x i32] %a.coerce, <4 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i64>*
+ store <4 x i64> %val, <4 x i64>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_long3([1 x i32] %a.coerce, <3 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i64>*
+ store <3 x i64> %val, <3 x i64>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_long2([1 x i32] %a.coerce, <2 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i64>*
+ store <2 x i64> %val, <2 x i64>* %2, align 8
+ ret void
+}
+
+define void @__rsAllocationVStoreXImpl_ulong4([1 x i32] %a.coerce, <4 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i64>*
+ store <4 x i64> %val, <4 x i64>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_ulong3([1 x i32] %a.coerce, <3 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i64>*
+ store <3 x i64> %val, <3 x i64>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_ulong2([1 x i32] %a.coerce, <2 x i64> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i64>*
+ store <2 x i64> %val, <2 x i64>* %2, align 8
+ ret void
+}
+
+define void @__rsAllocationVStoreXImpl_int4([1 x i32] %a.coerce, <4 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i32>*
+ store <4 x i32> %val, <4 x i32>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_int3([1 x i32] %a.coerce, <3 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i32>*
+ store <3 x i32> %val, <3 x i32>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_int2([1 x i32] %a.coerce, <2 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i32>*
+ store <2 x i32> %val, <2 x i32>* %2, align 8
+ ret void
+}
+
+define void @__rsAllocationVStoreXImpl_uint4([1 x i32] %a.coerce, <4 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i32>*
+ store <4 x i32> %val, <4 x i32>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_uint3([1 x i32] %a.coerce, <3 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i32>*
+ store <3 x i32> %val, <3 x i32>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_uint2([1 x i32] %a.coerce, <2 x i32> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i32>*
+ store <2 x i32> %val, <2 x i32>* %2, align 8
+ ret void
+}
+
+define void @__rsAllocationVStoreXImpl_short4([1 x i32] %a.coerce, <4 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i16>*
+ store <4 x i16> %val, <4 x i16>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_short3([1 x i32] %a.coerce, <3 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i16>*
+ store <3 x i16> %val, <3 x i16>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_short2([1 x i32] %a.coerce, <2 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i16>*
+ store <2 x i16> %val, <2 x i16>* %2, align 8
+ ret void
+}
+
+define void @__rsAllocationVStoreXImpl_ushort4([1 x i32] %a.coerce, <4 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i16>*
+ store <4 x i16> %val, <4 x i16>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_ushort3([1 x i32] %a.coerce, <3 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i16>*
+ store <3 x i16> %val, <3 x i16>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_ushort2([1 x i32] %a.coerce, <2 x i16> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i16>*
+ store <2 x i16> %val, <2 x i16>* %2, align 8
+ ret void
+}
+
+define void @__rsAllocationVStoreXImpl_char4([1 x i32] %a.coerce, <4 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i8>*
+ store <4 x i8> %val, <4 x i8>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_char3([1 x i32] %a.coerce, <3 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i8>*
+ store <3 x i8> %val, <3 x i8>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_char2([1 x i32] %a.coerce, <2 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i8>*
+ store <2 x i8> %val, <2 x i8>* %2, align 8
+ ret void
+}
+
+define void @__rsAllocationVStoreXImpl_uchar4([1 x i32] %a.coerce, <4 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x i8>*
+ store <4 x i8> %val, <4 x i8>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_uchar3([1 x i32] %a.coerce, <3 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x i8>*
+ store <3 x i8> %val, <3 x i8>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_uchar2([1 x i32] %a.coerce, <2 x i8> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x i8>*
+ store <2 x i8> %val, <2 x i8>* %2, align 8
+ ret void
+}
+
+define void @__rsAllocationVStoreXImpl_float4([1 x i32] %a.coerce, <4 x float> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x float>*
+ store <4 x float> %val, <4 x float>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_float3([1 x i32] %a.coerce, <3 x float> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x float>*
+ store <3 x float> %val, <3 x float>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_float2([1 x i32] %a.coerce, <2 x float> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x float>*
+ store <2 x float> %val, <2 x float>* %2, align 8
+ ret void
+}
+
+define void @__rsAllocationVStoreXImpl_double4([1 x i32] %a.coerce, <4 x double> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <4 x double>*
+ store <4 x double> %val, <4 x double>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_double3([1 x i32] %a.coerce, <3 x double> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <3 x double>*
+ store <3 x double> %val, <3 x double>* %2, align 8
+ ret void
+}
+define void @__rsAllocationVStoreXImpl_double2([1 x i32] %a.coerce, <2 x double> %val, i32 %x, i32 %y, i32 %z) #0 {
+ %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10
+ %2 = bitcast i8* %1 to <2 x double>*
+ store <2 x double> %val, <2 x double>* %2, align 8
+ ret void
+}
+
+
attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/driver/runtime/rs_allocation.c b/driver/runtime/rs_allocation.c
index a307776d..0722680d 100644
--- a/driver/runtime/rs_allocation.c
+++ b/driver/runtime/rs_allocation.c
@@ -99,7 +99,6 @@ static void memcpy(void* dst, void* src, size_t size) {
rsGetElementAt_##T(a, &tmp, x, y, z); \
return tmp; \
}
-
#else
uint8_t*
@@ -114,6 +113,18 @@ rsOffset(rs_allocation a, uint32_t sizeOf, uint32_t x, uint32_t y,
return dp;
}
+uint8_t*
+rsOffsetNs(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+ Allocation_t *alloc = (Allocation_t *)a.p;
+ uint8_t *p = (uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr;
+ const uint32_t stride = alloc->mHal.drvState.lod[0].stride;
+ const uint32_t dimY = alloc->mHal.drvState.lod[0].dimY;
+ const uint32_t sizeOf = alloc->mHal.state.elementSizeBytes;;
+ uint8_t *dp = &p[(sizeOf * x) + (y * stride) +
+ (z * stride * dimY)];
+ return dp;
+}
+
#define ELEMENT_AT(T) \
\
void \
@@ -290,3 +301,66 @@ extern const uchar __attribute__((overloadable))
return pin[((x >> shift) * cstep) + ((y >> shift) * stride)];
}
+
+#define VOP(T) \
+ extern void __rsAllocationVStoreXImpl_##T(rs_allocation a, const T val, uint32_t x, uint32_t y, uint32_t z); \
+ extern T __rsAllocationVLoadXImpl_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z); \
+ \
+ extern void __attribute__((overloadable)) \
+ rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x) { \
+ __rsAllocationVStoreXImpl_##T(a, val, x, 0, 0); \
+ } \
+ extern void __attribute__((overloadable)) \
+ rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x, uint32_t y) { \
+ __rsAllocationVStoreXImpl_##T(a, val, x, y, 0); \
+ } \
+ extern void __attribute__((overloadable)) \
+ rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x, uint32_t y, uint32_t z) { \
+ __rsAllocationVStoreXImpl_##T(a, val, x, y, z); \
+ } \
+ extern T __attribute__((overloadable)) \
+ rsAllocationVLoadX_##T(rs_allocation a, uint32_t x) { \
+ return __rsAllocationVLoadXImpl_##T(a, x, 0, 0); \
+ } \
+ extern T __attribute__((overloadable)) \
+ rsAllocationVLoadX_##T(rs_allocation a, uint32_t x, uint32_t y) { \
+ return __rsAllocationVLoadXImpl_##T(a, x, y, 0); \
+ } \
+ extern T __attribute__((overloadable)) \
+ rsAllocationVLoadX_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) { \
+ return __rsAllocationVLoadXImpl_##T(a, x, y, z); \
+ }
+
+VOP(char2)
+VOP(char3)
+VOP(char4)
+VOP(uchar2)
+VOP(uchar3)
+VOP(uchar4)
+VOP(short2)
+VOP(short3)
+VOP(short4)
+VOP(ushort2)
+VOP(ushort3)
+VOP(ushort4)
+VOP(int2)
+VOP(int3)
+VOP(int4)
+VOP(uint2)
+VOP(uint3)
+VOP(uint4)
+VOP(long2)
+VOP(long3)
+VOP(long4)
+VOP(ulong2)
+VOP(ulong3)
+VOP(ulong4)
+VOP(float2)
+VOP(float3)
+VOP(float4)
+VOP(double2)
+VOP(double3)
+VOP(double4)
+
+#undef VOP
+