diff options
author | Jason Sams <jsams@google.com> | 2014-03-01 00:26:33 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2014-03-01 00:26:33 +0000 |
commit | 039aa29085562aa0cf967e8584fd40ae818a20c6 (patch) | |
tree | e710756d37482e709ea5c28b8d57310fe3f76cb6 /driver | |
parent | 66961ba075442e5ae297dc1eb4273fc40e6697df (diff) | |
parent | 41660c4c73fc425a2e3511e2070b2748cdd1107c (diff) | |
download | rs-039aa29085562aa0cf967e8584fd40ae818a20c6.tar.gz |
Merge "Implement vector load/store."
Diffstat (limited to 'driver')
-rw-r--r-- | driver/runtime/allocation.ll | 384 | ||||
-rw-r--r-- | driver/runtime/rs_allocation.c | 76 |
2 files changed, 459 insertions, 1 deletions
diff --git a/driver/runtime/allocation.ll b/driver/runtime/allocation.ll index e1d6c7e1..2b04aef6 100644 --- a/driver/runtime/allocation.ll +++ b/driver/runtime/allocation.ll @@ -2,6 +2,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 target triple = "armv7-none-linux-gnueabi" declare i8* @rsOffset([1 x i32] %a.coerce, i32 %sizeOf, i32 %x, i32 %y, i32 %z) +declare i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) ; The loads and stores in this file are annotated with RenderScript-specific ; information for the type based alias analysis, such that the TBAA analysis @@ -648,6 +649,389 @@ define void @rsGetElementAtImpl_double4(<4 x double>* noalias nocapture sret %ag ret void } + +define <4 x i64> @__rsAllocationVLoadXImpl_long4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i64>* + %3 = load <4 x i64>* %2, align 8 + ret <4 x i64> %3 +} +define <3 x i64> @__rsAllocationVLoadXImpl_long3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i64>* + %3 = load <3 x i64>* %2, align 8 + ret <3 x i64> %3 +} +define <2 x i64> @__rsAllocationVLoadXImpl_long2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i64>* + %3 = load <2 x i64>* %2, align 8 + ret <2 x i64> %3 +} + +define <4 x i64> @__rsAllocationVLoadXImpl_ulong4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i64>* + %3 = load <4 x i64>* %2, align 8 + ret <4 x i64> %3 +} +define <3 x i64> @__rsAllocationVLoadXImpl_ulong3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i64>* + %3 = load <3 x i64>* %2, align 8 + ret <3 x i64> %3 +} +define <2 x i64> @__rsAllocationVLoadXImpl_ulong2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i64>* + %3 = load <2 x i64>* %2, align 8 + ret <2 x i64> %3 +} + +define <4 x i32> @__rsAllocationVLoadXImpl_int4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i32>* + %3 = load <4 x i32>* %2, align 4 + ret <4 x i32> %3 +} +define <3 x i32> @__rsAllocationVLoadXImpl_int3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i32>* + %3 = load <3 x i32>* %2, align 4 + ret <3 x i32> %3 +} +define <2 x i32> @__rsAllocationVLoadXImpl_int2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i32>* + %3 = load <2 x i32>* %2, align 4 + ret <2 x i32> %3 +} + +define <4 x i32> @__rsAllocationVLoadXImpl_uint4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i32>* + %3 = load <4 x i32>* %2, align 4 + ret <4 x i32> %3 +} +define <3 x i32> @__rsAllocationVLoadXImpl_uint3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i32>* + %3 = load <3 x i32>* %2, align 4 + ret <3 x i32> %3 +} +define <2 x i32> @__rsAllocationVLoadXImpl_uint2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i32>* + %3 = load <2 x i32>* %2, align 4 + ret <2 x i32> %3 +} + +define <4 x i16> @__rsAllocationVLoadXImpl_short4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i16>* + %3 = load <4 x i16>* %2, align 2 + ret <4 x i16> %3 +} +define <3 x i16> @__rsAllocationVLoadXImpl_short3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i16>* + %3 = load <3 x i16>* %2, align 2 + ret <3 x i16> %3 +} +define <2 x i16> @__rsAllocationVLoadXImpl_short2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i16>* + %3 = load <2 x i16>* %2, align 2 + ret <2 x i16> %3 +} + +define <4 x i16> @__rsAllocationVLoadXImpl_ushort4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i16>* + %3 = load <4 x i16>* %2, align 2 + ret <4 x i16> %3 +} +define <3 x i16> @__rsAllocationVLoadXImpl_ushort3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i16>* + %3 = load <3 x i16>* %2, align 2 + ret <3 x i16> %3 +} +define <2 x i16> @__rsAllocationVLoadXImpl_ushort2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i16>* + %3 = load <2 x i16>* %2, align 2 + ret <2 x i16> %3 +} + +define <4 x i8> @__rsAllocationVLoadXImpl_char4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i8>* + %3 = load <4 x i8>* %2, align 1 + ret <4 x i8> %3 +} +define <3 x i8> @__rsAllocationVLoadXImpl_char3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i8>* + %3 = load <3 x i8>* %2, align 1 + ret <3 x i8> %3 +} +define <2 x i8> @__rsAllocationVLoadXImpl_char2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i8>* + %3 = load <2 x i8>* %2, align 1 + ret <2 x i8> %3 +} + +define <4 x i8> @__rsAllocationVLoadXImpl_uchar4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i8>* + %3 = load <4 x i8>* %2, align 1 + ret <4 x i8> %3 +} +define <3 x i8> @__rsAllocationVLoadXImpl_uchar3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i8>* + %3 = load <3 x i8>* %2, align 1 + ret <3 x i8> %3 +} +define <2 x i8> @__rsAllocationVLoadXImpl_uchar2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i8>* + %3 = load <2 x i8>* %2, align 1 + ret <2 x i8> %3 +} + +define <4 x float> @__rsAllocationVLoadXImpl_float4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x float>* + %3 = load <4 x float>* %2, align 4 + ret <4 x float> %3 +} +define <3 x float> @__rsAllocationVLoadXImpl_float3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x float>* + %3 = load <3 x float>* %2, align 4 + ret <3 x float> %3 +} +define <2 x float> @__rsAllocationVLoadXImpl_float2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x float>* + %3 = load <2 x float>* %2, align 4 + ret <2 x float> %3 +} + +define <4 x double> @__rsAllocationVLoadXImpl_double4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x double>* + %3 = load <4 x double>* %2, align 8 + ret <4 x double> %3 +} +define <3 x double> @__rsAllocationVLoadXImpl_double3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x double>* + %3 = load <3 x double>* %2, align 8 + ret <3 x double> %3 +} +define <2 x double> @__rsAllocationVLoadXImpl_double2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x double>* + %3 = load <2 x double>* %2, align 8 + ret <2 x double> %3 +} + + +define void @__rsAllocationVStoreXImpl_long4([1 x i32] %a.coerce, <4 x i64> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i64>* + store <4 x i64> %val, <4 x i64>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_long3([1 x i32] %a.coerce, <3 x i64> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i64>* + store <3 x i64> %val, <3 x i64>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_long2([1 x i32] %a.coerce, <2 x i64> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i64>* + store <2 x i64> %val, <2 x i64>* %2, align 8 + ret void +} + +define void @__rsAllocationVStoreXImpl_ulong4([1 x i32] %a.coerce, <4 x i64> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i64>* + store <4 x i64> %val, <4 x i64>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_ulong3([1 x i32] %a.coerce, <3 x i64> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i64>* + store <3 x i64> %val, <3 x i64>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_ulong2([1 x i32] %a.coerce, <2 x i64> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i64>* + store <2 x i64> %val, <2 x i64>* %2, align 8 + ret void +} + +define void @__rsAllocationVStoreXImpl_int4([1 x i32] %a.coerce, <4 x i32> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i32>* + store <4 x i32> %val, <4 x i32>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_int3([1 x i32] %a.coerce, <3 x i32> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i32>* + store <3 x i32> %val, <3 x i32>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_int2([1 x i32] %a.coerce, <2 x i32> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i32>* + store <2 x i32> %val, <2 x i32>* %2, align 8 + ret void +} + +define void @__rsAllocationVStoreXImpl_uint4([1 x i32] %a.coerce, <4 x i32> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i32>* + store <4 x i32> %val, <4 x i32>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_uint3([1 x i32] %a.coerce, <3 x i32> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i32>* + store <3 x i32> %val, <3 x i32>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_uint2([1 x i32] %a.coerce, <2 x i32> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i32>* + store <2 x i32> %val, <2 x i32>* %2, align 8 + ret void +} + +define void @__rsAllocationVStoreXImpl_short4([1 x i32] %a.coerce, <4 x i16> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i16>* + store <4 x i16> %val, <4 x i16>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_short3([1 x i32] %a.coerce, <3 x i16> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i16>* + store <3 x i16> %val, <3 x i16>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_short2([1 x i32] %a.coerce, <2 x i16> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i16>* + store <2 x i16> %val, <2 x i16>* %2, align 8 + ret void +} + +define void @__rsAllocationVStoreXImpl_ushort4([1 x i32] %a.coerce, <4 x i16> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i16>* + store <4 x i16> %val, <4 x i16>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_ushort3([1 x i32] %a.coerce, <3 x i16> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i16>* + store <3 x i16> %val, <3 x i16>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_ushort2([1 x i32] %a.coerce, <2 x i16> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i16>* + store <2 x i16> %val, <2 x i16>* %2, align 8 + ret void +} + +define void @__rsAllocationVStoreXImpl_char4([1 x i32] %a.coerce, <4 x i8> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i8>* + store <4 x i8> %val, <4 x i8>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_char3([1 x i32] %a.coerce, <3 x i8> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i8>* + store <3 x i8> %val, <3 x i8>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_char2([1 x i32] %a.coerce, <2 x i8> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i8>* + store <2 x i8> %val, <2 x i8>* %2, align 8 + ret void +} + +define void @__rsAllocationVStoreXImpl_uchar4([1 x i32] %a.coerce, <4 x i8> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x i8>* + store <4 x i8> %val, <4 x i8>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_uchar3([1 x i32] %a.coerce, <3 x i8> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x i8>* + store <3 x i8> %val, <3 x i8>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_uchar2([1 x i32] %a.coerce, <2 x i8> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x i8>* + store <2 x i8> %val, <2 x i8>* %2, align 8 + ret void +} + +define void @__rsAllocationVStoreXImpl_float4([1 x i32] %a.coerce, <4 x float> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x float>* + store <4 x float> %val, <4 x float>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_float3([1 x i32] %a.coerce, <3 x float> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x float>* + store <3 x float> %val, <3 x float>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_float2([1 x i32] %a.coerce, <2 x float> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x float>* + store <2 x float> %val, <2 x float>* %2, align 8 + ret void +} + +define void @__rsAllocationVStoreXImpl_double4([1 x i32] %a.coerce, <4 x double> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <4 x double>* + store <4 x double> %val, <4 x double>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_double3([1 x i32] %a.coerce, <3 x double> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <3 x double>* + store <3 x double> %val, <3 x double>* %2, align 8 + ret void +} +define void @__rsAllocationVStoreXImpl_double2([1 x i32] %a.coerce, <2 x double> %val, i32 %x, i32 %y, i32 %z) #0 { + %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #10 + %2 = bitcast i8* %1 to <2 x double>* + store <2 x double> %val, <2 x double>* %2, align 8 + ret void +} + + attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/driver/runtime/rs_allocation.c b/driver/runtime/rs_allocation.c index a307776d..0722680d 100644 --- a/driver/runtime/rs_allocation.c +++ b/driver/runtime/rs_allocation.c @@ -99,7 +99,6 @@ static void memcpy(void* dst, void* src, size_t size) { rsGetElementAt_##T(a, &tmp, x, y, z); \ return tmp; \ } - #else uint8_t* @@ -114,6 +113,18 @@ rsOffset(rs_allocation a, uint32_t sizeOf, uint32_t x, uint32_t y, return dp; } +uint8_t* +rsOffsetNs(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) { + Allocation_t *alloc = (Allocation_t *)a.p; + uint8_t *p = (uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; + const uint32_t stride = alloc->mHal.drvState.lod[0].stride; + const uint32_t dimY = alloc->mHal.drvState.lod[0].dimY; + const uint32_t sizeOf = alloc->mHal.state.elementSizeBytes;; + uint8_t *dp = &p[(sizeOf * x) + (y * stride) + + (z * stride * dimY)]; + return dp; +} + #define ELEMENT_AT(T) \ \ void \ @@ -290,3 +301,66 @@ extern const uchar __attribute__((overloadable)) return pin[((x >> shift) * cstep) + ((y >> shift) * stride)]; } + +#define VOP(T) \ + extern void __rsAllocationVStoreXImpl_##T(rs_allocation a, const T val, uint32_t x, uint32_t y, uint32_t z); \ + extern T __rsAllocationVLoadXImpl_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z); \ + \ + extern void __attribute__((overloadable)) \ + rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x) { \ + __rsAllocationVStoreXImpl_##T(a, val, x, 0, 0); \ + } \ + extern void __attribute__((overloadable)) \ + rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x, uint32_t y) { \ + __rsAllocationVStoreXImpl_##T(a, val, x, y, 0); \ + } \ + extern void __attribute__((overloadable)) \ + rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x, uint32_t y, uint32_t z) { \ + __rsAllocationVStoreXImpl_##T(a, val, x, y, z); \ + } \ + extern T __attribute__((overloadable)) \ + rsAllocationVLoadX_##T(rs_allocation a, uint32_t x) { \ + return __rsAllocationVLoadXImpl_##T(a, x, 0, 0); \ + } \ + extern T __attribute__((overloadable)) \ + rsAllocationVLoadX_##T(rs_allocation a, uint32_t x, uint32_t y) { \ + return __rsAllocationVLoadXImpl_##T(a, x, y, 0); \ + } \ + extern T __attribute__((overloadable)) \ + rsAllocationVLoadX_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) { \ + return __rsAllocationVLoadXImpl_##T(a, x, y, z); \ + } + +VOP(char2) +VOP(char3) +VOP(char4) +VOP(uchar2) +VOP(uchar3) +VOP(uchar4) +VOP(short2) +VOP(short3) +VOP(short4) +VOP(ushort2) +VOP(ushort3) +VOP(ushort4) +VOP(int2) +VOP(int3) +VOP(int4) +VOP(uint2) +VOP(uint3) +VOP(uint4) +VOP(long2) +VOP(long3) +VOP(long4) +VOP(ulong2) +VOP(ulong3) +VOP(ulong4) +VOP(float2) +VOP(float3) +VOP(float4) +VOP(double2) +VOP(double3) +VOP(double4) + +#undef VOP + |