diff options
author | Jason Sams <jsams@google.com> | 2013-01-04 10:50:05 -0800 |
---|---|---|
committer | Jason Sams <jsams@google.com> | 2013-01-04 10:50:05 -0800 |
commit | 7c4b888f2147edf99690b6af75470774ff31c43b (patch) | |
tree | e75f935cafad69db6f3c44ebfade1048c997a76a | |
parent | e7d4df3dd02ec7505a2d83348af16e327d1fce3a (diff) | |
download | rs-7c4b888f2147edf99690b6af75470774ff31c43b.tar.gz |
Functional 3D LUT intrinsic.
1600x1000 takes ~23ms on manta.
Change-Id: I142d6dedded66df05aa5f49e3da409a34c6e1b6e
-rw-r--r-- | cpu_ref/Android.mk | 1 | ||||
-rw-r--r-- | cpu_ref/rsCpuCore.cpp | 5 | ||||
-rw-r--r-- | cpu_ref/rsCpuIntrinsic3DLUT.cpp | 191 | ||||
-rw-r--r-- | cpu_ref/rsCpuIntrinsicInlines.h | 20 | ||||
-rw-r--r-- | cpu_ref/rsCpuIntrinsics_neon.S | 207 | ||||
-rw-r--r-- | rsDefines.h | 3 | ||||
-rw-r--r-- | rsScriptIntrinsic.h | 1 |
7 files changed, 426 insertions, 2 deletions
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk index 062a916f..ddc87f96 100644 --- a/cpu_ref/Android.mk +++ b/cpu_ref/Android.mk @@ -21,6 +21,7 @@ LOCAL_SRC_FILES:= \ rsCpuRuntimeStubs.cpp \ rsCpuScriptGroup.cpp \ rsCpuIntrinsic.cpp \ + rsCpuIntrinsic3DLUT.cpp \ rsCpuIntrinsicBlend.cpp \ rsCpuIntrinsicBlur.cpp \ rsCpuIntrinsicColorMatrix.cpp \ diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp index e22b730c..75fc3f1f 100644 --- a/cpu_ref/rsCpuCore.cpp +++ b/cpu_ref/rsCpuCore.cpp @@ -433,6 +433,8 @@ RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s, return i; } +extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx, + const Script *s, const Element *e); extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, @@ -453,6 +455,9 @@ RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script * RsdCpuScriptImpl *i = NULL; switch (iid) { + case RS_SCRIPT_INTRINSIC_ID_3DLUT: + i = rsdIntrinsic_3DLUT(this, s, e); + break; case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3: i = rsdIntrinsic_Convolve3x3(this, s, e); break; diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp new file mode 100644 index 00000000..7bb89103 --- /dev/null +++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "rsCpuIntrinsic.h" +#include "rsCpuIntrinsicInlines.h" + +using namespace android; +using namespace android::renderscript; + +namespace android { +namespace renderscript { + + +class RsdCpuScriptIntrinsic3DLUT : public RsdCpuScriptIntrinsic { +public: + virtual void populateScript(Script *); + virtual void invokeFreeChildren(); + + virtual void setGlobalObj(uint32_t slot, ObjectBase *data); + + virtual ~RsdCpuScriptIntrinsic3DLUT(); + RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); + +protected: + ObjectBaseRef<Allocation> mLUT; + + static void kernel(const RsForEachStubParamStruct *p, + uint32_t xstart, uint32_t xend, + uint32_t instep, uint32_t outstep); +}; + +} +} + + +void RsdCpuScriptIntrinsic3DLUT::setGlobalObj(uint32_t slot, ObjectBase *data) { + rsAssert(slot == 0); + mLUT.set(static_cast<Allocation *>(data)); +} + +extern "C" void rsdIntrinsic3DLUT_K(void *dst, const void *src, const void *lut, + size_t lut_stride_y, size_t lut_stride_z, + uint32_t count, const void *constants); + + +void RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p, + uint32_t xstart, uint32_t xend, + uint32_t instep, uint32_t outstep) { + RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr; + + uchar4 *out = (uchar4 *)p->out; + uchar4 *in = (uchar4 *)p->in; + uint32_t x1 = xstart; + uint32_t x2 = xend; + + const uchar *bp = (const uchar *)cp->mLUT->mHal.drvState.lod[0].mallocPtr; + + int4 dims = { + cp->mLUT->mHal.drvState.lod[0].dimX, + cp->mLUT->mHal.drvState.lod[0].dimY, + cp->mLUT->mHal.drvState.lod[0].dimZ, + 0 + }; + const float4 m = (float4)(1.f / 255.f) * convert_float4(dims - 1); + const int4 coordMul = convert_int4(m * (float4)0x8000); + const size_t stride_y = cp->mLUT->mHal.drvState.lod[0].stride; + const size_t stride_z = stride_y * cp->mLUT->mHal.drvState.lod[0].dimY; + + //ALOGE("strides %zu %zu", stride_y, stride_z); + + while (x1 < x2) { +#if defined(ARCH_ARM_HAVE_NEON) + int32_t len = (x2 - x1 - 1) >> 1; + if(len > 0) { + const short neon_constants[] = { + coordMul.x, coordMul.y, coordMul.z, 0, + 0, 0, 0, 0xffff, + + }; + + rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants); + x1 += len << 1; + out += len << 1; + in += len << 1; + } + +#endif + + int4 baseCoord = convert_int4(*in) * coordMul; + int4 coord1 = baseCoord >> (int4)15; + //int4 coord2 = min(coord1 + 1, gDims - 1); + + int4 weight2 = baseCoord & 0x7fff; + int4 weight1 = (int4)0x8000 - weight2; + + //ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w); + const uchar *bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z); + const uchar4 *pt_00 = (const uchar4 *)&bp2[0]; + const uchar4 *pt_10 = (const uchar4 *)&bp2[stride_y]; + const uchar4 *pt_01 = (const uchar4 *)&bp2[stride_z]; + const uchar4 *pt_11 = (const uchar4 *)&bp2[stride_y + stride_z]; + + uint4 v000 = convert_uint4(pt_00[0]); + uint4 v100 = convert_uint4(pt_00[1]); + uint4 v010 = convert_uint4(pt_10[0]); + uint4 v110 = convert_uint4(pt_10[1]); + uint4 v001 = convert_uint4(pt_01[0]); + uint4 v101 = convert_uint4(pt_01[1]); + uint4 v011 = convert_uint4(pt_11[0]); + uint4 v111 = convert_uint4(pt_11[1]); + + uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7; + uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7; + uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7; + uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7; + + uint4 z0 = (yz00 * weight1.y) + (yz10 * weight2.y) >> (int4)15; + uint4 z1 = (yz01 * weight1.y) + (yz11 * weight2.y) >> (int4)15; + + uint4 v = (z0 * weight1.z) + (z1 * weight2.z) >> (int4)15; + uint4 v2 = (v + 0x7f) >> (int4)8; + + uchar4 ret = convert_uchar4(v2); + ret.a = in->a; + + #if 0 + if (!x1) { + ALOGE("in %08x %08x %08x %08x", in->r, in->g, in->b, in->a); + ALOGE("baseCoord %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z, baseCoord.w); + ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w); + ALOGE("weight1 %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w); + ALOGE("weight2 %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w); + + ALOGE("v000 %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w); + ALOGE("v100 %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w); + ALOGE("yz00 %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w); + ALOGE("z0 %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w); + + ALOGE("v %08x %08x %08x %08x", v.x, v.y, v.z, v.w); + ALOGE("v2 %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w); + } + #endif + *out = ret; + + + in++; + out++; + x1++; + } +} + +RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, + const Script *s, const Element *e) + : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) { + + mRootPtr = &kernel; +} + +RsdCpuScriptIntrinsic3DLUT::~RsdCpuScriptIntrinsic3DLUT() { +} + +void RsdCpuScriptIntrinsic3DLUT::populateScript(Script *s) { + s->mHal.info.exportedVariableCount = 1; +} + +void RsdCpuScriptIntrinsic3DLUT::invokeFreeChildren() { + mLUT.clear(); +} + + +RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx, + const Script *s, const Element *e) { + + return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e); +} + + diff --git a/cpu_ref/rsCpuIntrinsicInlines.h b/cpu_ref/rsCpuIntrinsicInlines.h index ab11b4f5..d6644cab 100644 --- a/cpu_ref/rsCpuIntrinsicInlines.h +++ b/cpu_ref/rsCpuIntrinsicInlines.h @@ -57,6 +57,16 @@ static inline int4 convert_int4(uchar4 i) { return f4; } +static inline uint4 convert_uint4(uchar4 i) { + uint4 f4 = {i.x, i.y, i.z, i.w}; + return f4; +} + +static inline int4 convert_int4(float4 i) { + int4 f4 = {i.x, i.y, i.z, i.w}; + return f4; +} + static inline short4 convert_short4(uchar4 i) { short4 f4 = {i.x, i.y, i.z, i.w}; return f4; @@ -67,6 +77,11 @@ static inline float4 convert_float4(uchar4 i) { return f4; } +static inline float4 convert_float4(int4 i) { + float4 f4 = {i.x, i.y, i.z, i.w}; + return f4; +} + static inline uchar4 convert_uchar4(short4 i) { uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w}; return f4; @@ -77,6 +92,11 @@ static inline uchar4 convert_uchar4(int4 i) { return f4; } +static inline uchar4 convert_uchar4(uint4 i) { + uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w}; + return f4; +} + static inline uchar4 convert_uchar4(float4 i) { uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w}; return f4; diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S index 07e9ffbe..76e709e3 100644 --- a/cpu_ref/rsCpuIntrinsics_neon.S +++ b/cpu_ref/rsCpuIntrinsics_neon.S @@ -1557,3 +1557,210 @@ ENTRY(rsdIntrinsicBlendSub_K) bx lr END(rsdIntrinsicBlendSub_K) + +/* 3D LUT */ + +/* + r0 = dst + r1 = src + r2 = cube base pointer + r3 = cube Y stride + r4 = cube Z stride + r5 = count + xr10 = * constants + + d0 / q0 = weight 1 p1 + d1 = weight 2 p1 + + d2 / q1 = weight 1 p2 + d3 = weight 2 p2 + + d4 / q2 = src1 + d5 = src2 + + d6 / q3 = baseCoord + d7 = baseCoord + + d8 / q4 = coord1 p1 + d9 = + + d10 / q5 = coord1 p2 + d11 = + + d12 / q6 = + d13 = + + d14 / q7 = + d15 = + + + d16 / q8 = x0 y0 z0 + d17 = x1 y0 z0 + d18 / q9 = x0 y1 z0 + d19 = x1 y1 z0 + d20 / q10 = x0 y0 z1 + d21 = x1 y0 z1 + d22 / q11 = x0 y1 z1 + d23 = x1 y1 z1 + + d24 / q12 = alpha mash + d25 = current pixel alpha + d26 / q13 = 4, y stride + d27 = z stride, 0 + d28 / q14 = 0x8000 + d29 = 0x7fff + d30 / q15 = 0, 0, 0, 0xffff + + + d31 = coordMult +*/ + +ENTRY(rsdIntrinsic3DLUT_K) + push {r4-r8, r10, r11, lr} + vpush {q4-q7} + + /* load Z stride in r4 */ + ldr r4, [sp, #32 + 64] + + /* Load count */ + ldr r5, [sp, #36 + 64] + + vmov.u16 d28, #0x8000 + vmov.u16 d29, #0x7fff + vmov.u32 d24, #0xff000000 + + /* load constants using r10 */ + ldr r10, [sp, #40 + 64] + vld1.32 {d31}, [r10]! + vld1.32 {d30}, [r10]! + + mov r6, #4 + vmov d26, r6, r3 + mov r6, #0 + vmov d27, r4, r6 + + add r8, r3, r4 + + + +1: + vld1.8 {d4}, [r1]! + vand.u8 d25, d4, d24 + vmovl.u8 q2, d4 + + + vmull.u16 q3, d4, d31 + vshr.u32 q4, q3, #15 // coord1 p1 + vmovn.u32 d1, q3 + vand.u16 d1, d29 // weight 2 + vsub.u16 d0, d28, d1 // weight 1 + vmul.u32 q4, q4, q13 // q4 = x*4, y*ystride, z*zstride, 0 + + vmull.u16 q3, d5, d31 + vshr.u32 q5, q3, #15 // coord1 p2 + vmovn.u32 d3, q3 + vand.u16 d3, d29 // weight 2 + vsub.u16 d2, d28, d3 // weight 1 + vmul.u32 q5, q5, q13 // q5 = x*4, y*ystride, z*zstride, 0 + + vpadd.u32 d8, d8, d9 + vpadd.u32 d9, d10, d11 + vpadd.u32 d8, d8, d9 + vmov r6, r7, d8 // base pointers + + add r6, r6, r2 + add r7, r7, r2 + + vld1.8 {d16}, [r6] + add r11, r6, r3 + vld1.8 {d18}, [r11] + add r11, r6, r4 + vld1.8 {d20}, [r11] + add r11, r6, r8 + vld1.8 {d22}, [r11] + + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + + vmull.u16 q6, d16, d0[0] + vmlal.u16 q6, d17, d1[0] + vshrn.u32 d16, q6, #7 + vmull.u16 q6, d18, d0[0] + vmlal.u16 q6, d19, d1[0] + vshrn.u32 d18, q6, #7 + vmull.u16 q6, d20, d0[0] + vmlal.u16 q6, d21, d1[0] + vshrn.u32 d20, q6, #7 + vmull.u16 q6, d22, d0[0] + vmlal.u16 q6, d23, d1[0] + vshrn.u32 d22, q6, #7 + + vmull.u16 q6, d16, d0[1] + vmlal.u16 q6, d18, d1[1] + vshrn.u32 d16, q6, #15 + vmull.u16 q6, d20, d0[1] + vmlal.u16 q6, d22, d1[1] + vshrn.u32 d18, q6, #15 + + vmull.u16 q6, d16, d0[2] + vmlal.u16 q6, d18, d1[2] + vshrn.u32 d14, q6, #15 + + + vld1.8 {d16}, [r7] + add r11, r7, r3 + vld1.8 {d18}, [r11] + add r11, r7, r4 + vld1.8 {d20}, [r11] + add r11, r7, r8 + vld1.8 {d22}, [r11] + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + + vmull.u16 q6, d16, d2[0] + vmlal.u16 q6, d17, d3[0] + vshrn.u32 d16, q6, #7 + vmull.u16 q6, d18, d2[0] + vmlal.u16 q6, d19, d3[0] + vshrn.u32 d18, q6, #7 + vmull.u16 q6, d20, d2[0] + vmlal.u16 q6, d21, d3[0] + vshrn.u32 d20, q6, #7 + vmull.u16 q6, d22, d2[0] + vmlal.u16 q6, d23, d3[0] + vshrn.u32 d22, q6, #7 + + vmull.u16 q6, d16, d2[1] + vmlal.u16 q6, d18, d3[1] + vshrn.u32 d16, q6, #15 + vmull.u16 q6, d20, d2[1] + vmlal.u16 q6, d22, d3[1] + vshrn.u32 d18, q6, #15 + + vmull.u16 q6, d16, d2[2] + vmlal.u16 q6, d18, d3[2] + vshrn.u32 d15, q6, #15 + + vrshrn.u16 d14, q7, #8 + + vbic.u8 d14, d14, d24 // mix in alpha + vorr.u8 d14, d14, d25 + vst1.32 {d14}, [r0]! + + + /* Are we done? */ + subs r5, r5, #1 + bne 1b + + /* Yup, bye */ + vpop {q4-q7} + pop {r4-r8, r10, r11, lr} + bx lr + +END(rsdIntrinsic3DLUT_K) + + diff --git a/rsDefines.h b/rsDefines.h index b1f10419..dfa0f9d5 100644 --- a/rsDefines.h +++ b/rsDefines.h @@ -350,7 +350,8 @@ enum RsScriptIntrinsicID { RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5 = 4, RS_SCRIPT_INTRINSIC_ID_BLUR = 5, RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB = 6, - RS_SCRIPT_INTRINSIC_ID_BLEND = 7 + RS_SCRIPT_INTRINSIC_ID_BLEND = 7, + RS_SCRIPT_INTRINSIC_ID_3DLUT = 8 }; typedef struct { diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h index 310cbec1..696f2db6 100644 --- a/rsScriptIntrinsic.h +++ b/rsScriptIntrinsic.h @@ -53,7 +53,6 @@ public: virtual uint32_t run(Context *); protected: uint32_t mIntrinsicID; - float mParams[9]; }; |