summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cpu_ref/Android.mk1
-rw-r--r--cpu_ref/rsCpuCore.cpp5
-rw-r--r--cpu_ref/rsCpuIntrinsic3DLUT.cpp191
-rw-r--r--cpu_ref/rsCpuIntrinsicInlines.h20
-rw-r--r--cpu_ref/rsCpuIntrinsics_neon.S207
-rw-r--r--rsDefines.h3
-rw-r--r--rsScriptIntrinsic.h1
7 files changed, 426 insertions, 2 deletions
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 062a916f..ddc87f96 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -21,6 +21,7 @@ LOCAL_SRC_FILES:= \
rsCpuRuntimeStubs.cpp \
rsCpuScriptGroup.cpp \
rsCpuIntrinsic.cpp \
+ rsCpuIntrinsic3DLUT.cpp \
rsCpuIntrinsicBlend.cpp \
rsCpuIntrinsicBlur.cpp \
rsCpuIntrinsicColorMatrix.cpp \
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index e22b730c..75fc3f1f 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -433,6 +433,8 @@ RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s,
return i;
}
+extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e);
extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
const Script *s, const Element *e);
extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
@@ -453,6 +455,9 @@ RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *
RsdCpuScriptImpl *i = NULL;
switch (iid) {
+ case RS_SCRIPT_INTRINSIC_ID_3DLUT:
+ i = rsdIntrinsic_3DLUT(this, s, e);
+ break;
case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
i = rsdIntrinsic_Convolve3x3(this, s, e);
break;
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
new file mode 100644
index 00000000..7bb89103
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsic3DLUT : public RsdCpuScriptIntrinsic {
+public:
+ virtual void populateScript(Script *);
+ virtual void invokeFreeChildren();
+
+ virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+ virtual ~RsdCpuScriptIntrinsic3DLUT();
+ RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
+
+protected:
+ ObjectBaseRef<Allocation> mLUT;
+
+ static void kernel(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+
+
+void RsdCpuScriptIntrinsic3DLUT::setGlobalObj(uint32_t slot, ObjectBase *data) {
+ rsAssert(slot == 0);
+ mLUT.set(static_cast<Allocation *>(data));
+}
+
+extern "C" void rsdIntrinsic3DLUT_K(void *dst, const void *src, const void *lut,
+ size_t lut_stride_y, size_t lut_stride_z,
+ uint32_t count, const void *constants);
+
+
+void RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep) {
+ RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
+
+ uchar4 *out = (uchar4 *)p->out;
+ uchar4 *in = (uchar4 *)p->in;
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+ const uchar *bp = (const uchar *)cp->mLUT->mHal.drvState.lod[0].mallocPtr;
+
+ int4 dims = {
+ cp->mLUT->mHal.drvState.lod[0].dimX,
+ cp->mLUT->mHal.drvState.lod[0].dimY,
+ cp->mLUT->mHal.drvState.lod[0].dimZ,
+ 0
+ };
+ const float4 m = (float4)(1.f / 255.f) * convert_float4(dims - 1);
+ const int4 coordMul = convert_int4(m * (float4)0x8000);
+ const size_t stride_y = cp->mLUT->mHal.drvState.lod[0].stride;
+ const size_t stride_z = stride_y * cp->mLUT->mHal.drvState.lod[0].dimY;
+
+ //ALOGE("strides %zu %zu", stride_y, stride_z);
+
+ while (x1 < x2) {
+#if defined(ARCH_ARM_HAVE_NEON)
+ int32_t len = (x2 - x1 - 1) >> 1;
+ if(len > 0) {
+ const short neon_constants[] = {
+ coordMul.x, coordMul.y, coordMul.z, 0,
+ 0, 0, 0, 0xffff,
+
+ };
+
+ rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants);
+ x1 += len << 1;
+ out += len << 1;
+ in += len << 1;
+ }
+
+#endif
+
+ int4 baseCoord = convert_int4(*in) * coordMul;
+ int4 coord1 = baseCoord >> (int4)15;
+ //int4 coord2 = min(coord1 + 1, gDims - 1);
+
+ int4 weight2 = baseCoord & 0x7fff;
+ int4 weight1 = (int4)0x8000 - weight2;
+
+ //ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
+ const uchar *bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z);
+ const uchar4 *pt_00 = (const uchar4 *)&bp2[0];
+ const uchar4 *pt_10 = (const uchar4 *)&bp2[stride_y];
+ const uchar4 *pt_01 = (const uchar4 *)&bp2[stride_z];
+ const uchar4 *pt_11 = (const uchar4 *)&bp2[stride_y + stride_z];
+
+ uint4 v000 = convert_uint4(pt_00[0]);
+ uint4 v100 = convert_uint4(pt_00[1]);
+ uint4 v010 = convert_uint4(pt_10[0]);
+ uint4 v110 = convert_uint4(pt_10[1]);
+ uint4 v001 = convert_uint4(pt_01[0]);
+ uint4 v101 = convert_uint4(pt_01[1]);
+ uint4 v011 = convert_uint4(pt_11[0]);
+ uint4 v111 = convert_uint4(pt_11[1]);
+
+ uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7;
+ uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7;
+ uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7;
+ uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7;
+
+ uint4 z0 = (yz00 * weight1.y) + (yz10 * weight2.y) >> (int4)15;
+ uint4 z1 = (yz01 * weight1.y) + (yz11 * weight2.y) >> (int4)15;
+
+ uint4 v = (z0 * weight1.z) + (z1 * weight2.z) >> (int4)15;
+ uint4 v2 = (v + 0x7f) >> (int4)8;
+
+ uchar4 ret = convert_uchar4(v2);
+ ret.a = in->a;
+
+ #if 0
+ if (!x1) {
+ ALOGE("in %08x %08x %08x %08x", in->r, in->g, in->b, in->a);
+ ALOGE("baseCoord %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z, baseCoord.w);
+ ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
+ ALOGE("weight1 %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w);
+ ALOGE("weight2 %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w);
+
+ ALOGE("v000 %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w);
+ ALOGE("v100 %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w);
+ ALOGE("yz00 %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w);
+ ALOGE("z0 %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w);
+
+ ALOGE("v %08x %08x %08x %08x", v.x, v.y, v.z, v.w);
+ ALOGE("v2 %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w);
+ }
+ #endif
+ *out = ret;
+
+
+ in++;
+ out++;
+ x1++;
+ }
+}
+
+RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e)
+ : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
+
+ mRootPtr = &kernel;
+}
+
+RsdCpuScriptIntrinsic3DLUT::~RsdCpuScriptIntrinsic3DLUT() {
+}
+
+void RsdCpuScriptIntrinsic3DLUT::populateScript(Script *s) {
+ s->mHal.info.exportedVariableCount = 1;
+}
+
+void RsdCpuScriptIntrinsic3DLUT::invokeFreeChildren() {
+ mLUT.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e) {
+
+ return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e);
+}
+
+
diff --git a/cpu_ref/rsCpuIntrinsicInlines.h b/cpu_ref/rsCpuIntrinsicInlines.h
index ab11b4f5..d6644cab 100644
--- a/cpu_ref/rsCpuIntrinsicInlines.h
+++ b/cpu_ref/rsCpuIntrinsicInlines.h
@@ -57,6 +57,16 @@ static inline int4 convert_int4(uchar4 i) {
return f4;
}
+static inline uint4 convert_uint4(uchar4 i) {
+ uint4 f4 = {i.x, i.y, i.z, i.w};
+ return f4;
+}
+
+static inline int4 convert_int4(float4 i) {
+ int4 f4 = {i.x, i.y, i.z, i.w};
+ return f4;
+}
+
static inline short4 convert_short4(uchar4 i) {
short4 f4 = {i.x, i.y, i.z, i.w};
return f4;
@@ -67,6 +77,11 @@ static inline float4 convert_float4(uchar4 i) {
return f4;
}
+static inline float4 convert_float4(int4 i) {
+ float4 f4 = {i.x, i.y, i.z, i.w};
+ return f4;
+}
+
static inline uchar4 convert_uchar4(short4 i) {
uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
return f4;
@@ -77,6 +92,11 @@ static inline uchar4 convert_uchar4(int4 i) {
return f4;
}
+static inline uchar4 convert_uchar4(uint4 i) {
+ uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
+ return f4;
+}
+
static inline uchar4 convert_uchar4(float4 i) {
uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
return f4;
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 07e9ffbe..76e709e3 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -1557,3 +1557,210 @@ ENTRY(rsdIntrinsicBlendSub_K)
bx lr
END(rsdIntrinsicBlendSub_K)
+
+/* 3D LUT */
+
+/*
+ r0 = dst
+ r1 = src
+ r2 = cube base pointer
+ r3 = cube Y stride
+ r4 = cube Z stride
+ r5 = count
+ xr10 = * constants
+
+ d0 / q0 = weight 1 p1
+ d1 = weight 2 p1
+
+ d2 / q1 = weight 1 p2
+ d3 = weight 2 p2
+
+ d4 / q2 = src1
+ d5 = src2
+
+ d6 / q3 = baseCoord
+ d7 = baseCoord
+
+ d8 / q4 = coord1 p1
+ d9 =
+
+ d10 / q5 = coord1 p2
+ d11 =
+
+ d12 / q6 =
+ d13 =
+
+ d14 / q7 =
+ d15 =
+
+
+ d16 / q8 = x0 y0 z0
+ d17 = x1 y0 z0
+ d18 / q9 = x0 y1 z0
+ d19 = x1 y1 z0
+ d20 / q10 = x0 y0 z1
+ d21 = x1 y0 z1
+ d22 / q11 = x0 y1 z1
+ d23 = x1 y1 z1
+
+ d24 / q12 = alpha mash
+ d25 = current pixel alpha
+ d26 / q13 = 4, y stride
+ d27 = z stride, 0
+ d28 / q14 = 0x8000
+ d29 = 0x7fff
+ d30 / q15 = 0, 0, 0, 0xffff
+
+
+ d31 = coordMult
+*/
+
+ENTRY(rsdIntrinsic3DLUT_K)
+ push {r4-r8, r10, r11, lr}
+ vpush {q4-q7}
+
+ /* load Z stride in r4 */
+ ldr r4, [sp, #32 + 64]
+
+ /* Load count */
+ ldr r5, [sp, #36 + 64]
+
+ vmov.u16 d28, #0x8000
+ vmov.u16 d29, #0x7fff
+ vmov.u32 d24, #0xff000000
+
+ /* load constants using r10 */
+ ldr r10, [sp, #40 + 64]
+ vld1.32 {d31}, [r10]!
+ vld1.32 {d30}, [r10]!
+
+ mov r6, #4
+ vmov d26, r6, r3
+ mov r6, #0
+ vmov d27, r4, r6
+
+ add r8, r3, r4
+
+
+
+1:
+ vld1.8 {d4}, [r1]!
+ vand.u8 d25, d4, d24
+ vmovl.u8 q2, d4
+
+
+ vmull.u16 q3, d4, d31
+ vshr.u32 q4, q3, #15 // coord1 p1
+ vmovn.u32 d1, q3
+ vand.u16 d1, d29 // weight 2
+ vsub.u16 d0, d28, d1 // weight 1
+ vmul.u32 q4, q4, q13 // q4 = x*4, y*ystride, z*zstride, 0
+
+ vmull.u16 q3, d5, d31
+ vshr.u32 q5, q3, #15 // coord1 p2
+ vmovn.u32 d3, q3
+ vand.u16 d3, d29 // weight 2
+ vsub.u16 d2, d28, d3 // weight 1
+ vmul.u32 q5, q5, q13 // q5 = x*4, y*ystride, z*zstride, 0
+
+ vpadd.u32 d8, d8, d9
+ vpadd.u32 d9, d10, d11
+ vpadd.u32 d8, d8, d9
+ vmov r6, r7, d8 // base pointers
+
+ add r6, r6, r2
+ add r7, r7, r2
+
+ vld1.8 {d16}, [r6]
+ add r11, r6, r3
+ vld1.8 {d18}, [r11]
+ add r11, r6, r4
+ vld1.8 {d20}, [r11]
+ add r11, r6, r8
+ vld1.8 {d22}, [r11]
+
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+
+ vmull.u16 q6, d16, d0[0]
+ vmlal.u16 q6, d17, d1[0]
+ vshrn.u32 d16, q6, #7
+ vmull.u16 q6, d18, d0[0]
+ vmlal.u16 q6, d19, d1[0]
+ vshrn.u32 d18, q6, #7
+ vmull.u16 q6, d20, d0[0]
+ vmlal.u16 q6, d21, d1[0]
+ vshrn.u32 d20, q6, #7
+ vmull.u16 q6, d22, d0[0]
+ vmlal.u16 q6, d23, d1[0]
+ vshrn.u32 d22, q6, #7
+
+ vmull.u16 q6, d16, d0[1]
+ vmlal.u16 q6, d18, d1[1]
+ vshrn.u32 d16, q6, #15
+ vmull.u16 q6, d20, d0[1]
+ vmlal.u16 q6, d22, d1[1]
+ vshrn.u32 d18, q6, #15
+
+ vmull.u16 q6, d16, d0[2]
+ vmlal.u16 q6, d18, d1[2]
+ vshrn.u32 d14, q6, #15
+
+
+ vld1.8 {d16}, [r7]
+ add r11, r7, r3
+ vld1.8 {d18}, [r11]
+ add r11, r7, r4
+ vld1.8 {d20}, [r11]
+ add r11, r7, r8
+ vld1.8 {d22}, [r11]
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+
+ vmull.u16 q6, d16, d2[0]
+ vmlal.u16 q6, d17, d3[0]
+ vshrn.u32 d16, q6, #7
+ vmull.u16 q6, d18, d2[0]
+ vmlal.u16 q6, d19, d3[0]
+ vshrn.u32 d18, q6, #7
+ vmull.u16 q6, d20, d2[0]
+ vmlal.u16 q6, d21, d3[0]
+ vshrn.u32 d20, q6, #7
+ vmull.u16 q6, d22, d2[0]
+ vmlal.u16 q6, d23, d3[0]
+ vshrn.u32 d22, q6, #7
+
+ vmull.u16 q6, d16, d2[1]
+ vmlal.u16 q6, d18, d3[1]
+ vshrn.u32 d16, q6, #15
+ vmull.u16 q6, d20, d2[1]
+ vmlal.u16 q6, d22, d3[1]
+ vshrn.u32 d18, q6, #15
+
+ vmull.u16 q6, d16, d2[2]
+ vmlal.u16 q6, d18, d3[2]
+ vshrn.u32 d15, q6, #15
+
+ vrshrn.u16 d14, q7, #8
+
+ vbic.u8 d14, d14, d24 // mix in alpha
+ vorr.u8 d14, d14, d25
+ vst1.32 {d14}, [r0]!
+
+
+ /* Are we done? */
+ subs r5, r5, #1
+ bne 1b
+
+ /* Yup, bye */
+ vpop {q4-q7}
+ pop {r4-r8, r10, r11, lr}
+ bx lr
+
+END(rsdIntrinsic3DLUT_K)
+
+
diff --git a/rsDefines.h b/rsDefines.h
index b1f10419..dfa0f9d5 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -350,7 +350,8 @@ enum RsScriptIntrinsicID {
RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5 = 4,
RS_SCRIPT_INTRINSIC_ID_BLUR = 5,
RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB = 6,
- RS_SCRIPT_INTRINSIC_ID_BLEND = 7
+ RS_SCRIPT_INTRINSIC_ID_BLEND = 7,
+ RS_SCRIPT_INTRINSIC_ID_3DLUT = 8
};
typedef struct {
diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h
index 310cbec1..696f2db6 100644
--- a/rsScriptIntrinsic.h
+++ b/rsScriptIntrinsic.h
@@ -53,7 +53,6 @@ public:
virtual uint32_t run(Context *);
protected:
uint32_t mIntrinsicID;
- float mParams[9];
};