7 files changed, 426 insertions, 2 deletions
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 062a916f..ddc87f96 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -21,6 +21,7 @@ LOCAL_SRC_FILES:= \
 	rsCpuRuntimeStubs.cpp \
 	rsCpuScriptGroup.cpp \
 	rsCpuIntrinsic.cpp \
+	rsCpuIntrinsic3DLUT.cpp \
 	rsCpuIntrinsicBlend.cpp \
 	rsCpuIntrinsicBlur.cpp \
 	rsCpuIntrinsicColorMatrix.cpp \
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index e22b730c..75fc3f1f 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -433,6 +433,8 @@ RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s,
     return i;
 }
 
+extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
+                                             const Script *s, const Element *e);
 extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
                                                    const Script *s, const Element *e);
 extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
@@ -453,6 +455,9 @@ RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *
 
     RsdCpuScriptImpl *i = NULL;
     switch (iid) {
+    case RS_SCRIPT_INTRINSIC_ID_3DLUT:
+        i = rsdIntrinsic_3DLUT(this, s, e);
+        break;
     case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
         i = rsdIntrinsic_Convolve3x3(this, s, e);
         break;
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
new file mode 100644
index 00000000..7bb89103
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsic3DLUT : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsic3DLUT();
+    RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
+
+protected:
+    ObjectBaseRef<Allocation> mLUT;
+
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+
+
+void RsdCpuScriptIntrinsic3DLUT::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    rsAssert(slot == 0);
+    mLUT.set(static_cast<Allocation *>(data));
+}
+
+extern "C" void rsdIntrinsic3DLUT_K(void *dst, const void *src, const void *lut,
+                                    size_t lut_stride_y, size_t lut_stride_z,
+                                    uint32_t count, const void *constants);
+
+
+void RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p,
+                                      uint32_t xstart, uint32_t xend,
+                                      uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
+
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    const uchar *bp = (const uchar *)cp->mLUT->mHal.drvState.lod[0].mallocPtr;
+
+    int4 dims = {
+        cp->mLUT->mHal.drvState.lod[0].dimX,
+        cp->mLUT->mHal.drvState.lod[0].dimY,
+        cp->mLUT->mHal.drvState.lod[0].dimZ,
+        0
+    };
+    const float4 m = (float4)(1.f / 255.f) * convert_float4(dims - 1);
+    const int4 coordMul = convert_int4(m * (float4)0x8000);
+    const size_t stride_y = cp->mLUT->mHal.drvState.lod[0].stride;
+    const size_t stride_z = stride_y * cp->mLUT->mHal.drvState.lod[0].dimY;
+
+    //ALOGE("strides %zu %zu", stride_y, stride_z);
+
+    while (x1 < x2) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1 - 1) >> 1;
+        if(len > 0) {
+            const short neon_constants[] = {
+                coordMul.x, coordMul.y, coordMul.z, 0,
+                0, 0, 0, 0xffff,
+
+            };
+
+            rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants);
+            x1 += len << 1;
+            out += len << 1;
+            in += len << 1;
+        }
+
+#endif
+
+        int4 baseCoord = convert_int4(*in) * coordMul;
+        int4 coord1 = baseCoord >> (int4)15;
+        //int4 coord2 = min(coord1 + 1, gDims - 1);
+
+        int4 weight2 = baseCoord & 0x7fff;
+        int4 weight1 = (int4)0x8000 - weight2;
+
+        //ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
+        const uchar *bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z);
+        const uchar4 *pt_00 = (const uchar4 *)&bp2[0];
+        const uchar4 *pt_10 = (const uchar4 *)&bp2[stride_y];
+        const uchar4 *pt_01 = (const uchar4 *)&bp2[stride_z];
+        const uchar4 *pt_11 = (const uchar4 *)&bp2[stride_y + stride_z];
+
+        uint4 v000 = convert_uint4(pt_00[0]);
+        uint4 v100 = convert_uint4(pt_00[1]);
+        uint4 v010 = convert_uint4(pt_10[0]);
+        uint4 v110 = convert_uint4(pt_10[1]);
+        uint4 v001 = convert_uint4(pt_01[0]);
+        uint4 v101 = convert_uint4(pt_01[1]);
+        uint4 v011 = convert_uint4(pt_11[0]);
+        uint4 v111 = convert_uint4(pt_11[1]);
+
+        uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7;
+        uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7;
+        uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7;
+        uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7;
+
+        uint4 z0 = (yz00 * weight1.y) + (yz10 * weight2.y) >> (int4)15;
+        uint4 z1 = (yz01 * weight1.y) + (yz11 * weight2.y) >> (int4)15;
+
+        uint4 v = (z0 * weight1.z) + (z1 * weight2.z) >> (int4)15;
+        uint4 v2 = (v + 0x7f) >> (int4)8;
+
+        uchar4 ret = convert_uchar4(v2);
+        ret.a = in->a;
+
+        #if 0
+        if (!x1) {
+            ALOGE("in          %08x %08x %08x %08x", in->r, in->g, in->b, in->a);
+            ALOGE("baseCoord   %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z, baseCoord.w);
+            ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
+            ALOGE("weight1     %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w);
+            ALOGE("weight2     %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w);
+
+            ALOGE("v000        %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w);
+            ALOGE("v100        %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w);
+            ALOGE("yz00        %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w);
+            ALOGE("z0          %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w);
+
+            ALOGE("v           %08x %08x %08x %08x", v.x, v.y, v.z, v.w);
+            ALOGE("v2          %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w);
+        }
+        #endif
+        *out = ret;
+
+
+        in++;
+        out++;
+        x1++;
+    }
+}
+
+RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx,
+                                                     const Script *s, const Element *e)
+            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
+
+    mRootPtr = &kernel;
+}
+
+RsdCpuScriptIntrinsic3DLUT::~RsdCpuScriptIntrinsic3DLUT() {
+}
+
+void RsdCpuScriptIntrinsic3DLUT::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 1;
+}
+
+void RsdCpuScriptIntrinsic3DLUT::invokeFreeChildren() {
+    mLUT.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
+                                    const Script *s, const Element *e) {
+
+    return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e);
+}
+
+
diff --git a/cpu_ref/rsCpuIntrinsicInlines.h b/cpu_ref/rsCpuIntrinsicInlines.h
index ab11b4f5..d6644cab 100644
--- a/cpu_ref/rsCpuIntrinsicInlines.h
+++ b/cpu_ref/rsCpuIntrinsicInlines.h
@@ -57,6 +57,16 @@ static inline int4 convert_int4(uchar4 i) {
     return f4;
 }
 
+static inline uint4 convert_uint4(uchar4 i) {
+    uint4 f4 = {i.x, i.y, i.z, i.w};
+    return f4;
+}
+
+static inline int4 convert_int4(float4 i) {
+    int4 f4 = {i.x, i.y, i.z, i.w};
+    return f4;
+}
+
 static inline short4 convert_short4(uchar4 i) {
     short4 f4 = {i.x, i.y, i.z, i.w};
     return f4;
@@ -67,6 +77,11 @@ static inline float4 convert_float4(uchar4 i) {
     return f4;
 }
 
+static inline float4 convert_float4(int4 i) {
+    float4 f4 = {i.x, i.y, i.z, i.w};
+    return f4;
+}
+
 static inline uchar4 convert_uchar4(short4 i) {
     uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
     return f4;
@@ -77,6 +92,11 @@ static inline uchar4 convert_uchar4(int4 i) {
     return f4;
 }
 
+static inline uchar4 convert_uchar4(uint4 i) {
+    uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
+    return f4;
+}
+
 static inline uchar4 convert_uchar4(float4 i) {
     uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
     return f4;
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 07e9ffbe..76e709e3 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -1557,3 +1557,210 @@ ENTRY(rsdIntrinsicBlendSub_K)
         bx              lr
 END(rsdIntrinsicBlendSub_K)
 
+
+/* 3D LUT */
+
+/*
+        r0 = dst
+        r1 = src
+        r2 = cube base pointer
+        r3 = cube Y stride
+        r4 = cube Z stride
+        r5 = count
+        xr10 = * constants
+
+        d0  / q0  = weight 1 p1
+        d1        = weight 2 p1
+
+        d2  / q1  = weight 1 p2
+        d3        = weight 2 p2
+
+        d4  / q2  = src1
+        d5        = src2
+
+        d6  / q3  = baseCoord
+        d7        = baseCoord
+
+        d8  / q4  = coord1 p1
+        d9        =
+
+        d10 / q5  = coord1 p2
+        d11       =
+
+        d12 / q6  =
+        d13       =
+
+        d14 / q7  =
+        d15       =
+
+
+        d16 / q8  = x0 y0 z0
+        d17       = x1 y0 z0
+        d18 / q9  = x0 y1 z0
+        d19       = x1 y1 z0
+        d20 / q10 = x0 y0 z1
+        d21       = x1 y0 z1
+        d22 / q11 = x0 y1 z1
+        d23       = x1 y1 z1
+
+        d24 / q12 = alpha mash
+        d25       = current pixel alpha
+        d26 / q13 = 4, y stride
+        d27       = z stride, 0
+        d28 / q14 = 0x8000
+        d29       = 0x7fff
+        d30 / q15 = 0, 0, 0, 0xffff
+
+
+        d31 = coordMult
+*/
+
+ENTRY(rsdIntrinsic3DLUT_K)
+        push        {r4-r8, r10, r11, lr}
+        vpush       {q4-q7}
+
+        /* load Z stride in r4 */
+        ldr     r4, [sp, #32 + 64]
+
+        /* Load count */
+        ldr     r5, [sp, #36 + 64]
+
+        vmov.u16 d28, #0x8000
+        vmov.u16 d29, #0x7fff
+        vmov.u32 d24, #0xff000000
+
+        /* load constants using r10 */
+        ldr     r10, [sp, #40 + 64]
+        vld1.32 {d31}, [r10]!
+        vld1.32 {d30}, [r10]!
+
+        mov r6, #4
+        vmov d26, r6, r3
+        mov r6, #0
+        vmov d27, r4, r6
+
+        add r8, r3, r4
+
+
+
+1:
+        vld1.8 {d4}, [r1]!
+        vand.u8 d25, d4, d24
+        vmovl.u8 q2, d4
+
+
+        vmull.u16 q3, d4, d31
+        vshr.u32 q4, q3, #15       // coord1 p1
+        vmovn.u32 d1, q3
+        vand.u16 d1, d29           // weight 2
+        vsub.u16 d0, d28, d1       // weight 1
+        vmul.u32 q4, q4, q13           // q4 = x*4, y*ystride, z*zstride, 0
+
+        vmull.u16 q3, d5, d31
+        vshr.u32 q5, q3, #15       // coord1 p2
+        vmovn.u32 d3, q3
+        vand.u16 d3, d29           // weight 2
+        vsub.u16 d2, d28, d3       // weight 1
+        vmul.u32 q5, q5, q13       // q5 = x*4, y*ystride, z*zstride, 0
+
+        vpadd.u32 d8, d8, d9
+        vpadd.u32 d9, d10, d11
+        vpadd.u32 d8, d8, d9
+        vmov r6, r7, d8            // base pointers
+
+        add  r6, r6, r2
+        add  r7, r7, r2
+
+        vld1.8 {d16}, [r6]
+        add r11, r6, r3
+        vld1.8 {d18}, [r11]
+        add r11, r6, r4
+        vld1.8 {d20}, [r11]
+        add r11, r6, r8
+        vld1.8 {d22}, [r11]
+
+        vmovl.u8 q8, d16
+        vmovl.u8 q9, d18
+        vmovl.u8 q10, d20
+        vmovl.u8 q11, d22
+
+        vmull.u16 q6, d16, d0[0]
+        vmlal.u16 q6, d17, d1[0]
+        vshrn.u32 d16, q6, #7
+        vmull.u16 q6, d18, d0[0]
+        vmlal.u16 q6, d19, d1[0]
+        vshrn.u32 d18, q6, #7
+        vmull.u16 q6, d20, d0[0]
+        vmlal.u16 q6, d21, d1[0]
+        vshrn.u32 d20, q6, #7
+        vmull.u16 q6, d22, d0[0]
+        vmlal.u16 q6, d23, d1[0]
+        vshrn.u32 d22, q6, #7
+
+        vmull.u16 q6, d16, d0[1]
+        vmlal.u16 q6, d18, d1[1]
+        vshrn.u32 d16, q6, #15
+        vmull.u16 q6, d20, d0[1]
+        vmlal.u16 q6, d22, d1[1]
+        vshrn.u32 d18, q6, #15
+
+        vmull.u16 q6, d16, d0[2]
+        vmlal.u16 q6, d18, d1[2]
+        vshrn.u32 d14, q6, #15
+
+
+        vld1.8 {d16}, [r7]
+        add r11, r7, r3
+        vld1.8 {d18}, [r11]
+        add r11, r7, r4
+        vld1.8 {d20}, [r11]
+        add r11, r7, r8
+        vld1.8 {d22}, [r11]
+        vmovl.u8 q8, d16
+        vmovl.u8 q9, d18
+        vmovl.u8 q10, d20
+        vmovl.u8 q11, d22
+
+        vmull.u16 q6, d16, d2[0]
+        vmlal.u16 q6, d17, d3[0]
+        vshrn.u32 d16, q6, #7
+        vmull.u16 q6, d18, d2[0]
+        vmlal.u16 q6, d19, d3[0]
+        vshrn.u32 d18, q6, #7
+        vmull.u16 q6, d20, d2[0]
+        vmlal.u16 q6, d21, d3[0]
+        vshrn.u32 d20, q6, #7
+        vmull.u16 q6, d22, d2[0]
+        vmlal.u16 q6, d23, d3[0]
+        vshrn.u32 d22, q6, #7
+
+        vmull.u16 q6, d16, d2[1]
+        vmlal.u16 q6, d18, d3[1]
+        vshrn.u32 d16, q6, #15
+        vmull.u16 q6, d20, d2[1]
+        vmlal.u16 q6, d22, d3[1]
+        vshrn.u32 d18, q6, #15
+
+        vmull.u16 q6, d16, d2[2]
+        vmlal.u16 q6, d18, d3[2]
+        vshrn.u32 d15, q6, #15
+
+        vrshrn.u16 d14, q7, #8
+
+        vbic.u8 d14, d14, d24  // mix in alpha
+        vorr.u8 d14, d14, d25
+        vst1.32 {d14}, [r0]!
+
+
+        /* Are we done? */
+        subs r5, r5, #1
+        bne 1b
+
+        /* Yup, bye */
+        vpop            {q4-q7}
+        pop         {r4-r8, r10, r11, lr}
+        bx          lr
+
+END(rsdIntrinsic3DLUT_K)
+
+
diff --git a/rsDefines.h b/rsDefines.h
index b1f10419..dfa0f9d5 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -350,7 +350,8 @@ enum RsScriptIntrinsicID {
     RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5 = 4,
     RS_SCRIPT_INTRINSIC_ID_BLUR = 5,
     RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB = 6,
-    RS_SCRIPT_INTRINSIC_ID_BLEND = 7
+    RS_SCRIPT_INTRINSIC_ID_BLEND = 7,
+    RS_SCRIPT_INTRINSIC_ID_3DLUT = 8
 };
 
 typedef struct {
diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h
index 310cbec1..696f2db6 100644
--- a/rsScriptIntrinsic.h
+++ b/rsScriptIntrinsic.h
@@ -53,7 +53,6 @@ public:
     virtual uint32_t run(Context *);
 protected:
     uint32_t mIntrinsicID;
-    float mParams[9];
 
 };