1 files changed, 145 insertions, 97 deletions
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index db3cc7fa..a0564fc1 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -350,134 +350,180 @@ RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
 }
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-typedef void (*walk_loop_t)(MTLaunchStruct*,
-                            RsExpandKernelParams&,
-                            outer_foreach_t);
 
-
-static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) {
+static void wc_xy(void *usr, uint32_t idx) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
 
-    uint32_t inLen = mtls->fep.inLen;
-
     RsExpandKernelParams kparams;
     kparams.takeFields(mtls->fep);
 
     // Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
     kparams.lid = idx;
 
-    if (inLen > 0) {
-        // Allocate space for our input base pointers.
-        kparams.ins = (const void**)alloca(inLen * sizeof(void*));
+    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+    while (1) {
+        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+        uint32_t yEnd   = yStart + mtls->mSliceSize;
 
-        // Allocate space for our input stride information.
-        kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
+        yEnd = rsMin(yEnd, mtls->yEnd);
 
-        // Fill our stride information.
-        for (int inIndex = inLen; --inIndex >= 0;) {
-          kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride;
+        if (yEnd <= yStart) {
+            return;
+        }
+
+        //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
+        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
+
+        for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
+            kparams.out = mtls->fep.ptrOut +
+                          (mtls->fep.yStrideOut * kparams.y) +
+                          (mtls->fep.eStrideOut * mtls->xStart);
+
+            kparams.in = mtls->fep.ptrIn +
+                         (mtls->fep.yStrideIn * kparams.y) +
+                         (mtls->fep.eStrideIn * mtls->xStart);
+
+
+            fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
+               mtls->fep.eStrideOut);
         }
     }
+}
+
+static void wc_x(void *usr, uint32_t idx) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+
+    RsExpandKernelParams kparams;
+    kparams.takeFields(mtls->fep);
+
+    // Used by CpuScriptGroup, IntrinsicBlur, and IntrisicHistogram
+    kparams.lid = idx;
 
     outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+    while (1) {
+        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+        uint32_t xEnd   = xStart + mtls->mSliceSize;
+
+        xEnd = rsMin(xEnd, mtls->xEnd);
+
+        if (xEnd <= xStart) {
+            return;
+        }
+
+        //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
+        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
 
-    walk_loop(mtls, kparams, fn);
+        kparams.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
+        kparams.in  = mtls->fep.ptrIn  + (mtls->fep.eStrideIn  * xStart);
+
+        fn(&kparams, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+    }
 }
 
-static void walk_2d(void *usr, uint32_t idx) {
-    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
-                              RsExpandKernelParams &kparams,
-                              outer_foreach_t fn) {
+void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
+                                        const RsScriptCall *sc, MTLaunchStruct *mtls) {
 
-        while (1) {
-            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-            uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
-            uint32_t yEnd   = yStart + mtls->mSliceSize;
+    //android::StopWatch kernel_time("kernel time");
 
-            yEnd = rsMin(yEnd, mtls->yEnd);
+    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+        const size_t targetByteChunk = 16 * 1024;
+        mInForEach = true;
+        if (mtls->fep.dimY > 1) {
+            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
 
-            if (yEnd <= yStart) {
-                return;
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.yStrideOut) {
+                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.yStrideIn;
             }
+            mtls->mSliceSize = rsMin(s1, s2);
 
-            for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
-                kparams.out = mtls->fep.outPtr +
-                              (mtls->fep.outStride.yStride * kparams.y) +
-                              (mtls->fep.outStride.eStride * mtls->xStart);
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
 
-                for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
-                    StridePair &strides = mtls->fep.inStrides[inIndex];
+         //   mtls->mSliceSize = 2;
+            launchThreads(wc_xy, mtls);
+        } else {
+            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
 
-                    kparams.ins[inIndex] =
-                      mtls->fep.inPtrs[inIndex] +
-                      (strides.yStride * kparams.y) +
-                      (strides.eStride * mtls->xStart);
-                }
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.eStrideOut) {
+                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.eStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
 
-                // Kernels now get their input strides from kparams.
-                fn(&kparams, mtls->xStart, mtls->xEnd, 0,
-                   mtls->fep.outStride.eStride);
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
             }
+
+            launchThreads(wc_x, mtls);
         }
-    });
-}
+        mInForEach = false;
 
-static void walk_1d(void *usr, uint32_t idx) {
-    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
-                              RsExpandKernelParams &kparams,
-                              outer_foreach_t fn) {
+        //ALOGE("launch 1");
+    } else {
+        RsExpandKernelParams kparams;
+        kparams.takeFields(mtls->fep);
+
+        //ALOGE("launch 3");
+        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+        for (uint32_t arrayIndex = mtls->arrayStart;
+             arrayIndex < mtls->arrayEnd; arrayIndex++) {
 
-        while (1) {
-            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-            uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
-            uint32_t xEnd   = xStart + mtls->mSliceSize;
+            for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
+                 kparams.z++) {
 
-            xEnd = rsMin(xEnd, mtls->xEnd);
+                for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
+                     kparams.y++) {
 
-            if (xEnd <= xStart) {
-                return;
-            }
+                    uint32_t offset =
+                      kparams.dimY * kparams.dimZ * arrayIndex +
+                      kparams.dimY * kparams.z + kparams.y;
 
-            kparams.out = mtls->fep.outPtr +
-                          (mtls->fep.outStride.eStride * xStart);
+                    kparams.out = mtls->fep.ptrOut +
+                                  (mtls->fep.yStrideOut * offset) +
+                                  (mtls->fep.eStrideOut * mtls->xStart);
 
-            for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
-                StridePair &strides = mtls->fep.inStrides[inIndex];
+                    kparams.in = mtls->fep.ptrIn +
+                                 (mtls->fep.yStrideIn * offset) +
+                                 (mtls->fep.eStrideIn * mtls->xStart);
 
-                kparams.ins[inIndex] =
-                  mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart);
+                    fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
+                       mtls->fep.eStrideOut);
+                }
             }
-
-            // Kernels now get their input strides from kparams.
-            fn(&kparams, xStart, xEnd, 0, mtls->fep.outStride.eStride);
         }
-    });
+    }
 }
 
-
-void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
-                                        uint32_t inLen,
-                                        Allocation* aout,
-                                        const RsScriptCall* sc,
-                                        MTLaunchStruct* mtls) {
+void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
+                                        const RsScriptCall* sc, MTLaunchStruct* mtls) {
 
     //android::StopWatch kernel_time("kernel time");
 
     if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
         const size_t targetByteChunk = 16 * 1024;
         mInForEach = true;
-
         if (mtls->fep.dimY > 1) {
             uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.outStride.yStride) {
-                s2 = targetByteChunk / mtls->fep.outStride.yStride;
+            if (mtls->fep.yStrideOut) {
+                s2 = targetByteChunk / mtls->fep.yStrideOut;
             } else {
-                // We know that there is either an output or an input.
-                s2 = targetByteChunk / mtls->fep.inStrides[0].yStride;
+                s2 = targetByteChunk / mtls->fep.yStrideIn;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -485,18 +531,18 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
                 mtls->mSliceSize = 1;
             }
 
-            launchThreads(walk_2d, mtls);
+         //   mtls->mSliceSize = 2;
+            launchThreads(wc_xy, mtls);
         } else {
             uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.outStride.eStride) {
-                s2 = targetByteChunk / mtls->fep.outStride.eStride;
+            if (mtls->fep.eStrideOut) {
+                s2 = targetByteChunk / mtls->fep.eStrideOut;
             } else {
-                // We know that there is either an output or an input.
-                s2 = targetByteChunk / mtls->fep.inStrides[0].eStride;
+                s2 = targetByteChunk / mtls->fep.eStrideIn;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -504,26 +550,24 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
                 mtls->mSliceSize = 1;
             }
 
-            launchThreads(walk_1d, mtls);
+            launchThreads(wc_x, mtls);
         }
         mInForEach = false;
 
+        //ALOGE("launch 1");
     } else {
         RsExpandKernelParams kparams;
         kparams.takeFields(mtls->fep);
 
-        if (inLen > 0) {
-            // Allocate space for our input base pointers.
-            kparams.ins = (const void**)alloca(inLen * sizeof(void*));
+        // Allocate space for our input base pointers.
+        kparams.ins = new const void*[inLen];
 
-            // Allocate space for our input stride information.
-            kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
+        // Allocate space for our input stride information.
+        kparams.eStrideIns = new uint32_t[inLen];
 
-            // Fill our stride information.
-            for (int inIndex = inLen; --inIndex >= 0;) {
-                kparams.inEStrides[inIndex] =
-                    mtls->fep.inStrides[inIndex].eStride;
-            }
+        // Fill our stride information.
+        for (int inIndex = inLen; --inIndex >= 0;) {
+          kparams.eStrideIns[inIndex] = mtls->fep.inStrides[inIndex].eStride;
         }
 
         //ALOGE("launch 3");
@@ -541,15 +585,15 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
                       mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
                       mtls->fep.dimY * kparams.z + kparams.y;
 
-                    kparams.out = mtls->fep.outPtr +
-                                  (mtls->fep.outStride.yStride * offset) +
-                                  (mtls->fep.outStride.eStride * mtls->xStart);
+                    kparams.out = mtls->fep.ptrOut +
+                                  (mtls->fep.yStrideOut * offset) +
+                                  (mtls->fep.eStrideOut * mtls->xStart);
 
                     for (int inIndex = inLen; --inIndex >= 0;) {
                         StridePair &strides = mtls->fep.inStrides[inIndex];
 
                         kparams.ins[inIndex] =
-                          mtls->fep.inPtrs[inIndex] +
+                          mtls->fep.ptrIns[inIndex] +
                           (strides.yStride * offset) +
                           (strides.eStride * mtls->xStart);
                     }
@@ -560,10 +604,14 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
                      * that points to an array.
                      */
                     fn(&kparams, mtls->xStart, mtls->xEnd, 0,
-                       mtls->fep.outStride.eStride);
+                       mtls->fep.eStrideOut);
                 }
             }
         }
+
+        // Free our arrays.
+        delete[] kparams.ins;
+        delete[] kparams.eStrideIns;
     }
 }