summaryrefslogtreecommitdiff
path: root/cpu_ref
diff options
context:
space:
mode:
authorStephen Hines <srhines@google.com>2014-08-13 17:32:10 +0000
committerStephen Hines <srhines@google.com>2014-08-13 17:32:10 +0000
commit4b2bea3dc20865f3a198797702e19912a6a2171c (patch)
treeb028521e6474ab22bc99571ead62e1e4f0cb2dc6 /cpu_ref
parent818cfa034e257c7bb48356257f5cb67334e19aa6 (diff)
downloadrs-4b2bea3dc20865f3a198797702e19912a6a2171c.tar.gz
Revert "Collapse code paths for single- and multi-input kernels."
This reverts commit 818cfa034e257c7bb48356257f5cb67334e19aa6. Change-Id: I59f39f52e6c8f60bb01cbcb8ccf2215eaf46a57f
Diffstat (limited to 'cpu_ref')
-rw-r--r--cpu_ref/Android.mk2
-rw-r--r--cpu_ref/rsCpuCore.cpp242
-rw-r--r--cpu_ref/rsCpuCore.h65
-rw-r--r--cpu_ref/rsCpuIntrinsic.cpp49
-rw-r--r--cpu_ref/rsCpuIntrinsic.h55
-rw-r--r--cpu_ref/rsCpuIntrinsic3DLUT.cpp10
-rw-r--r--cpu_ref/rsCpuIntrinsicBlend.cpp5
-rw-r--r--cpu_ref/rsCpuIntrinsicColorMatrix.cpp38
-rw-r--r--cpu_ref/rsCpuIntrinsicHistogram.cpp58
-rw-r--r--cpu_ref/rsCpuIntrinsicLUT.cpp4
-rw-r--r--cpu_ref/rsCpuIntrinsicResize.cpp14
-rw-r--r--cpu_ref/rsCpuScript.cpp214
-rw-r--r--cpu_ref/rsCpuScript.h36
-rw-r--r--cpu_ref/rsCpuScriptGroup.cpp103
-rw-r--r--cpu_ref/rsd_cpu.h22
15 files changed, 537 insertions, 380 deletions
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 729e7022..aeb75a65 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -91,7 +91,7 @@ include external/libcxx/libcxx.mk
endif
include frameworks/compile/libbcc/libbcc-targets.mk
-LOCAL_CFLAGS += $(rs_base_CFLAGS) -std=c++11
+LOCAL_CFLAGS += $(rs_base_CFLAGS)
LOCAL_MODULE_TAGS := optional
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index db3cc7fa..a0564fc1 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -350,134 +350,180 @@ RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
}
typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-typedef void (*walk_loop_t)(MTLaunchStruct*,
- RsExpandKernelParams&,
- outer_foreach_t);
-
-static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) {
+static void wc_xy(void *usr, uint32_t idx) {
MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
- uint32_t inLen = mtls->fep.inLen;
-
RsExpandKernelParams kparams;
kparams.takeFields(mtls->fep);
// Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
kparams.lid = idx;
- if (inLen > 0) {
- // Allocate space for our input base pointers.
- kparams.ins = (const void**)alloca(inLen * sizeof(void*));
+ outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+ while (1) {
+ uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+ uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+ uint32_t yEnd = yStart + mtls->mSliceSize;
- // Allocate space for our input stride information.
- kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
+ yEnd = rsMin(yEnd, mtls->yEnd);
- // Fill our stride information.
- for (int inIndex = inLen; --inIndex >= 0;) {
- kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride;
+ if (yEnd <= yStart) {
+ return;
+ }
+
+ //ALOGE("usr idx %i, x %i,%i y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
+ //ALOGE("usr ptr in %p, out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
+
+ for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
+ kparams.out = mtls->fep.ptrOut +
+ (mtls->fep.yStrideOut * kparams.y) +
+ (mtls->fep.eStrideOut * mtls->xStart);
+
+ kparams.in = mtls->fep.ptrIn +
+ (mtls->fep.yStrideIn * kparams.y) +
+ (mtls->fep.eStrideIn * mtls->xStart);
+
+
+ fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
+ mtls->fep.eStrideOut);
}
}
+}
+
+static void wc_x(void *usr, uint32_t idx) {
+ MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+
+ RsExpandKernelParams kparams;
+ kparams.takeFields(mtls->fep);
+
+ // Used by CpuScriptGroup, IntrinsicBlur, and IntrisicHistogram
+ kparams.lid = idx;
outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+ while (1) {
+ uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+ uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+ uint32_t xEnd = xStart + mtls->mSliceSize;
+
+ xEnd = rsMin(xEnd, mtls->xEnd);
+
+ if (xEnd <= xStart) {
+ return;
+ }
+
+ //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
+ //ALOGE("usr ptr in %p, out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
- walk_loop(mtls, kparams, fn);
+ kparams.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
+ kparams.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
+
+ fn(&kparams, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+ }
}
-static void walk_2d(void *usr, uint32_t idx) {
- walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
- RsExpandKernelParams &kparams,
- outer_foreach_t fn) {
+void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
+ const RsScriptCall *sc, MTLaunchStruct *mtls) {
- while (1) {
- uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
- uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
- uint32_t yEnd = yStart + mtls->mSliceSize;
+ //android::StopWatch kernel_time("kernel time");
- yEnd = rsMin(yEnd, mtls->yEnd);
+ if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+ const size_t targetByteChunk = 16 * 1024;
+ mInForEach = true;
+ if (mtls->fep.dimY > 1) {
+ uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+ uint32_t s2 = 0;
- if (yEnd <= yStart) {
- return;
+ // This chooses our slice size to rate limit atomic ops to
+ // one per 16k bytes of reads/writes.
+ if (mtls->fep.yStrideOut) {
+ s2 = targetByteChunk / mtls->fep.yStrideOut;
+ } else {
+ s2 = targetByteChunk / mtls->fep.yStrideIn;
}
+ mtls->mSliceSize = rsMin(s1, s2);
- for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
- kparams.out = mtls->fep.outPtr +
- (mtls->fep.outStride.yStride * kparams.y) +
- (mtls->fep.outStride.eStride * mtls->xStart);
+ if(mtls->mSliceSize < 1) {
+ mtls->mSliceSize = 1;
+ }
- for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
- StridePair &strides = mtls->fep.inStrides[inIndex];
+ // mtls->mSliceSize = 2;
+ launchThreads(wc_xy, mtls);
+ } else {
+ uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+ uint32_t s2 = 0;
- kparams.ins[inIndex] =
- mtls->fep.inPtrs[inIndex] +
- (strides.yStride * kparams.y) +
- (strides.eStride * mtls->xStart);
- }
+ // This chooses our slice size to rate limit atomic ops to
+ // one per 16k bytes of reads/writes.
+ if (mtls->fep.eStrideOut) {
+ s2 = targetByteChunk / mtls->fep.eStrideOut;
+ } else {
+ s2 = targetByteChunk / mtls->fep.eStrideIn;
+ }
+ mtls->mSliceSize = rsMin(s1, s2);
- // Kernels now get their input strides from kparams.
- fn(&kparams, mtls->xStart, mtls->xEnd, 0,
- mtls->fep.outStride.eStride);
+ if(mtls->mSliceSize < 1) {
+ mtls->mSliceSize = 1;
}
+
+ launchThreads(wc_x, mtls);
}
- });
-}
+ mInForEach = false;
-static void walk_1d(void *usr, uint32_t idx) {
- walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
- RsExpandKernelParams &kparams,
- outer_foreach_t fn) {
+ //ALOGE("launch 1");
+ } else {
+ RsExpandKernelParams kparams;
+ kparams.takeFields(mtls->fep);
+
+ //ALOGE("launch 3");
+ outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+ for (uint32_t arrayIndex = mtls->arrayStart;
+ arrayIndex < mtls->arrayEnd; arrayIndex++) {
- while (1) {
- uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
- uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
- uint32_t xEnd = xStart + mtls->mSliceSize;
+ for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
+ kparams.z++) {
- xEnd = rsMin(xEnd, mtls->xEnd);
+ for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
+ kparams.y++) {
- if (xEnd <= xStart) {
- return;
- }
+ uint32_t offset =
+ kparams.dimY * kparams.dimZ * arrayIndex +
+ kparams.dimY * kparams.z + kparams.y;
- kparams.out = mtls->fep.outPtr +
- (mtls->fep.outStride.eStride * xStart);
+ kparams.out = mtls->fep.ptrOut +
+ (mtls->fep.yStrideOut * offset) +
+ (mtls->fep.eStrideOut * mtls->xStart);
- for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
- StridePair &strides = mtls->fep.inStrides[inIndex];
+ kparams.in = mtls->fep.ptrIn +
+ (mtls->fep.yStrideIn * offset) +
+ (mtls->fep.eStrideIn * mtls->xStart);
- kparams.ins[inIndex] =
- mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart);
+ fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
+ mtls->fep.eStrideOut);
+ }
}
-
- // Kernels now get their input strides from kparams.
- fn(&kparams, xStart, xEnd, 0, mtls->fep.outStride.eStride);
}
- });
+ }
}
-
-void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
- uint32_t inLen,
- Allocation* aout,
- const RsScriptCall* sc,
- MTLaunchStruct* mtls) {
+void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
+ const RsScriptCall* sc, MTLaunchStruct* mtls) {
//android::StopWatch kernel_time("kernel time");
if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
const size_t targetByteChunk = 16 * 1024;
mInForEach = true;
-
if (mtls->fep.dimY > 1) {
uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
uint32_t s2 = 0;
// This chooses our slice size to rate limit atomic ops to
// one per 16k bytes of reads/writes.
- if (mtls->fep.outStride.yStride) {
- s2 = targetByteChunk / mtls->fep.outStride.yStride;
+ if (mtls->fep.yStrideOut) {
+ s2 = targetByteChunk / mtls->fep.yStrideOut;
} else {
- // We know that there is either an output or an input.
- s2 = targetByteChunk / mtls->fep.inStrides[0].yStride;
+ s2 = targetByteChunk / mtls->fep.yStrideIn;
}
mtls->mSliceSize = rsMin(s1, s2);
@@ -485,18 +531,18 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
mtls->mSliceSize = 1;
}
- launchThreads(walk_2d, mtls);
+ // mtls->mSliceSize = 2;
+ launchThreads(wc_xy, mtls);
} else {
uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
uint32_t s2 = 0;
// This chooses our slice size to rate limit atomic ops to
// one per 16k bytes of reads/writes.
- if (mtls->fep.outStride.eStride) {
- s2 = targetByteChunk / mtls->fep.outStride.eStride;
+ if (mtls->fep.eStrideOut) {
+ s2 = targetByteChunk / mtls->fep.eStrideOut;
} else {
- // We know that there is either an output or an input.
- s2 = targetByteChunk / mtls->fep.inStrides[0].eStride;
+ s2 = targetByteChunk / mtls->fep.eStrideIn;
}
mtls->mSliceSize = rsMin(s1, s2);
@@ -504,26 +550,24 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
mtls->mSliceSize = 1;
}
- launchThreads(walk_1d, mtls);
+ launchThreads(wc_x, mtls);
}
mInForEach = false;
+ //ALOGE("launch 1");
} else {
RsExpandKernelParams kparams;
kparams.takeFields(mtls->fep);
- if (inLen > 0) {
- // Allocate space for our input base pointers.
- kparams.ins = (const void**)alloca(inLen * sizeof(void*));
+ // Allocate space for our input base pointers.
+ kparams.ins = new const void*[inLen];
- // Allocate space for our input stride information.
- kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
+ // Allocate space for our input stride information.
+ kparams.eStrideIns = new uint32_t[inLen];
- // Fill our stride information.
- for (int inIndex = inLen; --inIndex >= 0;) {
- kparams.inEStrides[inIndex] =
- mtls->fep.inStrides[inIndex].eStride;
- }
+ // Fill our stride information.
+ for (int inIndex = inLen; --inIndex >= 0;) {
+ kparams.eStrideIns[inIndex] = mtls->fep.inStrides[inIndex].eStride;
}
//ALOGE("launch 3");
@@ -541,15 +585,15 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
mtls->fep.dimY * kparams.z + kparams.y;
- kparams.out = mtls->fep.outPtr +
- (mtls->fep.outStride.yStride * offset) +
- (mtls->fep.outStride.eStride * mtls->xStart);
+ kparams.out = mtls->fep.ptrOut +
+ (mtls->fep.yStrideOut * offset) +
+ (mtls->fep.eStrideOut * mtls->xStart);
for (int inIndex = inLen; --inIndex >= 0;) {
StridePair &strides = mtls->fep.inStrides[inIndex];
kparams.ins[inIndex] =
- mtls->fep.inPtrs[inIndex] +
+ mtls->fep.ptrIns[inIndex] +
(strides.yStride * offset) +
(strides.eStride * mtls->xStart);
}
@@ -560,10 +604,14 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
* that points to an array.
*/
fn(&kparams, mtls->xStart, mtls->xEnd, 0,
- mtls->fep.outStride.eStride);
+ mtls->fep.eStrideOut);
}
}
}
+
+ // Free our arrays.
+ delete[] kparams.ins;
+ delete[] kparams.eStrideIns;
}
}
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 2fea3fcd..5d4b6cc5 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -25,8 +25,6 @@
#include <string>
-#define RS_KERNEL_INPUT_THRESHOLD 32
-
namespace bcc {
class BCCContext;
class RSCompilerDriver;
@@ -42,36 +40,31 @@ struct StridePair {
};
struct RsExpandKernelDriverInfo {
- const uint8_t **inPtrs;
- uint32_t inLen;
-
- uint8_t *outPtr;
-
- StridePair *inStrides;
- StridePair outStride;
+ const void *usr;
+ uint32_t usrLen;
uint32_t dimX;
uint32_t dimY;
uint32_t dimZ;
+ const uint8_t *ptrIn;
+ uint8_t *ptrOut;
+ uint32_t eStrideIn;
+ uint32_t eStrideOut;
+ uint32_t yStrideIn;
+ uint32_t yStrideOut;
uint32_t slot;
- const void *usr;
- uint32_t usrLen;
-
- bool heapAllocatedArrays;
-
- RsExpandKernelDriverInfo() : heapAllocatedArrays(false) {}
+ const uint8_t** ptrIns;
+ StridePair* inStrides;
~RsExpandKernelDriverInfo() {
- if (heapAllocatedArrays) {
- if (inPtrs != NULL) {
- delete[] inPtrs;
- }
-
- if (inStrides != NULL) {
- delete[] inStrides;
- }
+ if (ptrIns != NULL) {
+ delete[] ptrIns;
+ }
+
+ if (inStrides != NULL) {
+ delete[] inStrides;
}
}
};
@@ -79,13 +72,15 @@ struct RsExpandKernelDriverInfo {
struct RsExpandKernelParams {
// Used by kernels
- const void **ins;
- uint32_t *inEStrides;
+ const void *in;
void *out;
uint32_t y;
uint32_t z;
uint32_t lid;
+ const void **ins;
+ uint32_t *eStrideIns;
+
// Used by ScriptGroup and user kernels.
const void *usr;
@@ -120,13 +115,13 @@ typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
class RsdCpuScriptImpl;
class RsdCpuReferenceImpl;
-struct ScriptTLSStruct {
+typedef struct ScriptTLSStructRec {
android::renderscript::Context * mContext;
const android::renderscript::Script * mScript;
RsdCpuScriptImpl *mImpl;
-};
+} ScriptTLSStruct;
-struct MTLaunchStruct {
+typedef struct {
RsExpandKernelDriverInfo fep;
RsdCpuReferenceImpl *rsc;
@@ -134,7 +129,7 @@ struct MTLaunchStruct {
ForEachFunc_t kernel;
uint32_t sig;
- const Allocation ** ains;
+ const Allocation * ain;
Allocation * aout;
uint32_t mSliceSize;
@@ -150,9 +145,12 @@ struct MTLaunchStruct {
uint32_t arrayStart;
uint32_t arrayEnd;
- const uint8_t *inPtrsBuff[RS_KERNEL_INPUT_THRESHOLD];
- StridePair inStridesBuff[RS_KERNEL_INPUT_THRESHOLD];
-};
+ // Multi-input data.
+ const Allocation ** ains;
+} MTLaunchStruct;
+
+
+
class RsdCpuReferenceImpl : public RsdCpuReference {
public:
@@ -173,6 +171,9 @@ public:
return mWorkers.mCount + 1;
}
+ void launchThreads(const Allocation * ain, Allocation * aout,
+ const RsScriptCall *sc, MTLaunchStruct *mtls);
+
void launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
const RsScriptCall* sc, MTLaunchStruct* mtls);
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 8437c998..5a7fffd5 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -73,29 +73,54 @@ void RsdCpuScriptIntrinsic::invokeFreeChildren() {
}
-void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation ** ains,
- uint32_t inLen, Allocation * aout,
- const void * usr, uint32_t usrLen,
- const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc) {
}
-void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation ** ains,
- uint32_t inLen, Allocation * aout,
- const void * usr, uint32_t usrLen,
- const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc) {
}
void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot,
- const Allocation ** ains,
- uint32_t inLen,
+ const Allocation * ain,
Allocation * aout,
const void * usr,
uint32_t usrLen,
const RsScriptCall *sc) {
MTLaunchStruct mtls;
+ preLaunch(slot, ain, aout, usr, usrLen, sc);
- preLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
+ forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
+ mtls.script = this;
+ mtls.fep.slot = slot;
+
+ mtls.kernel = (void (*)())mRootPtr;
+ mtls.fep.usr = this;
+
+ RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+ mCtx->launchThreads(ain, aout, sc, &mtls);
+ mCtx->setTLS(oldTLS);
+
+ postLaunch(slot, ain, aout, usr, usrLen, sc);
+}
+
+void RsdCpuScriptIntrinsic::invokeForEachMulti(uint32_t slot,
+ const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc) {
+
+ MTLaunchStruct mtls;
+ /*
+ * FIXME: Possibly create new preLaunch and postLaunch functions that take
+ * all of the input allocation pointers.
+ */
+ preLaunch(slot, ains[0], aout, usr, usrLen, sc);
forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
mtls.script = this;
@@ -108,7 +133,7 @@ void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot,
mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
mCtx->setTLS(oldTLS);
- postLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
+ postLaunch(slot, ains[0], aout, usr, usrLen, sc);
}
void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index 95aaa141..bf6a8acd 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -28,42 +28,43 @@ class RsdCpuScriptIntrinsic : public RsdCpuScriptImpl {
public:
virtual void populateScript(Script *) = 0;
- virtual void invokeFunction(uint32_t slot, const void * params,
- size_t paramLength);
+ virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
virtual int invokeRoot();
-
virtual void invokeForEach(uint32_t slot,
- const Allocation ** ain,
- uint32_t inLen,
- Allocation * aout,
- const void * usr,
- uint32_t usrLen,
- const RsScriptCall *sc);
-
- virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct * mtls);
+ const Allocation * ain,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc);
+
+ virtual void invokeForEachMulti(uint32_t slot,
+ const Allocation ** ain,
+ uint32_t inLen,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc);
+
+ virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
virtual void invokeInit();
virtual void invokeFreeChildren();
- virtual void preLaunch(uint32_t slot, const Allocation ** ains,
- uint32_t inLen, Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall * sc);
- virtual void postLaunch(uint32_t slot, const Allocation ** ains,
- uint32_t inLen, Allocation * aout,
- const void * usr, uint32_t usrLen,
- const RsScriptCall * sc);
-
- virtual void setGlobalVar(uint32_t slot, const void * data,
- size_t dataLength);
- virtual void setGlobalVarWithElemDims(uint32_t slot, const void * data,
- size_t dataLength, const Element * e,
- const uint32_t * dims,
- size_t dimLength);
+ virtual void preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc);
+ virtual void postLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc);
+
+ virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+ virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
+ const Element *e, const uint32_t *dims, size_t dimLength);
virtual void setGlobalBind(uint32_t slot, Allocation *data);
virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
virtual ~RsdCpuScriptIntrinsic();
- RsdCpuScriptIntrinsic(RsdCpuReferenceImpl * ctx, const Script * s,
- const Element * e, RsScriptIntrinsicID iid);
+ RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, const Element *,
+ RsScriptIntrinsicID iid);
protected:
RsScriptIntrinsicID mID;
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index a19d8851..c839c19d 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -64,7 +64,7 @@ void RsdCpuScriptIntrinsic3DLUT::kernel(const RsExpandKernelParams *p,
RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
uchar4 *out = (uchar4 *)p->out + xstart;
- uchar4 *in = (uchar4 *)p->ins[0] + xstart;
+ uchar4 *in = (uchar4 *)p->in + xstart;
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -161,9 +161,9 @@ void RsdCpuScriptIntrinsic3DLUT::kernel(const RsExpandKernelParams *p,
}
}
-RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(
- RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) :
- RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
+RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e)
+ : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
mRootPtr = &kernel;
}
@@ -185,3 +185,5 @@ RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e);
}
+
+
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 0378e076..b6046584 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -117,7 +117,7 @@ void RsdCpuScriptIntrinsicBlend::kernel(const RsExpandKernelParams *p,
// instep/outstep can be ignored--sizeof(uchar4) known at compile time
uchar4 *out = (uchar4 *)p->out;
- uchar4 *in = (uchar4 *)p->ins[0];
+ uchar4 *in = (uchar4 *)p->in;
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -509,3 +509,6 @@ RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
const Script *s, const Element *e) {
return new RsdCpuScriptIntrinsicBlend(ctx, s, e);
}
+
+
+
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 4e90ad72..bf78eb3e 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -169,9 +169,10 @@ public:
virtual ~RsdCpuScriptIntrinsicColorMatrix();
RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
- virtual void preLaunch(uint32_t slot, const Allocation ** ains,
- uint32_t inLen, Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall *sc);
+ virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
+ const void * usr, uint32_t usrLen, const RsScriptCall *sc);
+ virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
+ const void * usr, uint32_t usrLen, const RsScriptCall *sc);
protected:
float fp[16];
@@ -882,13 +883,8 @@ void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p,
uint32_t xstart, uint32_t xend,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
-
- // Update the instep due to change in parameter passing.
- instep = p->inEStrides[0];
-
- uchar *out = (uchar *)p->out + outstep * xstart;
- uchar *in = (uchar *)p->ins[0] + instep * xstart;
-
+ uchar *out = (uchar *)p->out + outstep * xstart;
+ uchar *in = (uchar *)p->in + instep * xstart;
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -936,15 +932,11 @@ void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p,
}
}
-void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
- const Allocation ** ains,
- uint32_t inLen,
- Allocation * aout,
- const void * usr,
- uint32_t usrLen,
- const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
+ uint32_t slot, const Allocation * ain, Allocation * aout,
+ const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
- const Element *ein = ains[0]->mHal.state.type->getElement();
+ const Element *ein = ain->mHal.state.type->getElement();
const Element *eout = aout->mHal.state.type->getElement();
if (ein->getType() == eout->getType()) {
@@ -961,8 +953,8 @@ void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
}
}
- Key_t key = computeKey(ein, eout);
-
+ Key_t key = computeKey(ain->mHal.state.type->getElement(),
+ aout->mHal.state.type->getElement());
#if defined(ARCH_X86_HAVE_SSSE3)
if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
// FIXME: Disable mOptKernel to pass RS color matrix CTS cases
@@ -1004,6 +996,12 @@ void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
#endif //if !defined(ARCH_X86_HAVE_SSSE3)
}
+void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
+ uint32_t slot, const Allocation * ain, Allocation * aout,
+ const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
+
+}
+
RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
: RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp
index b5dbfa80..1c430b72 100644
--- a/cpu_ref/rsCpuIntrinsicHistogram.cpp
+++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp
@@ -36,10 +36,10 @@ public:
RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
protected:
- void preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
+ void preLaunch(uint32_t slot, const Allocation * ain,
Allocation * aout, const void * usr,
uint32_t usrLen, const RsScriptCall *sc);
- void postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
+ void postLaunch(uint32_t slot, const Allocation * ain,
Allocation * aout, const void * usr,
uint32_t usrLen, const RsScriptCall *sc);
@@ -97,12 +97,9 @@ void RsdCpuScriptIntrinsicHistogram::setGlobalVar(uint32_t slot, const void *dat
-void
-RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot,
- const Allocation ** ains,
- uint32_t inLen, Allocation * aout,
- const void * usr, uint32_t usrLen,
- const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc) {
const uint32_t threads = mCtx->getThreadCount();
uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
@@ -126,7 +123,7 @@ RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot,
}
break;
case 1:
- switch(ains[0]->getType()->getElement()->getVectorSize()) {
+ switch(ain->getType()->getElement()->getVectorSize()) {
case 1:
mRootPtr = &kernelP1L1;
break;
@@ -145,12 +142,9 @@ RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot,
memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
}
-void
-RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot,
- const Allocation ** ains,
- uint32_t inLen, Allocation * aout,
- const void * usr, uint32_t usrLen,
- const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc) {
unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
uint32_t threads = mCtx->getThreadCount();
@@ -171,7 +165,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->ins[0];
+ uchar *in = (uchar *)p->in;
int * sums = &cp->mSums[256 * 4 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
@@ -179,7 +173,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelParams *p,
sums[(in[1] << 2) + 1] ++;
sums[(in[2] << 2) + 2] ++;
sums[(in[3] << 2) + 3] ++;
- in += p->inEStrides[0];
+ in += instep;
}
}
@@ -188,14 +182,14 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->ins[0];
+ uchar *in = (uchar *)p->in;
int * sums = &cp->mSums[256 * 4 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
sums[(in[0] << 2) ] ++;
sums[(in[1] << 2) + 1] ++;
sums[(in[2] << 2) + 2] ++;
- in += p->inEStrides[0];
+ in += instep;
}
}
@@ -204,13 +198,13 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->ins[0];
+ uchar *in = (uchar *)p->in;
int * sums = &cp->mSums[256 * 2 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
sums[(in[0] << 1) ] ++;
sums[(in[1] << 1) + 1] ++;
- in += p->inEStrides[0];
+ in += instep;
}
}
@@ -219,7 +213,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->ins[0];
+ uchar *in = (uchar *)p->in;
int * sums = &cp->mSums[256 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
@@ -228,7 +222,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelParams *p,
(cp->mDotI[2] * in[2]) +
(cp->mDotI[3] * in[3]);
sums[(t + 0x7f) >> 8] ++;
- in += p->inEStrides[0];
+ in += instep;
}
}
@@ -237,7 +231,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->ins[0];
+ uchar *in = (uchar *)p->in;
int * sums = &cp->mSums[256 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
@@ -245,7 +239,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelParams *p,
(cp->mDotI[1] * in[1]) +
(cp->mDotI[2] * in[2]);
sums[(t + 0x7f) >> 8] ++;
- in += p->inEStrides[0];
+ in += instep;
}
}
@@ -254,14 +248,14 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->ins[0];
+ uchar *in = (uchar *)p->in;
int * sums = &cp->mSums[256 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
int t = (cp->mDotI[0] * in[0]) +
(cp->mDotI[1] * in[1]);
sums[(t + 0x7f) >> 8] ++;
- in += p->inEStrides[0];
+ in += instep;
}
}
@@ -270,13 +264,13 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->ins[0];
+ uchar *in = (uchar *)p->in;
int * sums = &cp->mSums[256 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
int t = (cp->mDotI[0] * in[0]);
sums[(t + 0x7f) >> 8] ++;
- in += p->inEStrides[0];
+ in += instep;
}
}
@@ -285,12 +279,12 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->ins[0];
+ uchar *in = (uchar *)p->in;
int * sums = &cp->mSums[256 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
sums[in[0]] ++;
- in += p->inEStrides[0];
+ in += instep;
}
}
@@ -329,3 +323,5 @@ RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx, const Script
return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
}
+
+
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
index 9d3b4003..db73a838 100644
--- a/cpu_ref/rsCpuIntrinsicLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -59,7 +59,7 @@ void RsdCpuScriptIntrinsicLUT::kernel(const RsExpandKernelParams *p,
RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)p->usr;
uchar *out = (uchar *)p->out;
- const uchar *in = (uchar *)p->ins[0];
+ const uchar *in = (uchar *)p->in;
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -103,3 +103,5 @@ RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
return new RsdCpuScriptIntrinsicLUT(ctx, s, e);
}
+
+
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 3a307d63..af1127e7 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -35,8 +35,8 @@ public:
virtual ~RsdCpuScriptIntrinsicResize();
RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
- virtual void preLaunch(uint32_t slot, const Allocation ** ains,
- uint32_t inLen, Allocation * aout, const void * usr,
+ virtual void preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
uint32_t usrLen, const RsScriptCall *sc);
float scaleX;
@@ -308,11 +308,9 @@ RsdCpuScriptIntrinsicResize::RsdCpuScriptIntrinsicResize (
RsdCpuScriptIntrinsicResize::~RsdCpuScriptIntrinsicResize() {
}
-void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot,
- const Allocation ** ains,
- uint32_t inLen, Allocation * aout,
- const void * usr, uint32_t usrLen,
- const RsScriptCall *sc)
+void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc)
{
if (!mAlloc.get()) {
ALOGE("Resize executed without input, skipping");
@@ -353,3 +351,5 @@ RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx, const Script *s
return new RsdCpuScriptIntrinsicResize(ctx, s, e);
}
+
+
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 05984207..a11fda19 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -789,33 +789,144 @@ void RsdCpuScriptImpl::populateScript(Script *script) {
typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
- uint32_t inLen,
- Allocation * aout,
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation * ain, Allocation * aout,
const void * usr, uint32_t usrLen,
const RsScriptCall *sc,
MTLaunchStruct *mtls) {
memset(mtls, 0, sizeof(MTLaunchStruct));
- for (int index = inLen; --index >= 0;) {
- const Allocation* ain = ains[index];
+ // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
+ if (ain && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
+ return;
+ }
+ if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
+ return;
+ }
+
+ if (ain != NULL) {
+ const Type *inType = ain->getType();
- // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
- if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
+ mtls->fep.dimX = inType->getDimX();
+ mtls->fep.dimY = inType->getDimY();
+ mtls->fep.dimZ = inType->getDimZ();
+
+ } else if (aout != NULL) {
+ const Type *outType = aout->getType();
+
+ mtls->fep.dimX = outType->getDimX();
+ mtls->fep.dimY = outType->getDimY();
+ mtls->fep.dimZ = outType->getDimZ();
+
+ } else {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+ return;
+ }
+
+ if (ain != NULL && aout != NULL) {
+ if (!ain->hasSameDims(aout)) {
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "rsForEach called with null in allocations");
+ "Failed to launch kernel; dimensions of input and output allocations do not match.");
+
return;
}
}
+ if (!sc || (sc->xEnd == 0)) {
+ mtls->xEnd = mtls->fep.dimX;
+ } else {
+ rsAssert(sc->xStart < mtls->fep.dimX);
+ rsAssert(sc->xEnd <= mtls->fep.dimX);
+ rsAssert(sc->xStart < sc->xEnd);
+ mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
+ mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
+ if (mtls->xStart >= mtls->xEnd) return;
+ }
+
+ if (!sc || (sc->yEnd == 0)) {
+ mtls->yEnd = mtls->fep.dimY;
+ } else {
+ rsAssert(sc->yStart < mtls->fep.dimY);
+ rsAssert(sc->yEnd <= mtls->fep.dimY);
+ rsAssert(sc->yStart < sc->yEnd);
+ mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
+ mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
+ if (mtls->yStart >= mtls->yEnd) return;
+ }
+
+ if (!sc || (sc->zEnd == 0)) {
+ mtls->zEnd = mtls->fep.dimZ;
+ } else {
+ rsAssert(sc->zStart < mtls->fep.dimZ);
+ rsAssert(sc->zEnd <= mtls->fep.dimZ);
+ rsAssert(sc->zStart < sc->zEnd);
+ mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart);
+ mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd);
+ if (mtls->zStart >= mtls->zEnd) return;
+ }
+
+ mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
+ mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
+ mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
+ mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
+
+ rsAssert(!ain || (ain->getType()->getDimZ() == 0));
+
+ mtls->rsc = mCtx;
+ mtls->ain = ain;
+ mtls->aout = aout;
+ mtls->fep.usr = usr;
+ mtls->fep.usrLen = usrLen;
+ mtls->mSliceSize = 1;
+ mtls->mSliceNum = 0;
+
+ mtls->fep.ptrIn = NULL;
+ mtls->fep.eStrideIn = 0;
+ mtls->isThreadable = mIsThreadable;
+
+ if (ain) {
+ mtls->fep.ptrIn = (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr;
+ mtls->fep.eStrideIn = ain->getType()->getElementSizeBytes();
+ mtls->fep.yStrideIn = ain->mHal.drvState.lod[0].stride;
+ }
+
+ mtls->fep.ptrOut = NULL;
+ mtls->fep.eStrideOut = 0;
+ if (aout) {
+ mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+ mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
+ mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
+ }
+}
+
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+ Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc,
+ MTLaunchStruct *mtls) {
+
+ memset(mtls, 0, sizeof(MTLaunchStruct));
+
+ // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
+ if (ains != NULL) {
+ for (int index = inLen; --index >= 0;) {
+ const Allocation* ain = ains[index];
+
+ if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
+ return;
+ }
+ }
+ }
+
if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "rsForEach called with null out allocations");
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
return;
}
- if (inLen > 0) {
+ if (ains != NULL) {
const Allocation *ain0 = ains[0];
const Type *inType = ain0->getType();
@@ -840,12 +951,11 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
mtls->fep.dimZ = outType->getDimZ();
} else {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "rsForEach called with null allocations");
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
return;
}
- if (inLen > 0 && aout != NULL) {
+ if (ains != NULL && aout != NULL) {
if (!ains[0]->hasSameDims(aout)) {
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
"Failed to launch kernel; dimensions of input and output allocations do not match.");
@@ -892,7 +1002,7 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
- rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0));
+ rsAssert(!ains || (ains[0]->getType()->getDimZ() == 0));
mtls->rsc = mCtx;
mtls->ains = ains;
@@ -902,28 +1012,18 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
mtls->mSliceSize = 1;
mtls->mSliceNum = 0;
- mtls->fep.inPtrs = NULL;
- mtls->fep.inStrides = NULL;
+ mtls->fep.ptrIns = NULL;
+ mtls->fep.eStrideIn = 0;
mtls->isThreadable = mIsThreadable;
- if (inLen > 0) {
-
- if (inLen <= RS_KERNEL_INPUT_THRESHOLD) {
- mtls->fep.inPtrs = (const uint8_t**)mtls->inPtrsBuff;
- mtls->fep.inStrides = mtls->inStridesBuff;
- } else {
- mtls->fep.heapAllocatedArrays = true;
-
- mtls->fep.inPtrs = new const uint8_t*[inLen];
- mtls->fep.inStrides = new StridePair[inLen];
- }
-
- mtls->fep.inLen = inLen;
+ if (ains) {
+ mtls->fep.ptrIns = new const uint8_t*[inLen];
+ mtls->fep.inStrides = new StridePair[inLen];
for (int index = inLen; --index >= 0;) {
const Allocation *ain = ains[index];
- mtls->fep.inPtrs[index] =
+ mtls->fep.ptrIns[index] =
(const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr;
mtls->fep.inStrides[index].eStride =
@@ -933,27 +1033,41 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
}
}
- mtls->fep.outPtr = NULL;
- mtls->fep.outStride.eStride = 0;
- mtls->fep.outStride.yStride = 0;
- if (aout != NULL) {
- mtls->fep.outPtr = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
-
- mtls->fep.outStride.eStride = aout->getType()->getElementSizeBytes();
- mtls->fep.outStride.yStride = aout->mHal.drvState.lod[0].stride;
+ mtls->fep.ptrOut = NULL;
+ mtls->fep.eStrideOut = 0;
+ if (aout) {
+ mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+ mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
+ mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
}
}
void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
- const Allocation ** ains,
- uint32_t inLen,
+ const Allocation * ain,
Allocation * aout,
const void * usr,
uint32_t usrLen,
const RsScriptCall *sc) {
MTLaunchStruct mtls;
+ forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
+ forEachKernelSetup(slot, &mtls);
+
+ RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+ mCtx->launchThreads(ain, aout, sc, &mtls);
+ mCtx->setTLS(oldTLS);
+}
+
+void RsdCpuScriptImpl::invokeForEachMulti(uint32_t slot,
+ const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc) {
+
+ MTLaunchStruct mtls;
forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
forEachKernelSetup(slot, &mtls);
@@ -1224,15 +1338,17 @@ Allocation * RsdCpuScriptImpl::getAllocationForPointer(const void *ptr) const {
return NULL;
}
-void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation ** ains,
- uint32_t inLen, Allocation * aout,
- const void * usr, uint32_t usrLen,
- const RsScriptCall *sc) {}
+void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc)
+{
+}
-void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation ** ains,
- uint32_t inLen, Allocation * aout,
- const void * usr, uint32_t usrLen,
- const RsScriptCall *sc) {}
+void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc)
+{
+}
}
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index f0843cc0..d51e9e3f 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -64,22 +64,26 @@ public:
virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
virtual int invokeRoot();
- virtual void preLaunch(uint32_t slot, const Allocation ** ains,
- uint32_t inLen, Allocation * aout, const void * usr,
+ virtual void preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
uint32_t usrLen, const RsScriptCall *sc);
- virtual void postLaunch(uint32_t slot, const Allocation ** ains,
- uint32_t inLen, Allocation * aout,
- const void * usr, uint32_t usrLen,
- const RsScriptCall *sc);
-
+ virtual void postLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc);
virtual void invokeForEach(uint32_t slot,
- const Allocation ** ains,
- uint32_t inLen,
- Allocation* aout,
- const void* usr,
- uint32_t usrLen,
- const RsScriptCall* sc);
-
+ const Allocation * ain,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc);
+
+ virtual void invokeForEachMulti(uint32_t slot,
+ const Allocation** ains,
+ uint32_t inLen,
+ Allocation* aout,
+ const void* usr,
+ uint32_t usrLen,
+ const RsScriptCall* sc);
virtual void invokeInit();
virtual void invokeFreeChildren();
@@ -96,6 +100,10 @@ public:
const Script * getScript() {return mScript;}
+ void forEachMtlsSetup(const Allocation * ain, Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc, MTLaunchStruct *mtls);
+
void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
Allocation * aout, const void * usr, uint32_t usrLen,
const RsScriptCall *sc, MTLaunchStruct *mtls);
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 20ee09db..08785523 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -53,45 +53,38 @@ void CpuScriptGroupImpl::scriptGroupRoot(const RsExpandKernelParams *kparams,
uint32_t instep, uint32_t outstep) {
- const ScriptList *sl = (const ScriptList *)kparams->usr;
+ const ScriptList *sl = (const ScriptList *)kparams->usr;
RsExpandKernelParams *mkparams = (RsExpandKernelParams *)kparams;
- const void **oldIns = mkparams->ins;
- uint32_t *oldStrides = mkparams->inEStrides;
-
- void *localIns[1];
- uint32_t localStride[1];
-
- mkparams->ins = (const void**)localIns;
- mkparams->inEStrides = localStride;
-
for (size_t ct = 0; ct < sl->count; ct++) {
ScriptGroupRootFunc_t func;
func = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
mkparams->usr = sl->usrPtrs[ct];
+ mkparams->in = NULL;
+ mkparams->out = NULL;
+
+ uint32_t istep = 0;
+ uint32_t ostep = 0;
+
if (sl->ins[ct]) {
- localIns[0] = sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+ mkparams->in =
+ (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
- localStride[0] = sl->ins[ct]->mHal.state.elementSizeBytes;
+ istep = sl->ins[ct]->mHal.state.elementSizeBytes;
if (sl->inExts[ct]) {
- localIns[0] = (void*)
- ((const uint8_t *)localIns[0] +
- sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y);
+ mkparams->in =
+ (const uint8_t *)mkparams->in +
+ sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y;
} else if (sl->ins[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
- localIns[0] = (void*)
- ((const uint8_t *)localIns[0] +
- sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid);
+ mkparams->in =
+ (const uint8_t *)mkparams->in +
+ sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid;
}
-
- } else {
- localIns[0] = NULL;
- localStride[0] = 0;
}
- uint32_t ostep;
if (sl->outs[ct]) {
mkparams->out =
(uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
@@ -108,23 +101,14 @@ void CpuScriptGroupImpl::scriptGroupRoot(const RsExpandKernelParams *kparams,
(uint8_t *)mkparams->out +
sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->lid;
}
- } else {
- mkparams->out = NULL;
- ostep = 0;
}
//ALOGE("kernel %i %p,%p %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
- /*
- * The fourth argument is zero here because kernels get their stride
- * information from a member of p that points to an array.
- */
- func(kparams, xstart, xend, 0, ostep);
+ func(kparams, xstart, xend, istep, ostep);
}
//ALOGE("script group root");
- mkparams->ins = oldIns;
- mkparams->inEStrides = oldStrides;
- mkparams->usr = sl;
+ mkparams->usr = sl;
}
@@ -211,33 +195,17 @@ void CpuScriptGroupImpl::execute() {
MTLaunchStruct mtls;
- if (fieldDep) {
+ if(fieldDep) {
for (size_t ct=0; ct < ins.size(); ct++) {
Script *s = kernels[ct]->mScript;
RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
uint32_t slot = kernels[ct]->mSlot;
- uint32_t inLen;
- const Allocation **ains;
-
- if (ins[ct] == NULL) {
- inLen = 0;
- ains = NULL;
-
- } else {
- inLen = 1;
- ains = const_cast<const Allocation**>(&ins[ct]);
- }
-
- si->forEachMtlsSetup(ains, inLen, outs[ct], NULL, 0, NULL, &mtls);
-
+ si->forEachMtlsSetup(ins[ct], outs[ct], NULL, 0, NULL, &mtls);
si->forEachKernelSetup(slot, &mtls);
- si->preLaunch(slot, ains, inLen, outs[ct], mtls.fep.usr,
- mtls.fep.usrLen, NULL);
-
- mCtx->launchThreads(ains, inLen, outs[ct], NULL, &mtls);
-
- si->postLaunch(slot, ains, inLen, outs[ct], NULL, 0, NULL);
+ si->preLaunch(slot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
+ mCtx->launchThreads(ins[ct], outs[ct], NULL, &mtls);
+ si->postLaunch(slot, ins[ct], outs[ct], NULL, 0, NULL);
}
} else {
ScriptList sl;
@@ -246,18 +214,6 @@ void CpuScriptGroupImpl::execute() {
sl.kernels = kernels.array();
sl.count = kernels.size();
- uint32_t inLen;
- const Allocation **ains;
-
- if (ins[0] == NULL) {
- inLen = 0;
- ains = NULL;
-
- } else {
- inLen = 1;
- ains = const_cast<const Allocation**>(&ins[0]);
- }
-
Vector<const void *> usrPtrs;
Vector<const void *> fnPtrs;
Vector<uint32_t> sigs;
@@ -269,8 +225,7 @@ void CpuScriptGroupImpl::execute() {
fnPtrs.add((void *)mtls.kernel);
usrPtrs.add(mtls.fep.usr);
sigs.add(mtls.fep.usrLen);
- si->preLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct],
- mtls.fep.usr, mtls.fep.usrLen, NULL);
+ si->preLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
}
sl.sigs = sigs.array();
sl.usrPtrs = usrPtrs.array();
@@ -280,20 +235,16 @@ void CpuScriptGroupImpl::execute() {
Script *s = kernels[0]->mScript;
RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
-
- si->forEachMtlsSetup(ains, inLen, outs[0], NULL, 0, NULL, &mtls);
-
+ si->forEachMtlsSetup(ins[0], outs[0], NULL, 0, NULL, &mtls);
mtls.script = NULL;
mtls.kernel = (void (*)())&scriptGroupRoot;
mtls.fep.usr = &sl;
-
- mCtx->launchThreads(ains, inLen, outs[0], NULL, &mtls);
+ mCtx->launchThreads(ins[0], outs[0], NULL, &mtls);
for (size_t ct=0; ct < kernels.size(); ct++) {
Script *s = kernels[ct]->mScript;
RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
- si->postLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct], NULL, 0,
- NULL);
+ si->postLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], NULL, 0, NULL);
}
}
}
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 4728b7c8..0076cb98 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -69,15 +69,21 @@ public:
virtual void populateScript(Script *) = 0;
virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength) = 0;
virtual int invokeRoot() = 0;
-
virtual void invokeForEach(uint32_t slot,
- const Allocation ** ains,
- uint32_t inLen,
- Allocation * aout,
- const void * usr,
- uint32_t usrLen,
- const RsScriptCall *sc) = 0;
-
+ const Allocation * ain,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc) = 0;
+
+ virtual void invokeForEachMulti(uint32_t slot,
+ const Allocation** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc) = 0;
+
virtual void invokeInit() = 0;
virtual void invokeFreeChildren() = 0;