summaryrefslogtreecommitdiff
path: root/cpu_ref
diff options
context:
space:
mode:
authorChris Wailes <chriswailes@google.com>2014-07-16 15:18:30 -0700
committerStephen Hines <srhines@google.com>2014-08-12 17:02:01 -0700
commit818cfa034e257c7bb48356257f5cb67334e19aa6 (patch)
tree27ad9d05d771ae01aa678d71593a7c062b2d2105 /cpu_ref
parentfb0a274983ae8bfb07aff8c292305389789d6e92 (diff)
downloadrs-818cfa034e257c7bb48356257f5cb67334e19aa6.tar.gz
Collapse code paths for single- and multi-input kernels.
This patch simplifies the RenderScript driver and CPU reference implementation by removing the distinction between sing- and multi-input kernels in many places. The distinction is maintained in some places due to the need to maintain backwards compatibility. This permits the deletion of some functions and struct members that are no longer needed. Several related functions were also cleaned up. Change-Id: I77e4b155cc7ca1581b05bf901c70ae53a9ff0b12
Diffstat (limited to 'cpu_ref')
-rw-r--r--cpu_ref/Android.mk2
-rw-r--r--cpu_ref/rsCpuCore.cpp242
-rw-r--r--cpu_ref/rsCpuCore.h65
-rw-r--r--cpu_ref/rsCpuIntrinsic.cpp49
-rw-r--r--cpu_ref/rsCpuIntrinsic.h55
-rw-r--r--cpu_ref/rsCpuIntrinsic3DLUT.cpp10
-rw-r--r--cpu_ref/rsCpuIntrinsicBlend.cpp5
-rw-r--r--cpu_ref/rsCpuIntrinsicColorMatrix.cpp38
-rw-r--r--cpu_ref/rsCpuIntrinsicHistogram.cpp58
-rw-r--r--cpu_ref/rsCpuIntrinsicLUT.cpp4
-rw-r--r--cpu_ref/rsCpuIntrinsicResize.cpp14
-rw-r--r--cpu_ref/rsCpuScript.cpp214
-rw-r--r--cpu_ref/rsCpuScript.h36
-rw-r--r--cpu_ref/rsCpuScriptGroup.cpp103
-rw-r--r--cpu_ref/rsd_cpu.h22
15 files changed, 380 insertions, 537 deletions
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index aeb75a65..729e7022 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -91,7 +91,7 @@ include external/libcxx/libcxx.mk
endif
include frameworks/compile/libbcc/libbcc-targets.mk
-LOCAL_CFLAGS += $(rs_base_CFLAGS)
+LOCAL_CFLAGS += $(rs_base_CFLAGS) -std=c++11
LOCAL_MODULE_TAGS := optional
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index a0564fc1..db3cc7fa 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -350,180 +350,134 @@ RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
}
typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
+typedef void (*walk_loop_t)(MTLaunchStruct*,
+ RsExpandKernelParams&,
+ outer_foreach_t);
-static void wc_xy(void *usr, uint32_t idx) {
+
+static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) {
MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+ uint32_t inLen = mtls->fep.inLen;
+
RsExpandKernelParams kparams;
kparams.takeFields(mtls->fep);
// Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
kparams.lid = idx;
- outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
- while (1) {
- uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
- uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
- uint32_t yEnd = yStart + mtls->mSliceSize;
-
- yEnd = rsMin(yEnd, mtls->yEnd);
-
- if (yEnd <= yStart) {
- return;
- }
-
- //ALOGE("usr idx %i, x %i,%i y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
- //ALOGE("usr ptr in %p, out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
-
- for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
- kparams.out = mtls->fep.ptrOut +
- (mtls->fep.yStrideOut * kparams.y) +
- (mtls->fep.eStrideOut * mtls->xStart);
-
- kparams.in = mtls->fep.ptrIn +
- (mtls->fep.yStrideIn * kparams.y) +
- (mtls->fep.eStrideIn * mtls->xStart);
+ if (inLen > 0) {
+ // Allocate space for our input base pointers.
+ kparams.ins = (const void**)alloca(inLen * sizeof(void*));
+ // Allocate space for our input stride information.
+ kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
- fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
- mtls->fep.eStrideOut);
+ // Fill our stride information.
+ for (int inIndex = inLen; --inIndex >= 0;) {
+ kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride;
}
}
-}
-
-static void wc_x(void *usr, uint32_t idx) {
- MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-
- RsExpandKernelParams kparams;
- kparams.takeFields(mtls->fep);
-
- // Used by CpuScriptGroup, IntrinsicBlur, and IntrisicHistogram
- kparams.lid = idx;
outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
- while (1) {
- uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
- uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
- uint32_t xEnd = xStart + mtls->mSliceSize;
-
- xEnd = rsMin(xEnd, mtls->xEnd);
-
- if (xEnd <= xStart) {
- return;
- }
-
- //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
- //ALOGE("usr ptr in %p, out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
- kparams.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
- kparams.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
-
- fn(&kparams, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
- }
+ walk_loop(mtls, kparams, fn);
}
-void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
- const RsScriptCall *sc, MTLaunchStruct *mtls) {
+static void walk_2d(void *usr, uint32_t idx) {
+ walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
+ RsExpandKernelParams &kparams,
+ outer_foreach_t fn) {
- //android::StopWatch kernel_time("kernel time");
+ while (1) {
+ uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+ uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+ uint32_t yEnd = yStart + mtls->mSliceSize;
- if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
- const size_t targetByteChunk = 16 * 1024;
- mInForEach = true;
- if (mtls->fep.dimY > 1) {
- uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
- uint32_t s2 = 0;
+ yEnd = rsMin(yEnd, mtls->yEnd);
- // This chooses our slice size to rate limit atomic ops to
- // one per 16k bytes of reads/writes.
- if (mtls->fep.yStrideOut) {
- s2 = targetByteChunk / mtls->fep.yStrideOut;
- } else {
- s2 = targetByteChunk / mtls->fep.yStrideIn;
+ if (yEnd <= yStart) {
+ return;
}
- mtls->mSliceSize = rsMin(s1, s2);
- if(mtls->mSliceSize < 1) {
- mtls->mSliceSize = 1;
- }
+ for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
+ kparams.out = mtls->fep.outPtr +
+ (mtls->fep.outStride.yStride * kparams.y) +
+ (mtls->fep.outStride.eStride * mtls->xStart);
- // mtls->mSliceSize = 2;
- launchThreads(wc_xy, mtls);
- } else {
- uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
- uint32_t s2 = 0;
+ for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
+ StridePair &strides = mtls->fep.inStrides[inIndex];
- // This chooses our slice size to rate limit atomic ops to
- // one per 16k bytes of reads/writes.
- if (mtls->fep.eStrideOut) {
- s2 = targetByteChunk / mtls->fep.eStrideOut;
- } else {
- s2 = targetByteChunk / mtls->fep.eStrideIn;
- }
- mtls->mSliceSize = rsMin(s1, s2);
+ kparams.ins[inIndex] =
+ mtls->fep.inPtrs[inIndex] +
+ (strides.yStride * kparams.y) +
+ (strides.eStride * mtls->xStart);
+ }
- if(mtls->mSliceSize < 1) {
- mtls->mSliceSize = 1;
+ // Kernels now get their input strides from kparams.
+ fn(&kparams, mtls->xStart, mtls->xEnd, 0,
+ mtls->fep.outStride.eStride);
}
-
- launchThreads(wc_x, mtls);
}
- mInForEach = false;
-
- //ALOGE("launch 1");
- } else {
- RsExpandKernelParams kparams;
- kparams.takeFields(mtls->fep);
+ });
+}
- //ALOGE("launch 3");
- outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
- for (uint32_t arrayIndex = mtls->arrayStart;
- arrayIndex < mtls->arrayEnd; arrayIndex++) {
+static void walk_1d(void *usr, uint32_t idx) {
+ walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
+ RsExpandKernelParams &kparams,
+ outer_foreach_t fn) {
- for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
- kparams.z++) {
+ while (1) {
+ uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+ uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+ uint32_t xEnd = xStart + mtls->mSliceSize;
- for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
- kparams.y++) {
+ xEnd = rsMin(xEnd, mtls->xEnd);
- uint32_t offset =
- kparams.dimY * kparams.dimZ * arrayIndex +
- kparams.dimY * kparams.z + kparams.y;
+ if (xEnd <= xStart) {
+ return;
+ }
- kparams.out = mtls->fep.ptrOut +
- (mtls->fep.yStrideOut * offset) +
- (mtls->fep.eStrideOut * mtls->xStart);
+ kparams.out = mtls->fep.outPtr +
+ (mtls->fep.outStride.eStride * xStart);
- kparams.in = mtls->fep.ptrIn +
- (mtls->fep.yStrideIn * offset) +
- (mtls->fep.eStrideIn * mtls->xStart);
+ for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
+ StridePair &strides = mtls->fep.inStrides[inIndex];
- fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
- mtls->fep.eStrideOut);
- }
+ kparams.ins[inIndex] =
+ mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart);
}
+
+ // Kernels now get their input strides from kparams.
+ fn(&kparams, xStart, xEnd, 0, mtls->fep.outStride.eStride);
}
- }
+ });
}
-void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
- const RsScriptCall* sc, MTLaunchStruct* mtls) {
+
+void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
+ uint32_t inLen,
+ Allocation* aout,
+ const RsScriptCall* sc,
+ MTLaunchStruct* mtls) {
//android::StopWatch kernel_time("kernel time");
if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
const size_t targetByteChunk = 16 * 1024;
mInForEach = true;
+
if (mtls->fep.dimY > 1) {
uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
uint32_t s2 = 0;
// This chooses our slice size to rate limit atomic ops to
// one per 16k bytes of reads/writes.
- if (mtls->fep.yStrideOut) {
- s2 = targetByteChunk / mtls->fep.yStrideOut;
+ if (mtls->fep.outStride.yStride) {
+ s2 = targetByteChunk / mtls->fep.outStride.yStride;
} else {
- s2 = targetByteChunk / mtls->fep.yStrideIn;
+ // We know that there is either an output or an input.
+ s2 = targetByteChunk / mtls->fep.inStrides[0].yStride;
}
mtls->mSliceSize = rsMin(s1, s2);
@@ -531,18 +485,18 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen,
mtls->mSliceSize = 1;
}
- // mtls->mSliceSize = 2;
- launchThreads(wc_xy, mtls);
+ launchThreads(walk_2d, mtls);
} else {
uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
uint32_t s2 = 0;
// This chooses our slice size to rate limit atomic ops to
// one per 16k bytes of reads/writes.
- if (mtls->fep.eStrideOut) {
- s2 = targetByteChunk / mtls->fep.eStrideOut;
+ if (mtls->fep.outStride.eStride) {
+ s2 = targetByteChunk / mtls->fep.outStride.eStride;
} else {
- s2 = targetByteChunk / mtls->fep.eStrideIn;
+ // We know that there is either an output or an input.
+ s2 = targetByteChunk / mtls->fep.inStrides[0].eStride;
}
mtls->mSliceSize = rsMin(s1, s2);
@@ -550,24 +504,26 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen,
mtls->mSliceSize = 1;
}
- launchThreads(wc_x, mtls);
+ launchThreads(walk_1d, mtls);
}
mInForEach = false;
- //ALOGE("launch 1");
} else {
RsExpandKernelParams kparams;
kparams.takeFields(mtls->fep);
- // Allocate space for our input base pointers.
- kparams.ins = new const void*[inLen];
+ if (inLen > 0) {
+ // Allocate space for our input base pointers.
+ kparams.ins = (const void**)alloca(inLen * sizeof(void*));
- // Allocate space for our input stride information.
- kparams.eStrideIns = new uint32_t[inLen];
+ // Allocate space for our input stride information.
+ kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
- // Fill our stride information.
- for (int inIndex = inLen; --inIndex >= 0;) {
- kparams.eStrideIns[inIndex] = mtls->fep.inStrides[inIndex].eStride;
+ // Fill our stride information.
+ for (int inIndex = inLen; --inIndex >= 0;) {
+ kparams.inEStrides[inIndex] =
+ mtls->fep.inStrides[inIndex].eStride;
+ }
}
//ALOGE("launch 3");
@@ -585,15 +541,15 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen,
mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
mtls->fep.dimY * kparams.z + kparams.y;
- kparams.out = mtls->fep.ptrOut +
- (mtls->fep.yStrideOut * offset) +
- (mtls->fep.eStrideOut * mtls->xStart);
+ kparams.out = mtls->fep.outPtr +
+ (mtls->fep.outStride.yStride * offset) +
+ (mtls->fep.outStride.eStride * mtls->xStart);
for (int inIndex = inLen; --inIndex >= 0;) {
StridePair &strides = mtls->fep.inStrides[inIndex];
kparams.ins[inIndex] =
- mtls->fep.ptrIns[inIndex] +
+ mtls->fep.inPtrs[inIndex] +
(strides.yStride * offset) +
(strides.eStride * mtls->xStart);
}
@@ -604,14 +560,10 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen,
* that points to an array.
*/
fn(&kparams, mtls->xStart, mtls->xEnd, 0,
- mtls->fep.eStrideOut);
+ mtls->fep.outStride.eStride);
}
}
}
-
- // Free our arrays.
- delete[] kparams.ins;
- delete[] kparams.eStrideIns;
}
}
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 5d4b6cc5..2fea3fcd 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -25,6 +25,8 @@
#include <string>
+#define RS_KERNEL_INPUT_THRESHOLD 32
+
namespace bcc {
class BCCContext;
class RSCompilerDriver;
@@ -40,31 +42,36 @@ struct StridePair {
};
struct RsExpandKernelDriverInfo {
- const void *usr;
- uint32_t usrLen;
+ const uint8_t **inPtrs;
+ uint32_t inLen;
+
+ uint8_t *outPtr;
+
+ StridePair *inStrides;
+ StridePair outStride;
uint32_t dimX;
uint32_t dimY;
uint32_t dimZ;
- const uint8_t *ptrIn;
- uint8_t *ptrOut;
- uint32_t eStrideIn;
- uint32_t eStrideOut;
- uint32_t yStrideIn;
- uint32_t yStrideOut;
uint32_t slot;
- const uint8_t** ptrIns;
- StridePair* inStrides;
+ const void *usr;
+ uint32_t usrLen;
- ~RsExpandKernelDriverInfo() {
- if (ptrIns != NULL) {
- delete[] ptrIns;
- }
+ bool heapAllocatedArrays;
- if (inStrides != NULL) {
- delete[] inStrides;
+ RsExpandKernelDriverInfo() : heapAllocatedArrays(false) {}
+
+ ~RsExpandKernelDriverInfo() {
+ if (heapAllocatedArrays) {
+ if (inPtrs != NULL) {
+ delete[] inPtrs;
+ }
+
+ if (inStrides != NULL) {
+ delete[] inStrides;
+ }
}
}
};
@@ -72,15 +79,13 @@ struct RsExpandKernelDriverInfo {
struct RsExpandKernelParams {
// Used by kernels
- const void *in;
+ const void **ins;
+ uint32_t *inEStrides;
void *out;
uint32_t y;
uint32_t z;
uint32_t lid;
- const void **ins;
- uint32_t *eStrideIns;
-
// Used by ScriptGroup and user kernels.
const void *usr;
@@ -115,13 +120,13 @@ typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
class RsdCpuScriptImpl;
class RsdCpuReferenceImpl;
-typedef struct ScriptTLSStructRec {
+struct ScriptTLSStruct {
android::renderscript::Context * mContext;
const android::renderscript::Script * mScript;
RsdCpuScriptImpl *mImpl;
-} ScriptTLSStruct;
+};
-typedef struct {
+struct MTLaunchStruct {
RsExpandKernelDriverInfo fep;
RsdCpuReferenceImpl *rsc;
@@ -129,7 +134,7 @@ typedef struct {
ForEachFunc_t kernel;
uint32_t sig;
- const Allocation * ain;
+ const Allocation ** ains;
Allocation * aout;
uint32_t mSliceSize;
@@ -145,12 +150,9 @@ typedef struct {
uint32_t arrayStart;
uint32_t arrayEnd;
- // Multi-input data.
- const Allocation ** ains;
-} MTLaunchStruct;
-
-
-
+ const uint8_t *inPtrsBuff[RS_KERNEL_INPUT_THRESHOLD];
+ StridePair inStridesBuff[RS_KERNEL_INPUT_THRESHOLD];
+};
class RsdCpuReferenceImpl : public RsdCpuReference {
public:
@@ -171,9 +173,6 @@ public:
return mWorkers.mCount + 1;
}
- void launchThreads(const Allocation * ain, Allocation * aout,
- const RsScriptCall *sc, MTLaunchStruct *mtls);
-
void launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
const RsScriptCall* sc, MTLaunchStruct* mtls);
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 5a7fffd5..8437c998 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -73,54 +73,29 @@ void RsdCpuScriptIntrinsic::invokeFreeChildren() {
}
-void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation ** ains,
+ uint32_t inLen, Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc) {
}
-void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation ** ains,
+ uint32_t inLen, Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc) {
}
void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot,
- const Allocation * ain,
+ const Allocation ** ains,
+ uint32_t inLen,
Allocation * aout,
const void * usr,
uint32_t usrLen,
const RsScriptCall *sc) {
MTLaunchStruct mtls;
- preLaunch(slot, ain, aout, usr, usrLen, sc);
- forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
- mtls.script = this;
- mtls.fep.slot = slot;
-
- mtls.kernel = (void (*)())mRootPtr;
- mtls.fep.usr = this;
-
- RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
- mCtx->launchThreads(ain, aout, sc, &mtls);
- mCtx->setTLS(oldTLS);
-
- postLaunch(slot, ain, aout, usr, usrLen, sc);
-}
-
-void RsdCpuScriptIntrinsic::invokeForEachMulti(uint32_t slot,
- const Allocation ** ains,
- uint32_t inLen,
- Allocation * aout,
- const void * usr,
- uint32_t usrLen,
- const RsScriptCall *sc) {
-
- MTLaunchStruct mtls;
- /*
- * FIXME: Possibly create new preLaunch and postLaunch functions that take
- * all of the input allocation pointers.
- */
- preLaunch(slot, ains[0], aout, usr, usrLen, sc);
+ preLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
mtls.script = this;
@@ -133,7 +108,7 @@ void RsdCpuScriptIntrinsic::invokeForEachMulti(uint32_t slot,
mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
mCtx->setTLS(oldTLS);
- postLaunch(slot, ains[0], aout, usr, usrLen, sc);
+ postLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
}
void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index bf6a8acd..95aaa141 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -28,43 +28,42 @@ class RsdCpuScriptIntrinsic : public RsdCpuScriptImpl {
public:
virtual void populateScript(Script *) = 0;
- virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
+ virtual void invokeFunction(uint32_t slot, const void * params,
+ size_t paramLength);
virtual int invokeRoot();
+
virtual void invokeForEach(uint32_t slot,
- const Allocation * ain,
- Allocation * aout,
- const void * usr,
- uint32_t usrLen,
- const RsScriptCall *sc);
-
- virtual void invokeForEachMulti(uint32_t slot,
- const Allocation ** ain,
- uint32_t inLen,
- Allocation * aout,
- const void * usr,
- uint32_t usrLen,
- const RsScriptCall *sc);
-
- virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
+ const Allocation ** ain,
+ uint32_t inLen,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc);
+
+ virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct * mtls);
virtual void invokeInit();
virtual void invokeFreeChildren();
- virtual void preLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall *sc);
- virtual void postLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall *sc);
-
- virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
- virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
- const Element *e, const uint32_t *dims, size_t dimLength);
+ virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+ uint32_t inLen, Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall * sc);
+ virtual void postLaunch(uint32_t slot, const Allocation ** ains,
+ uint32_t inLen, Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall * sc);
+
+ virtual void setGlobalVar(uint32_t slot, const void * data,
+ size_t dataLength);
+ virtual void setGlobalVarWithElemDims(uint32_t slot, const void * data,
+ size_t dataLength, const Element * e,
+ const uint32_t * dims,
+ size_t dimLength);
virtual void setGlobalBind(uint32_t slot, Allocation *data);
virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
virtual ~RsdCpuScriptIntrinsic();
- RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, const Element *,
- RsScriptIntrinsicID iid);
+ RsdCpuScriptIntrinsic(RsdCpuReferenceImpl * ctx, const Script * s,
+ const Element * e, RsScriptIntrinsicID iid);
protected:
RsScriptIntrinsicID mID;
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index c839c19d..a19d8851 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -64,7 +64,7 @@ void RsdCpuScriptIntrinsic3DLUT::kernel(const RsExpandKernelParams *p,
RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
uchar4 *out = (uchar4 *)p->out + xstart;
- uchar4 *in = (uchar4 *)p->in + xstart;
+ uchar4 *in = (uchar4 *)p->ins[0] + xstart;
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -161,9 +161,9 @@ void RsdCpuScriptIntrinsic3DLUT::kernel(const RsExpandKernelParams *p,
}
}
-RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx,
- const Script *s, const Element *e)
- : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
+RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(
+ RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) :
+ RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
mRootPtr = &kernel;
}
@@ -185,5 +185,3 @@ RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e);
}
-
-
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index b6046584..0378e076 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -117,7 +117,7 @@ void RsdCpuScriptIntrinsicBlend::kernel(const RsExpandKernelParams *p,
// instep/outstep can be ignored--sizeof(uchar4) known at compile time
uchar4 *out = (uchar4 *)p->out;
- uchar4 *in = (uchar4 *)p->in;
+ uchar4 *in = (uchar4 *)p->ins[0];
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -509,6 +509,3 @@ RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
const Script *s, const Element *e) {
return new RsdCpuScriptIntrinsicBlend(ctx, s, e);
}
-
-
-
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index bf78eb3e..4e90ad72 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -169,10 +169,9 @@ public:
virtual ~RsdCpuScriptIntrinsicColorMatrix();
RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
- virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
- const void * usr, uint32_t usrLen, const RsScriptCall *sc);
- virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
- const void * usr, uint32_t usrLen, const RsScriptCall *sc);
+ virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+ uint32_t inLen, Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc);
protected:
float fp[16];
@@ -883,8 +882,13 @@ void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p,
uint32_t xstart, uint32_t xend,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
- uchar *out = (uchar *)p->out + outstep * xstart;
- uchar *in = (uchar *)p->in + instep * xstart;
+
+ // Update the instep due to change in parameter passing.
+ instep = p->inEStrides[0];
+
+ uchar *out = (uchar *)p->out + outstep * xstart;
+ uchar *in = (uchar *)p->ins[0] + instep * xstart;
+
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -932,11 +936,15 @@ void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p,
}
}
-void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
- uint32_t slot, const Allocation * ain, Allocation * aout,
- const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
+ const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc) {
- const Element *ein = ain->mHal.state.type->getElement();
+ const Element *ein = ains[0]->mHal.state.type->getElement();
const Element *eout = aout->mHal.state.type->getElement();
if (ein->getType() == eout->getType()) {
@@ -953,8 +961,8 @@ void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
}
}
- Key_t key = computeKey(ain->mHal.state.type->getElement(),
- aout->mHal.state.type->getElement());
+ Key_t key = computeKey(ein, eout);
+
#if defined(ARCH_X86_HAVE_SSSE3)
if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
// FIXME: Disable mOptKernel to pass RS color matrix CTS cases
@@ -996,12 +1004,6 @@ void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
#endif //if !defined(ARCH_X86_HAVE_SSSE3)
}
-void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
- uint32_t slot, const Allocation * ain, Allocation * aout,
- const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
-
-}
-
RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
: RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp
index 1c430b72..b5dbfa80 100644
--- a/cpu_ref/rsCpuIntrinsicHistogram.cpp
+++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp
@@ -36,10 +36,10 @@ public:
RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
protected:
- void preLaunch(uint32_t slot, const Allocation * ain,
+ void preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
Allocation * aout, const void * usr,
uint32_t usrLen, const RsScriptCall *sc);
- void postLaunch(uint32_t slot, const Allocation * ain,
+ void postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
Allocation * aout, const void * usr,
uint32_t usrLen, const RsScriptCall *sc);
@@ -97,9 +97,12 @@ void RsdCpuScriptIntrinsicHistogram::setGlobalVar(uint32_t slot, const void *dat
-void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall *sc) {
+void
+RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot,
+ const Allocation ** ains,
+ uint32_t inLen, Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc) {
const uint32_t threads = mCtx->getThreadCount();
uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
@@ -123,7 +126,7 @@ void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation *
}
break;
case 1:
- switch(ain->getType()->getElement()->getVectorSize()) {
+ switch(ains[0]->getType()->getElement()->getVectorSize()) {
case 1:
mRootPtr = &kernelP1L1;
break;
@@ -142,9 +145,12 @@ void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation *
memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
}
-void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall *sc) {
+void
+RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot,
+ const Allocation ** ains,
+ uint32_t inLen, Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc) {
unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
uint32_t threads = mCtx->getThreadCount();
@@ -165,7 +171,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->in;
+ uchar *in = (uchar *)p->ins[0];
int * sums = &cp->mSums[256 * 4 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
@@ -173,7 +179,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelParams *p,
sums[(in[1] << 2) + 1] ++;
sums[(in[2] << 2) + 2] ++;
sums[(in[3] << 2) + 3] ++;
- in += instep;
+ in += p->inEStrides[0];
}
}
@@ -182,14 +188,14 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->in;
+ uchar *in = (uchar *)p->ins[0];
int * sums = &cp->mSums[256 * 4 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
sums[(in[0] << 2) ] ++;
sums[(in[1] << 2) + 1] ++;
sums[(in[2] << 2) + 2] ++;
- in += instep;
+ in += p->inEStrides[0];
}
}
@@ -198,13 +204,13 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->in;
+ uchar *in = (uchar *)p->ins[0];
int * sums = &cp->mSums[256 * 2 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
sums[(in[0] << 1) ] ++;
sums[(in[1] << 1) + 1] ++;
- in += instep;
+ in += p->inEStrides[0];
}
}
@@ -213,7 +219,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->in;
+ uchar *in = (uchar *)p->ins[0];
int * sums = &cp->mSums[256 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
@@ -222,7 +228,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelParams *p,
(cp->mDotI[2] * in[2]) +
(cp->mDotI[3] * in[3]);
sums[(t + 0x7f) >> 8] ++;
- in += instep;
+ in += p->inEStrides[0];
}
}
@@ -231,7 +237,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->in;
+ uchar *in = (uchar *)p->ins[0];
int * sums = &cp->mSums[256 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
@@ -239,7 +245,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelParams *p,
(cp->mDotI[1] * in[1]) +
(cp->mDotI[2] * in[2]);
sums[(t + 0x7f) >> 8] ++;
- in += instep;
+ in += p->inEStrides[0];
}
}
@@ -248,14 +254,14 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->in;
+ uchar *in = (uchar *)p->ins[0];
int * sums = &cp->mSums[256 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
int t = (cp->mDotI[0] * in[0]) +
(cp->mDotI[1] * in[1]);
sums[(t + 0x7f) >> 8] ++;
- in += instep;
+ in += p->inEStrides[0];
}
}
@@ -264,13 +270,13 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->in;
+ uchar *in = (uchar *)p->ins[0];
int * sums = &cp->mSums[256 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
int t = (cp->mDotI[0] * in[0]);
sums[(t + 0x7f) >> 8] ++;
- in += instep;
+ in += p->inEStrides[0];
}
}
@@ -279,12 +285,12 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelParams *p,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
- uchar *in = (uchar *)p->in;
+ uchar *in = (uchar *)p->ins[0];
int * sums = &cp->mSums[256 * p->lid];
for (uint32_t x = xstart; x < xend; x++) {
sums[in[0]] ++;
- in += instep;
+ in += p->inEStrides[0];
}
}
@@ -323,5 +329,3 @@ RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx, const Script
return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
}
-
-
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
index db73a838..9d3b4003 100644
--- a/cpu_ref/rsCpuIntrinsicLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -59,7 +59,7 @@ void RsdCpuScriptIntrinsicLUT::kernel(const RsExpandKernelParams *p,
RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)p->usr;
uchar *out = (uchar *)p->out;
- const uchar *in = (uchar *)p->in;
+ const uchar *in = (uchar *)p->ins[0];
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -103,5 +103,3 @@ RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
return new RsdCpuScriptIntrinsicLUT(ctx, s, e);
}
-
-
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index af1127e7..3a307d63 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -35,8 +35,8 @@ public:
virtual ~RsdCpuScriptIntrinsicResize();
RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
- virtual void preLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
+ virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+ uint32_t inLen, Allocation * aout, const void * usr,
uint32_t usrLen, const RsScriptCall *sc);
float scaleX;
@@ -308,9 +308,11 @@ RsdCpuScriptIntrinsicResize::RsdCpuScriptIntrinsicResize (
RsdCpuScriptIntrinsicResize::~RsdCpuScriptIntrinsicResize() {
}
-void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall *sc)
+void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot,
+ const Allocation ** ains,
+ uint32_t inLen, Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc)
{
if (!mAlloc.get()) {
ALOGE("Resize executed without input, skipping");
@@ -351,5 +353,3 @@ RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx, const Script *s
return new RsdCpuScriptIntrinsicResize(ctx, s, e);
}
-
-
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index a11fda19..05984207 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -789,144 +789,33 @@ void RsdCpuScriptImpl::populateScript(Script *script) {
typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation * ain, Allocation * aout,
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
const void * usr, uint32_t usrLen,
const RsScriptCall *sc,
MTLaunchStruct *mtls) {
memset(mtls, 0, sizeof(MTLaunchStruct));
- // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
- if (ain && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
- return;
- }
- if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
- return;
- }
-
- if (ain != NULL) {
- const Type *inType = ain->getType();
+ for (int index = inLen; --index >= 0;) {
+ const Allocation* ain = ains[index];
- mtls->fep.dimX = inType->getDimX();
- mtls->fep.dimY = inType->getDimY();
- mtls->fep.dimZ = inType->getDimZ();
-
- } else if (aout != NULL) {
- const Type *outType = aout->getType();
-
- mtls->fep.dimX = outType->getDimX();
- mtls->fep.dimY = outType->getDimY();
- mtls->fep.dimZ = outType->getDimZ();
-
- } else {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
- return;
- }
-
- if (ain != NULL && aout != NULL) {
- if (!ain->hasSameDims(aout)) {
+ // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
+ if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "Failed to launch kernel; dimensions of input and output allocations do not match.");
-
+ "rsForEach called with null in allocations");
return;
}
}
- if (!sc || (sc->xEnd == 0)) {
- mtls->xEnd = mtls->fep.dimX;
- } else {
- rsAssert(sc->xStart < mtls->fep.dimX);
- rsAssert(sc->xEnd <= mtls->fep.dimX);
- rsAssert(sc->xStart < sc->xEnd);
- mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
- mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
- if (mtls->xStart >= mtls->xEnd) return;
- }
-
- if (!sc || (sc->yEnd == 0)) {
- mtls->yEnd = mtls->fep.dimY;
- } else {
- rsAssert(sc->yStart < mtls->fep.dimY);
- rsAssert(sc->yEnd <= mtls->fep.dimY);
- rsAssert(sc->yStart < sc->yEnd);
- mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
- mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
- if (mtls->yStart >= mtls->yEnd) return;
- }
-
- if (!sc || (sc->zEnd == 0)) {
- mtls->zEnd = mtls->fep.dimZ;
- } else {
- rsAssert(sc->zStart < mtls->fep.dimZ);
- rsAssert(sc->zEnd <= mtls->fep.dimZ);
- rsAssert(sc->zStart < sc->zEnd);
- mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart);
- mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd);
- if (mtls->zStart >= mtls->zEnd) return;
- }
-
- mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
- mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
- mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
- mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
-
- rsAssert(!ain || (ain->getType()->getDimZ() == 0));
-
- mtls->rsc = mCtx;
- mtls->ain = ain;
- mtls->aout = aout;
- mtls->fep.usr = usr;
- mtls->fep.usrLen = usrLen;
- mtls->mSliceSize = 1;
- mtls->mSliceNum = 0;
-
- mtls->fep.ptrIn = NULL;
- mtls->fep.eStrideIn = 0;
- mtls->isThreadable = mIsThreadable;
-
- if (ain) {
- mtls->fep.ptrIn = (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr;
- mtls->fep.eStrideIn = ain->getType()->getElementSizeBytes();
- mtls->fep.yStrideIn = ain->mHal.drvState.lod[0].stride;
- }
-
- mtls->fep.ptrOut = NULL;
- mtls->fep.eStrideOut = 0;
- if (aout) {
- mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
- mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
- mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
- }
-}
-
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
- Allocation * aout,
- const void * usr, uint32_t usrLen,
- const RsScriptCall *sc,
- MTLaunchStruct *mtls) {
-
- memset(mtls, 0, sizeof(MTLaunchStruct));
-
- // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
- if (ains != NULL) {
- for (int index = inLen; --index >= 0;) {
- const Allocation* ain = ains[index];
-
- if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
- return;
- }
- }
- }
-
if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "rsForEach called with null out allocations");
return;
}
- if (ains != NULL) {
+ if (inLen > 0) {
const Allocation *ain0 = ains[0];
const Type *inType = ain0->getType();
@@ -951,11 +840,12 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen
mtls->fep.dimZ = outType->getDimZ();
} else {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "rsForEach called with null allocations");
return;
}
- if (ains != NULL && aout != NULL) {
+ if (inLen > 0 && aout != NULL) {
if (!ains[0]->hasSameDims(aout)) {
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
"Failed to launch kernel; dimensions of input and output allocations do not match.");
@@ -1002,7 +892,7 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen
mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
- rsAssert(!ains || (ains[0]->getType()->getDimZ() == 0));
+ rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0));
mtls->rsc = mCtx;
mtls->ains = ains;
@@ -1012,18 +902,28 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen
mtls->mSliceSize = 1;
mtls->mSliceNum = 0;
- mtls->fep.ptrIns = NULL;
- mtls->fep.eStrideIn = 0;
+ mtls->fep.inPtrs = NULL;
+ mtls->fep.inStrides = NULL;
mtls->isThreadable = mIsThreadable;
- if (ains) {
- mtls->fep.ptrIns = new const uint8_t*[inLen];
- mtls->fep.inStrides = new StridePair[inLen];
+ if (inLen > 0) {
+
+ if (inLen <= RS_KERNEL_INPUT_THRESHOLD) {
+ mtls->fep.inPtrs = (const uint8_t**)mtls->inPtrsBuff;
+ mtls->fep.inStrides = mtls->inStridesBuff;
+ } else {
+ mtls->fep.heapAllocatedArrays = true;
+
+ mtls->fep.inPtrs = new const uint8_t*[inLen];
+ mtls->fep.inStrides = new StridePair[inLen];
+ }
+
+ mtls->fep.inLen = inLen;
for (int index = inLen; --index >= 0;) {
const Allocation *ain = ains[index];
- mtls->fep.ptrIns[index] =
+ mtls->fep.inPtrs[index] =
(const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr;
mtls->fep.inStrides[index].eStride =
@@ -1033,41 +933,27 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen
}
}
- mtls->fep.ptrOut = NULL;
- mtls->fep.eStrideOut = 0;
- if (aout) {
- mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
- mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
- mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
+ mtls->fep.outPtr = NULL;
+ mtls->fep.outStride.eStride = 0;
+ mtls->fep.outStride.yStride = 0;
+ if (aout != NULL) {
+ mtls->fep.outPtr = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+
+ mtls->fep.outStride.eStride = aout->getType()->getElementSizeBytes();
+ mtls->fep.outStride.yStride = aout->mHal.drvState.lod[0].stride;
}
}
void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
- const Allocation * ain,
+ const Allocation ** ains,
+ uint32_t inLen,
Allocation * aout,
const void * usr,
uint32_t usrLen,
const RsScriptCall *sc) {
MTLaunchStruct mtls;
- forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
- forEachKernelSetup(slot, &mtls);
-
- RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
- mCtx->launchThreads(ain, aout, sc, &mtls);
- mCtx->setTLS(oldTLS);
-}
-
-void RsdCpuScriptImpl::invokeForEachMulti(uint32_t slot,
- const Allocation ** ains,
- uint32_t inLen,
- Allocation * aout,
- const void * usr,
- uint32_t usrLen,
- const RsScriptCall *sc) {
-
- MTLaunchStruct mtls;
forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
forEachKernelSetup(slot, &mtls);
@@ -1338,17 +1224,15 @@ Allocation * RsdCpuScriptImpl::getAllocationForPointer(const void *ptr) const {
return NULL;
}
-void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall *sc)
-{
-}
+void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation ** ains,
+ uint32_t inLen, Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc) {}
-void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall *sc)
-{
-}
+void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation ** ains,
+ uint32_t inLen, Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc) {}
}
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index d51e9e3f..f0843cc0 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -64,26 +64,22 @@ public:
virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
virtual int invokeRoot();
- virtual void preLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
+ virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+ uint32_t inLen, Allocation * aout, const void * usr,
uint32_t usrLen, const RsScriptCall *sc);
- virtual void postLaunch(uint32_t slot, const Allocation * ain,
- Allocation * aout, const void * usr,
- uint32_t usrLen, const RsScriptCall *sc);
+ virtual void postLaunch(uint32_t slot, const Allocation ** ains,
+ uint32_t inLen, Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc);
+
virtual void invokeForEach(uint32_t slot,
- const Allocation * ain,
- Allocation * aout,
- const void * usr,
- uint32_t usrLen,
- const RsScriptCall *sc);
-
- virtual void invokeForEachMulti(uint32_t slot,
- const Allocation** ains,
- uint32_t inLen,
- Allocation* aout,
- const void* usr,
- uint32_t usrLen,
- const RsScriptCall* sc);
+ const Allocation ** ains,
+ uint32_t inLen,
+ Allocation* aout,
+ const void* usr,
+ uint32_t usrLen,
+ const RsScriptCall* sc);
+
virtual void invokeInit();
virtual void invokeFreeChildren();
@@ -100,10 +96,6 @@ public:
const Script * getScript() {return mScript;}
- void forEachMtlsSetup(const Allocation * ain, Allocation * aout,
- const void * usr, uint32_t usrLen,
- const RsScriptCall *sc, MTLaunchStruct *mtls);
-
void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
Allocation * aout, const void * usr, uint32_t usrLen,
const RsScriptCall *sc, MTLaunchStruct *mtls);
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 08785523..20ee09db 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -53,38 +53,45 @@ void CpuScriptGroupImpl::scriptGroupRoot(const RsExpandKernelParams *kparams,
uint32_t instep, uint32_t outstep) {
- const ScriptList *sl = (const ScriptList *)kparams->usr;
+ const ScriptList *sl = (const ScriptList *)kparams->usr;
RsExpandKernelParams *mkparams = (RsExpandKernelParams *)kparams;
+ const void **oldIns = mkparams->ins;
+ uint32_t *oldStrides = mkparams->inEStrides;
+
+ void *localIns[1];
+ uint32_t localStride[1];
+
+ mkparams->ins = (const void**)localIns;
+ mkparams->inEStrides = localStride;
+
for (size_t ct = 0; ct < sl->count; ct++) {
ScriptGroupRootFunc_t func;
func = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
mkparams->usr = sl->usrPtrs[ct];
- mkparams->in = NULL;
- mkparams->out = NULL;
-
- uint32_t istep = 0;
- uint32_t ostep = 0;
-
if (sl->ins[ct]) {
- mkparams->in =
- (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+ localIns[0] = sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
- istep = sl->ins[ct]->mHal.state.elementSizeBytes;
+ localStride[0] = sl->ins[ct]->mHal.state.elementSizeBytes;
if (sl->inExts[ct]) {
- mkparams->in =
- (const uint8_t *)mkparams->in +
- sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y;
+ localIns[0] = (void*)
+ ((const uint8_t *)localIns[0] +
+ sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y);
} else if (sl->ins[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
- mkparams->in =
- (const uint8_t *)mkparams->in +
- sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid;
+ localIns[0] = (void*)
+ ((const uint8_t *)localIns[0] +
+ sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid);
}
+
+ } else {
+ localIns[0] = NULL;
+ localStride[0] = 0;
}
+ uint32_t ostep;
if (sl->outs[ct]) {
mkparams->out =
(uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
@@ -101,14 +108,23 @@ void CpuScriptGroupImpl::scriptGroupRoot(const RsExpandKernelParams *kparams,
(uint8_t *)mkparams->out +
sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->lid;
}
+ } else {
+ mkparams->out = NULL;
+ ostep = 0;
}
//ALOGE("kernel %i %p,%p %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
- func(kparams, xstart, xend, istep, ostep);
+ /*
+ * The fourth argument is zero here because kernels get their stride
+ * information from a member of p that points to an array.
+ */
+ func(kparams, xstart, xend, 0, ostep);
}
//ALOGE("script group root");
- mkparams->usr = sl;
+ mkparams->ins = oldIns;
+ mkparams->inEStrides = oldStrides;
+ mkparams->usr = sl;
}
@@ -195,17 +211,33 @@ void CpuScriptGroupImpl::execute() {
MTLaunchStruct mtls;
- if(fieldDep) {
+ if (fieldDep) {
for (size_t ct=0; ct < ins.size(); ct++) {
Script *s = kernels[ct]->mScript;
RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
uint32_t slot = kernels[ct]->mSlot;
- si->forEachMtlsSetup(ins[ct], outs[ct], NULL, 0, NULL, &mtls);
+ uint32_t inLen;
+ const Allocation **ains;
+
+ if (ins[ct] == NULL) {
+ inLen = 0;
+ ains = NULL;
+
+ } else {
+ inLen = 1;
+ ains = const_cast<const Allocation**>(&ins[ct]);
+ }
+
+ si->forEachMtlsSetup(ains, inLen, outs[ct], NULL, 0, NULL, &mtls);
+
si->forEachKernelSetup(slot, &mtls);
- si->preLaunch(slot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
- mCtx->launchThreads(ins[ct], outs[ct], NULL, &mtls);
- si->postLaunch(slot, ins[ct], outs[ct], NULL, 0, NULL);
+ si->preLaunch(slot, ains, inLen, outs[ct], mtls.fep.usr,
+ mtls.fep.usrLen, NULL);
+
+ mCtx->launchThreads(ains, inLen, outs[ct], NULL, &mtls);
+
+ si->postLaunch(slot, ains, inLen, outs[ct], NULL, 0, NULL);
}
} else {
ScriptList sl;
@@ -214,6 +246,18 @@ void CpuScriptGroupImpl::execute() {
sl.kernels = kernels.array();
sl.count = kernels.size();
+ uint32_t inLen;
+ const Allocation **ains;
+
+ if (ins[0] == NULL) {
+ inLen = 0;
+ ains = NULL;
+
+ } else {
+ inLen = 1;
+ ains = const_cast<const Allocation**>(&ins[0]);
+ }
+
Vector<const void *> usrPtrs;
Vector<const void *> fnPtrs;
Vector<uint32_t> sigs;
@@ -225,7 +269,8 @@ void CpuScriptGroupImpl::execute() {
fnPtrs.add((void *)mtls.kernel);
usrPtrs.add(mtls.fep.usr);
sigs.add(mtls.fep.usrLen);
- si->preLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
+ si->preLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct],
+ mtls.fep.usr, mtls.fep.usrLen, NULL);
}
sl.sigs = sigs.array();
sl.usrPtrs = usrPtrs.array();
@@ -235,16 +280,20 @@ void CpuScriptGroupImpl::execute() {
Script *s = kernels[0]->mScript;
RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
- si->forEachMtlsSetup(ins[0], outs[0], NULL, 0, NULL, &mtls);
+
+ si->forEachMtlsSetup(ains, inLen, outs[0], NULL, 0, NULL, &mtls);
+
mtls.script = NULL;
mtls.kernel = (void (*)())&scriptGroupRoot;
mtls.fep.usr = &sl;
- mCtx->launchThreads(ins[0], outs[0], NULL, &mtls);
+
+ mCtx->launchThreads(ains, inLen, outs[0], NULL, &mtls);
for (size_t ct=0; ct < kernels.size(); ct++) {
Script *s = kernels[ct]->mScript;
RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
- si->postLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], NULL, 0, NULL);
+ si->postLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct], NULL, 0,
+ NULL);
}
}
}
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 0076cb98..4728b7c8 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -69,21 +69,15 @@ public:
virtual void populateScript(Script *) = 0;
virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength) = 0;
virtual int invokeRoot() = 0;
+
virtual void invokeForEach(uint32_t slot,
- const Allocation * ain,
- Allocation * aout,
- const void * usr,
- uint32_t usrLen,
- const RsScriptCall *sc) = 0;
-
- virtual void invokeForEachMulti(uint32_t slot,
- const Allocation** ains,
- uint32_t inLen,
- Allocation * aout,
- const void * usr,
- uint32_t usrLen,
- const RsScriptCall *sc) = 0;
-
+ const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc) = 0;
+
virtual void invokeInit() = 0;
virtual void invokeFreeChildren() = 0;