diff options
author | Stephen Hines <srhines@google.com> | 2014-08-13 17:32:10 +0000 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-08-13 17:32:10 +0000 |
commit | 4b2bea3dc20865f3a198797702e19912a6a2171c (patch) | |
tree | b028521e6474ab22bc99571ead62e1e4f0cb2dc6 /cpu_ref | |
parent | 818cfa034e257c7bb48356257f5cb67334e19aa6 (diff) | |
download | rs-4b2bea3dc20865f3a198797702e19912a6a2171c.tar.gz |
Revert "Collapse code paths for single- and multi-input kernels."
This reverts commit 818cfa034e257c7bb48356257f5cb67334e19aa6.
Change-Id: I59f39f52e6c8f60bb01cbcb8ccf2215eaf46a57f
Diffstat (limited to 'cpu_ref')
-rw-r--r-- | cpu_ref/Android.mk | 2 | ||||
-rw-r--r-- | cpu_ref/rsCpuCore.cpp | 242 | ||||
-rw-r--r-- | cpu_ref/rsCpuCore.h | 65 | ||||
-rw-r--r-- | cpu_ref/rsCpuIntrinsic.cpp | 49 | ||||
-rw-r--r-- | cpu_ref/rsCpuIntrinsic.h | 55 | ||||
-rw-r--r-- | cpu_ref/rsCpuIntrinsic3DLUT.cpp | 10 | ||||
-rw-r--r-- | cpu_ref/rsCpuIntrinsicBlend.cpp | 5 | ||||
-rw-r--r-- | cpu_ref/rsCpuIntrinsicColorMatrix.cpp | 38 | ||||
-rw-r--r-- | cpu_ref/rsCpuIntrinsicHistogram.cpp | 58 | ||||
-rw-r--r-- | cpu_ref/rsCpuIntrinsicLUT.cpp | 4 | ||||
-rw-r--r-- | cpu_ref/rsCpuIntrinsicResize.cpp | 14 | ||||
-rw-r--r-- | cpu_ref/rsCpuScript.cpp | 214 | ||||
-rw-r--r-- | cpu_ref/rsCpuScript.h | 36 | ||||
-rw-r--r-- | cpu_ref/rsCpuScriptGroup.cpp | 103 | ||||
-rw-r--r-- | cpu_ref/rsd_cpu.h | 22 |
15 files changed, 537 insertions, 380 deletions
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk index 729e7022..aeb75a65 100644 --- a/cpu_ref/Android.mk +++ b/cpu_ref/Android.mk @@ -91,7 +91,7 @@ include external/libcxx/libcxx.mk endif include frameworks/compile/libbcc/libbcc-targets.mk -LOCAL_CFLAGS += $(rs_base_CFLAGS) -std=c++11 +LOCAL_CFLAGS += $(rs_base_CFLAGS) LOCAL_MODULE_TAGS := optional diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp index db3cc7fa..a0564fc1 100644 --- a/cpu_ref/rsCpuCore.cpp +++ b/cpu_ref/rsCpuCore.cpp @@ -350,134 +350,180 @@ RsdCpuReferenceImpl::~RsdCpuReferenceImpl() { } typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t); -typedef void (*walk_loop_t)(MTLaunchStruct*, - RsExpandKernelParams&, - outer_foreach_t); - -static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) { +static void wc_xy(void *usr, uint32_t idx) { MTLaunchStruct *mtls = (MTLaunchStruct *)usr; - uint32_t inLen = mtls->fep.inLen; - RsExpandKernelParams kparams; kparams.takeFields(mtls->fep); // Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram kparams.lid = idx; - if (inLen > 0) { - // Allocate space for our input base pointers. - kparams.ins = (const void**)alloca(inLen * sizeof(void*)); + outer_foreach_t fn = (outer_foreach_t) mtls->kernel; + while (1) { + uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); + uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize; + uint32_t yEnd = yStart + mtls->mSliceSize; - // Allocate space for our input stride information. - kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t)); + yEnd = rsMin(yEnd, mtls->yEnd); - // Fill our stride information. - for (int inIndex = inLen; --inIndex >= 0;) { - kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride; + if (yEnd <= yStart) { + return; + } + + //ALOGE("usr idx %i, x %i,%i y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd); + //ALOGE("usr ptr in %p, out %p", mtls->fep.ptrIn, mtls->fep.ptrOut); + + for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) { + kparams.out = mtls->fep.ptrOut + + (mtls->fep.yStrideOut * kparams.y) + + (mtls->fep.eStrideOut * mtls->xStart); + + kparams.in = mtls->fep.ptrIn + + (mtls->fep.yStrideIn * kparams.y) + + (mtls->fep.eStrideIn * mtls->xStart); + + + fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, + mtls->fep.eStrideOut); } } +} + +static void wc_x(void *usr, uint32_t idx) { + MTLaunchStruct *mtls = (MTLaunchStruct *)usr; + + RsExpandKernelParams kparams; + kparams.takeFields(mtls->fep); + + // Used by CpuScriptGroup, IntrinsicBlur, and IntrisicHistogram + kparams.lid = idx; outer_foreach_t fn = (outer_foreach_t) mtls->kernel; + while (1) { + uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); + uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize; + uint32_t xEnd = xStart + mtls->mSliceSize; + + xEnd = rsMin(xEnd, mtls->xEnd); + + if (xEnd <= xStart) { + return; + } + + //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd); + //ALOGE("usr ptr in %p, out %p", mtls->fep.ptrIn, mtls->fep.ptrOut); - walk_loop(mtls, kparams, fn); + kparams.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart); + kparams.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart); + + fn(&kparams, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut); + } } -static void walk_2d(void *usr, uint32_t idx) { - walk_wrapper(usr, idx, [](MTLaunchStruct *mtls, - RsExpandKernelParams &kparams, - outer_foreach_t fn) { +void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout, + const RsScriptCall *sc, MTLaunchStruct *mtls) { - while (1) { - uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); - uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize; - uint32_t yEnd = yStart + mtls->mSliceSize; + //android::StopWatch kernel_time("kernel time"); - yEnd = rsMin(yEnd, mtls->yEnd); + if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) { + const size_t targetByteChunk = 16 * 1024; + mInForEach = true; + if (mtls->fep.dimY > 1) { + uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4); + uint32_t s2 = 0; - if (yEnd <= yStart) { - return; + // This chooses our slice size to rate limit atomic ops to + // one per 16k bytes of reads/writes. + if (mtls->fep.yStrideOut) { + s2 = targetByteChunk / mtls->fep.yStrideOut; + } else { + s2 = targetByteChunk / mtls->fep.yStrideIn; } + mtls->mSliceSize = rsMin(s1, s2); - for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) { - kparams.out = mtls->fep.outPtr + - (mtls->fep.outStride.yStride * kparams.y) + - (mtls->fep.outStride.eStride * mtls->xStart); + if(mtls->mSliceSize < 1) { + mtls->mSliceSize = 1; + } - for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) { - StridePair &strides = mtls->fep.inStrides[inIndex]; + // mtls->mSliceSize = 2; + launchThreads(wc_xy, mtls); + } else { + uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4); + uint32_t s2 = 0; - kparams.ins[inIndex] = - mtls->fep.inPtrs[inIndex] + - (strides.yStride * kparams.y) + - (strides.eStride * mtls->xStart); - } + // This chooses our slice size to rate limit atomic ops to + // one per 16k bytes of reads/writes. + if (mtls->fep.eStrideOut) { + s2 = targetByteChunk / mtls->fep.eStrideOut; + } else { + s2 = targetByteChunk / mtls->fep.eStrideIn; + } + mtls->mSliceSize = rsMin(s1, s2); - // Kernels now get their input strides from kparams. - fn(&kparams, mtls->xStart, mtls->xEnd, 0, - mtls->fep.outStride.eStride); + if(mtls->mSliceSize < 1) { + mtls->mSliceSize = 1; } + + launchThreads(wc_x, mtls); } - }); -} + mInForEach = false; -static void walk_1d(void *usr, uint32_t idx) { - walk_wrapper(usr, idx, [](MTLaunchStruct *mtls, - RsExpandKernelParams &kparams, - outer_foreach_t fn) { + //ALOGE("launch 1"); + } else { + RsExpandKernelParams kparams; + kparams.takeFields(mtls->fep); + + //ALOGE("launch 3"); + outer_foreach_t fn = (outer_foreach_t) mtls->kernel; + for (uint32_t arrayIndex = mtls->arrayStart; + arrayIndex < mtls->arrayEnd; arrayIndex++) { - while (1) { - uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); - uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize; - uint32_t xEnd = xStart + mtls->mSliceSize; + for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd; + kparams.z++) { - xEnd = rsMin(xEnd, mtls->xEnd); + for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd; + kparams.y++) { - if (xEnd <= xStart) { - return; - } + uint32_t offset = + kparams.dimY * kparams.dimZ * arrayIndex + + kparams.dimY * kparams.z + kparams.y; - kparams.out = mtls->fep.outPtr + - (mtls->fep.outStride.eStride * xStart); + kparams.out = mtls->fep.ptrOut + + (mtls->fep.yStrideOut * offset) + + (mtls->fep.eStrideOut * mtls->xStart); - for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) { - StridePair &strides = mtls->fep.inStrides[inIndex]; + kparams.in = mtls->fep.ptrIn + + (mtls->fep.yStrideIn * offset) + + (mtls->fep.eStrideIn * mtls->xStart); - kparams.ins[inIndex] = - mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart); + fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, + mtls->fep.eStrideOut); + } } - - // Kernels now get their input strides from kparams. - fn(&kparams, xStart, xEnd, 0, mtls->fep.outStride.eStride); } - }); + } } - -void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, - uint32_t inLen, - Allocation* aout, - const RsScriptCall* sc, - MTLaunchStruct* mtls) { +void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout, + const RsScriptCall* sc, MTLaunchStruct* mtls) { //android::StopWatch kernel_time("kernel time"); if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) { const size_t targetByteChunk = 16 * 1024; mInForEach = true; - if (mtls->fep.dimY > 1) { uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4); uint32_t s2 = 0; // This chooses our slice size to rate limit atomic ops to // one per 16k bytes of reads/writes. - if (mtls->fep.outStride.yStride) { - s2 = targetByteChunk / mtls->fep.outStride.yStride; + if (mtls->fep.yStrideOut) { + s2 = targetByteChunk / mtls->fep.yStrideOut; } else { - // We know that there is either an output or an input. - s2 = targetByteChunk / mtls->fep.inStrides[0].yStride; + s2 = targetByteChunk / mtls->fep.yStrideIn; } mtls->mSliceSize = rsMin(s1, s2); @@ -485,18 +531,18 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, mtls->mSliceSize = 1; } - launchThreads(walk_2d, mtls); + // mtls->mSliceSize = 2; + launchThreads(wc_xy, mtls); } else { uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4); uint32_t s2 = 0; // This chooses our slice size to rate limit atomic ops to // one per 16k bytes of reads/writes. - if (mtls->fep.outStride.eStride) { - s2 = targetByteChunk / mtls->fep.outStride.eStride; + if (mtls->fep.eStrideOut) { + s2 = targetByteChunk / mtls->fep.eStrideOut; } else { - // We know that there is either an output or an input. - s2 = targetByteChunk / mtls->fep.inStrides[0].eStride; + s2 = targetByteChunk / mtls->fep.eStrideIn; } mtls->mSliceSize = rsMin(s1, s2); @@ -504,26 +550,24 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, mtls->mSliceSize = 1; } - launchThreads(walk_1d, mtls); + launchThreads(wc_x, mtls); } mInForEach = false; + //ALOGE("launch 1"); } else { RsExpandKernelParams kparams; kparams.takeFields(mtls->fep); - if (inLen > 0) { - // Allocate space for our input base pointers. - kparams.ins = (const void**)alloca(inLen * sizeof(void*)); + // Allocate space for our input base pointers. + kparams.ins = new const void*[inLen]; - // Allocate space for our input stride information. - kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t)); + // Allocate space for our input stride information. + kparams.eStrideIns = new uint32_t[inLen]; - // Fill our stride information. - for (int inIndex = inLen; --inIndex >= 0;) { - kparams.inEStrides[inIndex] = - mtls->fep.inStrides[inIndex].eStride; - } + // Fill our stride information. + for (int inIndex = inLen; --inIndex >= 0;) { + kparams.eStrideIns[inIndex] = mtls->fep.inStrides[inIndex].eStride; } //ALOGE("launch 3"); @@ -541,15 +585,15 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, mtls->fep.dimY * mtls->fep.dimZ * arrayIndex + mtls->fep.dimY * kparams.z + kparams.y; - kparams.out = mtls->fep.outPtr + - (mtls->fep.outStride.yStride * offset) + - (mtls->fep.outStride.eStride * mtls->xStart); + kparams.out = mtls->fep.ptrOut + + (mtls->fep.yStrideOut * offset) + + (mtls->fep.eStrideOut * mtls->xStart); for (int inIndex = inLen; --inIndex >= 0;) { StridePair &strides = mtls->fep.inStrides[inIndex]; kparams.ins[inIndex] = - mtls->fep.inPtrs[inIndex] + + mtls->fep.ptrIns[inIndex] + (strides.yStride * offset) + (strides.eStride * mtls->xStart); } @@ -560,10 +604,14 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, * that points to an array. */ fn(&kparams, mtls->xStart, mtls->xEnd, 0, - mtls->fep.outStride.eStride); + mtls->fep.eStrideOut); } } } + + // Free our arrays. + delete[] kparams.ins; + delete[] kparams.eStrideIns; } } diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h index 2fea3fcd..5d4b6cc5 100644 --- a/cpu_ref/rsCpuCore.h +++ b/cpu_ref/rsCpuCore.h @@ -25,8 +25,6 @@ #include <string> -#define RS_KERNEL_INPUT_THRESHOLD 32 - namespace bcc { class BCCContext; class RSCompilerDriver; @@ -42,36 +40,31 @@ struct StridePair { }; struct RsExpandKernelDriverInfo { - const uint8_t **inPtrs; - uint32_t inLen; - - uint8_t *outPtr; - - StridePair *inStrides; - StridePair outStride; + const void *usr; + uint32_t usrLen; uint32_t dimX; uint32_t dimY; uint32_t dimZ; + const uint8_t *ptrIn; + uint8_t *ptrOut; + uint32_t eStrideIn; + uint32_t eStrideOut; + uint32_t yStrideIn; + uint32_t yStrideOut; uint32_t slot; - const void *usr; - uint32_t usrLen; - - bool heapAllocatedArrays; - - RsExpandKernelDriverInfo() : heapAllocatedArrays(false) {} + const uint8_t** ptrIns; + StridePair* inStrides; ~RsExpandKernelDriverInfo() { - if (heapAllocatedArrays) { - if (inPtrs != NULL) { - delete[] inPtrs; - } - - if (inStrides != NULL) { - delete[] inStrides; - } + if (ptrIns != NULL) { + delete[] ptrIns; + } + + if (inStrides != NULL) { + delete[] inStrides; } } }; @@ -79,13 +72,15 @@ struct RsExpandKernelDriverInfo { struct RsExpandKernelParams { // Used by kernels - const void **ins; - uint32_t *inEStrides; + const void *in; void *out; uint32_t y; uint32_t z; uint32_t lid; + const void **ins; + uint32_t *eStrideIns; + // Used by ScriptGroup and user kernels. const void *usr; @@ -120,13 +115,13 @@ typedef void (*WorkerCallback_t)(void *usr, uint32_t idx); class RsdCpuScriptImpl; class RsdCpuReferenceImpl; -struct ScriptTLSStruct { +typedef struct ScriptTLSStructRec { android::renderscript::Context * mContext; const android::renderscript::Script * mScript; RsdCpuScriptImpl *mImpl; -}; +} ScriptTLSStruct; -struct MTLaunchStruct { +typedef struct { RsExpandKernelDriverInfo fep; RsdCpuReferenceImpl *rsc; @@ -134,7 +129,7 @@ struct MTLaunchStruct { ForEachFunc_t kernel; uint32_t sig; - const Allocation ** ains; + const Allocation * ain; Allocation * aout; uint32_t mSliceSize; @@ -150,9 +145,12 @@ struct MTLaunchStruct { uint32_t arrayStart; uint32_t arrayEnd; - const uint8_t *inPtrsBuff[RS_KERNEL_INPUT_THRESHOLD]; - StridePair inStridesBuff[RS_KERNEL_INPUT_THRESHOLD]; -}; + // Multi-input data. + const Allocation ** ains; +} MTLaunchStruct; + + + class RsdCpuReferenceImpl : public RsdCpuReference { public: @@ -173,6 +171,9 @@ public: return mWorkers.mCount + 1; } + void launchThreads(const Allocation * ain, Allocation * aout, + const RsScriptCall *sc, MTLaunchStruct *mtls); + void launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout, const RsScriptCall* sc, MTLaunchStruct* mtls); diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp index 8437c998..5a7fffd5 100644 --- a/cpu_ref/rsCpuIntrinsic.cpp +++ b/cpu_ref/rsCpuIntrinsic.cpp @@ -73,29 +73,54 @@ void RsdCpuScriptIntrinsic::invokeFreeChildren() { } -void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation ** ains, - uint32_t inLen, Allocation * aout, - const void * usr, uint32_t usrLen, - const RsScriptCall *sc) { +void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, + uint32_t usrLen, const RsScriptCall *sc) { } -void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation ** ains, - uint32_t inLen, Allocation * aout, - const void * usr, uint32_t usrLen, - const RsScriptCall *sc) { +void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, + uint32_t usrLen, const RsScriptCall *sc) { } void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot, - const Allocation ** ains, - uint32_t inLen, + const Allocation * ain, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc) { MTLaunchStruct mtls; + preLaunch(slot, ain, aout, usr, usrLen, sc); - preLaunch(slot, ains, inLen, aout, usr, usrLen, sc); + forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls); + mtls.script = this; + mtls.fep.slot = slot; + + mtls.kernel = (void (*)())mRootPtr; + mtls.fep.usr = this; + + RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this); + mCtx->launchThreads(ain, aout, sc, &mtls); + mCtx->setTLS(oldTLS); + + postLaunch(slot, ain, aout, usr, usrLen, sc); +} + +void RsdCpuScriptIntrinsic::invokeForEachMulti(uint32_t slot, + const Allocation ** ains, + uint32_t inLen, + Allocation * aout, + const void * usr, + uint32_t usrLen, + const RsScriptCall *sc) { + + MTLaunchStruct mtls; + /* + * FIXME: Possibly create new preLaunch and postLaunch functions that take + * all of the input allocation pointers. + */ + preLaunch(slot, ains[0], aout, usr, usrLen, sc); forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls); mtls.script = this; @@ -108,7 +133,7 @@ void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot, mCtx->launchThreads(ains, inLen, aout, sc, &mtls); mCtx->setTLS(oldTLS); - postLaunch(slot, ains, inLen, aout, usr, usrLen, sc); + postLaunch(slot, ains[0], aout, usr, usrLen, sc); } void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) { diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h index 95aaa141..bf6a8acd 100644 --- a/cpu_ref/rsCpuIntrinsic.h +++ b/cpu_ref/rsCpuIntrinsic.h @@ -28,42 +28,43 @@ class RsdCpuScriptIntrinsic : public RsdCpuScriptImpl { public: virtual void populateScript(Script *) = 0; - virtual void invokeFunction(uint32_t slot, const void * params, - size_t paramLength); + virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength); virtual int invokeRoot(); - virtual void invokeForEach(uint32_t slot, - const Allocation ** ain, - uint32_t inLen, - Allocation * aout, - const void * usr, - uint32_t usrLen, - const RsScriptCall *sc); - - virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct * mtls); + const Allocation * ain, + Allocation * aout, + const void * usr, + uint32_t usrLen, + const RsScriptCall *sc); + + virtual void invokeForEachMulti(uint32_t slot, + const Allocation ** ain, + uint32_t inLen, + Allocation * aout, + const void * usr, + uint32_t usrLen, + const RsScriptCall *sc); + + virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls); virtual void invokeInit(); virtual void invokeFreeChildren(); - virtual void preLaunch(uint32_t slot, const Allocation ** ains, - uint32_t inLen, Allocation * aout, const void * usr, - uint32_t usrLen, const RsScriptCall * sc); - virtual void postLaunch(uint32_t slot, const Allocation ** ains, - uint32_t inLen, Allocation * aout, - const void * usr, uint32_t usrLen, - const RsScriptCall * sc); - - virtual void setGlobalVar(uint32_t slot, const void * data, - size_t dataLength); - virtual void setGlobalVarWithElemDims(uint32_t slot, const void * data, - size_t dataLength, const Element * e, - const uint32_t * dims, - size_t dimLength); + virtual void preLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, + uint32_t usrLen, const RsScriptCall *sc); + virtual void postLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, + uint32_t usrLen, const RsScriptCall *sc); + + virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength); + virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength, + const Element *e, const uint32_t *dims, size_t dimLength); virtual void setGlobalBind(uint32_t slot, Allocation *data); virtual void setGlobalObj(uint32_t slot, ObjectBase *data); virtual ~RsdCpuScriptIntrinsic(); - RsdCpuScriptIntrinsic(RsdCpuReferenceImpl * ctx, const Script * s, - const Element * e, RsScriptIntrinsicID iid); + RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, const Element *, + RsScriptIntrinsicID iid); protected: RsScriptIntrinsicID mID; diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp index a19d8851..c839c19d 100644 --- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp +++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp @@ -64,7 +64,7 @@ void RsdCpuScriptIntrinsic3DLUT::kernel(const RsExpandKernelParams *p, RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr; uchar4 *out = (uchar4 *)p->out + xstart; - uchar4 *in = (uchar4 *)p->ins[0] + xstart; + uchar4 *in = (uchar4 *)p->in + xstart; uint32_t x1 = xstart; uint32_t x2 = xend; @@ -161,9 +161,9 @@ void RsdCpuScriptIntrinsic3DLUT::kernel(const RsExpandKernelParams *p, } } -RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT( - RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) : - RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) { +RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, + const Script *s, const Element *e) + : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) { mRootPtr = &kernel; } @@ -185,3 +185,5 @@ RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx, return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e); } + + diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp index 0378e076..b6046584 100644 --- a/cpu_ref/rsCpuIntrinsicBlend.cpp +++ b/cpu_ref/rsCpuIntrinsicBlend.cpp @@ -117,7 +117,7 @@ void RsdCpuScriptIntrinsicBlend::kernel(const RsExpandKernelParams *p, // instep/outstep can be ignored--sizeof(uchar4) known at compile time uchar4 *out = (uchar4 *)p->out; - uchar4 *in = (uchar4 *)p->ins[0]; + uchar4 *in = (uchar4 *)p->in; uint32_t x1 = xstart; uint32_t x2 = xend; @@ -509,3 +509,6 @@ RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) { return new RsdCpuScriptIntrinsicBlend(ctx, s, e); } + + + diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp index 4e90ad72..bf78eb3e 100644 --- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp +++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp @@ -169,9 +169,10 @@ public: virtual ~RsdCpuScriptIntrinsicColorMatrix(); RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); - virtual void preLaunch(uint32_t slot, const Allocation ** ains, - uint32_t inLen, Allocation * aout, const void * usr, - uint32_t usrLen, const RsScriptCall *sc); + virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout, + const void * usr, uint32_t usrLen, const RsScriptCall *sc); + virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout, + const void * usr, uint32_t usrLen, const RsScriptCall *sc); protected: float fp[16]; @@ -882,13 +883,8 @@ void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p, uint32_t xstart, uint32_t xend, uint32_t instep, uint32_t outstep) { RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr; - - // Update the instep due to change in parameter passing. - instep = p->inEStrides[0]; - - uchar *out = (uchar *)p->out + outstep * xstart; - uchar *in = (uchar *)p->ins[0] + instep * xstart; - + uchar *out = (uchar *)p->out + outstep * xstart; + uchar *in = (uchar *)p->in + instep * xstart; uint32_t x1 = xstart; uint32_t x2 = xend; @@ -936,15 +932,11 @@ void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p, } } -void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot, - const Allocation ** ains, - uint32_t inLen, - Allocation * aout, - const void * usr, - uint32_t usrLen, - const RsScriptCall *sc) { +void RsdCpuScriptIntrinsicColorMatrix::preLaunch( + uint32_t slot, const Allocation * ain, Allocation * aout, + const void * usr, uint32_t usrLen, const RsScriptCall *sc) { - const Element *ein = ains[0]->mHal.state.type->getElement(); + const Element *ein = ain->mHal.state.type->getElement(); const Element *eout = aout->mHal.state.type->getElement(); if (ein->getType() == eout->getType()) { @@ -961,8 +953,8 @@ void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot, } } - Key_t key = computeKey(ein, eout); - + Key_t key = computeKey(ain->mHal.state.type->getElement(), + aout->mHal.state.type->getElement()); #if defined(ARCH_X86_HAVE_SSSE3) if ((mOptKernel == NULL) || (mLastKey.key != key.key)) { // FIXME: Disable mOptKernel to pass RS color matrix CTS cases @@ -1004,6 +996,12 @@ void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot, #endif //if !defined(ARCH_X86_HAVE_SSSE3) } +void RsdCpuScriptIntrinsicColorMatrix::postLaunch( + uint32_t slot, const Allocation * ain, Allocation * aout, + const void * usr, uint32_t usrLen, const RsScriptCall *sc) { + +} + RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix( RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) { diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp index b5dbfa80..1c430b72 100644 --- a/cpu_ref/rsCpuIntrinsicHistogram.cpp +++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp @@ -36,10 +36,10 @@ public: RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); protected: - void preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen, + void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc); - void postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen, + void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc); @@ -97,12 +97,9 @@ void RsdCpuScriptIntrinsicHistogram::setGlobalVar(uint32_t slot, const void *dat -void -RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, - const Allocation ** ains, - uint32_t inLen, Allocation * aout, - const void * usr, uint32_t usrLen, - const RsScriptCall *sc) { +void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, + uint32_t usrLen, const RsScriptCall *sc) { const uint32_t threads = mCtx->getThreadCount(); uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize(); @@ -126,7 +123,7 @@ RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, } break; case 1: - switch(ains[0]->getType()->getElement()->getVectorSize()) { + switch(ain->getType()->getElement()->getVectorSize()) { case 1: mRootPtr = &kernelP1L1; break; @@ -145,12 +142,9 @@ RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize); } -void -RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, - const Allocation ** ains, - uint32_t inLen, Allocation * aout, - const void * usr, uint32_t usrLen, - const RsScriptCall *sc) { +void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, + uint32_t usrLen, const RsScriptCall *sc) { unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr; uint32_t threads = mCtx->getThreadCount(); @@ -171,7 +165,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelParams *p, uint32_t instep, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr; - uchar *in = (uchar *)p->ins[0]; + uchar *in = (uchar *)p->in; int * sums = &cp->mSums[256 * 4 * p->lid]; for (uint32_t x = xstart; x < xend; x++) { @@ -179,7 +173,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelParams *p, sums[(in[1] << 2) + 1] ++; sums[(in[2] << 2) + 2] ++; sums[(in[3] << 2) + 3] ++; - in += p->inEStrides[0]; + in += instep; } } @@ -188,14 +182,14 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelParams *p, uint32_t instep, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr; - uchar *in = (uchar *)p->ins[0]; + uchar *in = (uchar *)p->in; int * sums = &cp->mSums[256 * 4 * p->lid]; for (uint32_t x = xstart; x < xend; x++) { sums[(in[0] << 2) ] ++; sums[(in[1] << 2) + 1] ++; sums[(in[2] << 2) + 2] ++; - in += p->inEStrides[0]; + in += instep; } } @@ -204,13 +198,13 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelParams *p, uint32_t instep, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr; - uchar *in = (uchar *)p->ins[0]; + uchar *in = (uchar *)p->in; int * sums = &cp->mSums[256 * 2 * p->lid]; for (uint32_t x = xstart; x < xend; x++) { sums[(in[0] << 1) ] ++; sums[(in[1] << 1) + 1] ++; - in += p->inEStrides[0]; + in += instep; } } @@ -219,7 +213,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelParams *p, uint32_t instep, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr; - uchar *in = (uchar *)p->ins[0]; + uchar *in = (uchar *)p->in; int * sums = &cp->mSums[256 * p->lid]; for (uint32_t x = xstart; x < xend; x++) { @@ -228,7 +222,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelParams *p, (cp->mDotI[2] * in[2]) + (cp->mDotI[3] * in[3]); sums[(t + 0x7f) >> 8] ++; - in += p->inEStrides[0]; + in += instep; } } @@ -237,7 +231,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelParams *p, uint32_t instep, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr; - uchar *in = (uchar *)p->ins[0]; + uchar *in = (uchar *)p->in; int * sums = &cp->mSums[256 * p->lid]; for (uint32_t x = xstart; x < xend; x++) { @@ -245,7 +239,7 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelParams *p, (cp->mDotI[1] * in[1]) + (cp->mDotI[2] * in[2]); sums[(t + 0x7f) >> 8] ++; - in += p->inEStrides[0]; + in += instep; } } @@ -254,14 +248,14 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelParams *p, uint32_t instep, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr; - uchar *in = (uchar *)p->ins[0]; + uchar *in = (uchar *)p->in; int * sums = &cp->mSums[256 * p->lid]; for (uint32_t x = xstart; x < xend; x++) { int t = (cp->mDotI[0] * in[0]) + (cp->mDotI[1] * in[1]); sums[(t + 0x7f) >> 8] ++; - in += p->inEStrides[0]; + in += instep; } } @@ -270,13 +264,13 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelParams *p, uint32_t instep, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr; - uchar *in = (uchar *)p->ins[0]; + uchar *in = (uchar *)p->in; int * sums = &cp->mSums[256 * p->lid]; for (uint32_t x = xstart; x < xend; x++) { int t = (cp->mDotI[0] * in[0]); sums[(t + 0x7f) >> 8] ++; - in += p->inEStrides[0]; + in += instep; } } @@ -285,12 +279,12 @@ void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelParams *p, uint32_t instep, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr; - uchar *in = (uchar *)p->ins[0]; + uchar *in = (uchar *)p->in; int * sums = &cp->mSums[256 * p->lid]; for (uint32_t x = xstart; x < xend; x++) { sums[in[0]] ++; - in += p->inEStrides[0]; + in += instep; } } @@ -329,3 +323,5 @@ RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx, const Script return new RsdCpuScriptIntrinsicHistogram(ctx, s, e); } + + diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp index 9d3b4003..db73a838 100644 --- a/cpu_ref/rsCpuIntrinsicLUT.cpp +++ b/cpu_ref/rsCpuIntrinsicLUT.cpp @@ -59,7 +59,7 @@ void RsdCpuScriptIntrinsicLUT::kernel(const RsExpandKernelParams *p, RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)p->usr; uchar *out = (uchar *)p->out; - const uchar *in = (uchar *)p->ins[0]; + const uchar *in = (uchar *)p->in; uint32_t x1 = xstart; uint32_t x2 = xend; @@ -103,3 +103,5 @@ RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx, return new RsdCpuScriptIntrinsicLUT(ctx, s, e); } + + diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp index 3a307d63..af1127e7 100644 --- a/cpu_ref/rsCpuIntrinsicResize.cpp +++ b/cpu_ref/rsCpuIntrinsicResize.cpp @@ -35,8 +35,8 @@ public: virtual ~RsdCpuScriptIntrinsicResize(); RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *); - virtual void preLaunch(uint32_t slot, const Allocation ** ains, - uint32_t inLen, Allocation * aout, const void * usr, + virtual void preLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc); float scaleX; @@ -308,11 +308,9 @@ RsdCpuScriptIntrinsicResize::RsdCpuScriptIntrinsicResize ( RsdCpuScriptIntrinsicResize::~RsdCpuScriptIntrinsicResize() { } -void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot, - const Allocation ** ains, - uint32_t inLen, Allocation * aout, - const void * usr, uint32_t usrLen, - const RsScriptCall *sc) +void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, + uint32_t usrLen, const RsScriptCall *sc) { if (!mAlloc.get()) { ALOGE("Resize executed without input, skipping"); @@ -353,3 +351,5 @@ RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx, const Script *s return new RsdCpuScriptIntrinsicResize(ctx, s, e); } + + diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp index 05984207..a11fda19 100644 --- a/cpu_ref/rsCpuScript.cpp +++ b/cpu_ref/rsCpuScript.cpp @@ -789,33 +789,144 @@ void RsdCpuScriptImpl::populateScript(Script *script) { typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t); -void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, - uint32_t inLen, - Allocation * aout, +void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation * ain, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc, MTLaunchStruct *mtls) { memset(mtls, 0, sizeof(MTLaunchStruct)); - for (int index = inLen; --index >= 0;) { - const Allocation* ain = ains[index]; + // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface + if (ain && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) { + mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations"); + return; + } + if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) { + mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations"); + return; + } + + if (ain != NULL) { + const Type *inType = ain->getType(); - // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface - if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) { + mtls->fep.dimX = inType->getDimX(); + mtls->fep.dimY = inType->getDimY(); + mtls->fep.dimZ = inType->getDimZ(); + + } else if (aout != NULL) { + const Type *outType = aout->getType(); + + mtls->fep.dimX = outType->getDimX(); + mtls->fep.dimY = outType->getDimY(); + mtls->fep.dimZ = outType->getDimZ(); + + } else { + mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations"); + return; + } + + if (ain != NULL && aout != NULL) { + if (!ain->hasSameDims(aout)) { mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, - "rsForEach called with null in allocations"); + "Failed to launch kernel; dimensions of input and output allocations do not match."); + return; } } + if (!sc || (sc->xEnd == 0)) { + mtls->xEnd = mtls->fep.dimX; + } else { + rsAssert(sc->xStart < mtls->fep.dimX); + rsAssert(sc->xEnd <= mtls->fep.dimX); + rsAssert(sc->xStart < sc->xEnd); + mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart); + mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd); + if (mtls->xStart >= mtls->xEnd) return; + } + + if (!sc || (sc->yEnd == 0)) { + mtls->yEnd = mtls->fep.dimY; + } else { + rsAssert(sc->yStart < mtls->fep.dimY); + rsAssert(sc->yEnd <= mtls->fep.dimY); + rsAssert(sc->yStart < sc->yEnd); + mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart); + mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd); + if (mtls->yStart >= mtls->yEnd) return; + } + + if (!sc || (sc->zEnd == 0)) { + mtls->zEnd = mtls->fep.dimZ; + } else { + rsAssert(sc->zStart < mtls->fep.dimZ); + rsAssert(sc->zEnd <= mtls->fep.dimZ); + rsAssert(sc->zStart < sc->zEnd); + mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart); + mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd); + if (mtls->zStart >= mtls->zEnd) return; + } + + mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd); + mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd); + mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd); + mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd); + + rsAssert(!ain || (ain->getType()->getDimZ() == 0)); + + mtls->rsc = mCtx; + mtls->ain = ain; + mtls->aout = aout; + mtls->fep.usr = usr; + mtls->fep.usrLen = usrLen; + mtls->mSliceSize = 1; + mtls->mSliceNum = 0; + + mtls->fep.ptrIn = NULL; + mtls->fep.eStrideIn = 0; + mtls->isThreadable = mIsThreadable; + + if (ain) { + mtls->fep.ptrIn = (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr; + mtls->fep.eStrideIn = ain->getType()->getElementSizeBytes(); + mtls->fep.yStrideIn = ain->mHal.drvState.lod[0].stride; + } + + mtls->fep.ptrOut = NULL; + mtls->fep.eStrideOut = 0; + if (aout) { + mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr; + mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes(); + mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride; + } +} + +void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen, + Allocation * aout, + const void * usr, uint32_t usrLen, + const RsScriptCall *sc, + MTLaunchStruct *mtls) { + + memset(mtls, 0, sizeof(MTLaunchStruct)); + + // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface + if (ains != NULL) { + for (int index = inLen; --index >= 0;) { + const Allocation* ain = ains[index]; + + if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) { + mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations"); + return; + } + } + } + if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) { - mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, - "rsForEach called with null out allocations"); + mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations"); return; } - if (inLen > 0) { + if (ains != NULL) { const Allocation *ain0 = ains[0]; const Type *inType = ain0->getType(); @@ -840,12 +951,11 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, mtls->fep.dimZ = outType->getDimZ(); } else { - mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, - "rsForEach called with null allocations"); + mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations"); return; } - if (inLen > 0 && aout != NULL) { + if (ains != NULL && aout != NULL) { if (!ains[0]->hasSameDims(aout)) { mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "Failed to launch kernel; dimensions of input and output allocations do not match."); @@ -892,7 +1002,7 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd); mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd); - rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0)); + rsAssert(!ains || (ains[0]->getType()->getDimZ() == 0)); mtls->rsc = mCtx; mtls->ains = ains; @@ -902,28 +1012,18 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, mtls->mSliceSize = 1; mtls->mSliceNum = 0; - mtls->fep.inPtrs = NULL; - mtls->fep.inStrides = NULL; + mtls->fep.ptrIns = NULL; + mtls->fep.eStrideIn = 0; mtls->isThreadable = mIsThreadable; - if (inLen > 0) { - - if (inLen <= RS_KERNEL_INPUT_THRESHOLD) { - mtls->fep.inPtrs = (const uint8_t**)mtls->inPtrsBuff; - mtls->fep.inStrides = mtls->inStridesBuff; - } else { - mtls->fep.heapAllocatedArrays = true; - - mtls->fep.inPtrs = new const uint8_t*[inLen]; - mtls->fep.inStrides = new StridePair[inLen]; - } - - mtls->fep.inLen = inLen; + if (ains) { + mtls->fep.ptrIns = new const uint8_t*[inLen]; + mtls->fep.inStrides = new StridePair[inLen]; for (int index = inLen; --index >= 0;) { const Allocation *ain = ains[index]; - mtls->fep.inPtrs[index] = + mtls->fep.ptrIns[index] = (const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr; mtls->fep.inStrides[index].eStride = @@ -933,27 +1033,41 @@ void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, } } - mtls->fep.outPtr = NULL; - mtls->fep.outStride.eStride = 0; - mtls->fep.outStride.yStride = 0; - if (aout != NULL) { - mtls->fep.outPtr = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr; - - mtls->fep.outStride.eStride = aout->getType()->getElementSizeBytes(); - mtls->fep.outStride.yStride = aout->mHal.drvState.lod[0].stride; + mtls->fep.ptrOut = NULL; + mtls->fep.eStrideOut = 0; + if (aout) { + mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr; + mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes(); + mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride; } } void RsdCpuScriptImpl::invokeForEach(uint32_t slot, - const Allocation ** ains, - uint32_t inLen, + const Allocation * ain, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc) { MTLaunchStruct mtls; + forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls); + forEachKernelSetup(slot, &mtls); + + RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this); + mCtx->launchThreads(ain, aout, sc, &mtls); + mCtx->setTLS(oldTLS); +} + +void RsdCpuScriptImpl::invokeForEachMulti(uint32_t slot, + const Allocation ** ains, + uint32_t inLen, + Allocation * aout, + const void * usr, + uint32_t usrLen, + const RsScriptCall *sc) { + + MTLaunchStruct mtls; forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls); forEachKernelSetup(slot, &mtls); @@ -1224,15 +1338,17 @@ Allocation * RsdCpuScriptImpl::getAllocationForPointer(const void *ptr) const { return NULL; } -void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation ** ains, - uint32_t inLen, Allocation * aout, - const void * usr, uint32_t usrLen, - const RsScriptCall *sc) {} +void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, + uint32_t usrLen, const RsScriptCall *sc) +{ +} -void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation ** ains, - uint32_t inLen, Allocation * aout, - const void * usr, uint32_t usrLen, - const RsScriptCall *sc) {} +void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, + uint32_t usrLen, const RsScriptCall *sc) +{ +} } diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h index f0843cc0..d51e9e3f 100644 --- a/cpu_ref/rsCpuScript.h +++ b/cpu_ref/rsCpuScript.h @@ -64,22 +64,26 @@ public: virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength); virtual int invokeRoot(); - virtual void preLaunch(uint32_t slot, const Allocation ** ains, - uint32_t inLen, Allocation * aout, const void * usr, + virtual void preLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc); - virtual void postLaunch(uint32_t slot, const Allocation ** ains, - uint32_t inLen, Allocation * aout, - const void * usr, uint32_t usrLen, - const RsScriptCall *sc); - + virtual void postLaunch(uint32_t slot, const Allocation * ain, + Allocation * aout, const void * usr, + uint32_t usrLen, const RsScriptCall *sc); virtual void invokeForEach(uint32_t slot, - const Allocation ** ains, - uint32_t inLen, - Allocation* aout, - const void* usr, - uint32_t usrLen, - const RsScriptCall* sc); - + const Allocation * ain, + Allocation * aout, + const void * usr, + uint32_t usrLen, + const RsScriptCall *sc); + + virtual void invokeForEachMulti(uint32_t slot, + const Allocation** ains, + uint32_t inLen, + Allocation* aout, + const void* usr, + uint32_t usrLen, + const RsScriptCall* sc); virtual void invokeInit(); virtual void invokeFreeChildren(); @@ -96,6 +100,10 @@ public: const Script * getScript() {return mScript;} + void forEachMtlsSetup(const Allocation * ain, Allocation * aout, + const void * usr, uint32_t usrLen, + const RsScriptCall *sc, MTLaunchStruct *mtls); + void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc, MTLaunchStruct *mtls); diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp index 20ee09db..08785523 100644 --- a/cpu_ref/rsCpuScriptGroup.cpp +++ b/cpu_ref/rsCpuScriptGroup.cpp @@ -53,45 +53,38 @@ void CpuScriptGroupImpl::scriptGroupRoot(const RsExpandKernelParams *kparams, uint32_t instep, uint32_t outstep) { - const ScriptList *sl = (const ScriptList *)kparams->usr; + const ScriptList *sl = (const ScriptList *)kparams->usr; RsExpandKernelParams *mkparams = (RsExpandKernelParams *)kparams; - const void **oldIns = mkparams->ins; - uint32_t *oldStrides = mkparams->inEStrides; - - void *localIns[1]; - uint32_t localStride[1]; - - mkparams->ins = (const void**)localIns; - mkparams->inEStrides = localStride; - for (size_t ct = 0; ct < sl->count; ct++) { ScriptGroupRootFunc_t func; func = (ScriptGroupRootFunc_t)sl->fnPtrs[ct]; mkparams->usr = sl->usrPtrs[ct]; + mkparams->in = NULL; + mkparams->out = NULL; + + uint32_t istep = 0; + uint32_t ostep = 0; + if (sl->ins[ct]) { - localIns[0] = sl->ins[ct]->mHal.drvState.lod[0].mallocPtr; + mkparams->in = + (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr; - localStride[0] = sl->ins[ct]->mHal.state.elementSizeBytes; + istep = sl->ins[ct]->mHal.state.elementSizeBytes; if (sl->inExts[ct]) { - localIns[0] = (void*) - ((const uint8_t *)localIns[0] + - sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y); + mkparams->in = + (const uint8_t *)mkparams->in + + sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y; } else if (sl->ins[ct]->mHal.drvState.lod[0].dimY > kparams->lid) { - localIns[0] = (void*) - ((const uint8_t *)localIns[0] + - sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid); + mkparams->in = + (const uint8_t *)mkparams->in + + sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid; } - - } else { - localIns[0] = NULL; - localStride[0] = 0; } - uint32_t ostep; if (sl->outs[ct]) { mkparams->out = (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr; @@ -108,23 +101,14 @@ void CpuScriptGroupImpl::scriptGroupRoot(const RsExpandKernelParams *kparams, (uint8_t *)mkparams->out + sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->lid; } - } else { - mkparams->out = NULL; - ostep = 0; } //ALOGE("kernel %i %p,%p %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out); - /* - * The fourth argument is zero here because kernels get their stride - * information from a member of p that points to an array. - */ - func(kparams, xstart, xend, 0, ostep); + func(kparams, xstart, xend, istep, ostep); } //ALOGE("script group root"); - mkparams->ins = oldIns; - mkparams->inEStrides = oldStrides; - mkparams->usr = sl; + mkparams->usr = sl; } @@ -211,33 +195,17 @@ void CpuScriptGroupImpl::execute() { MTLaunchStruct mtls; - if (fieldDep) { + if(fieldDep) { for (size_t ct=0; ct < ins.size(); ct++) { Script *s = kernels[ct]->mScript; RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s); uint32_t slot = kernels[ct]->mSlot; - uint32_t inLen; - const Allocation **ains; - - if (ins[ct] == NULL) { - inLen = 0; - ains = NULL; - - } else { - inLen = 1; - ains = const_cast<const Allocation**>(&ins[ct]); - } - - si->forEachMtlsSetup(ains, inLen, outs[ct], NULL, 0, NULL, &mtls); - + si->forEachMtlsSetup(ins[ct], outs[ct], NULL, 0, NULL, &mtls); si->forEachKernelSetup(slot, &mtls); - si->preLaunch(slot, ains, inLen, outs[ct], mtls.fep.usr, - mtls.fep.usrLen, NULL); - - mCtx->launchThreads(ains, inLen, outs[ct], NULL, &mtls); - - si->postLaunch(slot, ains, inLen, outs[ct], NULL, 0, NULL); + si->preLaunch(slot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL); + mCtx->launchThreads(ins[ct], outs[ct], NULL, &mtls); + si->postLaunch(slot, ins[ct], outs[ct], NULL, 0, NULL); } } else { ScriptList sl; @@ -246,18 +214,6 @@ void CpuScriptGroupImpl::execute() { sl.kernels = kernels.array(); sl.count = kernels.size(); - uint32_t inLen; - const Allocation **ains; - - if (ins[0] == NULL) { - inLen = 0; - ains = NULL; - - } else { - inLen = 1; - ains = const_cast<const Allocation**>(&ins[0]); - } - Vector<const void *> usrPtrs; Vector<const void *> fnPtrs; Vector<uint32_t> sigs; @@ -269,8 +225,7 @@ void CpuScriptGroupImpl::execute() { fnPtrs.add((void *)mtls.kernel); usrPtrs.add(mtls.fep.usr); sigs.add(mtls.fep.usrLen); - si->preLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct], - mtls.fep.usr, mtls.fep.usrLen, NULL); + si->preLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL); } sl.sigs = sigs.array(); sl.usrPtrs = usrPtrs.array(); @@ -280,20 +235,16 @@ void CpuScriptGroupImpl::execute() { Script *s = kernels[0]->mScript; RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s); - - si->forEachMtlsSetup(ains, inLen, outs[0], NULL, 0, NULL, &mtls); - + si->forEachMtlsSetup(ins[0], outs[0], NULL, 0, NULL, &mtls); mtls.script = NULL; mtls.kernel = (void (*)())&scriptGroupRoot; mtls.fep.usr = &sl; - - mCtx->launchThreads(ains, inLen, outs[0], NULL, &mtls); + mCtx->launchThreads(ins[0], outs[0], NULL, &mtls); for (size_t ct=0; ct < kernels.size(); ct++) { Script *s = kernels[ct]->mScript; RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s); - si->postLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct], NULL, 0, - NULL); + si->postLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], NULL, 0, NULL); } } } diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h index 4728b7c8..0076cb98 100644 --- a/cpu_ref/rsd_cpu.h +++ b/cpu_ref/rsd_cpu.h @@ -69,15 +69,21 @@ public: virtual void populateScript(Script *) = 0; virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength) = 0; virtual int invokeRoot() = 0; - virtual void invokeForEach(uint32_t slot, - const Allocation ** ains, - uint32_t inLen, - Allocation * aout, - const void * usr, - uint32_t usrLen, - const RsScriptCall *sc) = 0; - + const Allocation * ain, + Allocation * aout, + const void * usr, + uint32_t usrLen, + const RsScriptCall *sc) = 0; + + virtual void invokeForEachMulti(uint32_t slot, + const Allocation** ains, + uint32_t inLen, + Allocation * aout, + const void * usr, + uint32_t usrLen, + const RsScriptCall *sc) = 0; + virtual void invokeInit() = 0; virtual void invokeFreeChildren() = 0; |