diff options
Diffstat (limited to 'cpu_ref/rsCpuCore.cpp')
-rw-r--r-- | cpu_ref/rsCpuCore.cpp | 242 |
1 files changed, 145 insertions, 97 deletions
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp index db3cc7fa..a0564fc1 100644 --- a/cpu_ref/rsCpuCore.cpp +++ b/cpu_ref/rsCpuCore.cpp @@ -350,134 +350,180 @@ RsdCpuReferenceImpl::~RsdCpuReferenceImpl() { } typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t); -typedef void (*walk_loop_t)(MTLaunchStruct*, - RsExpandKernelParams&, - outer_foreach_t); - -static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) { +static void wc_xy(void *usr, uint32_t idx) { MTLaunchStruct *mtls = (MTLaunchStruct *)usr; - uint32_t inLen = mtls->fep.inLen; - RsExpandKernelParams kparams; kparams.takeFields(mtls->fep); // Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram kparams.lid = idx; - if (inLen > 0) { - // Allocate space for our input base pointers. - kparams.ins = (const void**)alloca(inLen * sizeof(void*)); + outer_foreach_t fn = (outer_foreach_t) mtls->kernel; + while (1) { + uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); + uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize; + uint32_t yEnd = yStart + mtls->mSliceSize; - // Allocate space for our input stride information. - kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t)); + yEnd = rsMin(yEnd, mtls->yEnd); - // Fill our stride information. - for (int inIndex = inLen; --inIndex >= 0;) { - kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride; + if (yEnd <= yStart) { + return; + } + + //ALOGE("usr idx %i, x %i,%i y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd); + //ALOGE("usr ptr in %p, out %p", mtls->fep.ptrIn, mtls->fep.ptrOut); + + for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) { + kparams.out = mtls->fep.ptrOut + + (mtls->fep.yStrideOut * kparams.y) + + (mtls->fep.eStrideOut * mtls->xStart); + + kparams.in = mtls->fep.ptrIn + + (mtls->fep.yStrideIn * kparams.y) + + (mtls->fep.eStrideIn * mtls->xStart); + + + fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, + mtls->fep.eStrideOut); } } +} + +static void wc_x(void *usr, uint32_t idx) { + MTLaunchStruct *mtls = (MTLaunchStruct *)usr; + + RsExpandKernelParams kparams; + kparams.takeFields(mtls->fep); + + // Used by CpuScriptGroup, IntrinsicBlur, and IntrisicHistogram + kparams.lid = idx; outer_foreach_t fn = (outer_foreach_t) mtls->kernel; + while (1) { + uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); + uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize; + uint32_t xEnd = xStart + mtls->mSliceSize; + + xEnd = rsMin(xEnd, mtls->xEnd); + + if (xEnd <= xStart) { + return; + } + + //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd); + //ALOGE("usr ptr in %p, out %p", mtls->fep.ptrIn, mtls->fep.ptrOut); - walk_loop(mtls, kparams, fn); + kparams.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart); + kparams.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart); + + fn(&kparams, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut); + } } -static void walk_2d(void *usr, uint32_t idx) { - walk_wrapper(usr, idx, [](MTLaunchStruct *mtls, - RsExpandKernelParams &kparams, - outer_foreach_t fn) { +void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout, + const RsScriptCall *sc, MTLaunchStruct *mtls) { - while (1) { - uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); - uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize; - uint32_t yEnd = yStart + mtls->mSliceSize; + //android::StopWatch kernel_time("kernel time"); - yEnd = rsMin(yEnd, mtls->yEnd); + if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) { + const size_t targetByteChunk = 16 * 1024; + mInForEach = true; + if (mtls->fep.dimY > 1) { + uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4); + uint32_t s2 = 0; - if (yEnd <= yStart) { - return; + // This chooses our slice size to rate limit atomic ops to + // one per 16k bytes of reads/writes. + if (mtls->fep.yStrideOut) { + s2 = targetByteChunk / mtls->fep.yStrideOut; + } else { + s2 = targetByteChunk / mtls->fep.yStrideIn; } + mtls->mSliceSize = rsMin(s1, s2); - for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) { - kparams.out = mtls->fep.outPtr + - (mtls->fep.outStride.yStride * kparams.y) + - (mtls->fep.outStride.eStride * mtls->xStart); + if(mtls->mSliceSize < 1) { + mtls->mSliceSize = 1; + } - for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) { - StridePair &strides = mtls->fep.inStrides[inIndex]; + // mtls->mSliceSize = 2; + launchThreads(wc_xy, mtls); + } else { + uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4); + uint32_t s2 = 0; - kparams.ins[inIndex] = - mtls->fep.inPtrs[inIndex] + - (strides.yStride * kparams.y) + - (strides.eStride * mtls->xStart); - } + // This chooses our slice size to rate limit atomic ops to + // one per 16k bytes of reads/writes. + if (mtls->fep.eStrideOut) { + s2 = targetByteChunk / mtls->fep.eStrideOut; + } else { + s2 = targetByteChunk / mtls->fep.eStrideIn; + } + mtls->mSliceSize = rsMin(s1, s2); - // Kernels now get their input strides from kparams. - fn(&kparams, mtls->xStart, mtls->xEnd, 0, - mtls->fep.outStride.eStride); + if(mtls->mSliceSize < 1) { + mtls->mSliceSize = 1; } + + launchThreads(wc_x, mtls); } - }); -} + mInForEach = false; -static void walk_1d(void *usr, uint32_t idx) { - walk_wrapper(usr, idx, [](MTLaunchStruct *mtls, - RsExpandKernelParams &kparams, - outer_foreach_t fn) { + //ALOGE("launch 1"); + } else { + RsExpandKernelParams kparams; + kparams.takeFields(mtls->fep); + + //ALOGE("launch 3"); + outer_foreach_t fn = (outer_foreach_t) mtls->kernel; + for (uint32_t arrayIndex = mtls->arrayStart; + arrayIndex < mtls->arrayEnd; arrayIndex++) { - while (1) { - uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); - uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize; - uint32_t xEnd = xStart + mtls->mSliceSize; + for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd; + kparams.z++) { - xEnd = rsMin(xEnd, mtls->xEnd); + for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd; + kparams.y++) { - if (xEnd <= xStart) { - return; - } + uint32_t offset = + kparams.dimY * kparams.dimZ * arrayIndex + + kparams.dimY * kparams.z + kparams.y; - kparams.out = mtls->fep.outPtr + - (mtls->fep.outStride.eStride * xStart); + kparams.out = mtls->fep.ptrOut + + (mtls->fep.yStrideOut * offset) + + (mtls->fep.eStrideOut * mtls->xStart); - for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) { - StridePair &strides = mtls->fep.inStrides[inIndex]; + kparams.in = mtls->fep.ptrIn + + (mtls->fep.yStrideIn * offset) + + (mtls->fep.eStrideIn * mtls->xStart); - kparams.ins[inIndex] = - mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart); + fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, + mtls->fep.eStrideOut); + } } - - // Kernels now get their input strides from kparams. - fn(&kparams, xStart, xEnd, 0, mtls->fep.outStride.eStride); } - }); + } } - -void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, - uint32_t inLen, - Allocation* aout, - const RsScriptCall* sc, - MTLaunchStruct* mtls) { +void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout, + const RsScriptCall* sc, MTLaunchStruct* mtls) { //android::StopWatch kernel_time("kernel time"); if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) { const size_t targetByteChunk = 16 * 1024; mInForEach = true; - if (mtls->fep.dimY > 1) { uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4); uint32_t s2 = 0; // This chooses our slice size to rate limit atomic ops to // one per 16k bytes of reads/writes. - if (mtls->fep.outStride.yStride) { - s2 = targetByteChunk / mtls->fep.outStride.yStride; + if (mtls->fep.yStrideOut) { + s2 = targetByteChunk / mtls->fep.yStrideOut; } else { - // We know that there is either an output or an input. - s2 = targetByteChunk / mtls->fep.inStrides[0].yStride; + s2 = targetByteChunk / mtls->fep.yStrideIn; } mtls->mSliceSize = rsMin(s1, s2); @@ -485,18 +531,18 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, mtls->mSliceSize = 1; } - launchThreads(walk_2d, mtls); + // mtls->mSliceSize = 2; + launchThreads(wc_xy, mtls); } else { uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4); uint32_t s2 = 0; // This chooses our slice size to rate limit atomic ops to // one per 16k bytes of reads/writes. - if (mtls->fep.outStride.eStride) { - s2 = targetByteChunk / mtls->fep.outStride.eStride; + if (mtls->fep.eStrideOut) { + s2 = targetByteChunk / mtls->fep.eStrideOut; } else { - // We know that there is either an output or an input. - s2 = targetByteChunk / mtls->fep.inStrides[0].eStride; + s2 = targetByteChunk / mtls->fep.eStrideIn; } mtls->mSliceSize = rsMin(s1, s2); @@ -504,26 +550,24 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, mtls->mSliceSize = 1; } - launchThreads(walk_1d, mtls); + launchThreads(wc_x, mtls); } mInForEach = false; + //ALOGE("launch 1"); } else { RsExpandKernelParams kparams; kparams.takeFields(mtls->fep); - if (inLen > 0) { - // Allocate space for our input base pointers. - kparams.ins = (const void**)alloca(inLen * sizeof(void*)); + // Allocate space for our input base pointers. + kparams.ins = new const void*[inLen]; - // Allocate space for our input stride information. - kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t)); + // Allocate space for our input stride information. + kparams.eStrideIns = new uint32_t[inLen]; - // Fill our stride information. - for (int inIndex = inLen; --inIndex >= 0;) { - kparams.inEStrides[inIndex] = - mtls->fep.inStrides[inIndex].eStride; - } + // Fill our stride information. + for (int inIndex = inLen; --inIndex >= 0;) { + kparams.eStrideIns[inIndex] = mtls->fep.inStrides[inIndex].eStride; } //ALOGE("launch 3"); @@ -541,15 +585,15 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, mtls->fep.dimY * mtls->fep.dimZ * arrayIndex + mtls->fep.dimY * kparams.z + kparams.y; - kparams.out = mtls->fep.outPtr + - (mtls->fep.outStride.yStride * offset) + - (mtls->fep.outStride.eStride * mtls->xStart); + kparams.out = mtls->fep.ptrOut + + (mtls->fep.yStrideOut * offset) + + (mtls->fep.eStrideOut * mtls->xStart); for (int inIndex = inLen; --inIndex >= 0;) { StridePair &strides = mtls->fep.inStrides[inIndex]; kparams.ins[inIndex] = - mtls->fep.inPtrs[inIndex] + + mtls->fep.ptrIns[inIndex] + (strides.yStride * offset) + (strides.eStride * mtls->xStart); } @@ -560,10 +604,14 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, * that points to an array. */ fn(&kparams, mtls->xStart, mtls->xEnd, 0, - mtls->fep.outStride.eStride); + mtls->fep.eStrideOut); } } } + + // Free our arrays. + delete[] kparams.ins; + delete[] kparams.eStrideIns; } } |