summaryrefslogtreecommitdiff
path: root/cpu_ref/rsCpuCore.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'cpu_ref/rsCpuCore.cpp')
-rw-r--r--cpu_ref/rsCpuCore.cpp242
1 files changed, 145 insertions, 97 deletions
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index db3cc7fa..a0564fc1 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -350,134 +350,180 @@ RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
}
typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-typedef void (*walk_loop_t)(MTLaunchStruct*,
- RsExpandKernelParams&,
- outer_foreach_t);
-
-static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) {
+static void wc_xy(void *usr, uint32_t idx) {
MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
- uint32_t inLen = mtls->fep.inLen;
-
RsExpandKernelParams kparams;
kparams.takeFields(mtls->fep);
// Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
kparams.lid = idx;
- if (inLen > 0) {
- // Allocate space for our input base pointers.
- kparams.ins = (const void**)alloca(inLen * sizeof(void*));
+ outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+ while (1) {
+ uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+ uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+ uint32_t yEnd = yStart + mtls->mSliceSize;
- // Allocate space for our input stride information.
- kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
+ yEnd = rsMin(yEnd, mtls->yEnd);
- // Fill our stride information.
- for (int inIndex = inLen; --inIndex >= 0;) {
- kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride;
+ if (yEnd <= yStart) {
+ return;
+ }
+
+ //ALOGE("usr idx %i, x %i,%i y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
+ //ALOGE("usr ptr in %p, out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
+
+ for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
+ kparams.out = mtls->fep.ptrOut +
+ (mtls->fep.yStrideOut * kparams.y) +
+ (mtls->fep.eStrideOut * mtls->xStart);
+
+ kparams.in = mtls->fep.ptrIn +
+ (mtls->fep.yStrideIn * kparams.y) +
+ (mtls->fep.eStrideIn * mtls->xStart);
+
+
+ fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
+ mtls->fep.eStrideOut);
}
}
+}
+
+static void wc_x(void *usr, uint32_t idx) {
+ MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+
+ RsExpandKernelParams kparams;
+ kparams.takeFields(mtls->fep);
+
+ // Used by CpuScriptGroup, IntrinsicBlur, and IntrisicHistogram
+ kparams.lid = idx;
outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+ while (1) {
+ uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+ uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+ uint32_t xEnd = xStart + mtls->mSliceSize;
+
+ xEnd = rsMin(xEnd, mtls->xEnd);
+
+ if (xEnd <= xStart) {
+ return;
+ }
+
+ //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
+ //ALOGE("usr ptr in %p, out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
- walk_loop(mtls, kparams, fn);
+ kparams.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
+ kparams.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
+
+ fn(&kparams, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+ }
}
-static void walk_2d(void *usr, uint32_t idx) {
- walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
- RsExpandKernelParams &kparams,
- outer_foreach_t fn) {
+void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
+ const RsScriptCall *sc, MTLaunchStruct *mtls) {
- while (1) {
- uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
- uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
- uint32_t yEnd = yStart + mtls->mSliceSize;
+ //android::StopWatch kernel_time("kernel time");
- yEnd = rsMin(yEnd, mtls->yEnd);
+ if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+ const size_t targetByteChunk = 16 * 1024;
+ mInForEach = true;
+ if (mtls->fep.dimY > 1) {
+ uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+ uint32_t s2 = 0;
- if (yEnd <= yStart) {
- return;
+ // This chooses our slice size to rate limit atomic ops to
+ // one per 16k bytes of reads/writes.
+ if (mtls->fep.yStrideOut) {
+ s2 = targetByteChunk / mtls->fep.yStrideOut;
+ } else {
+ s2 = targetByteChunk / mtls->fep.yStrideIn;
}
+ mtls->mSliceSize = rsMin(s1, s2);
- for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
- kparams.out = mtls->fep.outPtr +
- (mtls->fep.outStride.yStride * kparams.y) +
- (mtls->fep.outStride.eStride * mtls->xStart);
+ if(mtls->mSliceSize < 1) {
+ mtls->mSliceSize = 1;
+ }
- for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
- StridePair &strides = mtls->fep.inStrides[inIndex];
+ // mtls->mSliceSize = 2;
+ launchThreads(wc_xy, mtls);
+ } else {
+ uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+ uint32_t s2 = 0;
- kparams.ins[inIndex] =
- mtls->fep.inPtrs[inIndex] +
- (strides.yStride * kparams.y) +
- (strides.eStride * mtls->xStart);
- }
+ // This chooses our slice size to rate limit atomic ops to
+ // one per 16k bytes of reads/writes.
+ if (mtls->fep.eStrideOut) {
+ s2 = targetByteChunk / mtls->fep.eStrideOut;
+ } else {
+ s2 = targetByteChunk / mtls->fep.eStrideIn;
+ }
+ mtls->mSliceSize = rsMin(s1, s2);
- // Kernels now get their input strides from kparams.
- fn(&kparams, mtls->xStart, mtls->xEnd, 0,
- mtls->fep.outStride.eStride);
+ if(mtls->mSliceSize < 1) {
+ mtls->mSliceSize = 1;
}
+
+ launchThreads(wc_x, mtls);
}
- });
-}
+ mInForEach = false;
-static void walk_1d(void *usr, uint32_t idx) {
- walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
- RsExpandKernelParams &kparams,
- outer_foreach_t fn) {
+ //ALOGE("launch 1");
+ } else {
+ RsExpandKernelParams kparams;
+ kparams.takeFields(mtls->fep);
+
+ //ALOGE("launch 3");
+ outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+ for (uint32_t arrayIndex = mtls->arrayStart;
+ arrayIndex < mtls->arrayEnd; arrayIndex++) {
- while (1) {
- uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
- uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
- uint32_t xEnd = xStart + mtls->mSliceSize;
+ for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
+ kparams.z++) {
- xEnd = rsMin(xEnd, mtls->xEnd);
+ for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
+ kparams.y++) {
- if (xEnd <= xStart) {
- return;
- }
+ uint32_t offset =
+ kparams.dimY * kparams.dimZ * arrayIndex +
+ kparams.dimY * kparams.z + kparams.y;
- kparams.out = mtls->fep.outPtr +
- (mtls->fep.outStride.eStride * xStart);
+ kparams.out = mtls->fep.ptrOut +
+ (mtls->fep.yStrideOut * offset) +
+ (mtls->fep.eStrideOut * mtls->xStart);
- for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
- StridePair &strides = mtls->fep.inStrides[inIndex];
+ kparams.in = mtls->fep.ptrIn +
+ (mtls->fep.yStrideIn * offset) +
+ (mtls->fep.eStrideIn * mtls->xStart);
- kparams.ins[inIndex] =
- mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart);
+ fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
+ mtls->fep.eStrideOut);
+ }
}
-
- // Kernels now get their input strides from kparams.
- fn(&kparams, xStart, xEnd, 0, mtls->fep.outStride.eStride);
}
- });
+ }
}
-
-void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
- uint32_t inLen,
- Allocation* aout,
- const RsScriptCall* sc,
- MTLaunchStruct* mtls) {
+void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
+ const RsScriptCall* sc, MTLaunchStruct* mtls) {
//android::StopWatch kernel_time("kernel time");
if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
const size_t targetByteChunk = 16 * 1024;
mInForEach = true;
-
if (mtls->fep.dimY > 1) {
uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
uint32_t s2 = 0;
// This chooses our slice size to rate limit atomic ops to
// one per 16k bytes of reads/writes.
- if (mtls->fep.outStride.yStride) {
- s2 = targetByteChunk / mtls->fep.outStride.yStride;
+ if (mtls->fep.yStrideOut) {
+ s2 = targetByteChunk / mtls->fep.yStrideOut;
} else {
- // We know that there is either an output or an input.
- s2 = targetByteChunk / mtls->fep.inStrides[0].yStride;
+ s2 = targetByteChunk / mtls->fep.yStrideIn;
}
mtls->mSliceSize = rsMin(s1, s2);
@@ -485,18 +531,18 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
mtls->mSliceSize = 1;
}
- launchThreads(walk_2d, mtls);
+ // mtls->mSliceSize = 2;
+ launchThreads(wc_xy, mtls);
} else {
uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
uint32_t s2 = 0;
// This chooses our slice size to rate limit atomic ops to
// one per 16k bytes of reads/writes.
- if (mtls->fep.outStride.eStride) {
- s2 = targetByteChunk / mtls->fep.outStride.eStride;
+ if (mtls->fep.eStrideOut) {
+ s2 = targetByteChunk / mtls->fep.eStrideOut;
} else {
- // We know that there is either an output or an input.
- s2 = targetByteChunk / mtls->fep.inStrides[0].eStride;
+ s2 = targetByteChunk / mtls->fep.eStrideIn;
}
mtls->mSliceSize = rsMin(s1, s2);
@@ -504,26 +550,24 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
mtls->mSliceSize = 1;
}
- launchThreads(walk_1d, mtls);
+ launchThreads(wc_x, mtls);
}
mInForEach = false;
+ //ALOGE("launch 1");
} else {
RsExpandKernelParams kparams;
kparams.takeFields(mtls->fep);
- if (inLen > 0) {
- // Allocate space for our input base pointers.
- kparams.ins = (const void**)alloca(inLen * sizeof(void*));
+ // Allocate space for our input base pointers.
+ kparams.ins = new const void*[inLen];
- // Allocate space for our input stride information.
- kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
+ // Allocate space for our input stride information.
+ kparams.eStrideIns = new uint32_t[inLen];
- // Fill our stride information.
- for (int inIndex = inLen; --inIndex >= 0;) {
- kparams.inEStrides[inIndex] =
- mtls->fep.inStrides[inIndex].eStride;
- }
+ // Fill our stride information.
+ for (int inIndex = inLen; --inIndex >= 0;) {
+ kparams.eStrideIns[inIndex] = mtls->fep.inStrides[inIndex].eStride;
}
//ALOGE("launch 3");
@@ -541,15 +585,15 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
mtls->fep.dimY * kparams.z + kparams.y;
- kparams.out = mtls->fep.outPtr +
- (mtls->fep.outStride.yStride * offset) +
- (mtls->fep.outStride.eStride * mtls->xStart);
+ kparams.out = mtls->fep.ptrOut +
+ (mtls->fep.yStrideOut * offset) +
+ (mtls->fep.eStrideOut * mtls->xStart);
for (int inIndex = inLen; --inIndex >= 0;) {
StridePair &strides = mtls->fep.inStrides[inIndex];
kparams.ins[inIndex] =
- mtls->fep.inPtrs[inIndex] +
+ mtls->fep.ptrIns[inIndex] +
(strides.yStride * offset) +
(strides.eStride * mtls->xStart);
}
@@ -560,10 +604,14 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
* that points to an array.
*/
fn(&kparams, mtls->xStart, mtls->xEnd, 0,
- mtls->fep.outStride.eStride);
+ mtls->fep.eStrideOut);
}
}
}
+
+ // Free our arrays.
+ delete[] kparams.ins;
+ delete[] kparams.eStrideIns;
}
}