From 14ce007a633b10e3b9a3fae29d8f53a7e8c9b59f Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 30 Jul 2015 17:30:25 -0700 Subject: Add a basic implementation of the reduce kernel API to the CPU reference implementation. Bug: 22631253 For now, this just runs a serial reduction on one thread. Change-Id: I34c96d24bb6f44274de72bb53160abcf79d143b0 --- cpu_ref/rsCpuCore.cpp | 60 ++++++++---- cpu_ref/rsCpuCore.h | 54 +++++++--- cpu_ref/rsCpuExecutable.cpp | 43 +++++++- cpu_ref/rsCpuExecutable.h | 10 ++ cpu_ref/rsCpuIntrinsic.cpp | 10 +- cpu_ref/rsCpuIntrinsic.h | 4 +- cpu_ref/rsCpuScript.cpp | 223 +++++++++++++++++++++++------------------- cpu_ref/rsCpuScript.h | 34 ++++--- cpu_ref/rsCpuScriptGroup.cpp | 8 +- cpu_ref/rsCpuScriptGroup2.cpp | 12 +-- cpu_ref/rsd_cpu.h | 5 + driver/rsdBcc.cpp | 9 ++ driver/rsdBcc.h | 7 ++ driver/rsdCore.cpp | 2 + rsDriverLoader.cpp | 1 + rsScript.cpp | 4 +- rsScript.h | 5 + rsScriptC.cpp | 32 +++++- rsScriptC.h | 3 + rsScriptIntrinsic.cpp | 5 + rsScriptIntrinsic.h | 35 ++++--- rs_hal.h | 5 + 22 files changed, 382 insertions(+), 189 deletions(-) diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp index 4367bd4a..48e8dbb1 100644 --- a/cpu_ref/rsCpuCore.cpp +++ b/cpu_ref/rsCpuCore.cpp @@ -45,11 +45,6 @@ static pid_t gettid() { using namespace android; using namespace android::renderscript; -typedef void (*outer_foreach_t)( - const RsExpandKernelDriverInfo *, - uint32_t x1, uint32_t x2, uint32_t outstep); - - static pthread_key_t gThreadTLSKey = 0; static uint32_t gThreadTLSKeyCount = 0; static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER; @@ -153,13 +148,15 @@ void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) { return nullptr; } +// Launch a kernel. +// The callback function is called to execute the kernel. void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) { mWorkers.mLaunchData = data; mWorkers.mLaunchCallback = cbk; // fast path for very small launches - MTLaunchStruct *mtls = (MTLaunchStruct *)data; - if (mtls && mtls->fep.dim.y <= 1 && mtls->end.x <= mtls->start.x + mtls->mSliceSize) { + MTLaunchStructCommon *mtls = (MTLaunchStructCommon *)data; + if (mtls && mtls->dimPtr->y <= 1 && mtls->end.x <= mtls->start.x + mtls->mSliceSize) { if (mWorkers.mLaunchCallback) { mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0); } @@ -220,7 +217,6 @@ static void GetCpuInfo() { bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor, sym_lookup_t lfn, script_lookup_t slfn) { - mSymLookupFn = lfn; mScriptLookupFn = slfn; @@ -328,16 +324,19 @@ RsdCpuReferenceImpl::~RsdCpuReferenceImpl() { } -static inline void FepPtrSetup(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo *fep, +// Set up the appropriate input and output pointers to the kernel driver info structure. +// Inputs: +// mtls - The MTLaunchStruct holding information about the kernel launch +// fep - The forEach parameters (driver info structure) +// x, y, z, lod, face, a1, a2, a3, a4 - The start offsets into each dimension +static inline void FepPtrSetup(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo *fep, uint32_t x, uint32_t y, uint32_t z = 0, uint32_t lod = 0, RsAllocationCubemapFace face = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X, uint32_t a1 = 0, uint32_t a2 = 0, uint32_t a3 = 0, uint32_t a4 = 0) { - for (uint32_t i = 0; i < fep->inLen; i++) { fep->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4); } - if (mtls->aout[0] != nullptr) { fep->outPtr[0] = (uint8_t *)mtls->aout[0]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4); } @@ -356,7 +355,7 @@ static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end return n; } -static bool SelectOuterSlice(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo* fep, uint32_t sliceNum) { +static bool SelectOuterSlice(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo* fep, uint32_t sliceNum) { uint32_t r = sliceNum; r = sliceInt(&fep->current.z, r, mtls->start.z, mtls->end.z); @@ -371,10 +370,10 @@ static bool SelectOuterSlice(const MTLaunchStruct *mtls, RsExpandKernelDriverInf static void walk_general(void *usr, uint32_t idx) { - MTLaunchStruct *mtls = (MTLaunchStruct *)usr; + MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr; RsExpandKernelDriverInfo fep = mtls->fep; fep.lid = idx; - outer_foreach_t fn = (outer_foreach_t) mtls->kernel; + ForEachFunc_t fn = mtls->kernel; while(1) { @@ -400,10 +399,10 @@ static void walk_general(void *usr, uint32_t idx) { } static void walk_2d(void *usr, uint32_t idx) { - MTLaunchStruct *mtls = (MTLaunchStruct *)usr; + MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr; RsExpandKernelDriverInfo fep = mtls->fep; fep.lid = idx; - outer_foreach_t fn = (outer_foreach_t) mtls->kernel; + ForEachFunc_t fn = mtls->kernel; while (1) { uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); @@ -425,10 +424,10 @@ static void walk_2d(void *usr, uint32_t idx) { } static void walk_1d(void *usr, uint32_t idx) { - MTLaunchStruct *mtls = (MTLaunchStruct *)usr; + MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr; RsExpandKernelDriverInfo fep = mtls->fep; fep.lid = idx; - outer_foreach_t fn = (outer_foreach_t) mtls->kernel; + ForEachFunc_t fn = mtls->kernel; while (1) { uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); @@ -447,11 +446,30 @@ static void walk_1d(void *usr, uint32_t idx) { } } -void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, +// Launch a reduce-style kernel. +// Inputs: +// ain: The allocation that contains the input +// aout: The allocation that will hold the output +// mtls: Holds launch parameters +void RsdCpuReferenceImpl::launchReduce(const Allocation *ain, + Allocation *aout, + MTLaunchStructReduce *mtls) { + const uint32_t xStart = mtls->start.x; + const uint32_t xEnd = mtls->end.x; + + if (xStart >= xEnd) { + return; + } + + const uint32_t startOffset = ain->getType()->getElementSizeBytes() * xStart; + mtls->kernel(&mtls->inBuf[startOffset], mtls->outBuf, xEnd - xStart); +} + +void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains, uint32_t inLen, Allocation* aout, const RsScriptCall* sc, - MTLaunchStruct* mtls) { + MTLaunchStructForEach* mtls) { //android::StopWatch kernel_time("kernel time"); @@ -519,7 +537,7 @@ void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains, mInForEach = false; } else { - outer_foreach_t fn = (outer_foreach_t) mtls->kernel; + ForEachFunc_t fn = mtls->kernel; uint32_t slice = 0; diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h index 0f784382..cfdb29a6 100644 --- a/cpu_ref/rsCpuCore.h +++ b/cpu_ref/rsCpuCore.h @@ -31,8 +31,14 @@ namespace renderscript { // Whether the CPU we're running on supports SIMD instructions extern bool gArchUseSIMD; -typedef void (* InvokeFunc_t)(void); -typedef void (* ForEachFunc_t)(void); +// Function types found in RenderScript code +typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len); +typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride); +typedef void (*InvokeFunc_t)(void *params); +typedef void (*InitOrDtorFunc_t)(void); +typedef int (*RootFunc_t)(void); + +// Internal driver callback used to execute a kernel typedef void (*WorkerCallback_t)(void *usr, uint32_t idx); class RsdCpuScriptImpl; @@ -44,23 +50,38 @@ struct ScriptTLSStruct { RsdCpuScriptImpl *mImpl; }; -struct MTLaunchStruct { - RsExpandKernelDriverInfo fep; - - RsdCpuReferenceImpl *rsc; +// MTLaunchStruct passes information about a multithreaded kernel launch. +struct MTLaunchStructCommon { + RsdCpuReferenceImpl *rs; RsdCpuScriptImpl *script; - ForEachFunc_t kernel; - uint32_t sig; - const Allocation * ains[RS_KERNEL_INPUT_LIMIT]; - Allocation * aout[RS_KERNEL_INPUT_LIMIT]; - uint32_t mSliceSize; volatile int mSliceNum; bool isThreadable; + // Boundary information about the launch RsLaunchDimensions start; RsLaunchDimensions end; + // Points to MTLaunchStructForEach::fep::dim or + // MTLaunchStructReduce::inputDim. + RsLaunchDimensions *dimPtr; +}; + +struct MTLaunchStructForEach : public MTLaunchStructCommon { + // Driver info structure + RsExpandKernelDriverInfo fep; + + ForEachFunc_t kernel; + uint32_t sig; + const Allocation *ains[RS_KERNEL_INPUT_LIMIT]; + Allocation *aout[RS_KERNEL_INPUT_LIMIT]; +}; + +struct MTLaunchStructReduce : public MTLaunchStructCommon { + ReduceFunc_t kernel; + const uint8_t *inBuf; + uint8_t *outBuf; + RsLaunchDimensions inputDim; }; class RsdCpuReferenceImpl : public RsdCpuReference { @@ -82,8 +103,13 @@ public: return mWorkers.mCount + 1; } - void launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout, - const RsScriptCall* sc, MTLaunchStruct* mtls); + // Launch foreach kernel + void launchForEach(const Allocation **ains, uint32_t inLen, Allocation *aout, + const RsScriptCall *sc, MTLaunchStructForEach *mtls); + + // Launch a reduce kernel + void launchReduce(const Allocation *ain, Allocation *aout, + MTLaunchStructReduce *mtls); CpuScript * createScript(const ScriptC *s, char const *resName, char const *cacheDir, uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags) override; @@ -92,7 +118,7 @@ public: const RsdCpuReference::CpuSymbol *symLookup(const char *); - RsdCpuReference::CpuScript * lookupScript(const Script *s) { + RsdCpuReference::CpuScript *lookupScript(const Script *s) { return mScriptLookupFn(mRSC, s); } diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp index 867a2cd6..74d400f9 100644 --- a/cpu_ref/rsCpuExecutable.cpp +++ b/cpu_ref/rsCpuExecutable.cpp @@ -267,6 +267,7 @@ void* SharedLibraryUtils::loadSOHelper(const char *origName, const char *cacheDi #define EXPORT_VAR_STR "exportVarCount: " #define EXPORT_FUNC_STR "exportFuncCount: " #define EXPORT_FOREACH_STR "exportForEachCount: " +#define EXPORT_REDUCE_STR "exportReduceCount: " #define OBJECT_SLOT_STR "objectSlotCount: " #define PRAGMA_STR "pragmaCount: " #define THREADABLE_STR "isThreadable: " @@ -304,6 +305,7 @@ ScriptExecutable* ScriptExecutable::createFromSharedObject( size_t varCount = 0; size_t funcCount = 0; size_t forEachCount = 0; + size_t reduceCount = 0; size_t objectSlotCount = 0; size_t pragmaCount = 0; bool isThreadable = true; @@ -314,6 +316,7 @@ ScriptExecutable* ScriptExecutable::createFromSharedObject( InvokeFunc_t* invokeFunctions = nullptr; ForEachFunc_t* forEachFunctions = nullptr; uint32_t* forEachSignatures = nullptr; + ReduceFunc_t* reduceFunctions = nullptr; const char ** pragmaKeys = nullptr; const char ** pragmaValues = nullptr; uint32_t checksum = 0; @@ -439,12 +442,47 @@ ScriptExecutable* ScriptExecutable::createFromSharedObject( strcmp(tmpName, "root.expand")) { // Ignore missing root.expand functions. // root() is always specified at location 0. - ALOGE("Failed to find forEach function address for %s: %s", + ALOGE("Failed to find forEach function address for %s(): %s", tmpName, dlerror()); goto error; } } + // Read reduce kernels + if (strgets(line, MAXLINE, &rsInfo) == nullptr) { + goto error; + } + if (sscanf(line, EXPORT_REDUCE_STR "%zu", &reduceCount) != 1) { + ALOGE("Invalid export reduce count!: %s", line); + goto error; + } + + reduceFunctions = new ReduceFunc_t[reduceCount]; + if (reduceFunctions == nullptr) { + goto error; + } + + for (size_t i = 0; i < reduceCount; ++i) { + if (strgets(line, MAXLINE, &rsInfo) == nullptr) { + goto error; + } + char *c = strrchr(line, '\n'); + if (c) { + *c = '\0'; + } + + // Lookup the expanded reduce kernel. + strncat(line, ".expand", MAXLINE-1-strlen(line)); + + reduceFunctions[i] = + reinterpret_cast(dlsym(sharedObj, line)); + if (reduceFunctions[i] == nullptr) { + ALOGE("Failed to get function address for %s(): %s", + line, dlerror()); + goto error; + } + } + if (strgets(line, MAXLINE, &rsInfo) == nullptr) { goto error; } @@ -577,6 +615,7 @@ ScriptExecutable* ScriptExecutable::createFromSharedObject( RSContext, fieldAddress, fieldIsObject, fieldName, varCount, invokeFunctions, funcCount, forEachFunctions, forEachSignatures, forEachCount, + reduceFunctions, reduceCount, pragmaKeys, pragmaValues, pragmaCount, rsGlobalNames, rsGlobalAddresses, rsGlobalSizes, rsGlobalProperties, numEntries, isThreadable, checksum); @@ -594,6 +633,8 @@ error: delete[] pragmaKeys; #endif // RS_COMPATIBILITY_LIB + delete[] reduceFunctions; + delete[] forEachSignatures; delete[] forEachFunctions; diff --git a/cpu_ref/rsCpuExecutable.h b/cpu_ref/rsCpuExecutable.h index 68809706..fe9c2ad5 100644 --- a/cpu_ref/rsCpuExecutable.h +++ b/cpu_ref/rsCpuExecutable.h @@ -68,6 +68,7 @@ public: InvokeFunc_t* invokeFunctions, size_t funcCount, ForEachFunc_t* forEachFunctions, uint32_t* forEachSignatures, size_t forEachCount, + ReduceFunc_t* reduceFunctions, size_t reduceCount, const char** pragmaKeys, const char** pragmaValues, size_t pragmaCount, const char **globalNames, const void **globalAddresses, @@ -79,6 +80,7 @@ public: mInvokeFunctions(invokeFunctions), mFuncCount(funcCount), mForEachFunctions(forEachFunctions), mForEachSignatures(forEachSignatures), mForEachCount(forEachCount), + mReduceFunctions(reduceFunctions), mReduceCount(reduceCount), mPragmaKeys(pragmaKeys), mPragmaValues(pragmaValues), mPragmaCount(pragmaCount), mGlobalNames(globalNames), mGlobalAddresses(globalAddresses), mGlobalSizes(globalSizes), @@ -105,6 +107,8 @@ public: delete[] mPragmaValues; delete[] mPragmaKeys; + delete[] mReduceFunctions; + delete[] mForEachSignatures; delete[] mForEachFunctions; @@ -129,6 +133,7 @@ public: size_t getExportedVariableCount() const { return mExportedVarCount; } size_t getExportedFunctionCount() const { return mFuncCount; } size_t getExportedForEachCount() const { return mForEachCount; } + size_t getExportedReduceCount() const { return mReduceCount; } size_t getPragmaCount() const { return mPragmaCount; } void* getFieldAddress(int slot) const { return mFieldAddress[slot]; } @@ -141,6 +146,8 @@ public: ForEachFunc_t getForEachFunction(int slot) const { return mForEachFunctions[slot]; } uint32_t getForEachSignature(int slot) const { return mForEachSignatures[slot]; } + ReduceFunc_t getReduceFunction(int slot) const { return mReduceFunctions[slot]; } + const char ** getPragmaKeys() const { return mPragmaKeys; } const char ** getPragmaValues() const { return mPragmaValues; } @@ -193,6 +200,9 @@ private: uint32_t* mForEachSignatures; size_t mForEachCount; + ReduceFunc_t* mReduceFunctions; + size_t mReduceCount; + const char ** mPragmaKeys; const char ** mPragmaValues; size_t mPragmaCount; diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp index 16363697..4cb3f9ff 100644 --- a/cpu_ref/rsCpuIntrinsic.cpp +++ b/cpu_ref/rsCpuIntrinsic.cpp @@ -93,7 +93,7 @@ void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot, uint32_t usrLen, const RsScriptCall *sc) { - MTLaunchStruct mtls; + MTLaunchStructForEach mtls; preLaunch(slot, ains, inLen, aout, usr, usrLen, sc); @@ -101,21 +101,21 @@ void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot, mtls.script = this; mtls.fep.slot = slot; - mtls.kernel = (void (*)())mRootPtr; + mtls.kernel = mRootPtr; mtls.fep.usr = this; RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this); - mCtx->launchThreads(ains, inLen, aout, sc, &mtls); + mCtx->launchForEach(ains, inLen, aout, sc, &mtls); mCtx->setTLS(oldTLS); } postLaunch(slot, ains, inLen, aout, usr, usrLen, sc); } -void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) { +void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls) { mtls->script = this; mtls->fep.slot = slot; - mtls->kernel = (void (*)())mRootPtr; + mtls->kernel = mRootPtr; mtls->fep.usr = this; } diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h index 0ac8e6e7..9c7e1726 100644 --- a/cpu_ref/rsCpuIntrinsic.h +++ b/cpu_ref/rsCpuIntrinsic.h @@ -39,7 +39,7 @@ public: uint32_t usrLen, const RsScriptCall *sc) override; - void forEachKernelSetup(uint32_t slot, MTLaunchStruct * mtls) override; + void forEachKernelSetup(uint32_t slot, MTLaunchStructForEach * mtls) override; void invokeInit() override; void invokeFreeChildren() override; @@ -65,7 +65,7 @@ public: protected: RsScriptIntrinsicID mID; - outer_foreach_t mRootPtr; + ForEachFunc_t mRootPtr; ObjectBaseRef mElement; }; diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp index 09e7ab79..5adca544 100644 --- a/cpu_ref/rsCpuScript.cpp +++ b/cpu_ref/rsCpuScript.cpp @@ -50,6 +50,12 @@ namespace { static const bool kDebugGlobalVariables = false; +static bool allocationLODIsNull(const android::renderscript::Allocation *alloc) { + // Even if alloc != nullptr, mallocPtr could be null if + // IO_OUTPUT/IO_INPUT with no bound surface. + return alloc && alloc->mHal.drvState.lod[0].mallocPtr == nullptr; +} + #ifndef RS_COMPATIBILITY_LIB static bool is_force_recompile() { @@ -282,11 +288,11 @@ bool RsdCpuScriptImpl::storeRSInfoFromSO() { if (mRootExpand) { //ALOGE("Found root.expand(): %p", mRootExpand); } - mInit = (InvokeFunc_t) dlsym(mScriptSO, "init"); + mInit = (InitOrDtorFunc_t) dlsym(mScriptSO, "init"); if (mInit) { //ALOGE("Found init(): %p", mInit); } - mFreeChildren = (InvokeFunc_t) dlsym(mScriptSO, ".rs.dtor"); + mFreeChildren = (InitOrDtorFunc_t) dlsym(mScriptSO, ".rs.dtor"); if (mFreeChildren) { //ALOGE("Found .rs.dtor(): %p", mFreeChildren); } @@ -490,6 +496,8 @@ const char* RsdCpuScriptImpl::findCoreLib(const bcinfo::MetadataExtractor& ME, c void RsdCpuScriptImpl::populateScript(Script *script) { // Copy info over to runtime script->mHal.info.exportedFunctionCount = mScriptExec->getExportedFunctionCount(); + script->mHal.info.exportedReduceCount = mScriptExec->getExportedReduceCount(); + script->mHal.info.exportedForEachCount = mScriptExec->getExportedForEachCount(); script->mHal.info.exportedVariableCount = mScriptExec->getExportedVariableCount(); script->mHal.info.exportedPragmaCount = mScriptExec->getPragmaCount();; script->mHal.info.exportedPragmaKeyList = mScriptExec->getPragmaKeys(); @@ -503,32 +511,105 @@ void RsdCpuScriptImpl::populateScript(Script *script) { } } +// Set up the launch dimensions, and write the values of the launch +// dimensions into the mtls start/end fields. +// +// Inputs: +// baseDim - base shape of the input +// sc - used to constrain the launch dimensions +// +// Returns: +// True on success, false on failure to set up +bool RsdCpuScriptImpl::setUpMtlsDimensions(MTLaunchStructCommon *mtls, + const RsLaunchDimensions &baseDim, + const RsScriptCall *sc) { + rsAssert(mtls); + +#define SET_UP_DIMENSION(DIM_FIELD, SC_FIELD) do { \ + if (!sc || (sc->SC_FIELD##End == 0)) { \ + mtls->end.DIM_FIELD = baseDim.DIM_FIELD; \ + } else { \ + mtls->start.DIM_FIELD = \ + rsMin(baseDim.DIM_FIELD, sc->SC_FIELD##Start); \ + mtls->end.DIM_FIELD = \ + rsMin(baseDim.DIM_FIELD, sc->SC_FIELD##End); \ + if (mtls->start.DIM_FIELD >= mtls->end.DIM_FIELD) { \ + mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, \ + "Failed to launch kernel; Invalid " \ + #SC_FIELD "Start or " #SC_FIELD "End."); \ + return false; \ + } \ + }} while(0) + + SET_UP_DIMENSION(x, x); + SET_UP_DIMENSION(y, y); + SET_UP_DIMENSION(z, z); + SET_UP_DIMENSION(array[0], array); + SET_UP_DIMENSION(array[1], array2); + SET_UP_DIMENSION(array[2], array3); + SET_UP_DIMENSION(array[3], array4); +#undef SET_UP_DIMENSION + + return true; +} + +// Preliminary work to prepare a reduce-style kernel for launch. +bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation *ain, + const Allocation *aout, + const RsScriptCall *sc, + MTLaunchStructReduce *mtls) { + rsAssert(ain && aout); + memset(mtls, 0, sizeof(MTLaunchStructReduce)); + mtls->dimPtr = &mtls->inputDim; + + if (allocationLODIsNull(ain) || allocationLODIsNull(aout)) { + mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, + "reduce called with a null allocation"); + return false; + } + + // Set up the dimensions of the input. + const Type *inType = ain->getType(); + mtls->inputDim.x = inType->getDimX(); + rsAssert(inType->getDimY() == 0); + + if (!setUpMtlsDimensions(mtls, mtls->inputDim, sc)) { + return false; + } + + mtls->rs = mCtx; + // Currently not threaded. + mtls->isThreadable = false; + mtls->mSliceNum = -1; + + // Set up input and output. + mtls->inBuf = static_cast(ain->getPointerUnchecked(0, 0)); + mtls->outBuf = static_cast(aout->getPointerUnchecked(0, 0)); + + rsAssert(mtls->inBuf && mtls->outBuf); + + return true; +} +// Preliminary work to prepare a forEach-style kernel for launch. bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc, - MTLaunchStruct *mtls) { - - memset(mtls, 0, sizeof(MTLaunchStruct)); + MTLaunchStructForEach *mtls) { + memset(mtls, 0, sizeof(MTLaunchStructForEach)); + mtls->dimPtr = &mtls->fep.dim; for (int index = inLen; --index >= 0;) { - const Allocation* ain = ains[index]; - - // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface - if (ain != nullptr && - (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == nullptr) { - + if (allocationLODIsNull(ains[index])) { mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations"); return false; } } - if (aout && - (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == nullptr) { - + if (allocationLODIsNull(aout)) { mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations"); return false; @@ -578,96 +659,14 @@ bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, } } - if (!sc || (sc->xEnd == 0)) { - mtls->end.x = mtls->fep.dim.x; - } else { - mtls->start.x = rsMin(mtls->fep.dim.x, sc->xStart); - mtls->end.x = rsMin(mtls->fep.dim.x, sc->xEnd); - if (mtls->start.x >= mtls->end.x) { - mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, - "Failed to launch kernel; Invalid xStart or xEnd."); - return false; - } - } - - if (!sc || (sc->yEnd == 0)) { - mtls->end.y = mtls->fep.dim.y; - } else { - mtls->start.y = rsMin(mtls->fep.dim.y, sc->yStart); - mtls->end.y = rsMin(mtls->fep.dim.y, sc->yEnd); - if (mtls->start.y >= mtls->end.y) { - mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, - "Failed to launch kernel; Invalid yStart or yEnd."); - return false; - } - } - - if (!sc || (sc->zEnd == 0)) { - mtls->end.z = mtls->fep.dim.z; - } else { - mtls->start.z = rsMin(mtls->fep.dim.z, sc->zStart); - mtls->end.z = rsMin(mtls->fep.dim.z, sc->zEnd); - if (mtls->start.z >= mtls->end.z) { - mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, - "Failed to launch kernel; Invalid zStart or zEnd."); - return false; - } - } - - if (!sc || (sc->arrayEnd == 0)) { - mtls->end.array[0] = mtls->fep.dim.array[0]; - } else { - mtls->start.array[0] = rsMin(mtls->fep.dim.array[0], sc->arrayStart); - mtls->end.array[0] = rsMin(mtls->fep.dim.array[0], sc->arrayEnd); - if (mtls->start.array[0] >= mtls->end.array[0]) { - mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, - "Failed to launch kernel; Invalid arrayStart or arrayEnd."); - return false; - } - } - - if (!sc || (sc->array2End == 0)) { - mtls->end.array[1] = mtls->fep.dim.array[1]; - } else { - mtls->start.array[1] = rsMin(mtls->fep.dim.array[1], sc->array2Start); - mtls->end.array[1] = rsMin(mtls->fep.dim.array[1], sc->array2End); - if (mtls->start.array[1] >= mtls->end.array[1]) { - mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, - "Failed to launch kernel; Invalid array2Start or array2End."); - return false; - } - } - - if (!sc || (sc->array3End == 0)) { - mtls->end.array[2] = mtls->fep.dim.array[2]; - } else { - mtls->start.array[2] = rsMin(mtls->fep.dim.array[2], sc->array3Start); - mtls->end.array[2] = rsMin(mtls->fep.dim.array[2], sc->array3End); - if (mtls->start.array[2] >= mtls->end.array[2]) { - mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, - "Failed to launch kernel; Invalid array3Start or array3End."); - return false; - } - } - - if (!sc || (sc->array4End == 0)) { - mtls->end.array[3] = mtls->fep.dim.array[3]; - } else { - mtls->start.array[3] = rsMin(mtls->fep.dim.array[3], sc->array4Start); - mtls->end.array[3] = rsMin(mtls->fep.dim.array[3], sc->array4End); - if (mtls->start.array[3] >= mtls->end.array[3]) { - mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, - "Failed to launch kernel; Invalid array4Start or array4End."); - return false; - } + if (!setUpMtlsDimensions(mtls, mtls->fep.dim, sc)) { + return false; } - // The X & Y walkers always want 0-1 min even if dim is not present mtls->end.x = rsMax((uint32_t)1, mtls->end.x); mtls->end.y = rsMax((uint32_t)1, mtls->end.y); - - mtls->rsc = mCtx; + mtls->rs = mCtx; if (ains) { memcpy(mtls->ains, ains, inLen * sizeof(ains[0])); } @@ -705,18 +704,32 @@ void RsdCpuScriptImpl::invokeForEach(uint32_t slot, uint32_t usrLen, const RsScriptCall *sc) { - MTLaunchStruct mtls; + MTLaunchStructForEach mtls; if (forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls)) { forEachKernelSetup(slot, &mtls); RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this); - mCtx->launchThreads(ains, inLen, aout, sc, &mtls); + mCtx->launchForEach(ains, inLen, aout, sc, &mtls); mCtx->setTLS(oldTLS); } } -void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) { +void RsdCpuScriptImpl::invokeReduce(uint32_t slot, + const Allocation *ain, + Allocation *aout, + const RsScriptCall *sc) { + MTLaunchStructReduce mtls; + + if (reduceMtlsSetup(ain, aout, sc, &mtls)) { + reduceKernelSetup(slot, &mtls); + RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this); + mCtx->launchReduce(ain, aout, &mtls); + mCtx->setTLS(oldTLS); + } +} + +void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls) { mtls->script = this; mtls->fep.slot = slot; mtls->kernel = mScriptExec->getForEachFunction(slot); @@ -724,6 +737,12 @@ void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) { mtls->sig = mScriptExec->getForEachSignature(slot); } +void RsdCpuScriptImpl::reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls) { + mtls->script = this; + mtls->kernel = mScriptExec->getReduceFunction(slot); + rsAssert(mtls->kernel != nullptr); +} + int RsdCpuScriptImpl::invokeRoot() { RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this); int ret = mRoot(); diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h index 6059825a..248e5c73 100644 --- a/cpu_ref/rsCpuScript.h +++ b/cpu_ref/rsCpuScript.h @@ -37,18 +37,6 @@ class ScriptExecutable; class RsdCpuScriptImpl : public RsdCpuReferenceImpl::CpuScript { public: - typedef void (*outer_foreach_t)( - const RsExpandKernelDriverInfo *, - uint32_t x1, uint32_t x2, - uint32_t outstep); - - typedef void (* InvokeFunc_t)(void); - typedef void (* ForEachFunc_t)(void); - typedef int (* RootFunc_t)(void); -#ifdef RS_COMPATIBILITY_LIB - typedef void (*WorkerCallback_t)(void *usr, uint32_t idx); -#endif - bool init(char const *resName, char const *cacheDir, uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags, char const *bccPluginName = nullptr); @@ -72,6 +60,11 @@ public: uint32_t usrLen, const RsScriptCall* sc) override; + void invokeReduce(uint32_t slot, + const Allocation* ain, + Allocation* aout, + const RsScriptCall* sc) override; + void invokeInit() override; void invokeFreeChildren() override; @@ -92,10 +85,15 @@ public: bool forEachMtlsSetup(const Allocation ** ains, uint32_t inLen, Allocation * aout, const void * usr, uint32_t usrLen, - const RsScriptCall *sc, MTLaunchStruct *mtls); + const RsScriptCall *sc, MTLaunchStructForEach *mtls); - virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls); + virtual void forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls); + // Build an MTLaunchStruct suitable for launching a reduce-style kernel. + bool reduceMtlsSetup(const Allocation *ain, const Allocation *aout, + const RsScriptCall *sc, MTLaunchStructReduce *mtls); + // Finalize an MTLaunchStruct for launching a reduce-style kernel. + virtual void reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls); const RsdCpuReference::CpuSymbol * lookupSymbolMath(const char *sym); static void * lookupRuntimeStub(void* pContext, char const* name); @@ -122,8 +120,8 @@ protected: RootFunc_t mRoot; RootFunc_t mRootExpand; - InvokeFunc_t mInit; - InvokeFunc_t mFreeChildren; + InitOrDtorFunc_t mInit; + InitOrDtorFunc_t mFreeChildren; ScriptExecutable* mScriptExec; Allocation **mBoundAllocs; @@ -135,6 +133,10 @@ public: const char* getBitcodeFilePath() const { return mBitcodeFilePath.string(); } private: + bool setUpMtlsDimensions(MTLaunchStructCommon *mtls, + const RsLaunchDimensions &baseDim, + const RsScriptCall *sc); + String8 mBitcodeFilePath; uint32_t mBuildChecksum; bool mChecksumNeeded; diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp index 82208db9..9cc9b69d 100644 --- a/cpu_ref/rsCpuScriptGroup.cpp +++ b/cpu_ref/rsCpuScriptGroup.cpp @@ -203,7 +203,7 @@ void CpuScriptGroupImpl::execute() { } - MTLaunchStruct mtls; + MTLaunchStructForEach mtls; if (fieldDep) { for (size_t ct=0; ct < ins.size(); ct++) { @@ -230,7 +230,7 @@ void CpuScriptGroupImpl::execute() { mtls.fep.usrLen, nullptr); if (launchOK) { - mCtx->launchThreads(ains, inLen, outs[ct], nullptr, &mtls); + mCtx->launchForEach(ains, inLen, outs[ct], nullptr, &mtls); } si->postLaunch(slot, ains, inLen, outs[ct], nullptr, 0, nullptr); @@ -280,10 +280,10 @@ void CpuScriptGroupImpl::execute() { if (si->forEachMtlsSetup(ains, inLen, outs[0], nullptr, 0, nullptr, &mtls)) { mtls.script = nullptr; - mtls.kernel = (void (*)())&scriptGroupRoot; + mtls.kernel = &scriptGroupRoot; mtls.fep.usr = &sl; - mCtx->launchThreads(ains, inLen, outs[0], nullptr, &mtls); + mCtx->launchForEach(ains, inLen, outs[0], nullptr, &mtls); } for (size_t ct=0; ct < kernels.size(); ct++) { diff --git a/cpu_ref/rsCpuScriptGroup2.cpp b/cpu_ref/rsCpuScriptGroup2.cpp index 50b203d6..f0b657a2 100644 --- a/cpu_ref/rsCpuScriptGroup2.cpp +++ b/cpu_ref/rsCpuScriptGroup2.cpp @@ -165,7 +165,7 @@ CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, RsdCpuScriptImpl* si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript); if (closure->mIsKernel) { - MTLaunchStruct mtls; + MTLaunchStructForEach mtls; si->forEachKernelSetup(funcID->mSlot, &mtls); cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel); } else { @@ -568,7 +568,7 @@ void Batch::run() { } if (mFunc != nullptr) { - MTLaunchStruct mtls; + MTLaunchStructForEach mtls; const CPUClosure* firstCpuClosure = mClosures.front(); const CPUClosure* lastCpuClosure = mClosures.back(); @@ -582,7 +582,7 @@ void Batch::run() { mtls.fep.usr = nullptr; mtls.kernel = (ForEachFunc_t)mFunc; - mGroup->getCpuRefImpl()->launchThreads( + mGroup->getCpuRefImpl()->launchForEach( (const Allocation**)firstCpuClosure->mClosure->mArgs, firstCpuClosure->mClosure->mNumArg, lastCpuClosure->mClosure->mReturnValue, @@ -603,7 +603,7 @@ void Batch::run() { const CPUClosure* cpuClosure = mClosures.front(); const Closure* closure = cpuClosure->mClosure; - MTLaunchStruct mtls; + MTLaunchStructForEach mtls; if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs, closure->mNumArg, @@ -611,10 +611,10 @@ void Batch::run() { nullptr, 0, nullptr, &mtls)) { mtls.script = nullptr; - mtls.kernel = (void (*)())&groupRoot; + mtls.kernel = &groupRoot; mtls.fep.usr = &mClosures; - mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls); + mGroup->getCpuRefImpl()->launchForEach(nullptr, 0, nullptr, nullptr, &mtls); } for (CPUClosure* cpuClosure : mClosures) { diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h index 8e205d85..f2c7f19a 100644 --- a/cpu_ref/rsd_cpu.h +++ b/cpu_ref/rsd_cpu.h @@ -58,6 +58,11 @@ public: uint32_t usrLen, const RsScriptCall *sc) = 0; + virtual void invokeReduce(uint32_t slot, + const Allocation *ain, + Allocation *aout, + const RsScriptCall *sc) = 0; + virtual void invokeInit() = 0; virtual void invokeFreeChildren() = 0; diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp index a57409d9..25659d86 100644 --- a/driver/rsdBcc.cpp +++ b/driver/rsdBcc.cpp @@ -124,6 +124,15 @@ void rsdScriptInvokeFunction(const Context *dc, Script *s, cs->invokeFunction(slot, params, paramLength); } +void rsdScriptInvokeReduce(const Context *dc, Script *s, + uint32_t slot, + const Allocation *ain, + Allocation *aout, + const RsScriptCall *sc) { + RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv; + cs->invokeReduce(slot, ain, aout, sc); +} + void rsdScriptSetGlobalVar(const Context *dc, const Script *s, uint32_t slot, void *data, size_t dataLength) { RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv; diff --git a/driver/rsdBcc.h b/driver/rsdBcc.h index d51fb80f..e95529b2 100644 --- a/driver/rsdBcc.h +++ b/driver/rsdBcc.h @@ -43,6 +43,13 @@ void rsdScriptInvokeForEach(const android::renderscript::Context *rsc, size_t usrLen, const RsScriptCall *sc); +void rsdScriptInvokeReduce(const android::renderscript::Context *rsc, + android::renderscript::Script *s, + uint32_t slot, + const android::renderscript::Allocation *ain, + android::renderscript::Allocation *aout, + const RsScriptCall *sc); + void rsdScriptInvokeForEachMulti(const android::renderscript::Context *rsc, android::renderscript::Script *s, uint32_t slot, diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp index 1fcfcc2e..f70b79b6 100644 --- a/driver/rsdCore.cpp +++ b/driver/rsdCore.cpp @@ -99,6 +99,8 @@ extern "C" bool rsdHalQueryHal(RsHalInitEnums entry, void **fnPtr) { fnPtr[0] = (void *)rsdScriptInvokeForEachMulti; break; case RS_HAL_SCRIPT_UPDATE_CACHED_OBJECT: fnPtr[0] = (void *)rsdScriptUpdateCachedObject; break; + case RS_HAL_SCRIPT_INVOKE_REDUCE: + fnPtr[0] = (void *)rsdScriptInvokeReduce; break; case RS_HAL_ALLOCATION_INIT: fnPtr[0] = (void *)rsdAllocationInit; break; diff --git a/rsDriverLoader.cpp b/rsDriverLoader.cpp index 125a6df4..43e42949 100644 --- a/rsDriverLoader.cpp +++ b/rsDriverLoader.cpp @@ -70,6 +70,7 @@ static bool LoadHalTable(Context *rsc, HalQueryHal fn, bool loadGraphics) { ret &= fn(RS_HAL_SCRIPT_INVOKE_FUNCTION, (void **)&rsc->mHal.funcs.script.invokeFunction); ret &= fn(RS_HAL_SCRIPT_INVOKE_ROOT, (void **)&rsc->mHal.funcs.script.invokeRoot); ret &= fn(RS_HAL_SCRIPT_INVOKE_FOR_EACH, (void **)&rsc->mHal.funcs.script.invokeForEach); + ret &= fn(RS_HAL_SCRIPT_INVOKE_REDUCE, (void **)&rsc->mHal.funcs.script.invokeReduce); ret &= fn(RS_HAL_SCRIPT_INVOKE_INIT, (void **)&rsc->mHal.funcs.script.invokeInit); ret &= fn(RS_HAL_SCRIPT_INVOKE_FREE_CHILDREN, (void **)&rsc->mHal.funcs.script.invokeFreeChildren); ret &= fn(RS_HAL_SCRIPT_SET_GLOBAL_VAR, (void **)&rsc->mHal.funcs.script.setGlobalVar); diff --git a/rsScript.cpp b/rsScript.cpp index 483789cb..bc242921 100644 --- a/rsScript.cpp +++ b/rsScript.cpp @@ -227,7 +227,9 @@ void rsi_ScriptForEach(Context *rsc, RsScript vs, uint32_t slot, void rsi_ScriptReduce(Context *rsc, RsScript vs, uint32_t slot, RsAllocation vain, RsAllocation vaout, const RsScriptCall *sc, size_t scLen) { - // TODO(wala) + Script *s = static_cast