diff options
-rw-r--r-- | cpu_ref/rsCpuCore.cpp | 239 | ||||
-rw-r--r-- | cpu_ref/rsCpuCore.h | 59 | ||||
-rw-r--r-- | cpu_ref/rsCpuExecutable.cpp | 17 | ||||
-rw-r--r-- | cpu_ref/rsCpuScript.cpp | 7 | ||||
-rw-r--r-- | cpu_ref/rsd_cpu.h | 2 | ||||
-rw-r--r-- | driver/rsdRuntimeStubs.cpp | 2 | ||||
-rw-r--r-- | java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java | 34 | ||||
-rw-r--r-- | java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java | 34 | ||||
-rw-r--r-- | java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs | 18 | ||||
-rw-r--r-- | java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs | 18 | ||||
-rw-r--r-- | java/tests/RsTest/AndroidManifest.xml | 1 | ||||
-rw-r--r-- | java/tests/RsTest/src/com/android/rs/test/UT_reduce.java | 671 | ||||
-rw-r--r-- | java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java | 34 | ||||
-rw-r--r-- | java/tests/RsTest/src/com/android/rs/test/reduce.rs | 39 | ||||
-rw-r--r-- | java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs | 18 | ||||
-rw-r--r-- | rsContext.cpp | 2 | ||||
-rw-r--r-- | rsContext.h | 2 |
17 files changed, 930 insertions, 267 deletions
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp index b8b48387..9f9c429b 100644 --- a/cpu_ref/rsCpuCore.cpp +++ b/cpu_ref/rsCpuCore.cpp @@ -45,6 +45,8 @@ static pid_t gettid() { using namespace android; using namespace android::renderscript; +#define REDUCE_NEW_ALOGV(...) /* ALOGV(__VA_ARGS__) */ + static pthread_key_t gThreadTLSKey = 0; static uint32_t gThreadTLSKeyCount = 0; static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER; @@ -98,7 +100,7 @@ RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) { version_major = 0; version_minor = 0; - mInForEach = false; + mInKernel = false; memset(&mWorkers, 0, sizeof(mWorkers)); memset(&mTlsStruct, 0, sizeof(mTlsStruct)); mExit = false; @@ -239,6 +241,9 @@ bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor, ALOGE("pthread_setspecific %i", status); } + mPageSize = sysconf(_SC_PAGE_SIZE); + REDUCE_NEW_ALOGV("page size = %ld", mPageSize); + GetCpuInfo(); int cpu = sysconf(_SC_NPROCESSORS_CONF); @@ -435,7 +440,7 @@ static void walk_2d(void *usr, uint32_t idx) { } } -static void walk_1d(void *usr, uint32_t idx) { +static void walk_1d_foreach(void *usr, uint32_t idx) { MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr; RsExpandKernelDriverInfo fep = mtls->fep; fep.lid = idx; @@ -458,6 +463,103 @@ static void walk_1d(void *usr, uint32_t idx) { } } +// The function format_bytes() is an auxiliary function to assist in logging. +// +// Bytes are read from an input (inBuf) and written (as pairs of hex digits) +// to an output (outBuf). +// +// Output format: +// - starts with ": " +// - each input byte is translated to a pair of hex digits +// - bytes are separated by "." except that every fourth separator is "|" +// - if the input is sufficiently long, the output is truncated and terminated with "..." +// +// Arguments: +// - outBuf -- Pointer to buffer of type "FormatBuf" into which output is written +// - inBuf -- Pointer to bytes which are to be formatted into outBuf +// - inBytes -- Number of bytes in inBuf +// +// Constant: +// - kFormatInBytesMax -- Only min(kFormatInBytesMax, inBytes) bytes will be read +// from inBuf +// +// Return value: +// - pointer (const char *) to output (which is part of outBuf) +// +static const int kFormatInBytesMax = 16; +// ": " + 2 digits per byte + 1 separator between bytes + "..." + null +typedef char FormatBuf[2 + kFormatInBytesMax*2 + (kFormatInBytesMax - 1) + 3 + 1]; +static const char *format_bytes(FormatBuf *outBuf, const uint8_t *inBuf, const int inBytes) { + strcpy(*outBuf, ": "); + int pos = 2; + const int lim = std::min(kFormatInBytesMax, inBytes); + for (int i = 0; i < lim; ++i) { + if (i) { + sprintf(*outBuf + pos, (i % 4 ? "." : "|")); + ++pos; + } + sprintf(*outBuf + pos, "%02x", inBuf[i]); + pos += 2; + } + if (kFormatInBytesMax < inBytes) + strcpy(*outBuf + pos, "..."); + return *outBuf; +} + +static void walk_1d_reduce_new(void *usr, uint32_t idx) { + const MTLaunchStructReduceNew *mtls = (const MTLaunchStructReduceNew *)usr; + RsExpandKernelDriverInfo redp = mtls->redp; + + // find accumulator + uint8_t *&accumPtr = mtls->accumPtr[idx]; + if (!accumPtr) { + uint32_t accumIdx = (uint32_t)__sync_fetch_and_add(&mtls->accumCount, 1); + if (mtls->outFunc) { + accumPtr = mtls->accumAlloc + mtls->accumStride * accumIdx; + } else { + if (accumIdx == 0) { + accumPtr = mtls->redp.outPtr[0]; + } else { + accumPtr = mtls->accumAlloc + mtls->accumStride * (accumIdx - 1); + } + } + REDUCE_NEW_ALOGV("walk_1d_reduce_new(%p): idx = %u got accumCount %u and accumPtr %p", + mtls->accumFunc, idx, accumIdx, accumPtr); + // initialize accumulator + if (mtls->initFunc) { + mtls->initFunc(accumPtr); + } else { + memset(accumPtr, 0, mtls->accumSize); + } + } + + // accumulate + const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc; + while (1) { + uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1); + uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize; + uint32_t xEnd = xStart + mtls->mSliceSize; + + xEnd = rsMin(xEnd, mtls->end.x); + + if (xEnd <= xStart) { + return; + } + + RedpPtrSetup(mtls, &redp, xStart, 0, 0); + fn(&redp, xStart, xEnd, accumPtr); + + FormatBuf fmt; + if (mtls->logReduceAccum) { + format_bytes(&fmt, accumPtr, mtls->accumSize); + } else { + fmt[0] = 0; + } + REDUCE_NEW_ALOGV("walk_1d_reduce_new(%p): idx = %u [%u, %u)%s", + mtls->accumFunc, idx, xStart, xEnd, fmt); + } +} + // Launch a simple reduce-style kernel. // Inputs: // ain: The allocation that contains the input @@ -486,6 +588,25 @@ void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains, uint32_t inLen, Allocation * aout, MTLaunchStructReduceNew *mtls) { + mtls->logReduceAccum = mRSC->props.mLogReduceAccum; + if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) { + launchReduceNewParallel(ains, inLen, aout, mtls); + } else { + launchReduceNewSerial(ains, inLen, aout, mtls); + } +} + +// Launch a general reduce-style kernel, single-threaded. +// Inputs: +// ains[0..inLen-1]: Array of allocations that contain the inputs +// aout: The allocation that will hold the output +// mtls: Holds launch parameters +void RsdCpuReferenceImpl::launchReduceNewSerial(const Allocation ** ains, + uint32_t inLen, + Allocation * aout, + MTLaunchStructReduceNew *mtls) { + ALOGV("launchReduceNewSerial(%p)", mtls->accumFunc); + // In the presence of outconverter, we allocate temporary memory for // the accumulator. // @@ -521,6 +642,112 @@ void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains, } } +// Launch a general reduce-style kernel, multi-threaded. +// Inputs: +// ains[0..inLen-1]: Array of allocations that contain the inputs +// aout: The allocation that will hold the output +// mtls: Holds launch parameters +void RsdCpuReferenceImpl::launchReduceNewParallel(const Allocation ** ains, + uint32_t inLen, + Allocation * aout, + MTLaunchStructReduceNew *mtls) { + // For now, we don't know how to go parallel beyond 1D, or in the absence of a combiner. + if ((mtls->redp.dim.y > 1) || (mtls->redp.dim.z > 1) || !mtls->combFunc) { + launchReduceNewSerial(ains, inLen, aout, mtls); + return; + } + + // Number of threads = "main thread" + number of other (worker) threads + const uint32_t numThreads = mWorkers.mCount + 1; + + // In the absence of outconverter, we use the output allocation as + // an accumulator, and therefore need to allocate one fewer accumulator. + const uint32_t numAllocAccum = numThreads - (mtls->outFunc == nullptr); + + // If mDebugReduceSplitAccum, then we want each accumulator to start + // on a page boundary. (TODO: Would some unit smaller than a page + // be sufficient to avoid false sharing?) + if (mRSC->props.mDebugReduceSplitAccum) { + // Round up accumulator size to an integral number of pages + mtls->accumStride = + (unsigned(mtls->accumSize) + unsigned(mPageSize)-1) & + ~(unsigned(mPageSize)-1); + // Each accumulator gets its own page. Alternatively, if we just + // wanted to make sure no two accumulators are on the same page, + // we could instead do + // allocSize = mtls->accumStride * (numAllocation - 1) + mtls->accumSize + const size_t allocSize = mtls->accumStride * numAllocAccum; + mtls->accumAlloc = static_cast<uint8_t *>(memalign(mPageSize, allocSize)); + } else { + mtls->accumStride = mtls->accumSize; + mtls->accumAlloc = static_cast<uint8_t *>(malloc(mtls->accumStride * numAllocAccum)); + } + + const size_t accumPtrArrayBytes = sizeof(uint8_t *) * numThreads; + mtls->accumPtr = static_cast<uint8_t **>(malloc(accumPtrArrayBytes)); + memset(mtls->accumPtr, 0, accumPtrArrayBytes); + + mtls->accumCount = 0; + + rsAssert(!mInKernel); + mInKernel = true; + mtls->mSliceSize = rsMax(1U, mtls->redp.dim.x / (numThreads * 4)); + ALOGV("launchReduceNewParallel(%p): %u threads, accumAlloc = %p", + mtls->accumFunc, numThreads, mtls->accumAlloc); + launchThreads(walk_1d_reduce_new, mtls); + mInKernel = false; + + // Combine accumulators and identify final accumulator + uint8_t *finalAccumPtr = (mtls->outFunc ? nullptr : mtls->redp.outPtr[0]); + // Loop over accumulators, combining into finalAccumPtr. If finalAccumPtr + // is null, then the first accumulator I find becomes finalAccumPtr. + for (unsigned idx = 0; idx < mtls->accumCount; ++idx) { + uint8_t *const thisAccumPtr = mtls->accumPtr[idx]; + if (finalAccumPtr) { + if (finalAccumPtr != thisAccumPtr) { + if (mtls->combFunc) { + if (mtls->logReduceAccum) { + FormatBuf fmt; + REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): accumulating into%s", + mtls->accumFunc, + format_bytes(&fmt, finalAccumPtr, mtls->accumSize)); + REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): accumulator[%d]%s", + mtls->accumFunc, idx, + format_bytes(&fmt, thisAccumPtr, mtls->accumSize)); + } + mtls->combFunc(finalAccumPtr, thisAccumPtr); + } else { + rsAssert(!"expected combiner"); + } + } + } else { + finalAccumPtr = thisAccumPtr; + } + } + rsAssert(finalAccumPtr != nullptr); + if (mtls->logReduceAccum) { + FormatBuf fmt; + REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): final accumulator%s", + mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize)); + } + + // Outconvert + if (mtls->outFunc) { + mtls->outFunc(mtls->redp.outPtr[0], finalAccumPtr); + if (mtls->logReduceAccum) { + FormatBuf fmt; + REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): final outconverted result%s", + mtls->accumFunc, + format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0])); + } + } + + // Clean up + free(mtls->accumPtr); + free(mtls->accumAlloc); +} + + void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains, uint32_t inLen, Allocation* aout, @@ -537,9 +764,9 @@ void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains, (mtls->start.array[2] != mtls->end.array[2]) || (mtls->start.array[3] != mtls->end.array[3]); - if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) { + if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) { const size_t targetByteChunk = 16 * 1024; - mInForEach = true; + mInKernel = true; // NOTE: The guard immediately above ensures this was !mInKernel if (outerDims) { // No fancy logic for chunk size @@ -588,9 +815,9 @@ void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains, mtls->mSliceSize = 1; } - launchThreads(walk_1d, mtls); + launchThreads(walk_1d_foreach, mtls); } - mInForEach = false; + mInKernel = false; } else { ForEachFunc_t fn = mtls->kernel; diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h index 939b7ae2..c2a08640 100644 --- a/cpu_ref/rsCpuCore.h +++ b/cpu_ref/rsCpuCore.h @@ -34,6 +34,7 @@ extern bool gArchUseSIMD; // Function types found in RenderScript code typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len); typedef void (*ReduceNewAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum); +typedef void (*ReduceNewCombinerFunc_t)(uint8_t *accum, const uint8_t *other); typedef void (*ReduceNewInitializerFunc_t)(uint8_t *accum); typedef void (*ReduceNewOutConverterFunc_t)(uint8_t *out, const uint8_t *accum); typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride); @@ -44,6 +45,7 @@ typedef int (*RootFunc_t)(void); struct ReduceNewDescription { ReduceNewAccumulatorFunc_t accumFunc; // expanded accumulator function ReduceNewInitializerFunc_t initFunc; // user initializer function + ReduceNewCombinerFunc_t combFunc; // user combiner function ReduceNewOutConverterFunc_t outFunc; // user outconverter function size_t accumSize; // accumulator datum size, in bytes }; @@ -73,7 +75,8 @@ struct MTLaunchStructCommon { RsLaunchDimensions start; RsLaunchDimensions end; // Points to MTLaunchStructForEach::fep::dim or - // MTLaunchStructReduce::inputDim. + // MTLaunchStructReduce::inputDim or + // MTLaunchStructReduceNew::redp::dim. RsLaunchDimensions *dimPtr; }; @@ -101,9 +104,51 @@ struct MTLaunchStructReduceNew : public MTLaunchStructCommon { ReduceNewAccumulatorFunc_t accumFunc; ReduceNewInitializerFunc_t initFunc; + ReduceNewCombinerFunc_t combFunc; ReduceNewOutConverterFunc_t outFunc; size_t accumSize; // accumulator datum size in bytes + + size_t accumStride; // stride between accumulators in accumAlloc (below) + + // These fields are used for managing accumulator data items in a + // multithreaded execution. + // + // Let the number of threads be N. + // Let Outc be true iff there is an outconverter. + // + // accumAlloc is a pointer to a single allocation of (N - !Outc) + // accumulators. (If there is no outconverter, then the output + // allocation acts as an accumulator.) It is created at kernel + // launch time. Within that allocation, the distance between the + // start of adjacent accumulators is accumStride bytes -- this + // might be the same as accumSize, or it might be larger, if we + // are attempting to avoid false sharing. + // + // accumCount is an atomic counter of how many accumulators have + // been grabbed by threads. It is initialized to zero at kernel + // launch time. See accumPtr for further description. + // + // accumPtr is pointer to an array of N pointers to accumulators. + // The array is created at kernel launch time, and each element is + // initialized to nullptr. When a particular thread goes to work, + // that thread obtains its accumulator from its entry in this + // array. If the entry is nullptr, that thread needs to obtain an + // accumulator, and initialize its entry in the array accordingly. + // It does so via atomic access (fetch-and-add) to accumCount. + // - If Outc, then the fetched value is used as an index into + // accumAlloc. + // - If !Outc, then + // - If the fetched value is zero, then this thread gets the + // output allocation for its accumulator. + // - If the fetched value is nonzero, then (fetched value - 1) + // is used as an index into accumAlloc. + uint8_t *accumAlloc; + uint8_t **accumPtr; + uint32_t accumCount; + + // Logging control + bool logReduceAccum; }; class RsdCpuReferenceImpl : public RsdCpuReference { @@ -161,7 +206,7 @@ public: virtual const char *getBccPluginName() const { return mBccPluginName.string(); } - bool getInForEach() override { return mInForEach; } + bool getInKernel() override { return mInKernel; } // Set to true if we should embed global variable information in the code. void setEmbedGlobalInfo(bool v) override { @@ -190,7 +235,7 @@ protected: uint32_t version_major; uint32_t version_minor; //bool mHasGraphics; - bool mInForEach; + bool mInKernel; // Is a parallel kernel execution underway? struct Workers { volatile int mRunningCount; @@ -222,6 +267,14 @@ protected: // when potentially embedding information about globals. // Defaults to true. bool mEmbedGlobalInfoSkipConstant; + + long mPageSize; + + // Launch a general reduce kernel + void launchReduceNewSerial(const Allocation ** ains, uint32_t inLen, Allocation *aout, + MTLaunchStructReduceNew *mtls); + void launchReduceNewParallel(const Allocation ** ains, uint32_t inLen, Allocation *aout, + MTLaunchStructReduceNew *mtls); }; diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp index 34a6b20c..9d6e6236 100644 --- a/cpu_ref/rsCpuExecutable.cpp +++ b/cpu_ref/rsCpuExecutable.cpp @@ -538,8 +538,8 @@ ScriptExecutable* ScriptExecutable::createFromSharedObject( goto error; } - // The current implementation does not use the signature, - // reduce name, or combiner. + // The current implementation does not use the signature + // or reduce name. reduceNewDescriptions[i].accumSize = tmpSize; @@ -565,6 +565,19 @@ ScriptExecutable* ScriptExecutable::createFromSharedObject( goto error; } + // Process the (optional) combiner. + if (strcmp(tmpNameCombiner, kNoName)) { + // Lookup the original user-written combiner. + if (!(reduceNewDescriptions[i].combFunc = + (ReduceNewCombinerFunc_t) dlsym(sharedObj, tmpNameCombiner))) { + ALOGE("Failed to find combiner function address for %s(): %s", + tmpNameCombiner, dlerror()); + goto error; + } + } else { + reduceNewDescriptions[i].combFunc = nullptr; + } + // Process the (optional) outconverter. if (strcmp(tmpNameOutConverter, kNoName)) { // Lookup the original user-written outconverter. diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp index ef738d72..a88af2fe 100644 --- a/cpu_ref/rsCpuScript.cpp +++ b/cpu_ref/rsCpuScript.cpp @@ -645,9 +645,9 @@ bool RsdCpuScriptImpl::reduceNewMtlsSetup(const Allocation ** ains, mtls->rs = mCtx; - // Currently not threaded. - mtls->isThreadable = false; - mtls->mSliceNum = -1; + mtls->mSliceNum = 0; + mtls->mSliceSize = 1; + mtls->isThreadable = mIsThreadable; // Set up output, mtls->redp.outLen = 1; @@ -842,6 +842,7 @@ void RsdCpuScriptImpl::reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceN const ReduceNewDescription *desc = mScriptExec->getReduceNewDescription(slot); mtls->accumFunc = desc->accumFunc; mtls->initFunc = desc->initFunc; // might legally be nullptr + mtls->combFunc = desc->combFunc; // might legally be nullptr mtls->outFunc = desc->outFunc; // might legally be nullptr mtls->accumSize = desc->accumSize; diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h index 49a999db..e226b934 100644 --- a/cpu_ref/rsd_cpu.h +++ b/cpu_ref/rsd_cpu.h @@ -131,7 +131,7 @@ public: uint32_t flags) = 0; virtual CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) = 0; virtual void* createScriptGroup(const ScriptGroupBase *sg) = 0; - virtual bool getInForEach() = 0; + virtual bool getInKernel() = 0; // Is a parallel kernel execution underway? // Set to true if we should embed global variable information in the code. virtual void setEmbedGlobalInfo(bool v) = 0; diff --git a/driver/rsdRuntimeStubs.cpp b/driver/rsdRuntimeStubs.cpp index c7b88962..10775407 100644 --- a/driver/rsdRuntimeStubs.cpp +++ b/driver/rsdRuntimeStubs.cpp @@ -136,7 +136,7 @@ static bool failIfInKernel(Context *rsc, const char *funcName) { RsdHal *dc = (RsdHal *)rsc->mHal.drv; RsdCpuReference *impl = (RsdCpuReference *) dc->mCpuRef; - if (impl->getInForEach()) { + if (impl->getInKernel()) { char buf[256]; snprintf(buf, sizeof(buf), "Error: Call to unsupported function %s " "in kernel", funcName); diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java index 608de473..c1e9c408 100644 --- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java +++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java @@ -119,39 +119,6 @@ public class UT_reduce extends UnitTest { /////////////////////////////////////////////////////////////////// - private float dp(float[] input1, float[] input2) { - _RS_ASSERT("dp input length mismatch", input1.length == input2.length); - - float rslt = 0; - for (int idx = 0; idx < input1.length; ++idx) - rslt += input1[idx] * input2[idx]; - return rslt; - } - - private boolean dp(RenderScript RS, ScriptC_reduce s) { - final float[] input1 = createInputArrayFloat(100000, 2); - final float[] input2 = createInputArrayFloat(100000, 3); - - final float javaRslt = dp(input1, input2); - final float rsRslt = s.reduce_dp(input1, input2).get(); - - // NOTE: Using a floating point equality check to test for - // correctness -- as we do below -- is a bad idea. It's only - // reliable if the Java and RenderScript implementation of dp - // use the same algorithm. Equality could be broken by - // different optimizations between the two, or running the - // RenderScript algorithm multithreaded, or running the - // RenderScript algorithm on a GPU rather than the CPU. - // - // Should we be checking instead that the results are - // "sufficiently close"? Cooking the input set to try to - // ensure a deterministic result? Changing to integers - // instead? - return result("dp", javaRslt, rsRslt); - } - - /////////////////////////////////////////////////////////////////// - private Int2 findMinAndMax(float[] input) { float minVal = Float.POSITIVE_INFINITY; int minIdx = -1; @@ -322,7 +289,6 @@ public class UT_reduce extends UnitTest { boolean pass = true; pass &= addint1D(pRS, s); pass &= addint2D(pRS, s); - pass &= dp(pRS, s); pass &= findMinAndMax(pRS, s); pass &= fz(pRS, s); pass &= fz2(pRS, s); diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java index 84d2c505..b998f518 100644 --- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java +++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java @@ -119,39 +119,6 @@ public class UT_reduce_backward extends UnitTest { /////////////////////////////////////////////////////////////////// - private float dp(float[] input1, float[] input2) { - _RS_ASSERT("dp input length mismatch", input1.length == input2.length); - - float rslt = 0; - for (int idx = 0; idx < input1.length; ++idx) - rslt += input1[idx] * input2[idx]; - return rslt; - } - - private boolean dp(RenderScript RS, ScriptC_reduce_backward s) { - final float[] input1 = createInputArrayFloat(100000, 2); - final float[] input2 = createInputArrayFloat(100000, 3); - - final float javaRslt = dp(input1, input2); - final float rsRslt = s.reduce_dp(input1, input2).get(); - - // NOTE: Using a floating point equality check to test for - // correctness -- as we do below -- is a bad idea. It's only - // reliable if the Java and RenderScript implementation of dp - // use the same algorithm. Equality could be broken by - // different optimizations between the two, or running the - // RenderScript algorithm multithreaded, or running the - // RenderScript algorithm on a GPU rather than the CPU. - // - // Should we be checking instead that the results are - // "sufficiently close"? Cooking the input set to try to - // ensure a deterministic result? Changing to integers - // instead? - return result("dp", javaRslt, rsRslt); - } - - /////////////////////////////////////////////////////////////////// - private Int2 findMinAndMax(float[] input) { float minVal = Float.POSITIVE_INFINITY; int minIdx = -1; @@ -322,7 +289,6 @@ public class UT_reduce_backward extends UnitTest { boolean pass = true; pass &= addint1D(pRS, s); pass &= addint2D(pRS, s); - pass &= dp(pRS, s); pass &= findMinAndMax(pRS, s); pass &= fz(pRS, s); pass &= fz2(pRS, s); diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs index be09dfb6..ec7be8b7 100644 --- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs +++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs @@ -16,18 +16,6 @@ static void aiAccum(int *accum, int val) { *accum += val; } ///////////////////////////////////////////////////////////////////////// -#pragma rs reduce(dp) \ - accumulator(dpAccum) combiner(dpSum) - -static void dpAccum(float *accum, float in1, float in2) { - *accum += in1*in2; -} - -// combiner function -static void dpSum(float *accum, const float *val) { *accum += *val; } - -///////////////////////////////////////////////////////////////////////// - #pragma rs reduce(findMinAndMax) \ initializer(fMMInit) accumulator(fMMAccumulator) \ combiner(fMMCombiner) outconverter(fMMOutConverter) @@ -61,8 +49,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) { static void fMMCombiner(MinAndMax *accum, const MinAndMax *val) { - fMMAccumulator(accum, val->min.val, val->min.idx); - fMMAccumulator(accum, val->max.val, val->max.idx); + if (val->min.val < accum->min.val) + accum->min = val->min; + if (val->max.val > accum->max.val) + accum->max = val->max; } static void fMMOutConverter(int2 *result, diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs index 419e7090..41252c8a 100644 --- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs +++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs @@ -15,18 +15,6 @@ static void aiAccum(int *accum, int val) { *accum += val; } ///////////////////////////////////////////////////////////////////////// -static void dpAccum(float *accum, float in1, float in2) { - *accum += in1*in2; -} - -// combiner function -static void dpSum(float *accum, const float *val) { *accum += *val; } - -#pragma rs reduce(dp) \ - accumulator(dpAccum) combiner(dpSum) - -///////////////////////////////////////////////////////////////////////// - typedef struct { float val; int idx; @@ -56,8 +44,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) { static void fMMCombiner(MinAndMax *accum, const MinAndMax *val) { - fMMAccumulator(accum, val->min.val, val->min.idx); - fMMAccumulator(accum, val->max.val, val->max.idx); + if (val->min.val < accum->min.val) + accum->min = val->min; + if (val->max.val > accum->max.val) + accum->max = val->max; } static void fMMOutConverter(int2 *result, diff --git a/java/tests/RsTest/AndroidManifest.xml b/java/tests/RsTest/AndroidManifest.xml index b660398d..31da896a 100644 --- a/java/tests/RsTest/AndroidManifest.xml +++ b/java/tests/RsTest/AndroidManifest.xml @@ -2,6 +2,7 @@ <manifest xmlns:android="http://schemas.android.com/apk/res/android" package="com.android.rs.test"> <application + android:largeHeap="true" android:label="_RS_Test" android:icon="@drawable/test_pattern"> <activity android:name="RSTest" diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java index a244646c..07692595 100644 --- a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java +++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java @@ -27,6 +27,7 @@ import android.content.res.Resources; import android.renderscript.*; import android.util.Log; import java.lang.Float; +import java.util.Arrays; import java.util.Random; public class UT_reduce extends UnitTest { @@ -36,6 +37,81 @@ public class UT_reduce extends UnitTest { super(rstc, "reduce", ctx); } + private static class timing { + timing(long myJavaStart, long myJavaEnd, long myRsStart, + long myCopyStart, long myKernelStart, long myRsEnd, + Allocation... myInputs) { + javaStart = myJavaStart; + javaEnd = myJavaEnd; + rsStart = myRsStart; + copyStart = myCopyStart; + kernelStart = myKernelStart; + rsEnd = myRsEnd; + + inputBytes = 0; + for (Allocation input : myInputs) + inputBytes += input.getBytesSize(); + + inputCells = (myInputs.length > 0) ? myInputs[0].getType().getCount() : 0; + } + + timing(long myInputCells) { + inputCells = myInputCells; + } + + private long javaStart = -1; + private long javaEnd = -1; + private long rsStart = -1; + private long copyStart = -1; + private long kernelStart = -1; + private long rsEnd = -1; + private long inputBytes = -1; + private long inputCells = -1; + + public long javaTime() { return javaEnd - javaStart; } + public long rsTime() { return rsEnd - rsStart; } + public long kernelTime() { return rsEnd - kernelStart; } + public long overheadTime() { return kernelStart - rsStart; } + public long allocationTime() { return copyStart - rsStart; } + public long copyTime() { return kernelStart - copyStart; } + + public static String string(long myJavaStart, long myJavaEnd, long myRsStart, + long myCopyStart, long myKernelStart, long myRsEnd, + Allocation... myInputs) { + return (new timing(myJavaStart, myJavaEnd, myRsStart, + myCopyStart, myKernelStart, myRsEnd, myInputs)).string(); + } + + public static String string(long myInputCells) { + return (new timing(myInputCells)).string(); + } + + public String string() { + String result; + if (javaStart >= 0) { + result = "(java " + javaTime() + "ms, rs " + rsTime() + "ms = overhead " + + overheadTime() + "ms (alloc " + allocationTime() + "ms + copy " + + copyTime() + "ms) + kernel+get() " + kernelTime() + "ms)"; + if (inputCells > 0) + result += " "; + } else { + result = ""; + } + if (inputCells > 0) { + result += "(" + fmt.format(inputCells) + " cells"; + if (inputBytes > 0) + result += ", " + fmt.format(inputBytes) + " bytes"; + result += ")"; + } + return result; + } + + private static java.text.DecimalFormat fmt; + static { + fmt = new java.text.DecimalFormat("###,###"); + } + }; + private byte[] createInputArrayByte(int len, int seed) { byte[] array = new byte[len]; (new Random(seed)).nextBytes(array); @@ -66,21 +142,60 @@ public class UT_reduce extends UnitTest { return array; } - private <T extends Number> boolean result(String testName, T javaRslt, T rsRslt) { + private <T extends Number> boolean result(String testName, final timing t, + T javaRslt, T rsRslt) { final boolean success = javaRslt.equals(rsRslt); + String status = (success ? "PASSED" : "FAILED"); + if (success && (t != null)) + status += " " + t.string(); + Log.i(TAG, testName + ": java " + javaRslt + ", rs " + rsRslt + ": " + status); + return success; + } + + private boolean result(String testName, final timing t, + final long[] javaRslt, final long[] rsRslt) { + if (javaRslt.length != rsRslt.length) { + Log.i(TAG, testName + ": java length " + javaRslt.length + + ", rs length " + rsRslt.length + ": FAILED"); + return false; + } + for (int i = 0; i < javaRslt.length; ++i) { + if (javaRslt[i] != rsRslt[i]) { + Log.i(TAG, testName + "[" + i + "]: java " + javaRslt[i] + + ", rs " + rsRslt[i] + ": FAILED"); + return false; + } + } + String status = "PASSED"; + if (t != null) + status += " " + t.string(); + Log.i(TAG, testName + ": " + status); + return true; + } + + private boolean result(String testName, final timing t, Int2 javaRslt, Int2 rsRslt) { + final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y); + String status = (success ? "PASSED" : "FAILED"); + if (success && (t != null)) + status += " " + t.string(); Log.i(TAG, - testName + ": java " + javaRslt + ", rs " + rsRslt + ": " + - (success ? "PASSED" : "FAILED")); + testName + + ": java (" + javaRslt.x + ", " + javaRslt.y + ")" + + ", rs (" + rsRslt.x + ", " + rsRslt.y + ")" + + ": " + status); return success; } - private boolean result(String testName, Int2 javaRslt, Int2 rsRslt) { + private boolean result(String testName, final timing t, Float2 javaRslt, Float2 rsRslt) { final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y); + String status = (success ? "PASSED" : "FAILED"); + if (success && (t != null)) + status += " " + t.string(); Log.i(TAG, testName + ": java (" + javaRslt.x + ", " + javaRslt.y + ")" + ", rs (" + rsRslt.x + ", " + rsRslt.y + ")" + - ": " + (success ? "PASSED" : "FAILED")); + ": " + status); return success; } @@ -93,61 +208,68 @@ public class UT_reduce extends UnitTest { return rslt; } - private boolean addint1D(RenderScript RS, ScriptC_reduce s) { - final int[] input = createInputArrayInt(100000, 0, 1 << 13); + private boolean addint1D_array(RenderScript RS, ScriptC_reduce s, int seed, int[] size) { + final int[] input = createInputArrayInt(size[0], seed, Integer.MAX_VALUE / size[0]); final int javaRslt = addint(input); final int rsRslt = s.reduce_addint(input).get(); - return result("addint1D", javaRslt, rsRslt); + return result("addint1D_array", new timing(size[0]), javaRslt, rsRslt); } - private boolean addint2D(RenderScript RS, ScriptC_reduce s) { - final int dimX = 450, dimY = 225; - - final int[] inputArray = createInputArrayInt(dimX * dimY, 1, 1 << 13); - Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS)); - typeBuilder.setX(dimX).setY(dimY); - Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create()); - inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray); + private boolean addint1D(RenderScript RS, ScriptC_reduce s, int seed, int[] size) { + final int[] inputArray = createInputArrayInt(size[0], seed, Integer.MAX_VALUE / size[0]); + final long javaTimeStart = java.lang.System.currentTimeMillis(); final int javaRslt = addint(inputArray); + final long javaTimeEnd = java.lang.System.currentTimeMillis(); + + final long rsTimeStart = java.lang.System.currentTimeMillis(); + + Allocation inputAllocation = Allocation.createSized(RS, Element.I32(RS), inputArray.length); + + final long copyTimeStart = java.lang.System.currentTimeMillis(); + + inputAllocation.copyFrom(inputArray); + + final long kernelTimeStart = java.lang.System.currentTimeMillis(); final int rsRslt = s.reduce_addint(inputAllocation).get(); + final long rsTimeEnd = java.lang.System.currentTimeMillis(); - return result("addint2D", javaRslt, rsRslt); + return result("addint1D", + new timing(javaTimeStart, javaTimeEnd, rsTimeStart, + copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation), + javaRslt, rsRslt); } - /////////////////////////////////////////////////////////////////// + private boolean addint2D(RenderScript RS, ScriptC_reduce s, int seed, int[] size) { + final int dimX = size[0]; + final int dimY = size[1]; - private float dp(float[] input1, float[] input2) { - _RS_ASSERT("dp input length mismatch", input1.length == input2.length); + final int[] inputArray = createInputArrayInt(dimX * dimY, seed, Integer.MAX_VALUE / (dimX * dimY)); - float rslt = 0; - for (int idx = 0; idx < input1.length; ++idx) - rslt += input1[idx] * input2[idx]; - return rslt; - } + final long javaTimeStart = java.lang.System.currentTimeMillis(); + final int javaRslt = addint(inputArray); + final long javaTimeEnd = java.lang.System.currentTimeMillis(); + + final long rsTimeStart = java.lang.System.currentTimeMillis(); - private boolean dp(RenderScript RS, ScriptC_reduce s) { - final float[] input1 = createInputArrayFloat(100000, 2); - final float[] input2 = createInputArrayFloat(100000, 3); + Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS)); + typeBuilder.setX(dimX).setY(dimY); + Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create()); - final float javaRslt = dp(input1, input2); - final float rsRslt = s.reduce_dp(input1, input2).get(); + final long copyTimeStart = java.lang.System.currentTimeMillis(); - // NOTE: Using a floating point equality check to test for - // correctness -- as we do below -- is a bad idea. It's only - // reliable if the Java and RenderScript implementation of dp - // use the same algorithm. Equality could be broken by - // different optimizations between the two, or running the - // RenderScript algorithm multithreaded, or running the - // RenderScript algorithm on a GPU rather than the CPU. - // - // Should we be checking instead that the results are - // "sufficiently close"? Cooking the input set to try to - // ensure a deterministic result? Changing to integers - // instead? - return result("dp", javaRslt, rsRslt); + inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray); + + final long kernelTimeStart = java.lang.System.currentTimeMillis(); + final int rsRslt = s.reduce_addint(inputAllocation).get(); + final long rsTimeEnd = java.lang.System.currentTimeMillis(); + + return result("addint2D", + new timing(javaTimeStart, javaTimeEnd, rsTimeStart, + copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation), + javaRslt, rsRslt); } /////////////////////////////////////////////////////////////////// @@ -172,79 +294,195 @@ public class UT_reduce extends UnitTest { return new Int2(minIdx, maxIdx); } - private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s) { - final float[] input = createInputArrayFloat(100000, 4); + private boolean findMinAndMax_array(RenderScript RS, ScriptC_reduce s, int seed, int[] size) { + final float[] input = createInputArrayFloat(size[0], seed); final Int2 javaRslt = findMinAndMax(input); final Int2 rsRslt = s.reduce_findMinAndMax(input).get(); - return result("findMinAndMax", javaRslt, rsRslt); + // Note that the Java and RenderScript algorithms are not + // guaranteed to find the same cells -- but they should + // find cells of the same value. + final Float2 javaVal = new Float2(input[javaRslt.x], input[javaRslt.y]); + final Float2 rsVal = new Float2(input[rsRslt.x], input[rsRslt.y]); + + return result("findMinAndMax_array", new timing(size[0]), javaVal, rsVal); + } + + private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s, int seed, int[] size) { + final float[] inputArray = createInputArrayFloat(size[0], seed); + + final long javaTimeStart = java.lang.System.currentTimeMillis(); + final Int2 javaRslt = findMinAndMax(inputArray); + final long javaTimeEnd = java.lang.System.currentTimeMillis(); + + final long rsTimeStart = java.lang.System.currentTimeMillis(); + + Allocation inputAllocation = Allocation.createSized(RS, Element.F32(RS), inputArray.length); + + final long copyTimeStart = java.lang.System.currentTimeMillis(); + + inputAllocation.copyFrom(inputArray); + + final long kernelTimeStart = java.lang.System.currentTimeMillis(); + final Int2 rsRslt = s.reduce_findMinAndMax(inputAllocation).get(); + final long rsTimeEnd = java.lang.System.currentTimeMillis(); + + // Note that the Java and RenderScript algorithms are not + // guaranteed to find the same cells -- but they should + // find cells of the same value. + final Float2 javaVal = new Float2(inputArray[javaRslt.x], inputArray[javaRslt.y]); + final Float2 rsVal = new Float2(inputArray[rsRslt.x], inputArray[rsRslt.y]); + + return result("findMinAndMax", + new timing(javaTimeStart, javaTimeEnd, rsTimeStart, + copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation), + javaVal, rsVal); } /////////////////////////////////////////////////////////////////// - private boolean fz(RenderScript RS, ScriptC_reduce s) { - final int inputLen = 100000; - int[] input = createInputArrayInt(inputLen, 5); + private int fz(final int[] input) { + for (int i = 0; i < input.length; ++i) + if (input[i] == 0) + return i; + return -1; + } + + private boolean fz_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) { + final int inputLen = size[0]; + int[] input = createInputArrayInt(inputLen, seed+0); // just in case we got unlucky - input[(new Random(6)).nextInt(inputLen)] = 0; + input[(new Random(seed+1)).nextInt(inputLen)] = 0; final int rsRslt = s.reduce_fz(input).get(); final boolean success = (input[rsRslt] == 0); Log.i(TAG, - "fz: input[" + rsRslt + "] == " + input[rsRslt] + ": " + - (success ? "PASSED" : "FAILED")); + "fz_array: input[" + rsRslt + "] == " + input[rsRslt] + ": " + + (success ? "PASSED " + timing.string(size[0]) : "FAILED")); + return success; + } + + private boolean fz(RenderScript RS, ScriptC_reduce s, int seed, int size[]) { + final int inputLen = size[0]; + int[] inputArray = createInputArrayInt(inputLen, seed+0); + // just in case we got unlucky + inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0; + + final long javaTimeStart = java.lang.System.currentTimeMillis(); + final int javaRslt = fz(inputArray); + final long javaTimeEnd = java.lang.System.currentTimeMillis(); + + final long rsTimeStart = java.lang.System.currentTimeMillis(); + + Allocation inputAllocation = Allocation.createSized(RS, Element.I32(RS), inputArray.length); + + final long copyTimeStart = java.lang.System.currentTimeMillis(); + + inputAllocation.copyFrom(inputArray); + + final long kernelTimeStart = java.lang.System.currentTimeMillis(); + final int rsRslt = s.reduce_fz(inputAllocation).get(); + final long rsTimeEnd = java.lang.System.currentTimeMillis(); + + final boolean success = (inputArray[rsRslt] == 0); + String status = (success ? "PASSED" : "FAILED"); + if (success) + status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart, + copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation); + Log.i(TAG, + "fz: java input[" + javaRslt + "] == " + inputArray[javaRslt] + + ", rs input[" + rsRslt + "] == " + inputArray[javaRslt] + ": " + status); return success; } /////////////////////////////////////////////////////////////////// - private boolean fz2(RenderScript RS, ScriptC_reduce s) { - final int dimX = 225, dimY = 450; + private boolean fz2(RenderScript RS, ScriptC_reduce s, int seed, int size[]) { + final int dimX = size[0], dimY = size[1]; final int inputLen = dimX * dimY; - int[] inputArray = createInputArrayInt(inputLen, 7); + int[] inputArray = createInputArrayInt(inputLen, seed+0); // just in case we got unlucky - inputArray[(new Random(8)).nextInt(inputLen)] = 0; + inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0; + + final long javaTimeStart = java.lang.System.currentTimeMillis(); + final int javaRsltLinear = fz(inputArray); + final long javaTimeEnd = java.lang.System.currentTimeMillis(); + + final Int2 javaRslt = new Int2(javaRsltLinear % dimX, javaRsltLinear / dimX); + final int javaCellVal = inputArray[javaRslt.x + dimX * javaRslt.y]; + + final long rsTimeStart = java.lang.System.currentTimeMillis(); Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS)); typeBuilder.setX(dimX).setY(dimY); Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create()); + + final long copyTimeStart = java.lang.System.currentTimeMillis(); + inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray); + final long kernelTimeStart = java.lang.System.currentTimeMillis(); final Int2 rsRslt = s.reduce_fz2(inputAllocation).get(); - - final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y]; - final boolean success = (cellVal == 0); + final long rsTimeEnd = java.lang.System.currentTimeMillis(); + + final int rsCellVal = inputArray[rsRslt.x + dimX * rsRslt.y]; + final boolean success = (rsCellVal == 0); + String status = (success ? "PASSED" : "FAILED"); + if (success) + status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart, + copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation); Log.i(TAG, - "fz2: input[" + rsRslt.x + ", " + rsRslt.y + "] == " + cellVal + ": " + - (success ? "PASSED" : "FAILED")); + "fz2: java input[" + javaRslt.x + ", " + javaRslt.y + "] == " + javaCellVal + + ", rs input[" + rsRslt.x + ", " + rsRslt.y + "] == " + rsCellVal + ": " + status); return success; } /////////////////////////////////////////////////////////////////// - private boolean fz3(RenderScript RS, ScriptC_reduce s) { - final int dimX = 59, dimY = 48, dimZ = 37; + private boolean fz3(RenderScript RS, ScriptC_reduce s, int seed, int[] size) { + final int dimX = size[0], dimY = size[1], dimZ = size[2]; final int inputLen = dimX * dimY * dimZ; - int[] inputArray = createInputArrayInt(inputLen, 9); + int[] inputArray = createInputArrayInt(inputLen, seed+0); // just in case we got unlucky - inputArray[(new Random(10)).nextInt(inputLen)] = 0; + inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0; + + final long javaTimeStart = java.lang.System.currentTimeMillis(); + final int javaRsltLinear = fz(inputArray); + final long javaTimeEnd = java.lang.System.currentTimeMillis(); + + final Int3 javaRslt = new Int3( + javaRsltLinear % dimX, + (javaRsltLinear / dimX) % dimY, + javaRsltLinear / (dimX * dimY)); + final int javaCellVal = inputArray[javaRslt.x + dimX * javaRslt.y + dimX * dimY * javaRslt.z]; + + final long rsTimeStart = java.lang.System.currentTimeMillis(); Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS)); typeBuilder.setX(dimX).setY(dimY).setZ(dimZ); Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create()); + + final long copyTimeStart = java.lang.System.currentTimeMillis(); + inputAllocation.copy3DRangeFrom(0, 0, 0, dimX, dimY, dimZ, inputArray); + final long kernelTimeStart = java.lang.System.currentTimeMillis(); final Int3 rsRslt = s.reduce_fz3(inputAllocation).get(); - - final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z]; - final boolean success = (cellVal == 0); + final long rsTimeEnd = java.lang.System.currentTimeMillis(); + + final int rsCellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z]; + final boolean success = (rsCellVal == 0); + String status = (success ? "PASSED" : "FAILED"); + if (success) + status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart, + copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation); Log.i(TAG, - "fz3: input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + cellVal + ": " + - (success ? "PASSED" : "FAILED")); + "fz3: java input[" + javaRslt.x + ", " + javaRslt.y + ", " + javaRslt.z + "] == " + javaCellVal + + ", rs input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + rsCellVal + ": " + status); return success; } @@ -271,24 +509,43 @@ public class UT_reduce extends UnitTest { return outputArray; } - private boolean histogram(RenderScript RS, ScriptC_reduce s) { - final byte[] inputArray = createInputArrayByte(100000, 11); + private boolean histogram_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) { + final byte[] inputArray = createInputArrayByte(size[0], seed); final long[] javaRslt = histogram(RS, inputArray); _RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount); final long[] rsRslt = s.reduce_histogram(inputArray).get(); _RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount); - for (int i = 0; i < histogramBucketCount; ++i) { - if (javaRslt[i] != rsRslt[i]) { - Log.i(TAG, - "histogram[" + i + "]: java " + javaRslt[i] + ", rs " + rsRslt[i] + ": FAILED"); - return false; - } - } + return result("histogram_array", new timing(size[0]), javaRslt, rsRslt); + } - Log.i(TAG, "histogram: PASSED"); - return true; + private boolean histogram(RenderScript RS, ScriptC_reduce s, int seed, int size[]) { + final byte[] inputArray = createInputArrayByte(size[0], seed); + + final long javaTimeStart = java.lang.System.currentTimeMillis(); + final long[] javaRslt = histogram(RS, inputArray); + final long javaTimeEnd = java.lang.System.currentTimeMillis(); + _RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount); + + final long rsTimeStart = java.lang.System.currentTimeMillis(); + + Allocation inputAllocation = Allocation.createSized(RS, Element.U8(RS), inputArray.length); + + final long copyTimeStart = java.lang.System.currentTimeMillis(); + + inputAllocation.copyFrom(inputArray); + + final long kernelTimeStart = java.lang.System.currentTimeMillis(); + final long[] rsRslt = s.reduce_histogram(inputAllocation).get(); + final long rsTimeEnd = java.lang.System.currentTimeMillis(); + _RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount); + + // NOTE: The "java time" is actually for the RenderScript histogram intrinsic + return result("histogram", + new timing(javaTimeStart, javaTimeEnd, rsTimeStart, + copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation), + javaRslt, rsRslt); } //----------------------------------------------------------------- @@ -302,17 +559,250 @@ public class UT_reduce extends UnitTest { return new Int2(modeIdx, (int)hsg[modeIdx]); } - private boolean mode(RenderScript RS, ScriptC_reduce s) { - final byte[] inputArray = createInputArrayByte(100000, 12); + private boolean mode_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) { + final byte[] inputArray = createInputArrayByte(size[0], seed); final Int2 javaRslt = mode(RS, inputArray); final Int2 rsRslt = s.reduce_mode(inputArray).get(); - return result("mode", javaRslt, rsRslt); + return result("mode", new timing(size[0]), javaRslt, rsRslt); + } + + /////////////////////////////////////////////////////////////////// + + private long sumgcd(final int in1[], final int in2[]) { + _RS_ASSERT("sumgcd input length mismatch", in1.length == in2.length); + + long sum = 0; + for (int i = 0; i < in1.length; ++i) { + int a = in1[i], b = in2[i]; + + while (b != 0) { + final int aNew = b; + final int bNew = a % b; + + a = aNew; + b = bNew; + } + + sum += a; + } + return sum; + } + + private boolean sumgcd(RenderScript RS, ScriptC_reduce s, int seed, int size[]) { + final int len = size[0]; + + final int[] inputArrayA = createInputArrayInt(len, seed+0); + final int[] inputArrayB = createInputArrayInt(len, seed+1); + + final long javaTimeStart = java.lang.System.currentTimeMillis(); + final long javaRslt = sumgcd(inputArrayA, inputArrayB); + final long javaTimeEnd = java.lang.System.currentTimeMillis(); + + final long rsTimeStart = java.lang.System.currentTimeMillis(); + + Allocation inputAllocationA = Allocation.createSized(RS, Element.I32(RS), len); + Allocation inputAllocationB = Allocation.createSized(RS, Element.I32(RS), len); + + final long copyTimeStart = java.lang.System.currentTimeMillis(); + + inputAllocationA.copyFrom(inputArrayA); + inputAllocationB.copyFrom(inputArrayB); + + final long kernelTimeStart = java.lang.System.currentTimeMillis(); + final long rsRslt = s.reduce_sumgcd(inputAllocationA, inputAllocationB).get(); + final long rsTimeEnd = java.lang.System.currentTimeMillis(); + + return result("sumgcd", + new timing(javaTimeStart, javaTimeEnd, rsTimeStart, copyTimeStart, kernelTimeStart, rsTimeEnd, + inputAllocationA, inputAllocationB), + javaRslt, rsRslt); } /////////////////////////////////////////////////////////////////// + public static final int maxSeedsPerTest = 10; + + static interface Test { + // A test execution is characterized by two properties: A seed + // and a size. + // + // The seed is used for generating pseudorandom input data. + // Ideally, we use different seeds for different tests and for + // different executions of the same test at different sizes. + // A test with multiple blocks of input data (i.e., for a + // reduction with multiple inputs) may want multiple seeds; it + // may use the seeds seed..seed+maxSeedsPerTest-1. + // + // The size indicates the amount of input data. It is the number + // of cells in a particular dimension of the iteration space. + boolean run(RenderScript RS, ScriptC_reduce s, int seed, int[] size); + }; + + static class TestDescription { + public TestDescription(String myTestName, Test myTest, int mySeed, int[] myDefSize, int[] myLog2MaxSize) { + testName = myTestName; + test = myTest; + seed = mySeed; + defSize = myDefSize; + log2MaxSize = myLog2MaxSize; + }; + + public TestDescription(String myTestName, Test myTest, int mySeed, int[] myDefSize) { + testName = myTestName; + test = myTest; + seed = mySeed; + defSize = myDefSize; + log2MaxSize = null; + }; + + public final String testName; + + public final Test test; + + // When executing the test, scale this up by maxSeedsPerTest. + public final int seed; + + // If we're only going to run the test once, what size should + // we use? + public final int[] defSize; + + // If we're going to run the test over a range of sizes, what + // is the maximum size to use? + public final int[] log2MaxSize; + }; + + private boolean run(TestDescription td, RenderScript RS, ScriptC_reduce s, int seed, int[] size) { + String arrayContent = ""; + for (int i = 0; i < size.length; ++i) { + if (i != 0) + arrayContent += ", "; + arrayContent += size[i]; + } + Log.i(TAG, "Running " + td.testName + "(seed = " + seed + ", size[] = {" + arrayContent + "})"); + return td.test.run(RS, s, seed, size); + } + + private final TestDescription[] correctnessTests = { + // alloc and array variants of the same test will use the same + // seed, in case results need to be compared. + + new TestDescription("addint1D", this::addint1D, 0, new int[]{100000}, new int[]{20}), + new TestDescription("addint1D_array", this::addint1D_array, 0, new int[]{100000}, new int[]{20}), + new TestDescription("addint2D", this::addint2D, 1, new int[]{450, 225}), + new TestDescription("findMinAndMax", this::findMinAndMax, 3, new int[]{100000}, new int[]{20}), + new TestDescription("findMinAndMaxArray", this::findMinAndMax_array, 3, new int[]{100000}, new int[]{20}), + new TestDescription("fz", this::fz, 4, new int[]{100000}, new int[]{20}), + new TestDescription("fz_array", this::fz_array, 4, new int[]{100000}, new int[]{20}), + new TestDescription("fz2", this::fz2, 5, new int[]{225, 450}), + new TestDescription("fz3", this::fz3, 6, new int[]{59, 48, 37}), + new TestDescription("histogram", this::histogram, 7, new int[]{100000}, new int[]{20}), + new TestDescription("histogram_array", this::histogram_array, 7, new int[]{100000}, new int[]{20}), + // might want to add: new TestDescription("mode", this::mode, 8, new int[]{100000}, new int[]{20}), + new TestDescription("mode_array", this::mode_array, 8, new int[]{100000}, new int[]{20}), + new TestDescription("sumgcd", this::sumgcd, 9, new int[]{1 << 16}, new int[]{20}) + }; + + private boolean runCorrectnessQuick(RenderScript RS, ScriptC_reduce s) { + boolean pass = true; + + for (TestDescription td : correctnessTests) { + pass &= run(td, RS, s, maxSeedsPerTest * td.seed, td.defSize); + } + + return pass; + } + + private boolean runCorrectness(RenderScript RS, ScriptC_reduce s) { + boolean pass = true; + + for (TestDescription td : correctnessTests) { + if (td.log2MaxSize == null) // TODO: Eventually this should never happen? + continue; + + if (td.log2MaxSize.length == 1) { + final int log2MaxSize = td.log2MaxSize[0]; + // We will execute the test with the following sizes: + // (a) Each power of 2 from zero (2**0) up to log2MaxSize (2**log2MaxSize) + // (b) Each size from (a) +/-1 + // (c) 2 random sizes between adjacent points in (a) + int[] testSizes = new int[ + /* a */ (1 + log2MaxSize) + + /* b */ 2*(1 + log2MaxSize) + + /* c */ 2*log2MaxSize]; + + // NOTE: Each test execution gets maxSeedsPerTest, and + // there are up to 3 + 5*log2MaxSize test executions + // of a test, and we need a seed for (c). Assuming + // log2MaxSize does not exceed 32, then it should be + // sufficient to reserve 1 + 5*32*maxSeedsPerTest seeds + // per TestDescription. + final int seedForPickingTestSizes = td.seed * (1 + 5*32*maxSeedsPerTest); + + int nextTestIdx = 0; + + // Fill in (a) and (b) + for (int i = 0; i <= log2MaxSize; ++i) { + final int pwrOf2 = 1 << i; + testSizes[nextTestIdx++] = pwrOf2; /* a */ + testSizes[nextTestIdx++] = pwrOf2 - 1; /* b */ + testSizes[nextTestIdx++] = pwrOf2 + 1; /* b */ + } + + // Fill in (c) + Random r = new Random(seedForPickingTestSizes); + for (int i = 0; i < log2MaxSize; ++i) { + final int lo = (1 << i) + 1; + final int hi = 1 << (i + 1); + + if (lo < hi) { + for (int j = 0; j < 2; ++j) { + testSizes[nextTestIdx++] = r.nextInt(hi - lo) + lo; + } + } + } + + Arrays.sort(testSizes); + + int[] lastTestSizeArg = new int[]{-1}; + for (int i = 0; i < testSizes.length; ++i) { + if ((testSizes[i] > 0) && (testSizes[i] != lastTestSizeArg[0])) { + lastTestSizeArg[0] = testSizes[i]; + final int seedForTestExecution = seedForPickingTestSizes + 1 + i*maxSeedsPerTest; + pass &= run(td, RS, s, seedForTestExecution, lastTestSizeArg); + } + } + } + // TODO: lengths 2 and 3, and assert otherwise + } + + return pass; + } + + private final TestDescription[] performanceTests = { + new TestDescription("addint1D", this::addint1D, 0, new int[]{100000 << 10}), + new TestDescription("addint2D", this::addint2D, 1, new int[]{450 << 5, 225 << 5}), + new TestDescription("findMinAndMax", this::findMinAndMax, 3, new int[]{100000 << 9}), + new TestDescription("fz", this::fz, 4, new int[]{100000 << 10}), + new TestDescription("fz2", this::fz2, 5, new int[]{225 << 5, 450 << 5}), + new TestDescription("fz3", this::fz3, 6, new int[]{59 << 3, 48 << 3, 37 << 3}), + new TestDescription("histogram", this::histogram, 7, new int[]{100000 << 10}), + // might want to add: new TestDescription("mode", this::mode, 8, new int[]{100000}), + new TestDescription("sumgcd", this::sumgcd, 9, new int[]{1 << 21}) + }; + + private boolean runPerformanceQuick(RenderScript RS, ScriptC_reduce s) { + boolean pass = true; + + for (TestDescription td : performanceTests) { + pass &= run(td, RS, s, maxSeedsPerTest * td.seed, td.defSize); + } + + return pass; + } + + public void run() { RenderScript pRS = RenderScript.create(mCtx); ScriptC_reduce s = new ScriptC_reduce(pRS); @@ -320,15 +810,10 @@ public class UT_reduce extends UnitTest { s.set_posInf(Float.POSITIVE_INFINITY); boolean pass = true; - pass &= addint1D(pRS, s); - pass &= addint2D(pRS, s); - pass &= dp(pRS, s); - pass &= findMinAndMax(pRS, s); - pass &= fz(pRS, s); - pass &= fz2(pRS, s); - pass &= fz3(pRS, s); - pass &= histogram(pRS, s); - pass &= mode(pRS, s); + + pass &= runCorrectnessQuick(pRS, s); + pass &= runCorrectness(pRS, s); + // pass &= runPerformanceQuick(pRS, s); pRS.finish(); pRS.destroy(); diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java index 3a64a738..6a50d2bf 100644 --- a/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java +++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java @@ -119,39 +119,6 @@ public class UT_reduce_backward extends UnitTest { /////////////////////////////////////////////////////////////////// - private float dp(float[] input1, float[] input2) { - _RS_ASSERT("dp input length mismatch", input1.length == input2.length); - - float rslt = 0; - for (int idx = 0; idx < input1.length; ++idx) - rslt += input1[idx] * input2[idx]; - return rslt; - } - - private boolean dp(RenderScript RS, ScriptC_reduce_backward s) { - final float[] input1 = createInputArrayFloat(100000, 2); - final float[] input2 = createInputArrayFloat(100000, 3); - - final float javaRslt = dp(input1, input2); - final float rsRslt = s.reduce_dp(input1, input2).get(); - - // NOTE: Using a floating point equality check to test for - // correctness -- as we do below -- is a bad idea. It's only - // reliable if the Java and RenderScript implementation of dp - // use the same algorithm. Equality could be broken by - // different optimizations between the two, or running the - // RenderScript algorithm multithreaded, or running the - // RenderScript algorithm on a GPU rather than the CPU. - // - // Should we be checking instead that the results are - // "sufficiently close"? Cooking the input set to try to - // ensure a deterministic result? Changing to integers - // instead? - return result("dp", javaRslt, rsRslt); - } - - /////////////////////////////////////////////////////////////////// - private Int2 findMinAndMax(float[] input) { float minVal = Float.POSITIVE_INFINITY; int minIdx = -1; @@ -322,7 +289,6 @@ public class UT_reduce_backward extends UnitTest { boolean pass = true; pass &= addint1D(pRS, s); pass &= addint2D(pRS, s); - pass &= dp(pRS, s); pass &= findMinAndMax(pRS, s); pass &= fz(pRS, s); pass &= fz2(pRS, s); diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce.rs b/java/tests/RsTest/src/com/android/rs/test/reduce.rs index be09dfb6..97b45e0c 100644 --- a/java/tests/RsTest/src/com/android/rs/test/reduce.rs +++ b/java/tests/RsTest/src/com/android/rs/test/reduce.rs @@ -16,18 +16,6 @@ static void aiAccum(int *accum, int val) { *accum += val; } ///////////////////////////////////////////////////////////////////////// -#pragma rs reduce(dp) \ - accumulator(dpAccum) combiner(dpSum) - -static void dpAccum(float *accum, float in1, float in2) { - *accum += in1*in2; -} - -// combiner function -static void dpSum(float *accum, const float *val) { *accum += *val; } - -///////////////////////////////////////////////////////////////////////// - #pragma rs reduce(findMinAndMax) \ initializer(fMMInit) accumulator(fMMAccumulator) \ combiner(fMMCombiner) outconverter(fMMOutConverter) @@ -61,8 +49,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) { static void fMMCombiner(MinAndMax *accum, const MinAndMax *val) { - fMMAccumulator(accum, val->min.val, val->min.idx); - fMMAccumulator(accum, val->max.val, val->max.idx); + if (val->min.val < accum->min.val) + accum->min = val->min; + if (val->max.val > accum->max.val) + accum->max = val->max; } static void fMMOutConverter(int2 *result, @@ -160,3 +150,24 @@ static void modeOutConvert(int2 *result, const Histogram *h) { result->x = mode; result->y = (*h)[mode]; } + +///////////////////////////////////////////////////////////////////////// + +#pragma rs reduce(sumgcd) accumulator(sgAccum) combiner(sgCombine) + +static int gcd(int a, int b) { + while (b != 0) { + const int aNew = b; + const int bNew = a % b; + + a = aNew; + b = bNew; + } + return a; +} + +static void sgAccum(long *accum, int a, int b) { + *accum += gcd(a, b); +} + +static void sgCombine(long *accum, const long *other) { *accum += *other; } diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs b/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs index 419e7090..41252c8a 100644 --- a/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs +++ b/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs @@ -15,18 +15,6 @@ static void aiAccum(int *accum, int val) { *accum += val; } ///////////////////////////////////////////////////////////////////////// -static void dpAccum(float *accum, float in1, float in2) { - *accum += in1*in2; -} - -// combiner function -static void dpSum(float *accum, const float *val) { *accum += *val; } - -#pragma rs reduce(dp) \ - accumulator(dpAccum) combiner(dpSum) - -///////////////////////////////////////////////////////////////////////// - typedef struct { float val; int idx; @@ -56,8 +44,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) { static void fMMCombiner(MinAndMax *accum, const MinAndMax *val) { - fMMAccumulator(accum, val->min.val, val->min.idx); - fMMAccumulator(accum, val->max.val, val->max.idx); + if (val->min.val < accum->min.val) + accum->min = val->min; + if (val->max.val > accum->max.val) + accum->max = val->max; } static void fMMOutConverter(int2 *result, diff --git a/rsContext.cpp b/rsContext.cpp index 77e82f64..122815fb 100644 --- a/rsContext.cpp +++ b/rsContext.cpp @@ -260,6 +260,8 @@ void * Context::threadProc(void *vrsc) { rsc->props.mLogShadersAttr = getProp("debug.rs.shader.attributes") != 0; rsc->props.mLogShadersUniforms = getProp("debug.rs.shader.uniforms") != 0; rsc->props.mLogVisual = getProp("debug.rs.visual") != 0; + rsc->props.mLogReduceAccum = getProp("debug.rs.reduce-accum") != 0; + rsc->props.mDebugReduceSplitAccum = getProp("debug.rs.reduce-split-accum") != 0; rsc->props.mDebugMaxThreads = getProp("debug.rs.max-threads"); if (getProp("debug.rs.debug") != 0) { diff --git a/rsContext.h b/rsContext.h index fce22b56..dd2fc00e 100644 --- a/rsContext.h +++ b/rsContext.h @@ -226,6 +226,8 @@ public: bool mLogShadersAttr; bool mLogShadersUniforms; bool mLogVisual; + bool mLogReduceAccum; + bool mDebugReduceSplitAccum; uint32_t mDebugMaxThreads; } props; |