summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cpu_ref/rsCpuCore.cpp239
-rw-r--r--cpu_ref/rsCpuCore.h59
-rw-r--r--cpu_ref/rsCpuExecutable.cpp17
-rw-r--r--cpu_ref/rsCpuScript.cpp7
-rw-r--r--cpu_ref/rsd_cpu.h2
-rw-r--r--driver/rsdRuntimeStubs.cpp2
-rw-r--r--java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java34
-rw-r--r--java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java34
-rw-r--r--java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs18
-rw-r--r--java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs18
-rw-r--r--java/tests/RsTest/AndroidManifest.xml1
-rw-r--r--java/tests/RsTest/src/com/android/rs/test/UT_reduce.java671
-rw-r--r--java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java34
-rw-r--r--java/tests/RsTest/src/com/android/rs/test/reduce.rs39
-rw-r--r--java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs18
-rw-r--r--rsContext.cpp2
-rw-r--r--rsContext.h2
17 files changed, 930 insertions, 267 deletions
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index b8b48387..9f9c429b 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -45,6 +45,8 @@ static pid_t gettid() {
using namespace android;
using namespace android::renderscript;
+#define REDUCE_NEW_ALOGV(...) /* ALOGV(__VA_ARGS__) */
+
static pthread_key_t gThreadTLSKey = 0;
static uint32_t gThreadTLSKeyCount = 0;
static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
@@ -98,7 +100,7 @@ RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) {
version_major = 0;
version_minor = 0;
- mInForEach = false;
+ mInKernel = false;
memset(&mWorkers, 0, sizeof(mWorkers));
memset(&mTlsStruct, 0, sizeof(mTlsStruct));
mExit = false;
@@ -239,6 +241,9 @@ bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
ALOGE("pthread_setspecific %i", status);
}
+ mPageSize = sysconf(_SC_PAGE_SIZE);
+ REDUCE_NEW_ALOGV("page size = %ld", mPageSize);
+
GetCpuInfo();
int cpu = sysconf(_SC_NPROCESSORS_CONF);
@@ -435,7 +440,7 @@ static void walk_2d(void *usr, uint32_t idx) {
}
}
-static void walk_1d(void *usr, uint32_t idx) {
+static void walk_1d_foreach(void *usr, uint32_t idx) {
MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
RsExpandKernelDriverInfo fep = mtls->fep;
fep.lid = idx;
@@ -458,6 +463,103 @@ static void walk_1d(void *usr, uint32_t idx) {
}
}
+// The function format_bytes() is an auxiliary function to assist in logging.
+//
+// Bytes are read from an input (inBuf) and written (as pairs of hex digits)
+// to an output (outBuf).
+//
+// Output format:
+// - starts with ": "
+// - each input byte is translated to a pair of hex digits
+// - bytes are separated by "." except that every fourth separator is "|"
+// - if the input is sufficiently long, the output is truncated and terminated with "..."
+//
+// Arguments:
+// - outBuf -- Pointer to buffer of type "FormatBuf" into which output is written
+// - inBuf -- Pointer to bytes which are to be formatted into outBuf
+// - inBytes -- Number of bytes in inBuf
+//
+// Constant:
+// - kFormatInBytesMax -- Only min(kFormatInBytesMax, inBytes) bytes will be read
+// from inBuf
+//
+// Return value:
+// - pointer (const char *) to output (which is part of outBuf)
+//
+static const int kFormatInBytesMax = 16;
+// ": " + 2 digits per byte + 1 separator between bytes + "..." + null
+typedef char FormatBuf[2 + kFormatInBytesMax*2 + (kFormatInBytesMax - 1) + 3 + 1];
+static const char *format_bytes(FormatBuf *outBuf, const uint8_t *inBuf, const int inBytes) {
+ strcpy(*outBuf, ": ");
+ int pos = 2;
+ const int lim = std::min(kFormatInBytesMax, inBytes);
+ for (int i = 0; i < lim; ++i) {
+ if (i) {
+ sprintf(*outBuf + pos, (i % 4 ? "." : "|"));
+ ++pos;
+ }
+ sprintf(*outBuf + pos, "%02x", inBuf[i]);
+ pos += 2;
+ }
+ if (kFormatInBytesMax < inBytes)
+ strcpy(*outBuf + pos, "...");
+ return *outBuf;
+}
+
+static void walk_1d_reduce_new(void *usr, uint32_t idx) {
+ const MTLaunchStructReduceNew *mtls = (const MTLaunchStructReduceNew *)usr;
+ RsExpandKernelDriverInfo redp = mtls->redp;
+
+ // find accumulator
+ uint8_t *&accumPtr = mtls->accumPtr[idx];
+ if (!accumPtr) {
+ uint32_t accumIdx = (uint32_t)__sync_fetch_and_add(&mtls->accumCount, 1);
+ if (mtls->outFunc) {
+ accumPtr = mtls->accumAlloc + mtls->accumStride * accumIdx;
+ } else {
+ if (accumIdx == 0) {
+ accumPtr = mtls->redp.outPtr[0];
+ } else {
+ accumPtr = mtls->accumAlloc + mtls->accumStride * (accumIdx - 1);
+ }
+ }
+ REDUCE_NEW_ALOGV("walk_1d_reduce_new(%p): idx = %u got accumCount %u and accumPtr %p",
+ mtls->accumFunc, idx, accumIdx, accumPtr);
+ // initialize accumulator
+ if (mtls->initFunc) {
+ mtls->initFunc(accumPtr);
+ } else {
+ memset(accumPtr, 0, mtls->accumSize);
+ }
+ }
+
+ // accumulate
+ const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+ while (1) {
+ uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+ uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
+ uint32_t xEnd = xStart + mtls->mSliceSize;
+
+ xEnd = rsMin(xEnd, mtls->end.x);
+
+ if (xEnd <= xStart) {
+ return;
+ }
+
+ RedpPtrSetup(mtls, &redp, xStart, 0, 0);
+ fn(&redp, xStart, xEnd, accumPtr);
+
+ FormatBuf fmt;
+ if (mtls->logReduceAccum) {
+ format_bytes(&fmt, accumPtr, mtls->accumSize);
+ } else {
+ fmt[0] = 0;
+ }
+ REDUCE_NEW_ALOGV("walk_1d_reduce_new(%p): idx = %u [%u, %u)%s",
+ mtls->accumFunc, idx, xStart, xEnd, fmt);
+ }
+}
+
// Launch a simple reduce-style kernel.
// Inputs:
// ain: The allocation that contains the input
@@ -486,6 +588,25 @@ void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains,
uint32_t inLen,
Allocation * aout,
MTLaunchStructReduceNew *mtls) {
+ mtls->logReduceAccum = mRSC->props.mLogReduceAccum;
+ if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
+ launchReduceNewParallel(ains, inLen, aout, mtls);
+ } else {
+ launchReduceNewSerial(ains, inLen, aout, mtls);
+ }
+}
+
+// Launch a general reduce-style kernel, single-threaded.
+// Inputs:
+// ains[0..inLen-1]: Array of allocations that contain the inputs
+// aout: The allocation that will hold the output
+// mtls: Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNewSerial(const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ MTLaunchStructReduceNew *mtls) {
+ ALOGV("launchReduceNewSerial(%p)", mtls->accumFunc);
+
// In the presence of outconverter, we allocate temporary memory for
// the accumulator.
//
@@ -521,6 +642,112 @@ void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains,
}
}
+// Launch a general reduce-style kernel, multi-threaded.
+// Inputs:
+// ains[0..inLen-1]: Array of allocations that contain the inputs
+// aout: The allocation that will hold the output
+// mtls: Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNewParallel(const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ MTLaunchStructReduceNew *mtls) {
+ // For now, we don't know how to go parallel beyond 1D, or in the absence of a combiner.
+ if ((mtls->redp.dim.y > 1) || (mtls->redp.dim.z > 1) || !mtls->combFunc) {
+ launchReduceNewSerial(ains, inLen, aout, mtls);
+ return;
+ }
+
+ // Number of threads = "main thread" + number of other (worker) threads
+ const uint32_t numThreads = mWorkers.mCount + 1;
+
+ // In the absence of outconverter, we use the output allocation as
+ // an accumulator, and therefore need to allocate one fewer accumulator.
+ const uint32_t numAllocAccum = numThreads - (mtls->outFunc == nullptr);
+
+ // If mDebugReduceSplitAccum, then we want each accumulator to start
+ // on a page boundary. (TODO: Would some unit smaller than a page
+ // be sufficient to avoid false sharing?)
+ if (mRSC->props.mDebugReduceSplitAccum) {
+ // Round up accumulator size to an integral number of pages
+ mtls->accumStride =
+ (unsigned(mtls->accumSize) + unsigned(mPageSize)-1) &
+ ~(unsigned(mPageSize)-1);
+ // Each accumulator gets its own page. Alternatively, if we just
+ // wanted to make sure no two accumulators are on the same page,
+ // we could instead do
+ // allocSize = mtls->accumStride * (numAllocation - 1) + mtls->accumSize
+ const size_t allocSize = mtls->accumStride * numAllocAccum;
+ mtls->accumAlloc = static_cast<uint8_t *>(memalign(mPageSize, allocSize));
+ } else {
+ mtls->accumStride = mtls->accumSize;
+ mtls->accumAlloc = static_cast<uint8_t *>(malloc(mtls->accumStride * numAllocAccum));
+ }
+
+ const size_t accumPtrArrayBytes = sizeof(uint8_t *) * numThreads;
+ mtls->accumPtr = static_cast<uint8_t **>(malloc(accumPtrArrayBytes));
+ memset(mtls->accumPtr, 0, accumPtrArrayBytes);
+
+ mtls->accumCount = 0;
+
+ rsAssert(!mInKernel);
+ mInKernel = true;
+ mtls->mSliceSize = rsMax(1U, mtls->redp.dim.x / (numThreads * 4));
+ ALOGV("launchReduceNewParallel(%p): %u threads, accumAlloc = %p",
+ mtls->accumFunc, numThreads, mtls->accumAlloc);
+ launchThreads(walk_1d_reduce_new, mtls);
+ mInKernel = false;
+
+ // Combine accumulators and identify final accumulator
+ uint8_t *finalAccumPtr = (mtls->outFunc ? nullptr : mtls->redp.outPtr[0]);
+ // Loop over accumulators, combining into finalAccumPtr. If finalAccumPtr
+ // is null, then the first accumulator I find becomes finalAccumPtr.
+ for (unsigned idx = 0; idx < mtls->accumCount; ++idx) {
+ uint8_t *const thisAccumPtr = mtls->accumPtr[idx];
+ if (finalAccumPtr) {
+ if (finalAccumPtr != thisAccumPtr) {
+ if (mtls->combFunc) {
+ if (mtls->logReduceAccum) {
+ FormatBuf fmt;
+ REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): accumulating into%s",
+ mtls->accumFunc,
+ format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+ REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): accumulator[%d]%s",
+ mtls->accumFunc, idx,
+ format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
+ }
+ mtls->combFunc(finalAccumPtr, thisAccumPtr);
+ } else {
+ rsAssert(!"expected combiner");
+ }
+ }
+ } else {
+ finalAccumPtr = thisAccumPtr;
+ }
+ }
+ rsAssert(finalAccumPtr != nullptr);
+ if (mtls->logReduceAccum) {
+ FormatBuf fmt;
+ REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): final accumulator%s",
+ mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+ }
+
+ // Outconvert
+ if (mtls->outFunc) {
+ mtls->outFunc(mtls->redp.outPtr[0], finalAccumPtr);
+ if (mtls->logReduceAccum) {
+ FormatBuf fmt;
+ REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): final outconverted result%s",
+ mtls->accumFunc,
+ format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
+ }
+ }
+
+ // Clean up
+ free(mtls->accumPtr);
+ free(mtls->accumAlloc);
+}
+
+
void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
uint32_t inLen,
Allocation* aout,
@@ -537,9 +764,9 @@ void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
(mtls->start.array[2] != mtls->end.array[2]) ||
(mtls->start.array[3] != mtls->end.array[3]);
- if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+ if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
const size_t targetByteChunk = 16 * 1024;
- mInForEach = true;
+ mInKernel = true; // NOTE: The guard immediately above ensures this was !mInKernel
if (outerDims) {
// No fancy logic for chunk size
@@ -588,9 +815,9 @@ void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
mtls->mSliceSize = 1;
}
- launchThreads(walk_1d, mtls);
+ launchThreads(walk_1d_foreach, mtls);
}
- mInForEach = false;
+ mInKernel = false;
} else {
ForEachFunc_t fn = mtls->kernel;
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 939b7ae2..c2a08640 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -34,6 +34,7 @@ extern bool gArchUseSIMD;
// Function types found in RenderScript code
typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len);
typedef void (*ReduceNewAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
+typedef void (*ReduceNewCombinerFunc_t)(uint8_t *accum, const uint8_t *other);
typedef void (*ReduceNewInitializerFunc_t)(uint8_t *accum);
typedef void (*ReduceNewOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride);
@@ -44,6 +45,7 @@ typedef int (*RootFunc_t)(void);
struct ReduceNewDescription {
ReduceNewAccumulatorFunc_t accumFunc; // expanded accumulator function
ReduceNewInitializerFunc_t initFunc; // user initializer function
+ ReduceNewCombinerFunc_t combFunc; // user combiner function
ReduceNewOutConverterFunc_t outFunc; // user outconverter function
size_t accumSize; // accumulator datum size, in bytes
};
@@ -73,7 +75,8 @@ struct MTLaunchStructCommon {
RsLaunchDimensions start;
RsLaunchDimensions end;
// Points to MTLaunchStructForEach::fep::dim or
- // MTLaunchStructReduce::inputDim.
+ // MTLaunchStructReduce::inputDim or
+ // MTLaunchStructReduceNew::redp::dim.
RsLaunchDimensions *dimPtr;
};
@@ -101,9 +104,51 @@ struct MTLaunchStructReduceNew : public MTLaunchStructCommon {
ReduceNewAccumulatorFunc_t accumFunc;
ReduceNewInitializerFunc_t initFunc;
+ ReduceNewCombinerFunc_t combFunc;
ReduceNewOutConverterFunc_t outFunc;
size_t accumSize; // accumulator datum size in bytes
+
+ size_t accumStride; // stride between accumulators in accumAlloc (below)
+
+ // These fields are used for managing accumulator data items in a
+ // multithreaded execution.
+ //
+ // Let the number of threads be N.
+ // Let Outc be true iff there is an outconverter.
+ //
+ // accumAlloc is a pointer to a single allocation of (N - !Outc)
+ // accumulators. (If there is no outconverter, then the output
+ // allocation acts as an accumulator.) It is created at kernel
+ // launch time. Within that allocation, the distance between the
+ // start of adjacent accumulators is accumStride bytes -- this
+ // might be the same as accumSize, or it might be larger, if we
+ // are attempting to avoid false sharing.
+ //
+ // accumCount is an atomic counter of how many accumulators have
+ // been grabbed by threads. It is initialized to zero at kernel
+ // launch time. See accumPtr for further description.
+ //
+ // accumPtr is pointer to an array of N pointers to accumulators.
+ // The array is created at kernel launch time, and each element is
+ // initialized to nullptr. When a particular thread goes to work,
+ // that thread obtains its accumulator from its entry in this
+ // array. If the entry is nullptr, that thread needs to obtain an
+ // accumulator, and initialize its entry in the array accordingly.
+ // It does so via atomic access (fetch-and-add) to accumCount.
+ // - If Outc, then the fetched value is used as an index into
+ // accumAlloc.
+ // - If !Outc, then
+ // - If the fetched value is zero, then this thread gets the
+ // output allocation for its accumulator.
+ // - If the fetched value is nonzero, then (fetched value - 1)
+ // is used as an index into accumAlloc.
+ uint8_t *accumAlloc;
+ uint8_t **accumPtr;
+ uint32_t accumCount;
+
+ // Logging control
+ bool logReduceAccum;
};
class RsdCpuReferenceImpl : public RsdCpuReference {
@@ -161,7 +206,7 @@ public:
virtual const char *getBccPluginName() const {
return mBccPluginName.string();
}
- bool getInForEach() override { return mInForEach; }
+ bool getInKernel() override { return mInKernel; }
// Set to true if we should embed global variable information in the code.
void setEmbedGlobalInfo(bool v) override {
@@ -190,7 +235,7 @@ protected:
uint32_t version_major;
uint32_t version_minor;
//bool mHasGraphics;
- bool mInForEach;
+ bool mInKernel; // Is a parallel kernel execution underway?
struct Workers {
volatile int mRunningCount;
@@ -222,6 +267,14 @@ protected:
// when potentially embedding information about globals.
// Defaults to true.
bool mEmbedGlobalInfoSkipConstant;
+
+ long mPageSize;
+
+ // Launch a general reduce kernel
+ void launchReduceNewSerial(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+ MTLaunchStructReduceNew *mtls);
+ void launchReduceNewParallel(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+ MTLaunchStructReduceNew *mtls);
};
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
index 34a6b20c..9d6e6236 100644
--- a/cpu_ref/rsCpuExecutable.cpp
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -538,8 +538,8 @@ ScriptExecutable* ScriptExecutable::createFromSharedObject(
goto error;
}
- // The current implementation does not use the signature,
- // reduce name, or combiner.
+ // The current implementation does not use the signature
+ // or reduce name.
reduceNewDescriptions[i].accumSize = tmpSize;
@@ -565,6 +565,19 @@ ScriptExecutable* ScriptExecutable::createFromSharedObject(
goto error;
}
+ // Process the (optional) combiner.
+ if (strcmp(tmpNameCombiner, kNoName)) {
+ // Lookup the original user-written combiner.
+ if (!(reduceNewDescriptions[i].combFunc =
+ (ReduceNewCombinerFunc_t) dlsym(sharedObj, tmpNameCombiner))) {
+ ALOGE("Failed to find combiner function address for %s(): %s",
+ tmpNameCombiner, dlerror());
+ goto error;
+ }
+ } else {
+ reduceNewDescriptions[i].combFunc = nullptr;
+ }
+
// Process the (optional) outconverter.
if (strcmp(tmpNameOutConverter, kNoName)) {
// Lookup the original user-written outconverter.
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index ef738d72..a88af2fe 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -645,9 +645,9 @@ bool RsdCpuScriptImpl::reduceNewMtlsSetup(const Allocation ** ains,
mtls->rs = mCtx;
- // Currently not threaded.
- mtls->isThreadable = false;
- mtls->mSliceNum = -1;
+ mtls->mSliceNum = 0;
+ mtls->mSliceSize = 1;
+ mtls->isThreadable = mIsThreadable;
// Set up output,
mtls->redp.outLen = 1;
@@ -842,6 +842,7 @@ void RsdCpuScriptImpl::reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceN
const ReduceNewDescription *desc = mScriptExec->getReduceNewDescription(slot);
mtls->accumFunc = desc->accumFunc;
mtls->initFunc = desc->initFunc; // might legally be nullptr
+ mtls->combFunc = desc->combFunc; // might legally be nullptr
mtls->outFunc = desc->outFunc; // might legally be nullptr
mtls->accumSize = desc->accumSize;
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 49a999db..e226b934 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -131,7 +131,7 @@ public:
uint32_t flags) = 0;
virtual CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) = 0;
virtual void* createScriptGroup(const ScriptGroupBase *sg) = 0;
- virtual bool getInForEach() = 0;
+ virtual bool getInKernel() = 0; // Is a parallel kernel execution underway?
// Set to true if we should embed global variable information in the code.
virtual void setEmbedGlobalInfo(bool v) = 0;
diff --git a/driver/rsdRuntimeStubs.cpp b/driver/rsdRuntimeStubs.cpp
index c7b88962..10775407 100644
--- a/driver/rsdRuntimeStubs.cpp
+++ b/driver/rsdRuntimeStubs.cpp
@@ -136,7 +136,7 @@ static bool failIfInKernel(Context *rsc, const char *funcName) {
RsdHal *dc = (RsdHal *)rsc->mHal.drv;
RsdCpuReference *impl = (RsdCpuReference *) dc->mCpuRef;
- if (impl->getInForEach()) {
+ if (impl->getInKernel()) {
char buf[256];
snprintf(buf, sizeof(buf), "Error: Call to unsupported function %s "
"in kernel", funcName);
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
index 608de473..c1e9c408 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
@@ -119,39 +119,6 @@ public class UT_reduce extends UnitTest {
///////////////////////////////////////////////////////////////////
- private float dp(float[] input1, float[] input2) {
- _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
- float rslt = 0;
- for (int idx = 0; idx < input1.length; ++idx)
- rslt += input1[idx] * input2[idx];
- return rslt;
- }
-
- private boolean dp(RenderScript RS, ScriptC_reduce s) {
- final float[] input1 = createInputArrayFloat(100000, 2);
- final float[] input2 = createInputArrayFloat(100000, 3);
-
- final float javaRslt = dp(input1, input2);
- final float rsRslt = s.reduce_dp(input1, input2).get();
-
- // NOTE: Using a floating point equality check to test for
- // correctness -- as we do below -- is a bad idea. It's only
- // reliable if the Java and RenderScript implementation of dp
- // use the same algorithm. Equality could be broken by
- // different optimizations between the two, or running the
- // RenderScript algorithm multithreaded, or running the
- // RenderScript algorithm on a GPU rather than the CPU.
- //
- // Should we be checking instead that the results are
- // "sufficiently close"? Cooking the input set to try to
- // ensure a deterministic result? Changing to integers
- // instead?
- return result("dp", javaRslt, rsRslt);
- }
-
- ///////////////////////////////////////////////////////////////////
-
private Int2 findMinAndMax(float[] input) {
float minVal = Float.POSITIVE_INFINITY;
int minIdx = -1;
@@ -322,7 +289,6 @@ public class UT_reduce extends UnitTest {
boolean pass = true;
pass &= addint1D(pRS, s);
pass &= addint2D(pRS, s);
- pass &= dp(pRS, s);
pass &= findMinAndMax(pRS, s);
pass &= fz(pRS, s);
pass &= fz2(pRS, s);
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
index 84d2c505..b998f518 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
@@ -119,39 +119,6 @@ public class UT_reduce_backward extends UnitTest {
///////////////////////////////////////////////////////////////////
- private float dp(float[] input1, float[] input2) {
- _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
- float rslt = 0;
- for (int idx = 0; idx < input1.length; ++idx)
- rslt += input1[idx] * input2[idx];
- return rslt;
- }
-
- private boolean dp(RenderScript RS, ScriptC_reduce_backward s) {
- final float[] input1 = createInputArrayFloat(100000, 2);
- final float[] input2 = createInputArrayFloat(100000, 3);
-
- final float javaRslt = dp(input1, input2);
- final float rsRslt = s.reduce_dp(input1, input2).get();
-
- // NOTE: Using a floating point equality check to test for
- // correctness -- as we do below -- is a bad idea. It's only
- // reliable if the Java and RenderScript implementation of dp
- // use the same algorithm. Equality could be broken by
- // different optimizations between the two, or running the
- // RenderScript algorithm multithreaded, or running the
- // RenderScript algorithm on a GPU rather than the CPU.
- //
- // Should we be checking instead that the results are
- // "sufficiently close"? Cooking the input set to try to
- // ensure a deterministic result? Changing to integers
- // instead?
- return result("dp", javaRslt, rsRslt);
- }
-
- ///////////////////////////////////////////////////////////////////
-
private Int2 findMinAndMax(float[] input) {
float minVal = Float.POSITIVE_INFINITY;
int minIdx = -1;
@@ -322,7 +289,6 @@ public class UT_reduce_backward extends UnitTest {
boolean pass = true;
pass &= addint1D(pRS, s);
pass &= addint2D(pRS, s);
- pass &= dp(pRS, s);
pass &= findMinAndMax(pRS, s);
pass &= fz(pRS, s);
pass &= fz2(pRS, s);
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
index be09dfb6..ec7be8b7 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
@@ -16,18 +16,6 @@ static void aiAccum(int *accum, int val) { *accum += val; }
/////////////////////////////////////////////////////////////////////////
-#pragma rs reduce(dp) \
- accumulator(dpAccum) combiner(dpSum)
-
-static void dpAccum(float *accum, float in1, float in2) {
- *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-/////////////////////////////////////////////////////////////////////////
-
#pragma rs reduce(findMinAndMax) \
initializer(fMMInit) accumulator(fMMAccumulator) \
combiner(fMMCombiner) outconverter(fMMOutConverter)
@@ -61,8 +49,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) {
static void fMMCombiner(MinAndMax *accum,
const MinAndMax *val) {
- fMMAccumulator(accum, val->min.val, val->min.idx);
- fMMAccumulator(accum, val->max.val, val->max.idx);
+ if (val->min.val < accum->min.val)
+ accum->min = val->min;
+ if (val->max.val > accum->max.val)
+ accum->max = val->max;
}
static void fMMOutConverter(int2 *result,
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
index 419e7090..41252c8a 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
@@ -15,18 +15,6 @@ static void aiAccum(int *accum, int val) { *accum += val; }
/////////////////////////////////////////////////////////////////////////
-static void dpAccum(float *accum, float in1, float in2) {
- *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-#pragma rs reduce(dp) \
- accumulator(dpAccum) combiner(dpSum)
-
-/////////////////////////////////////////////////////////////////////////
-
typedef struct {
float val;
int idx;
@@ -56,8 +44,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) {
static void fMMCombiner(MinAndMax *accum,
const MinAndMax *val) {
- fMMAccumulator(accum, val->min.val, val->min.idx);
- fMMAccumulator(accum, val->max.val, val->max.idx);
+ if (val->min.val < accum->min.val)
+ accum->min = val->min;
+ if (val->max.val > accum->max.val)
+ accum->max = val->max;
}
static void fMMOutConverter(int2 *result,
diff --git a/java/tests/RsTest/AndroidManifest.xml b/java/tests/RsTest/AndroidManifest.xml
index b660398d..31da896a 100644
--- a/java/tests/RsTest/AndroidManifest.xml
+++ b/java/tests/RsTest/AndroidManifest.xml
@@ -2,6 +2,7 @@
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
package="com.android.rs.test">
<application
+ android:largeHeap="true"
android:label="_RS_Test"
android:icon="@drawable/test_pattern">
<activity android:name="RSTest"
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
index a244646c..07692595 100644
--- a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
@@ -27,6 +27,7 @@ import android.content.res.Resources;
import android.renderscript.*;
import android.util.Log;
import java.lang.Float;
+import java.util.Arrays;
import java.util.Random;
public class UT_reduce extends UnitTest {
@@ -36,6 +37,81 @@ public class UT_reduce extends UnitTest {
super(rstc, "reduce", ctx);
}
+ private static class timing {
+ timing(long myJavaStart, long myJavaEnd, long myRsStart,
+ long myCopyStart, long myKernelStart, long myRsEnd,
+ Allocation... myInputs) {
+ javaStart = myJavaStart;
+ javaEnd = myJavaEnd;
+ rsStart = myRsStart;
+ copyStart = myCopyStart;
+ kernelStart = myKernelStart;
+ rsEnd = myRsEnd;
+
+ inputBytes = 0;
+ for (Allocation input : myInputs)
+ inputBytes += input.getBytesSize();
+
+ inputCells = (myInputs.length > 0) ? myInputs[0].getType().getCount() : 0;
+ }
+
+ timing(long myInputCells) {
+ inputCells = myInputCells;
+ }
+
+ private long javaStart = -1;
+ private long javaEnd = -1;
+ private long rsStart = -1;
+ private long copyStart = -1;
+ private long kernelStart = -1;
+ private long rsEnd = -1;
+ private long inputBytes = -1;
+ private long inputCells = -1;
+
+ public long javaTime() { return javaEnd - javaStart; }
+ public long rsTime() { return rsEnd - rsStart; }
+ public long kernelTime() { return rsEnd - kernelStart; }
+ public long overheadTime() { return kernelStart - rsStart; }
+ public long allocationTime() { return copyStart - rsStart; }
+ public long copyTime() { return kernelStart - copyStart; }
+
+ public static String string(long myJavaStart, long myJavaEnd, long myRsStart,
+ long myCopyStart, long myKernelStart, long myRsEnd,
+ Allocation... myInputs) {
+ return (new timing(myJavaStart, myJavaEnd, myRsStart,
+ myCopyStart, myKernelStart, myRsEnd, myInputs)).string();
+ }
+
+ public static String string(long myInputCells) {
+ return (new timing(myInputCells)).string();
+ }
+
+ public String string() {
+ String result;
+ if (javaStart >= 0) {
+ result = "(java " + javaTime() + "ms, rs " + rsTime() + "ms = overhead " +
+ overheadTime() + "ms (alloc " + allocationTime() + "ms + copy " +
+ copyTime() + "ms) + kernel+get() " + kernelTime() + "ms)";
+ if (inputCells > 0)
+ result += " ";
+ } else {
+ result = "";
+ }
+ if (inputCells > 0) {
+ result += "(" + fmt.format(inputCells) + " cells";
+ if (inputBytes > 0)
+ result += ", " + fmt.format(inputBytes) + " bytes";
+ result += ")";
+ }
+ return result;
+ }
+
+ private static java.text.DecimalFormat fmt;
+ static {
+ fmt = new java.text.DecimalFormat("###,###");
+ }
+ };
+
private byte[] createInputArrayByte(int len, int seed) {
byte[] array = new byte[len];
(new Random(seed)).nextBytes(array);
@@ -66,21 +142,60 @@ public class UT_reduce extends UnitTest {
return array;
}
- private <T extends Number> boolean result(String testName, T javaRslt, T rsRslt) {
+ private <T extends Number> boolean result(String testName, final timing t,
+ T javaRslt, T rsRslt) {
final boolean success = javaRslt.equals(rsRslt);
+ String status = (success ? "PASSED" : "FAILED");
+ if (success && (t != null))
+ status += " " + t.string();
+ Log.i(TAG, testName + ": java " + javaRslt + ", rs " + rsRslt + ": " + status);
+ return success;
+ }
+
+ private boolean result(String testName, final timing t,
+ final long[] javaRslt, final long[] rsRslt) {
+ if (javaRslt.length != rsRslt.length) {
+ Log.i(TAG, testName + ": java length " + javaRslt.length +
+ ", rs length " + rsRslt.length + ": FAILED");
+ return false;
+ }
+ for (int i = 0; i < javaRslt.length; ++i) {
+ if (javaRslt[i] != rsRslt[i]) {
+ Log.i(TAG, testName + "[" + i + "]: java " + javaRslt[i] +
+ ", rs " + rsRslt[i] + ": FAILED");
+ return false;
+ }
+ }
+ String status = "PASSED";
+ if (t != null)
+ status += " " + t.string();
+ Log.i(TAG, testName + ": " + status);
+ return true;
+ }
+
+ private boolean result(String testName, final timing t, Int2 javaRslt, Int2 rsRslt) {
+ final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y);
+ String status = (success ? "PASSED" : "FAILED");
+ if (success && (t != null))
+ status += " " + t.string();
Log.i(TAG,
- testName + ": java " + javaRslt + ", rs " + rsRslt + ": " +
- (success ? "PASSED" : "FAILED"));
+ testName +
+ ": java (" + javaRslt.x + ", " + javaRslt.y + ")" +
+ ", rs (" + rsRslt.x + ", " + rsRslt.y + ")" +
+ ": " + status);
return success;
}
- private boolean result(String testName, Int2 javaRslt, Int2 rsRslt) {
+ private boolean result(String testName, final timing t, Float2 javaRslt, Float2 rsRslt) {
final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y);
+ String status = (success ? "PASSED" : "FAILED");
+ if (success && (t != null))
+ status += " " + t.string();
Log.i(TAG,
testName +
": java (" + javaRslt.x + ", " + javaRslt.y + ")" +
", rs (" + rsRslt.x + ", " + rsRslt.y + ")" +
- ": " + (success ? "PASSED" : "FAILED"));
+ ": " + status);
return success;
}
@@ -93,61 +208,68 @@ public class UT_reduce extends UnitTest {
return rslt;
}
- private boolean addint1D(RenderScript RS, ScriptC_reduce s) {
- final int[] input = createInputArrayInt(100000, 0, 1 << 13);
+ private boolean addint1D_array(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final int[] input = createInputArrayInt(size[0], seed, Integer.MAX_VALUE / size[0]);
final int javaRslt = addint(input);
final int rsRslt = s.reduce_addint(input).get();
- return result("addint1D", javaRslt, rsRslt);
+ return result("addint1D_array", new timing(size[0]), javaRslt, rsRslt);
}
- private boolean addint2D(RenderScript RS, ScriptC_reduce s) {
- final int dimX = 450, dimY = 225;
-
- final int[] inputArray = createInputArrayInt(dimX * dimY, 1, 1 << 13);
- Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
- typeBuilder.setX(dimX).setY(dimY);
- Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
- inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
+ private boolean addint1D(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final int[] inputArray = createInputArrayInt(size[0], seed, Integer.MAX_VALUE / size[0]);
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
final int javaRslt = addint(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+ Allocation inputAllocation = Allocation.createSized(RS, Element.I32(RS), inputArray.length);
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+ inputAllocation.copyFrom(inputArray);
+
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
final int rsRslt = s.reduce_addint(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
- return result("addint2D", javaRslt, rsRslt);
+ return result("addint1D",
+ new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+ javaRslt, rsRslt);
}
- ///////////////////////////////////////////////////////////////////
+ private boolean addint2D(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final int dimX = size[0];
+ final int dimY = size[1];
- private float dp(float[] input1, float[] input2) {
- _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
+ final int[] inputArray = createInputArrayInt(dimX * dimY, seed, Integer.MAX_VALUE / (dimX * dimY));
- float rslt = 0;
- for (int idx = 0; idx < input1.length; ++idx)
- rslt += input1[idx] * input2[idx];
- return rslt;
- }
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final int javaRslt = addint(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
- private boolean dp(RenderScript RS, ScriptC_reduce s) {
- final float[] input1 = createInputArrayFloat(100000, 2);
- final float[] input2 = createInputArrayFloat(100000, 3);
+ Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
+ typeBuilder.setX(dimX).setY(dimY);
+ Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
- final float javaRslt = dp(input1, input2);
- final float rsRslt = s.reduce_dp(input1, input2).get();
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
- // NOTE: Using a floating point equality check to test for
- // correctness -- as we do below -- is a bad idea. It's only
- // reliable if the Java and RenderScript implementation of dp
- // use the same algorithm. Equality could be broken by
- // different optimizations between the two, or running the
- // RenderScript algorithm multithreaded, or running the
- // RenderScript algorithm on a GPU rather than the CPU.
- //
- // Should we be checking instead that the results are
- // "sufficiently close"? Cooking the input set to try to
- // ensure a deterministic result? Changing to integers
- // instead?
- return result("dp", javaRslt, rsRslt);
+ inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
+
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
+ final int rsRslt = s.reduce_addint(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+ return result("addint2D",
+ new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+ javaRslt, rsRslt);
}
///////////////////////////////////////////////////////////////////
@@ -172,79 +294,195 @@ public class UT_reduce extends UnitTest {
return new Int2(minIdx, maxIdx);
}
- private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s) {
- final float[] input = createInputArrayFloat(100000, 4);
+ private boolean findMinAndMax_array(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final float[] input = createInputArrayFloat(size[0], seed);
final Int2 javaRslt = findMinAndMax(input);
final Int2 rsRslt = s.reduce_findMinAndMax(input).get();
- return result("findMinAndMax", javaRslt, rsRslt);
+ // Note that the Java and RenderScript algorithms are not
+ // guaranteed to find the same cells -- but they should
+ // find cells of the same value.
+ final Float2 javaVal = new Float2(input[javaRslt.x], input[javaRslt.y]);
+ final Float2 rsVal = new Float2(input[rsRslt.x], input[rsRslt.y]);
+
+ return result("findMinAndMax_array", new timing(size[0]), javaVal, rsVal);
+ }
+
+ private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final float[] inputArray = createInputArrayFloat(size[0], seed);
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final Int2 javaRslt = findMinAndMax(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+ Allocation inputAllocation = Allocation.createSized(RS, Element.F32(RS), inputArray.length);
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+ inputAllocation.copyFrom(inputArray);
+
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
+ final Int2 rsRslt = s.reduce_findMinAndMax(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+ // Note that the Java and RenderScript algorithms are not
+ // guaranteed to find the same cells -- but they should
+ // find cells of the same value.
+ final Float2 javaVal = new Float2(inputArray[javaRslt.x], inputArray[javaRslt.y]);
+ final Float2 rsVal = new Float2(inputArray[rsRslt.x], inputArray[rsRslt.y]);
+
+ return result("findMinAndMax",
+ new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+ javaVal, rsVal);
}
///////////////////////////////////////////////////////////////////
- private boolean fz(RenderScript RS, ScriptC_reduce s) {
- final int inputLen = 100000;
- int[] input = createInputArrayInt(inputLen, 5);
+ private int fz(final int[] input) {
+ for (int i = 0; i < input.length; ++i)
+ if (input[i] == 0)
+ return i;
+ return -1;
+ }
+
+ private boolean fz_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final int inputLen = size[0];
+ int[] input = createInputArrayInt(inputLen, seed+0);
// just in case we got unlucky
- input[(new Random(6)).nextInt(inputLen)] = 0;
+ input[(new Random(seed+1)).nextInt(inputLen)] = 0;
final int rsRslt = s.reduce_fz(input).get();
final boolean success = (input[rsRslt] == 0);
Log.i(TAG,
- "fz: input[" + rsRslt + "] == " + input[rsRslt] + ": " +
- (success ? "PASSED" : "FAILED"));
+ "fz_array: input[" + rsRslt + "] == " + input[rsRslt] + ": " +
+ (success ? "PASSED " + timing.string(size[0]) : "FAILED"));
+ return success;
+ }
+
+ private boolean fz(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final int inputLen = size[0];
+ int[] inputArray = createInputArrayInt(inputLen, seed+0);
+ // just in case we got unlucky
+ inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final int javaRslt = fz(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+ Allocation inputAllocation = Allocation.createSized(RS, Element.I32(RS), inputArray.length);
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+ inputAllocation.copyFrom(inputArray);
+
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
+ final int rsRslt = s.reduce_fz(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+ final boolean success = (inputArray[rsRslt] == 0);
+ String status = (success ? "PASSED" : "FAILED");
+ if (success)
+ status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
+ Log.i(TAG,
+ "fz: java input[" + javaRslt + "] == " + inputArray[javaRslt] +
+ ", rs input[" + rsRslt + "] == " + inputArray[javaRslt] + ": " + status);
return success;
}
///////////////////////////////////////////////////////////////////
- private boolean fz2(RenderScript RS, ScriptC_reduce s) {
- final int dimX = 225, dimY = 450;
+ private boolean fz2(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final int dimX = size[0], dimY = size[1];
final int inputLen = dimX * dimY;
- int[] inputArray = createInputArrayInt(inputLen, 7);
+ int[] inputArray = createInputArrayInt(inputLen, seed+0);
// just in case we got unlucky
- inputArray[(new Random(8)).nextInt(inputLen)] = 0;
+ inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final int javaRsltLinear = fz(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final Int2 javaRslt = new Int2(javaRsltLinear % dimX, javaRsltLinear / dimX);
+ final int javaCellVal = inputArray[javaRslt.x + dimX * javaRslt.y];
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
typeBuilder.setX(dimX).setY(dimY);
Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
final Int2 rsRslt = s.reduce_fz2(inputAllocation).get();
-
- final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y];
- final boolean success = (cellVal == 0);
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+ final int rsCellVal = inputArray[rsRslt.x + dimX * rsRslt.y];
+ final boolean success = (rsCellVal == 0);
+ String status = (success ? "PASSED" : "FAILED");
+ if (success)
+ status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
Log.i(TAG,
- "fz2: input[" + rsRslt.x + ", " + rsRslt.y + "] == " + cellVal + ": " +
- (success ? "PASSED" : "FAILED"));
+ "fz2: java input[" + javaRslt.x + ", " + javaRslt.y + "] == " + javaCellVal +
+ ", rs input[" + rsRslt.x + ", " + rsRslt.y + "] == " + rsCellVal + ": " + status);
return success;
}
///////////////////////////////////////////////////////////////////
- private boolean fz3(RenderScript RS, ScriptC_reduce s) {
- final int dimX = 59, dimY = 48, dimZ = 37;
+ private boolean fz3(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final int dimX = size[0], dimY = size[1], dimZ = size[2];
final int inputLen = dimX * dimY * dimZ;
- int[] inputArray = createInputArrayInt(inputLen, 9);
+ int[] inputArray = createInputArrayInt(inputLen, seed+0);
// just in case we got unlucky
- inputArray[(new Random(10)).nextInt(inputLen)] = 0;
+ inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final int javaRsltLinear = fz(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final Int3 javaRslt = new Int3(
+ javaRsltLinear % dimX,
+ (javaRsltLinear / dimX) % dimY,
+ javaRsltLinear / (dimX * dimY));
+ final int javaCellVal = inputArray[javaRslt.x + dimX * javaRslt.y + dimX * dimY * javaRslt.z];
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
typeBuilder.setX(dimX).setY(dimY).setZ(dimZ);
Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
inputAllocation.copy3DRangeFrom(0, 0, 0, dimX, dimY, dimZ, inputArray);
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
final Int3 rsRslt = s.reduce_fz3(inputAllocation).get();
-
- final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z];
- final boolean success = (cellVal == 0);
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+ final int rsCellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z];
+ final boolean success = (rsCellVal == 0);
+ String status = (success ? "PASSED" : "FAILED");
+ if (success)
+ status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
Log.i(TAG,
- "fz3: input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + cellVal + ": " +
- (success ? "PASSED" : "FAILED"));
+ "fz3: java input[" + javaRslt.x + ", " + javaRslt.y + ", " + javaRslt.z + "] == " + javaCellVal +
+ ", rs input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + rsCellVal + ": " + status);
return success;
}
@@ -271,24 +509,43 @@ public class UT_reduce extends UnitTest {
return outputArray;
}
- private boolean histogram(RenderScript RS, ScriptC_reduce s) {
- final byte[] inputArray = createInputArrayByte(100000, 11);
+ private boolean histogram_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final byte[] inputArray = createInputArrayByte(size[0], seed);
final long[] javaRslt = histogram(RS, inputArray);
_RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount);
final long[] rsRslt = s.reduce_histogram(inputArray).get();
_RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount);
- for (int i = 0; i < histogramBucketCount; ++i) {
- if (javaRslt[i] != rsRslt[i]) {
- Log.i(TAG,
- "histogram[" + i + "]: java " + javaRslt[i] + ", rs " + rsRslt[i] + ": FAILED");
- return false;
- }
- }
+ return result("histogram_array", new timing(size[0]), javaRslt, rsRslt);
+ }
- Log.i(TAG, "histogram: PASSED");
- return true;
+ private boolean histogram(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final byte[] inputArray = createInputArrayByte(size[0], seed);
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final long[] javaRslt = histogram(RS, inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+ _RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount);
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+ Allocation inputAllocation = Allocation.createSized(RS, Element.U8(RS), inputArray.length);
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+ inputAllocation.copyFrom(inputArray);
+
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
+ final long[] rsRslt = s.reduce_histogram(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+ _RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount);
+
+ // NOTE: The "java time" is actually for the RenderScript histogram intrinsic
+ return result("histogram",
+ new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+ javaRslt, rsRslt);
}
//-----------------------------------------------------------------
@@ -302,17 +559,250 @@ public class UT_reduce extends UnitTest {
return new Int2(modeIdx, (int)hsg[modeIdx]);
}
- private boolean mode(RenderScript RS, ScriptC_reduce s) {
- final byte[] inputArray = createInputArrayByte(100000, 12);
+ private boolean mode_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final byte[] inputArray = createInputArrayByte(size[0], seed);
final Int2 javaRslt = mode(RS, inputArray);
final Int2 rsRslt = s.reduce_mode(inputArray).get();
- return result("mode", javaRslt, rsRslt);
+ return result("mode", new timing(size[0]), javaRslt, rsRslt);
+ }
+
+ ///////////////////////////////////////////////////////////////////
+
+ private long sumgcd(final int in1[], final int in2[]) {
+ _RS_ASSERT("sumgcd input length mismatch", in1.length == in2.length);
+
+ long sum = 0;
+ for (int i = 0; i < in1.length; ++i) {
+ int a = in1[i], b = in2[i];
+
+ while (b != 0) {
+ final int aNew = b;
+ final int bNew = a % b;
+
+ a = aNew;
+ b = bNew;
+ }
+
+ sum += a;
+ }
+ return sum;
+ }
+
+ private boolean sumgcd(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final int len = size[0];
+
+ final int[] inputArrayA = createInputArrayInt(len, seed+0);
+ final int[] inputArrayB = createInputArrayInt(len, seed+1);
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final long javaRslt = sumgcd(inputArrayA, inputArrayB);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+ Allocation inputAllocationA = Allocation.createSized(RS, Element.I32(RS), len);
+ Allocation inputAllocationB = Allocation.createSized(RS, Element.I32(RS), len);
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+ inputAllocationA.copyFrom(inputArrayA);
+ inputAllocationB.copyFrom(inputArrayB);
+
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
+ final long rsRslt = s.reduce_sumgcd(inputAllocationA, inputAllocationB).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+ return result("sumgcd",
+ new timing(javaTimeStart, javaTimeEnd, rsTimeStart, copyTimeStart, kernelTimeStart, rsTimeEnd,
+ inputAllocationA, inputAllocationB),
+ javaRslt, rsRslt);
}
///////////////////////////////////////////////////////////////////
+ public static final int maxSeedsPerTest = 10;
+
+ static interface Test {
+ // A test execution is characterized by two properties: A seed
+ // and a size.
+ //
+ // The seed is used for generating pseudorandom input data.
+ // Ideally, we use different seeds for different tests and for
+ // different executions of the same test at different sizes.
+ // A test with multiple blocks of input data (i.e., for a
+ // reduction with multiple inputs) may want multiple seeds; it
+ // may use the seeds seed..seed+maxSeedsPerTest-1.
+ //
+ // The size indicates the amount of input data. It is the number
+ // of cells in a particular dimension of the iteration space.
+ boolean run(RenderScript RS, ScriptC_reduce s, int seed, int[] size);
+ };
+
+ static class TestDescription {
+ public TestDescription(String myTestName, Test myTest, int mySeed, int[] myDefSize, int[] myLog2MaxSize) {
+ testName = myTestName;
+ test = myTest;
+ seed = mySeed;
+ defSize = myDefSize;
+ log2MaxSize = myLog2MaxSize;
+ };
+
+ public TestDescription(String myTestName, Test myTest, int mySeed, int[] myDefSize) {
+ testName = myTestName;
+ test = myTest;
+ seed = mySeed;
+ defSize = myDefSize;
+ log2MaxSize = null;
+ };
+
+ public final String testName;
+
+ public final Test test;
+
+ // When executing the test, scale this up by maxSeedsPerTest.
+ public final int seed;
+
+ // If we're only going to run the test once, what size should
+ // we use?
+ public final int[] defSize;
+
+ // If we're going to run the test over a range of sizes, what
+ // is the maximum size to use?
+ public final int[] log2MaxSize;
+ };
+
+ private boolean run(TestDescription td, RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ String arrayContent = "";
+ for (int i = 0; i < size.length; ++i) {
+ if (i != 0)
+ arrayContent += ", ";
+ arrayContent += size[i];
+ }
+ Log.i(TAG, "Running " + td.testName + "(seed = " + seed + ", size[] = {" + arrayContent + "})");
+ return td.test.run(RS, s, seed, size);
+ }
+
+ private final TestDescription[] correctnessTests = {
+ // alloc and array variants of the same test will use the same
+ // seed, in case results need to be compared.
+
+ new TestDescription("addint1D", this::addint1D, 0, new int[]{100000}, new int[]{20}),
+ new TestDescription("addint1D_array", this::addint1D_array, 0, new int[]{100000}, new int[]{20}),
+ new TestDescription("addint2D", this::addint2D, 1, new int[]{450, 225}),
+ new TestDescription("findMinAndMax", this::findMinAndMax, 3, new int[]{100000}, new int[]{20}),
+ new TestDescription("findMinAndMaxArray", this::findMinAndMax_array, 3, new int[]{100000}, new int[]{20}),
+ new TestDescription("fz", this::fz, 4, new int[]{100000}, new int[]{20}),
+ new TestDescription("fz_array", this::fz_array, 4, new int[]{100000}, new int[]{20}),
+ new TestDescription("fz2", this::fz2, 5, new int[]{225, 450}),
+ new TestDescription("fz3", this::fz3, 6, new int[]{59, 48, 37}),
+ new TestDescription("histogram", this::histogram, 7, new int[]{100000}, new int[]{20}),
+ new TestDescription("histogram_array", this::histogram_array, 7, new int[]{100000}, new int[]{20}),
+ // might want to add: new TestDescription("mode", this::mode, 8, new int[]{100000}, new int[]{20}),
+ new TestDescription("mode_array", this::mode_array, 8, new int[]{100000}, new int[]{20}),
+ new TestDescription("sumgcd", this::sumgcd, 9, new int[]{1 << 16}, new int[]{20})
+ };
+
+ private boolean runCorrectnessQuick(RenderScript RS, ScriptC_reduce s) {
+ boolean pass = true;
+
+ for (TestDescription td : correctnessTests) {
+ pass &= run(td, RS, s, maxSeedsPerTest * td.seed, td.defSize);
+ }
+
+ return pass;
+ }
+
+ private boolean runCorrectness(RenderScript RS, ScriptC_reduce s) {
+ boolean pass = true;
+
+ for (TestDescription td : correctnessTests) {
+ if (td.log2MaxSize == null) // TODO: Eventually this should never happen?
+ continue;
+
+ if (td.log2MaxSize.length == 1) {
+ final int log2MaxSize = td.log2MaxSize[0];
+ // We will execute the test with the following sizes:
+ // (a) Each power of 2 from zero (2**0) up to log2MaxSize (2**log2MaxSize)
+ // (b) Each size from (a) +/-1
+ // (c) 2 random sizes between adjacent points in (a)
+ int[] testSizes = new int[
+ /* a */ (1 + log2MaxSize) +
+ /* b */ 2*(1 + log2MaxSize) +
+ /* c */ 2*log2MaxSize];
+
+ // NOTE: Each test execution gets maxSeedsPerTest, and
+ // there are up to 3 + 5*log2MaxSize test executions
+ // of a test, and we need a seed for (c). Assuming
+ // log2MaxSize does not exceed 32, then it should be
+ // sufficient to reserve 1 + 5*32*maxSeedsPerTest seeds
+ // per TestDescription.
+ final int seedForPickingTestSizes = td.seed * (1 + 5*32*maxSeedsPerTest);
+
+ int nextTestIdx = 0;
+
+ // Fill in (a) and (b)
+ for (int i = 0; i <= log2MaxSize; ++i) {
+ final int pwrOf2 = 1 << i;
+ testSizes[nextTestIdx++] = pwrOf2; /* a */
+ testSizes[nextTestIdx++] = pwrOf2 - 1; /* b */
+ testSizes[nextTestIdx++] = pwrOf2 + 1; /* b */
+ }
+
+ // Fill in (c)
+ Random r = new Random(seedForPickingTestSizes);
+ for (int i = 0; i < log2MaxSize; ++i) {
+ final int lo = (1 << i) + 1;
+ final int hi = 1 << (i + 1);
+
+ if (lo < hi) {
+ for (int j = 0; j < 2; ++j) {
+ testSizes[nextTestIdx++] = r.nextInt(hi - lo) + lo;
+ }
+ }
+ }
+
+ Arrays.sort(testSizes);
+
+ int[] lastTestSizeArg = new int[]{-1};
+ for (int i = 0; i < testSizes.length; ++i) {
+ if ((testSizes[i] > 0) && (testSizes[i] != lastTestSizeArg[0])) {
+ lastTestSizeArg[0] = testSizes[i];
+ final int seedForTestExecution = seedForPickingTestSizes + 1 + i*maxSeedsPerTest;
+ pass &= run(td, RS, s, seedForTestExecution, lastTestSizeArg);
+ }
+ }
+ }
+ // TODO: lengths 2 and 3, and assert otherwise
+ }
+
+ return pass;
+ }
+
+ private final TestDescription[] performanceTests = {
+ new TestDescription("addint1D", this::addint1D, 0, new int[]{100000 << 10}),
+ new TestDescription("addint2D", this::addint2D, 1, new int[]{450 << 5, 225 << 5}),
+ new TestDescription("findMinAndMax", this::findMinAndMax, 3, new int[]{100000 << 9}),
+ new TestDescription("fz", this::fz, 4, new int[]{100000 << 10}),
+ new TestDescription("fz2", this::fz2, 5, new int[]{225 << 5, 450 << 5}),
+ new TestDescription("fz3", this::fz3, 6, new int[]{59 << 3, 48 << 3, 37 << 3}),
+ new TestDescription("histogram", this::histogram, 7, new int[]{100000 << 10}),
+ // might want to add: new TestDescription("mode", this::mode, 8, new int[]{100000}),
+ new TestDescription("sumgcd", this::sumgcd, 9, new int[]{1 << 21})
+ };
+
+ private boolean runPerformanceQuick(RenderScript RS, ScriptC_reduce s) {
+ boolean pass = true;
+
+ for (TestDescription td : performanceTests) {
+ pass &= run(td, RS, s, maxSeedsPerTest * td.seed, td.defSize);
+ }
+
+ return pass;
+ }
+
+
public void run() {
RenderScript pRS = RenderScript.create(mCtx);
ScriptC_reduce s = new ScriptC_reduce(pRS);
@@ -320,15 +810,10 @@ public class UT_reduce extends UnitTest {
s.set_posInf(Float.POSITIVE_INFINITY);
boolean pass = true;
- pass &= addint1D(pRS, s);
- pass &= addint2D(pRS, s);
- pass &= dp(pRS, s);
- pass &= findMinAndMax(pRS, s);
- pass &= fz(pRS, s);
- pass &= fz2(pRS, s);
- pass &= fz3(pRS, s);
- pass &= histogram(pRS, s);
- pass &= mode(pRS, s);
+
+ pass &= runCorrectnessQuick(pRS, s);
+ pass &= runCorrectness(pRS, s);
+ // pass &= runPerformanceQuick(pRS, s);
pRS.finish();
pRS.destroy();
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
index 3a64a738..6a50d2bf 100644
--- a/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
@@ -119,39 +119,6 @@ public class UT_reduce_backward extends UnitTest {
///////////////////////////////////////////////////////////////////
- private float dp(float[] input1, float[] input2) {
- _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
- float rslt = 0;
- for (int idx = 0; idx < input1.length; ++idx)
- rslt += input1[idx] * input2[idx];
- return rslt;
- }
-
- private boolean dp(RenderScript RS, ScriptC_reduce_backward s) {
- final float[] input1 = createInputArrayFloat(100000, 2);
- final float[] input2 = createInputArrayFloat(100000, 3);
-
- final float javaRslt = dp(input1, input2);
- final float rsRslt = s.reduce_dp(input1, input2).get();
-
- // NOTE: Using a floating point equality check to test for
- // correctness -- as we do below -- is a bad idea. It's only
- // reliable if the Java and RenderScript implementation of dp
- // use the same algorithm. Equality could be broken by
- // different optimizations between the two, or running the
- // RenderScript algorithm multithreaded, or running the
- // RenderScript algorithm on a GPU rather than the CPU.
- //
- // Should we be checking instead that the results are
- // "sufficiently close"? Cooking the input set to try to
- // ensure a deterministic result? Changing to integers
- // instead?
- return result("dp", javaRslt, rsRslt);
- }
-
- ///////////////////////////////////////////////////////////////////
-
private Int2 findMinAndMax(float[] input) {
float minVal = Float.POSITIVE_INFINITY;
int minIdx = -1;
@@ -322,7 +289,6 @@ public class UT_reduce_backward extends UnitTest {
boolean pass = true;
pass &= addint1D(pRS, s);
pass &= addint2D(pRS, s);
- pass &= dp(pRS, s);
pass &= findMinAndMax(pRS, s);
pass &= fz(pRS, s);
pass &= fz2(pRS, s);
diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce.rs b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
index be09dfb6..97b45e0c 100644
--- a/java/tests/RsTest/src/com/android/rs/test/reduce.rs
+++ b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
@@ -16,18 +16,6 @@ static void aiAccum(int *accum, int val) { *accum += val; }
/////////////////////////////////////////////////////////////////////////
-#pragma rs reduce(dp) \
- accumulator(dpAccum) combiner(dpSum)
-
-static void dpAccum(float *accum, float in1, float in2) {
- *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-/////////////////////////////////////////////////////////////////////////
-
#pragma rs reduce(findMinAndMax) \
initializer(fMMInit) accumulator(fMMAccumulator) \
combiner(fMMCombiner) outconverter(fMMOutConverter)
@@ -61,8 +49,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) {
static void fMMCombiner(MinAndMax *accum,
const MinAndMax *val) {
- fMMAccumulator(accum, val->min.val, val->min.idx);
- fMMAccumulator(accum, val->max.val, val->max.idx);
+ if (val->min.val < accum->min.val)
+ accum->min = val->min;
+ if (val->max.val > accum->max.val)
+ accum->max = val->max;
}
static void fMMOutConverter(int2 *result,
@@ -160,3 +150,24 @@ static void modeOutConvert(int2 *result, const Histogram *h) {
result->x = mode;
result->y = (*h)[mode];
}
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(sumgcd) accumulator(sgAccum) combiner(sgCombine)
+
+static int gcd(int a, int b) {
+ while (b != 0) {
+ const int aNew = b;
+ const int bNew = a % b;
+
+ a = aNew;
+ b = bNew;
+ }
+ return a;
+}
+
+static void sgAccum(long *accum, int a, int b) {
+ *accum += gcd(a, b);
+}
+
+static void sgCombine(long *accum, const long *other) { *accum += *other; }
diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs b/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
index 419e7090..41252c8a 100644
--- a/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
+++ b/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
@@ -15,18 +15,6 @@ static void aiAccum(int *accum, int val) { *accum += val; }
/////////////////////////////////////////////////////////////////////////
-static void dpAccum(float *accum, float in1, float in2) {
- *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-#pragma rs reduce(dp) \
- accumulator(dpAccum) combiner(dpSum)
-
-/////////////////////////////////////////////////////////////////////////
-
typedef struct {
float val;
int idx;
@@ -56,8 +44,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) {
static void fMMCombiner(MinAndMax *accum,
const MinAndMax *val) {
- fMMAccumulator(accum, val->min.val, val->min.idx);
- fMMAccumulator(accum, val->max.val, val->max.idx);
+ if (val->min.val < accum->min.val)
+ accum->min = val->min;
+ if (val->max.val > accum->max.val)
+ accum->max = val->max;
}
static void fMMOutConverter(int2 *result,
diff --git a/rsContext.cpp b/rsContext.cpp
index 77e82f64..122815fb 100644
--- a/rsContext.cpp
+++ b/rsContext.cpp
@@ -260,6 +260,8 @@ void * Context::threadProc(void *vrsc) {
rsc->props.mLogShadersAttr = getProp("debug.rs.shader.attributes") != 0;
rsc->props.mLogShadersUniforms = getProp("debug.rs.shader.uniforms") != 0;
rsc->props.mLogVisual = getProp("debug.rs.visual") != 0;
+ rsc->props.mLogReduceAccum = getProp("debug.rs.reduce-accum") != 0;
+ rsc->props.mDebugReduceSplitAccum = getProp("debug.rs.reduce-split-accum") != 0;
rsc->props.mDebugMaxThreads = getProp("debug.rs.max-threads");
if (getProp("debug.rs.debug") != 0) {
diff --git a/rsContext.h b/rsContext.h
index fce22b56..dd2fc00e 100644
--- a/rsContext.h
+++ b/rsContext.h
@@ -226,6 +226,8 @@ public:
bool mLogShadersAttr;
bool mLogShadersUniforms;
bool mLogVisual;
+ bool mLogReduceAccum;
+ bool mDebugReduceSplitAccum;
uint32_t mDebugMaxThreads;
} props;