17 files changed, 930 insertions, 267 deletions
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index b8b48387..9f9c429b 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -45,6 +45,8 @@ static pid_t gettid() {
 using namespace android;
 using namespace android::renderscript;
 
+#define REDUCE_NEW_ALOGV(...) /* ALOGV(__VA_ARGS__) */
+
 static pthread_key_t gThreadTLSKey = 0;
 static uint32_t gThreadTLSKeyCount = 0;
 static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
@@ -98,7 +100,7 @@ RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) {
 
     version_major = 0;
     version_minor = 0;
-    mInForEach = false;
+    mInKernel = false;
     memset(&mWorkers, 0, sizeof(mWorkers));
     memset(&mTlsStruct, 0, sizeof(mTlsStruct));
     mExit = false;
@@ -239,6 +241,9 @@ bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
         ALOGE("pthread_setspecific %i", status);
     }
 
+    mPageSize = sysconf(_SC_PAGE_SIZE);
+    REDUCE_NEW_ALOGV("page size = %ld", mPageSize);
+
     GetCpuInfo();
 
     int cpu = sysconf(_SC_NPROCESSORS_CONF);
@@ -435,7 +440,7 @@ static void walk_2d(void *usr, uint32_t idx) {
     }
 }
 
-static void walk_1d(void *usr, uint32_t idx) {
+static void walk_1d_foreach(void *usr, uint32_t idx) {
     MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
     RsExpandKernelDriverInfo fep = mtls->fep;
     fep.lid = idx;
@@ -458,6 +463,103 @@ static void walk_1d(void *usr, uint32_t idx) {
     }
 }
 
+// The function format_bytes() is an auxiliary function to assist in logging.
+//
+// Bytes are read from an input (inBuf) and written (as pairs of hex digits)
+// to an output (outBuf).
+//
+// Output format:
+// - starts with ": "
+// - each input byte is translated to a pair of hex digits
+// - bytes are separated by "." except that every fourth separator is "|"
+// - if the input is sufficiently long, the output is truncated and terminated with "..."
+//
+// Arguments:
+// - outBuf  -- Pointer to buffer of type "FormatBuf" into which output is written
+// - inBuf   -- Pointer to bytes which are to be formatted into outBuf
+// - inBytes -- Number of bytes in inBuf
+//
+// Constant:
+// - kFormatInBytesMax -- Only min(kFormatInBytesMax, inBytes) bytes will be read
+//                        from inBuf
+//
+// Return value:
+// - pointer (const char *) to output (which is part of outBuf)
+//
+static const int kFormatInBytesMax = 16;
+// ": " + 2 digits per byte + 1 separator between bytes + "..." + null
+typedef char FormatBuf[2 + kFormatInBytesMax*2 + (kFormatInBytesMax - 1) + 3 + 1];
+static const char *format_bytes(FormatBuf *outBuf, const uint8_t *inBuf, const int inBytes) {
+  strcpy(*outBuf, ": ");
+  int pos = 2;
+  const int lim = std::min(kFormatInBytesMax, inBytes);
+  for (int i = 0; i < lim; ++i) {
+    if (i) {
+      sprintf(*outBuf + pos, (i % 4 ? "." : "|"));
+      ++pos;
+    }
+    sprintf(*outBuf + pos, "%02x", inBuf[i]);
+    pos += 2;
+  }
+  if (kFormatInBytesMax < inBytes)
+    strcpy(*outBuf + pos, "...");
+  return *outBuf;
+}
+
+static void walk_1d_reduce_new(void *usr, uint32_t idx) {
+  const MTLaunchStructReduceNew *mtls = (const MTLaunchStructReduceNew *)usr;
+  RsExpandKernelDriverInfo redp = mtls->redp;
+
+  // find accumulator
+  uint8_t *&accumPtr = mtls->accumPtr[idx];
+  if (!accumPtr) {
+    uint32_t accumIdx = (uint32_t)__sync_fetch_and_add(&mtls->accumCount, 1);
+    if (mtls->outFunc) {
+      accumPtr = mtls->accumAlloc + mtls->accumStride * accumIdx;
+    } else {
+      if (accumIdx == 0) {
+        accumPtr = mtls->redp.outPtr[0];
+      } else {
+        accumPtr = mtls->accumAlloc + mtls->accumStride * (accumIdx - 1);
+      }
+    }
+    REDUCE_NEW_ALOGV("walk_1d_reduce_new(%p): idx = %u got accumCount %u and accumPtr %p",
+                     mtls->accumFunc, idx, accumIdx, accumPtr);
+    // initialize accumulator
+    if (mtls->initFunc) {
+      mtls->initFunc(accumPtr);
+    } else {
+      memset(accumPtr, 0, mtls->accumSize);
+    }
+  }
+
+  // accumulate
+  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  while (1) {
+    uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+    uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
+    uint32_t xEnd   = xStart + mtls->mSliceSize;
+
+    xEnd = rsMin(xEnd, mtls->end.x);
+
+    if (xEnd <= xStart) {
+      return;
+    }
+
+    RedpPtrSetup(mtls, &redp, xStart, 0, 0);
+    fn(&redp, xStart, xEnd, accumPtr);
+
+    FormatBuf fmt;
+    if (mtls->logReduceAccum) {
+      format_bytes(&fmt, accumPtr, mtls->accumSize);
+    } else {
+      fmt[0] = 0;
+    }
+    REDUCE_NEW_ALOGV("walk_1d_reduce_new(%p): idx = %u [%u, %u)%s",
+                     mtls->accumFunc, idx, xStart, xEnd, fmt);
+  }
+}
+
 // Launch a simple reduce-style kernel.
 // Inputs:
 //  ain:  The allocation that contains the input
@@ -486,6 +588,25 @@ void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains,
                                           uint32_t inLen,
                                           Allocation * aout,
                                           MTLaunchStructReduceNew *mtls) {
+  mtls->logReduceAccum = mRSC->props.mLogReduceAccum;
+  if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
+    launchReduceNewParallel(ains, inLen, aout, mtls);
+  } else {
+    launchReduceNewSerial(ains, inLen, aout, mtls);
+  }
+}
+
+// Launch a general reduce-style kernel, single-threaded.
+// Inputs:
+//   ains[0..inLen-1]: Array of allocations that contain the inputs
+//   aout:             The allocation that will hold the output
+//   mtls:             Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNewSerial(const Allocation ** ains,
+                                                uint32_t inLen,
+                                                Allocation * aout,
+                                                MTLaunchStructReduceNew *mtls) {
+  ALOGV("launchReduceNewSerial(%p)", mtls->accumFunc);
+
   // In the presence of outconverter, we allocate temporary memory for
   // the accumulator.
   //
@@ -521,6 +642,112 @@ void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains,
   }
 }
 
+// Launch a general reduce-style kernel, multi-threaded.
+// Inputs:
+//   ains[0..inLen-1]: Array of allocations that contain the inputs
+//   aout:             The allocation that will hold the output
+//   mtls:             Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNewParallel(const Allocation ** ains,
+                                                  uint32_t inLen,
+                                                  Allocation * aout,
+                                                  MTLaunchStructReduceNew *mtls) {
+  // For now, we don't know how to go parallel beyond 1D, or in the absence of a combiner.
+  if ((mtls->redp.dim.y > 1) || (mtls->redp.dim.z > 1) || !mtls->combFunc) {
+    launchReduceNewSerial(ains, inLen, aout, mtls);
+    return;
+  }
+
+  // Number of threads = "main thread" + number of other (worker) threads
+  const uint32_t numThreads = mWorkers.mCount + 1;
+
+  // In the absence of outconverter, we use the output allocation as
+  // an accumulator, and therefore need to allocate one fewer accumulator.
+  const uint32_t numAllocAccum = numThreads - (mtls->outFunc == nullptr);
+
+  // If mDebugReduceSplitAccum, then we want each accumulator to start
+  // on a page boundary.  (TODO: Would some unit smaller than a page
+  // be sufficient to avoid false sharing?)
+  if (mRSC->props.mDebugReduceSplitAccum) {
+    // Round up accumulator size to an integral number of pages
+    mtls->accumStride =
+        (unsigned(mtls->accumSize) + unsigned(mPageSize)-1) &
+        ~(unsigned(mPageSize)-1);
+    // Each accumulator gets its own page.  Alternatively, if we just
+    // wanted to make sure no two accumulators are on the same page,
+    // we could instead do
+    //   allocSize = mtls->accumStride * (numAllocation - 1) + mtls->accumSize
+    const size_t allocSize = mtls->accumStride * numAllocAccum;
+    mtls->accumAlloc = static_cast<uint8_t *>(memalign(mPageSize, allocSize));
+  } else {
+    mtls->accumStride = mtls->accumSize;
+    mtls->accumAlloc = static_cast<uint8_t *>(malloc(mtls->accumStride * numAllocAccum));
+  }
+
+  const size_t accumPtrArrayBytes = sizeof(uint8_t *) * numThreads;
+  mtls->accumPtr = static_cast<uint8_t **>(malloc(accumPtrArrayBytes));
+  memset(mtls->accumPtr, 0, accumPtrArrayBytes);
+
+  mtls->accumCount = 0;
+
+  rsAssert(!mInKernel);
+  mInKernel = true;
+  mtls->mSliceSize = rsMax(1U, mtls->redp.dim.x / (numThreads * 4));
+  ALOGV("launchReduceNewParallel(%p): %u threads, accumAlloc = %p",
+        mtls->accumFunc, numThreads, mtls->accumAlloc);
+  launchThreads(walk_1d_reduce_new, mtls);
+  mInKernel = false;
+
+  // Combine accumulators and identify final accumulator
+  uint8_t *finalAccumPtr = (mtls->outFunc ? nullptr : mtls->redp.outPtr[0]);
+  //   Loop over accumulators, combining into finalAccumPtr.  If finalAccumPtr
+  //   is null, then the first accumulator I find becomes finalAccumPtr.
+  for (unsigned idx = 0; idx < mtls->accumCount; ++idx) {
+    uint8_t *const thisAccumPtr = mtls->accumPtr[idx];
+    if (finalAccumPtr) {
+      if (finalAccumPtr != thisAccumPtr) {
+        if (mtls->combFunc) {
+          if (mtls->logReduceAccum) {
+            FormatBuf fmt;
+            REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): accumulating into%s",
+                             mtls->accumFunc,
+                             format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+            REDUCE_NEW_ALOGV("launchReduceNewParallel(%p):    accumulator[%d]%s",
+                             mtls->accumFunc, idx,
+                             format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
+          }
+          mtls->combFunc(finalAccumPtr, thisAccumPtr);
+        } else {
+          rsAssert(!"expected combiner");
+        }
+      }
+    } else {
+      finalAccumPtr = thisAccumPtr;
+    }
+  }
+  rsAssert(finalAccumPtr != nullptr);
+  if (mtls->logReduceAccum) {
+    FormatBuf fmt;
+    REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): final accumulator%s",
+                     mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+  }
+
+  // Outconvert
+  if (mtls->outFunc) {
+    mtls->outFunc(mtls->redp.outPtr[0], finalAccumPtr);
+    if (mtls->logReduceAccum) {
+      FormatBuf fmt;
+      REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): final outconverted result%s",
+                       mtls->accumFunc,
+                       format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
+    }
+  }
+
+  // Clean up
+  free(mtls->accumPtr);
+  free(mtls->accumAlloc);
+}
+
+
 void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
                                         uint32_t inLen,
                                         Allocation* aout,
@@ -537,9 +764,9 @@ void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
                      (mtls->start.array[2] != mtls->end.array[2]) ||
                      (mtls->start.array[3] != mtls->end.array[3]);
 
-    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
         const size_t targetByteChunk = 16 * 1024;
-        mInForEach = true;
+        mInKernel = true;  // NOTE: The guard immediately above ensures this was !mInKernel
 
         if (outerDims) {
             // No fancy logic for chunk size
@@ -588,9 +815,9 @@ void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
                 mtls->mSliceSize = 1;
             }
 
-            launchThreads(walk_1d, mtls);
+            launchThreads(walk_1d_foreach, mtls);
         }
-        mInForEach = false;
+        mInKernel = false;
 
     } else {
         ForEachFunc_t fn = mtls->kernel;
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 939b7ae2..c2a08640 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -34,6 +34,7 @@ extern bool gArchUseSIMD;
 // Function types found in RenderScript code
 typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len);
 typedef void (*ReduceNewAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
+typedef void (*ReduceNewCombinerFunc_t)(uint8_t *accum, const uint8_t *other);
 typedef void (*ReduceNewInitializerFunc_t)(uint8_t *accum);
 typedef void (*ReduceNewOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
 typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride);
@@ -44,6 +45,7 @@ typedef int  (*RootFunc_t)(void);
 struct ReduceNewDescription {
     ReduceNewAccumulatorFunc_t  accumFunc;  // expanded accumulator function
     ReduceNewInitializerFunc_t  initFunc;   // user initializer function
+    ReduceNewCombinerFunc_t     combFunc;   // user combiner function
     ReduceNewOutConverterFunc_t outFunc;    // user outconverter function
     size_t                      accumSize;  // accumulator datum size, in bytes
 };
@@ -73,7 +75,8 @@ struct MTLaunchStructCommon {
     RsLaunchDimensions start;
     RsLaunchDimensions end;
     // Points to MTLaunchStructForEach::fep::dim or
-    // MTLaunchStructReduce::inputDim.
+    // MTLaunchStructReduce::inputDim or
+    // MTLaunchStructReduceNew::redp::dim.
     RsLaunchDimensions *dimPtr;
 };
 
@@ -101,9 +104,51 @@ struct MTLaunchStructReduceNew : public MTLaunchStructCommon {
 
     ReduceNewAccumulatorFunc_t accumFunc;
     ReduceNewInitializerFunc_t initFunc;
+    ReduceNewCombinerFunc_t combFunc;
     ReduceNewOutConverterFunc_t outFunc;
 
     size_t accumSize;  // accumulator datum size in bytes
+
+    size_t accumStride;  // stride between accumulators in accumAlloc (below)
+
+    // These fields are used for managing accumulator data items in a
+    // multithreaded execution.
+    //
+    // Let the number of threads be N.
+    // Let Outc be true iff there is an outconverter.
+    //
+    // accumAlloc is a pointer to a single allocation of (N - !Outc)
+    // accumulators.  (If there is no outconverter, then the output
+    // allocation acts as an accumulator.)  It is created at kernel
+    // launch time.  Within that allocation, the distance between the
+    // start of adjacent accumulators is accumStride bytes -- this
+    // might be the same as accumSize, or it might be larger, if we
+    // are attempting to avoid false sharing.
+    //
+    // accumCount is an atomic counter of how many accumulators have
+    // been grabbed by threads.  It is initialized to zero at kernel
+    // launch time.  See accumPtr for further description.
+    //
+    // accumPtr is pointer to an array of N pointers to accumulators.
+    // The array is created at kernel launch time, and each element is
+    // initialized to nullptr.  When a particular thread goes to work,
+    // that thread obtains its accumulator from its entry in this
+    // array.  If the entry is nullptr, that thread needs to obtain an
+    // accumulator, and initialize its entry in the array accordingly.
+    // It does so via atomic access (fetch-and-add) to accumCount.
+    // - If Outc, then the fetched value is used as an index into
+    //   accumAlloc.
+    // - If !Outc, then
+    //   - If the fetched value is zero, then this thread gets the
+    //     output allocation for its accumulator.
+    //   - If the fetched value is nonzero, then (fetched value - 1)
+    //     is used as an index into accumAlloc.
+    uint8_t *accumAlloc;
+    uint8_t **accumPtr;
+    uint32_t accumCount;
+
+    // Logging control
+    bool logReduceAccum;
 };
 
 class RsdCpuReferenceImpl : public RsdCpuReference {
@@ -161,7 +206,7 @@ public:
     virtual const char *getBccPluginName() const {
         return mBccPluginName.string();
     }
-    bool getInForEach() override { return mInForEach; }
+    bool getInKernel() override { return mInKernel; }
 
     // Set to true if we should embed global variable information in the code.
     void setEmbedGlobalInfo(bool v) override {
@@ -190,7 +235,7 @@ protected:
     uint32_t version_major;
     uint32_t version_minor;
     //bool mHasGraphics;
-    bool mInForEach;
+    bool mInKernel;  // Is a parallel kernel execution underway?
 
     struct Workers {
         volatile int mRunningCount;
@@ -222,6 +267,14 @@ protected:
     // when potentially embedding information about globals.
     // Defaults to true.
     bool mEmbedGlobalInfoSkipConstant;
+
+    long mPageSize;
+
+    // Launch a general reduce kernel
+    void launchReduceNewSerial(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                               MTLaunchStructReduceNew *mtls);
+    void launchReduceNewParallel(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                                 MTLaunchStructReduceNew *mtls);
 };
 
 
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
index 34a6b20c..9d6e6236 100644
--- a/cpu_ref/rsCpuExecutable.cpp
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -538,8 +538,8 @@ ScriptExecutable* ScriptExecutable::createFromSharedObject(
             goto error;
         }
 
-        // The current implementation does not use the signature,
-        // reduce name, or combiner.
+        // The current implementation does not use the signature
+        // or reduce name.
 
         reduceNewDescriptions[i].accumSize = tmpSize;
 
@@ -565,6 +565,19 @@ ScriptExecutable* ScriptExecutable::createFromSharedObject(
             goto error;
         }
 
+        // Process the (optional) combiner.
+        if (strcmp(tmpNameCombiner, kNoName)) {
+          // Lookup the original user-written combiner.
+          if (!(reduceNewDescriptions[i].combFunc =
+                (ReduceNewCombinerFunc_t) dlsym(sharedObj, tmpNameCombiner))) {
+            ALOGE("Failed to find combiner function address for %s(): %s",
+                  tmpNameCombiner, dlerror());
+            goto error;
+          }
+        } else {
+          reduceNewDescriptions[i].combFunc = nullptr;
+        }
+
         // Process the (optional) outconverter.
         if (strcmp(tmpNameOutConverter, kNoName)) {
           // Lookup the original user-written outconverter.
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index ef738d72..a88af2fe 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -645,9 +645,9 @@ bool RsdCpuScriptImpl::reduceNewMtlsSetup(const Allocation ** ains,
 
     mtls->rs = mCtx;
 
-    // Currently not threaded.
-    mtls->isThreadable = false;
-    mtls->mSliceNum = -1;
+    mtls->mSliceNum    = 0;
+    mtls->mSliceSize   = 1;
+    mtls->isThreadable = mIsThreadable;
 
     // Set up output,
     mtls->redp.outLen = 1;
@@ -842,6 +842,7 @@ void RsdCpuScriptImpl::reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceN
     const ReduceNewDescription *desc = mScriptExec->getReduceNewDescription(slot);
     mtls->accumFunc = desc->accumFunc;
     mtls->initFunc  = desc->initFunc;   // might legally be nullptr
+    mtls->combFunc  = desc->combFunc;   // might legally be nullptr
     mtls->outFunc   = desc->outFunc;    // might legally be nullptr
     mtls->accumSize = desc->accumSize;
 
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 49a999db..e226b934 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -131,7 +131,7 @@ public:
                                      uint32_t flags) = 0;
     virtual CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) = 0;
     virtual void* createScriptGroup(const ScriptGroupBase *sg) = 0;
-    virtual bool getInForEach() = 0;
+    virtual bool getInKernel() = 0;  // Is a parallel kernel execution underway?
 
     // Set to true if we should embed global variable information in the code.
     virtual void setEmbedGlobalInfo(bool v) = 0;
diff --git a/driver/rsdRuntimeStubs.cpp b/driver/rsdRuntimeStubs.cpp
index c7b88962..10775407 100644
--- a/driver/rsdRuntimeStubs.cpp
+++ b/driver/rsdRuntimeStubs.cpp
@@ -136,7 +136,7 @@ static bool failIfInKernel(Context *rsc, const char *funcName) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
     RsdCpuReference *impl = (RsdCpuReference *) dc->mCpuRef;
 
-    if (impl->getInForEach()) {
+    if (impl->getInKernel()) {
         char buf[256];
         snprintf(buf, sizeof(buf), "Error: Call to unsupported function %s "
                          "in kernel", funcName);
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
index 608de473..c1e9c408 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
@@ -119,39 +119,6 @@ public class UT_reduce extends UnitTest {
 
     ///////////////////////////////////////////////////////////////////
 
-    private float dp(float[] input1, float[] input2) {
-        _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
-        float rslt = 0;
-        for (int idx = 0; idx < input1.length; ++idx)
-            rslt += input1[idx] * input2[idx];
-        return rslt;
-    }
-
-    private boolean dp(RenderScript RS, ScriptC_reduce s) {
-        final float[] input1 = createInputArrayFloat(100000, 2);
-        final float[] input2 = createInputArrayFloat(100000, 3);
-
-        final float javaRslt = dp(input1, input2);
-        final float rsRslt = s.reduce_dp(input1, input2).get();
-
-        // NOTE: Using a floating point equality check to test for
-        // correctness -- as we do below -- is a bad idea.  It's only
-        // reliable if the Java and RenderScript implementation of dp
-        // use the same algorithm.  Equality could be broken by
-        // different optimizations between the two, or running the
-        // RenderScript algorithm multithreaded, or running the
-        // RenderScript algorithm on a GPU rather than the CPU.
-        //
-        // Should we be checking instead that the results are
-        // "sufficiently close"?  Cooking the input set to try to
-        // ensure a deterministic result?  Changing to integers
-        // instead?
-        return result("dp", javaRslt, rsRslt);
-    }
-
-    ///////////////////////////////////////////////////////////////////
-
     private Int2 findMinAndMax(float[] input) {
         float minVal = Float.POSITIVE_INFINITY;
         int minIdx = -1;
@@ -322,7 +289,6 @@ public class UT_reduce extends UnitTest {
         boolean pass = true;
         pass &= addint1D(pRS, s);
         pass &= addint2D(pRS, s);
-        pass &= dp(pRS, s);
         pass &= findMinAndMax(pRS, s);
         pass &= fz(pRS, s);
         pass &= fz2(pRS, s);
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
index 84d2c505..b998f518 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
@@ -119,39 +119,6 @@ public class UT_reduce_backward extends UnitTest {
 
     ///////////////////////////////////////////////////////////////////
 
-    private float dp(float[] input1, float[] input2) {
-        _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
-        float rslt = 0;
-        for (int idx = 0; idx < input1.length; ++idx)
-            rslt += input1[idx] * input2[idx];
-        return rslt;
-    }
-
-    private boolean dp(RenderScript RS, ScriptC_reduce_backward s) {
-        final float[] input1 = createInputArrayFloat(100000, 2);
-        final float[] input2 = createInputArrayFloat(100000, 3);
-
-        final float javaRslt = dp(input1, input2);
-        final float rsRslt = s.reduce_dp(input1, input2).get();
-
-        // NOTE: Using a floating point equality check to test for
-        // correctness -- as we do below -- is a bad idea.  It's only
-        // reliable if the Java and RenderScript implementation of dp
-        // use the same algorithm.  Equality could be broken by
-        // different optimizations between the two, or running the
-        // RenderScript algorithm multithreaded, or running the
-        // RenderScript algorithm on a GPU rather than the CPU.
-        //
-        // Should we be checking instead that the results are
-        // "sufficiently close"?  Cooking the input set to try to
-        // ensure a deterministic result?  Changing to integers
-        // instead?
-        return result("dp", javaRslt, rsRslt);
-    }
-
-    ///////////////////////////////////////////////////////////////////
-
     private Int2 findMinAndMax(float[] input) {
         float minVal = Float.POSITIVE_INFINITY;
         int minIdx = -1;
@@ -322,7 +289,6 @@ public class UT_reduce_backward extends UnitTest {
         boolean pass = true;
         pass &= addint1D(pRS, s);
         pass &= addint2D(pRS, s);
-        pass &= dp(pRS, s);
         pass &= findMinAndMax(pRS, s);
         pass &= fz(pRS, s);
         pass &= fz2(pRS, s);
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
index be09dfb6..ec7be8b7 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
@@ -16,18 +16,6 @@ static void aiAccum(int *accum, int val) { *accum += val; }
 
 /////////////////////////////////////////////////////////////////////////
 
-#pragma rs reduce(dp) \
-  accumulator(dpAccum) combiner(dpSum)
-
-static void dpAccum(float *accum, float in1, float in2) {
-  *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-/////////////////////////////////////////////////////////////////////////
-
 #pragma rs reduce(findMinAndMax) \
   initializer(fMMInit) accumulator(fMMAccumulator) \
   combiner(fMMCombiner) outconverter(fMMOutConverter)
@@ -61,8 +49,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) {
 
 static void fMMCombiner(MinAndMax *accum,
                         const MinAndMax *val) {
-  fMMAccumulator(accum, val->min.val, val->min.idx);
-  fMMAccumulator(accum, val->max.val, val->max.idx);
+  if (val->min.val < accum->min.val)
+    accum->min = val->min;
+  if (val->max.val > accum->max.val)
+    accum->max = val->max;
 }
 
 static void fMMOutConverter(int2 *result,
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
index 419e7090..41252c8a 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
@@ -15,18 +15,6 @@ static void aiAccum(int *accum, int val) { *accum += val; }
 
 /////////////////////////////////////////////////////////////////////////
 
-static void dpAccum(float *accum, float in1, float in2) {
-  *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-#pragma rs reduce(dp) \
-  accumulator(dpAccum) combiner(dpSum)
-
-/////////////////////////////////////////////////////////////////////////
-
 typedef struct {
   float val;
   int idx;
@@ -56,8 +44,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) {
 
 static void fMMCombiner(MinAndMax *accum,
                         const MinAndMax *val) {
-  fMMAccumulator(accum, val->min.val, val->min.idx);
-  fMMAccumulator(accum, val->max.val, val->max.idx);
+  if (val->min.val < accum->min.val)
+    accum->min = val->min;
+  if (val->max.val > accum->max.val)
+    accum->max = val->max;
 }
 
 static void fMMOutConverter(int2 *result,
diff --git a/java/tests/RsTest/AndroidManifest.xml b/java/tests/RsTest/AndroidManifest.xml
index b660398d..31da896a 100644
--- a/java/tests/RsTest/AndroidManifest.xml
+++ b/java/tests/RsTest/AndroidManifest.xml
@@ -2,6 +2,7 @@
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="com.android.rs.test">
     <application 
+	android:largeHeap="true"
         android:label="_RS_Test"
         android:icon="@drawable/test_pattern">
         <activity android:name="RSTest"
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
index a244646c..07692595 100644
--- a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
@@ -27,6 +27,7 @@ import android.content.res.Resources;
 import android.renderscript.*;
 import android.util.Log;
 import java.lang.Float;
+import java.util.Arrays;
 import java.util.Random;
 
 public class UT_reduce extends UnitTest {
@@ -36,6 +37,81 @@ public class UT_reduce extends UnitTest {
         super(rstc, "reduce", ctx);
     }
 
+    private static class timing {
+        timing(long myJavaStart, long myJavaEnd, long myRsStart,
+               long myCopyStart, long myKernelStart, long myRsEnd,
+               Allocation... myInputs) {
+            javaStart = myJavaStart;
+            javaEnd = myJavaEnd;
+            rsStart = myRsStart;
+            copyStart = myCopyStart;
+            kernelStart = myKernelStart;
+            rsEnd = myRsEnd;
+
+            inputBytes = 0;
+            for (Allocation input : myInputs)
+                inputBytes += input.getBytesSize();
+
+            inputCells = (myInputs.length > 0) ? myInputs[0].getType().getCount() : 0;
+        }
+
+        timing(long myInputCells) {
+            inputCells = myInputCells;
+        }
+
+        private long javaStart = -1;
+        private long javaEnd = -1;
+        private long rsStart = -1;
+        private long copyStart = -1;
+        private long kernelStart = -1;
+        private long rsEnd = -1;
+        private long inputBytes = -1;
+        private long inputCells = -1;
+
+        public long javaTime() { return javaEnd - javaStart; }
+        public long rsTime() { return rsEnd - rsStart; }
+        public long kernelTime() { return rsEnd - kernelStart; }
+        public long overheadTime() { return kernelStart - rsStart; }
+        public long allocationTime() { return copyStart - rsStart; }
+        public long copyTime() { return kernelStart - copyStart; }
+
+        public static String string(long myJavaStart, long myJavaEnd, long myRsStart,
+                                    long myCopyStart, long myKernelStart, long myRsEnd,
+                                    Allocation... myInputs) {
+            return (new timing(myJavaStart, myJavaEnd, myRsStart,
+                               myCopyStart, myKernelStart, myRsEnd, myInputs)).string();
+        }
+
+        public static String string(long myInputCells) {
+            return (new timing(myInputCells)).string();
+        }
+
+        public String string() {
+            String result;
+            if (javaStart >= 0) {
+                result = "(java " + javaTime() + "ms, rs " + rsTime() + "ms = overhead " +
+                         overheadTime() + "ms (alloc " + allocationTime() + "ms + copy " +
+                         copyTime() + "ms) + kernel+get() " + kernelTime() + "ms)";
+                if (inputCells > 0)
+                    result += " ";
+            } else {
+                result = "";
+            }
+            if (inputCells > 0) {
+                result += "(" + fmt.format(inputCells) + " cells";
+                if (inputBytes > 0)
+                    result += ", " + fmt.format(inputBytes) + " bytes";
+                result += ")";
+            }
+            return result;
+        }
+
+        private static java.text.DecimalFormat fmt;
+        static {
+            fmt = new java.text.DecimalFormat("###,###");
+        }
+    };
+
     private byte[] createInputArrayByte(int len, int seed) {
         byte[] array = new byte[len];
         (new Random(seed)).nextBytes(array);
@@ -66,21 +142,60 @@ public class UT_reduce extends UnitTest {
         return array;
     }
 
-    private <T extends Number> boolean result(String testName, T javaRslt, T rsRslt) {
+    private <T extends Number> boolean result(String testName, final timing t,
+                                              T javaRslt, T rsRslt) {
         final boolean success = javaRslt.equals(rsRslt);
+        String status = (success ? "PASSED" : "FAILED");
+        if (success && (t != null))
+            status += " " + t.string();
+        Log.i(TAG, testName + ": java " + javaRslt + ", rs " + rsRslt + ": " + status);
+        return success;
+    }
+
+    private boolean result(String testName, final timing t,
+                           final long[] javaRslt, final long[] rsRslt) {
+        if (javaRslt.length != rsRslt.length) {
+            Log.i(TAG, testName + ": java length " + javaRslt.length +
+                       ", rs length " + rsRslt.length + ": FAILED");
+            return false;
+        }
+        for (int i = 0; i < javaRslt.length; ++i) {
+            if (javaRslt[i] != rsRslt[i]) {
+                Log.i(TAG, testName + "[" + i + "]: java " + javaRslt[i] +
+                           ", rs " + rsRslt[i] + ": FAILED");
+                return false;
+            }
+        }
+        String status = "PASSED";
+        if (t != null)
+            status += " " + t.string();
+        Log.i(TAG, testName + ": " + status);
+        return true;
+    }
+
+    private boolean result(String testName, final timing t, Int2 javaRslt, Int2 rsRslt) {
+        final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y);
+        String status = (success ? "PASSED" : "FAILED");
+        if (success && (t != null))
+            status += " " + t.string();
         Log.i(TAG,
-                testName + ": java " + javaRslt + ", rs " + rsRslt + ": " +
-                (success ? "PASSED" : "FAILED"));
+                testName +
+                ": java (" + javaRslt.x + ", " + javaRslt.y + ")" +
+                ", rs (" + rsRslt.x + ", " + rsRslt.y + ")" +
+                ": " + status);
         return success;
     }
 
-    private boolean result(String testName, Int2 javaRslt, Int2 rsRslt) {
+    private boolean result(String testName, final timing t, Float2 javaRslt, Float2 rsRslt) {
         final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y);
+        String status = (success ? "PASSED" : "FAILED");
+        if (success && (t != null))
+            status += " " + t.string();
         Log.i(TAG,
                 testName +
                 ": java (" + javaRslt.x + ", " + javaRslt.y + ")" +
                 ", rs (" + rsRslt.x + ", " + rsRslt.y + ")" +
-                ": " + (success ? "PASSED" : "FAILED"));
+                ": " + status);
         return success;
     }
 
@@ -93,61 +208,68 @@ public class UT_reduce extends UnitTest {
         return rslt;
     }
 
-    private boolean addint1D(RenderScript RS, ScriptC_reduce s) {
-        final int[] input = createInputArrayInt(100000, 0, 1 << 13);
+    private boolean addint1D_array(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final int[] input = createInputArrayInt(size[0], seed, Integer.MAX_VALUE / size[0]);
 
         final int javaRslt = addint(input);
         final int rsRslt = s.reduce_addint(input).get();
 
-        return result("addint1D", javaRslt, rsRslt);
+        return result("addint1D_array", new timing(size[0]), javaRslt, rsRslt);
     }
 
-    private boolean addint2D(RenderScript RS, ScriptC_reduce s) {
-        final int dimX = 450, dimY = 225;
-
-        final int[] inputArray = createInputArrayInt(dimX * dimY, 1, 1 << 13);
-        Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
-        typeBuilder.setX(dimX).setY(dimY);
-        Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
-        inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
+    private boolean addint1D(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final int[] inputArray = createInputArrayInt(size[0], seed, Integer.MAX_VALUE / size[0]);
 
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
         final int javaRslt = addint(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+        Allocation inputAllocation = Allocation.createSized(RS, Element.I32(RS), inputArray.length);
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+        inputAllocation.copyFrom(inputArray);
+
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
         final int rsRslt = s.reduce_addint(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
 
-        return result("addint2D", javaRslt, rsRslt);
+        return result("addint1D",
+                new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+                           copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+                javaRslt, rsRslt);
     }
 
-    ///////////////////////////////////////////////////////////////////
+    private boolean addint2D(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final int dimX = size[0];
+        final int dimY = size[1];
 
-    private float dp(float[] input1, float[] input2) {
-        _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
+        final int[] inputArray = createInputArrayInt(dimX * dimY, seed, Integer.MAX_VALUE / (dimX * dimY));
 
-        float rslt = 0;
-        for (int idx = 0; idx < input1.length; ++idx)
-            rslt += input1[idx] * input2[idx];
-        return rslt;
-    }
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final int javaRslt = addint(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
 
-    private boolean dp(RenderScript RS, ScriptC_reduce s) {
-        final float[] input1 = createInputArrayFloat(100000, 2);
-        final float[] input2 = createInputArrayFloat(100000, 3);
+        Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
+        typeBuilder.setX(dimX).setY(dimY);
+        Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
 
-        final float javaRslt = dp(input1, input2);
-        final float rsRslt = s.reduce_dp(input1, input2).get();
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
 
-        // NOTE: Using a floating point equality check to test for
-        // correctness -- as we do below -- is a bad idea.  It's only
-        // reliable if the Java and RenderScript implementation of dp
-        // use the same algorithm.  Equality could be broken by
-        // different optimizations between the two, or running the
-        // RenderScript algorithm multithreaded, or running the
-        // RenderScript algorithm on a GPU rather than the CPU.
-        //
-        // Should we be checking instead that the results are
-        // "sufficiently close"?  Cooking the input set to try to
-        // ensure a deterministic result?  Changing to integers
-        // instead?
-        return result("dp", javaRslt, rsRslt);
+        inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
+
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
+        final int rsRslt = s.reduce_addint(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+        return result("addint2D",
+                new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+                           copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+                javaRslt, rsRslt);
     }
 
     ///////////////////////////////////////////////////////////////////
@@ -172,79 +294,195 @@ public class UT_reduce extends UnitTest {
         return new Int2(minIdx, maxIdx);
     }
 
-    private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s) {
-        final float[] input = createInputArrayFloat(100000, 4);
+    private boolean findMinAndMax_array(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final float[] input = createInputArrayFloat(size[0], seed);
 
         final Int2 javaRslt = findMinAndMax(input);
         final Int2 rsRslt = s.reduce_findMinAndMax(input).get();
 
-        return result("findMinAndMax", javaRslt, rsRslt);
+        // Note that the Java and RenderScript algorithms are not
+        // guaranteed to find the same cells -- but they should
+        // find cells of the same value.
+        final Float2 javaVal = new Float2(input[javaRslt.x], input[javaRslt.y]);
+        final Float2 rsVal = new Float2(input[rsRslt.x], input[rsRslt.y]);
+
+        return result("findMinAndMax_array", new timing(size[0]), javaVal, rsVal);
+    }
+
+    private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final float[] inputArray = createInputArrayFloat(size[0], seed);
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final Int2 javaRslt = findMinAndMax(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+        Allocation inputAllocation = Allocation.createSized(RS, Element.F32(RS), inputArray.length);
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+        inputAllocation.copyFrom(inputArray);
+
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
+        final Int2 rsRslt = s.reduce_findMinAndMax(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+        // Note that the Java and RenderScript algorithms are not
+        // guaranteed to find the same cells -- but they should
+        // find cells of the same value.
+        final Float2 javaVal = new Float2(inputArray[javaRslt.x], inputArray[javaRslt.y]);
+        final Float2 rsVal = new Float2(inputArray[rsRslt.x], inputArray[rsRslt.y]);
+
+        return result("findMinAndMax",
+                new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+                           copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+                javaVal, rsVal);
     }
 
     ///////////////////////////////////////////////////////////////////
 
-    private boolean fz(RenderScript RS, ScriptC_reduce s) {
-        final int inputLen = 100000;
-        int[] input = createInputArrayInt(inputLen, 5);
+    private int fz(final int[] input) {
+        for (int i = 0; i < input.length; ++i)
+            if (input[i] == 0)
+                return i;
+        return -1;
+    }
+
+    private boolean fz_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final int inputLen = size[0];
+        int[] input = createInputArrayInt(inputLen, seed+0);
         // just in case we got unlucky
-        input[(new Random(6)).nextInt(inputLen)] = 0;
+        input[(new Random(seed+1)).nextInt(inputLen)] = 0;
 
         final int rsRslt = s.reduce_fz(input).get();
 
         final boolean success = (input[rsRslt] == 0);
         Log.i(TAG,
-                "fz: input[" + rsRslt + "] == " + input[rsRslt] + ": " +
-                (success ? "PASSED" : "FAILED"));
+                "fz_array: input[" + rsRslt + "] == " + input[rsRslt] + ": " +
+                (success ? "PASSED " + timing.string(size[0]) : "FAILED"));
+        return success;
+    }
+
+    private boolean fz(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final int inputLen = size[0];
+        int[] inputArray = createInputArrayInt(inputLen, seed+0);
+        // just in case we got unlucky
+        inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final int javaRslt = fz(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+        Allocation inputAllocation = Allocation.createSized(RS, Element.I32(RS), inputArray.length);
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+        inputAllocation.copyFrom(inputArray);
+
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
+        final int rsRslt = s.reduce_fz(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+        final boolean success = (inputArray[rsRslt] == 0);
+        String status = (success ? "PASSED" : "FAILED");
+        if (success)
+            status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+                                          copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
+        Log.i(TAG,
+                "fz: java input[" + javaRslt + "] == " + inputArray[javaRslt] +
+                ", rs input[" + rsRslt + "] == " + inputArray[javaRslt] + ": " + status);
         return success;
     }
 
     ///////////////////////////////////////////////////////////////////
 
-    private boolean fz2(RenderScript RS, ScriptC_reduce s) {
-        final int dimX = 225, dimY = 450;
+    private boolean fz2(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final int dimX = size[0], dimY = size[1];
         final int inputLen = dimX * dimY;
 
-        int[] inputArray = createInputArrayInt(inputLen, 7);
+        int[] inputArray = createInputArrayInt(inputLen, seed+0);
         // just in case we got unlucky
-        inputArray[(new Random(8)).nextInt(inputLen)] = 0;
+        inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final int javaRsltLinear = fz(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final Int2 javaRslt = new Int2(javaRsltLinear % dimX, javaRsltLinear / dimX);
+        final int javaCellVal = inputArray[javaRslt.x + dimX * javaRslt.y];
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
 
         Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
         typeBuilder.setX(dimX).setY(dimY);
         Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
         inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
 
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
         final Int2 rsRslt = s.reduce_fz2(inputAllocation).get();
-
-        final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y];
-        final boolean success = (cellVal == 0);
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+        final int rsCellVal = inputArray[rsRslt.x + dimX * rsRslt.y];
+        final boolean success = (rsCellVal == 0);
+        String status = (success ? "PASSED" : "FAILED");
+        if (success)
+            status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+                                          copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
         Log.i(TAG,
-                "fz2: input[" + rsRslt.x + ", " + rsRslt.y + "] == " + cellVal + ": " +
-                (success ? "PASSED" : "FAILED"));
+                "fz2: java input[" + javaRslt.x + ", " + javaRslt.y + "] == " + javaCellVal +
+                ", rs input[" + rsRslt.x + ", " + rsRslt.y + "] == " + rsCellVal + ": " + status);
         return success;
     }
 
     ///////////////////////////////////////////////////////////////////
 
-    private boolean fz3(RenderScript RS, ScriptC_reduce s) {
-        final int dimX = 59, dimY = 48, dimZ = 37;
+    private boolean fz3(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        final int dimX = size[0], dimY = size[1], dimZ = size[2];
         final int inputLen = dimX * dimY * dimZ;
 
-        int[] inputArray = createInputArrayInt(inputLen, 9);
+        int[] inputArray = createInputArrayInt(inputLen, seed+0);
         // just in case we got unlucky
-        inputArray[(new Random(10)).nextInt(inputLen)] = 0;
+        inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final int javaRsltLinear = fz(inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final Int3 javaRslt = new Int3(
+            javaRsltLinear % dimX,
+            (javaRsltLinear / dimX) % dimY,
+            javaRsltLinear / (dimX * dimY));
+        final int javaCellVal = inputArray[javaRslt.x + dimX * javaRslt.y + dimX * dimY * javaRslt.z];
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
 
         Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
         typeBuilder.setX(dimX).setY(dimY).setZ(dimZ);
         Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
         inputAllocation.copy3DRangeFrom(0, 0, 0, dimX, dimY, dimZ, inputArray);
 
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
         final Int3 rsRslt = s.reduce_fz3(inputAllocation).get();
-
-        final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z];
-        final boolean success = (cellVal == 0);
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+        final int rsCellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z];
+        final boolean success = (rsCellVal == 0);
+        String status = (success ? "PASSED" : "FAILED");
+        if (success)
+            status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+                                          copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
         Log.i(TAG,
-                "fz3: input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + cellVal + ": " +
-                (success ? "PASSED" : "FAILED"));
+                "fz3: java input[" + javaRslt.x + ", " + javaRslt.y + ", " + javaRslt.z + "] == " + javaCellVal +
+                ", rs input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + rsCellVal + ": " + status);
         return success;
     }
 
@@ -271,24 +509,43 @@ public class UT_reduce extends UnitTest {
         return outputArray;
     }
 
-    private boolean histogram(RenderScript RS, ScriptC_reduce s) {
-        final byte[] inputArray = createInputArrayByte(100000, 11);
+    private boolean histogram_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final byte[] inputArray = createInputArrayByte(size[0], seed);
 
         final long[] javaRslt = histogram(RS, inputArray);
         _RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount);
         final long[] rsRslt = s.reduce_histogram(inputArray).get();
         _RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount);
 
-        for (int i = 0; i < histogramBucketCount; ++i) {
-            if (javaRslt[i] != rsRslt[i]) {
-                Log.i(TAG,
-                        "histogram[" + i + "]: java " + javaRslt[i] + ", rs " + rsRslt[i] + ": FAILED");
-                return false;
-            }
-        }
+        return result("histogram_array", new timing(size[0]), javaRslt, rsRslt);
+    }
 
-        Log.i(TAG, "histogram: PASSED");
-        return true;
+    private boolean histogram(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final byte[] inputArray = createInputArrayByte(size[0], seed);
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final long[] javaRslt = histogram(RS, inputArray);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+        _RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount);
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+        Allocation inputAllocation = Allocation.createSized(RS, Element.U8(RS), inputArray.length);
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+        inputAllocation.copyFrom(inputArray);
+
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
+        final long[] rsRslt = s.reduce_histogram(inputAllocation).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+        _RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount);
+
+        // NOTE: The "java time" is actually for the RenderScript histogram intrinsic
+        return result("histogram",
+                new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+                           copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+                javaRslt, rsRslt);
     }
 
     //-----------------------------------------------------------------
@@ -302,17 +559,250 @@ public class UT_reduce extends UnitTest {
         return new Int2(modeIdx, (int)hsg[modeIdx]);
     }
 
-    private boolean mode(RenderScript RS, ScriptC_reduce s) {
-        final byte[] inputArray = createInputArrayByte(100000, 12);
+    private boolean mode_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final byte[] inputArray = createInputArrayByte(size[0], seed);
 
         final Int2 javaRslt = mode(RS, inputArray);
         final Int2 rsRslt = s.reduce_mode(inputArray).get();
 
-        return result("mode", javaRslt, rsRslt);
+        return result("mode", new timing(size[0]), javaRslt, rsRslt);
+    }
+
+    ///////////////////////////////////////////////////////////////////
+
+    private long sumgcd(final int in1[], final int in2[]) {
+        _RS_ASSERT("sumgcd input length mismatch", in1.length == in2.length);
+
+        long sum = 0;
+        for (int i = 0; i < in1.length; ++i) {
+            int a = in1[i], b = in2[i];
+
+            while (b != 0) {
+                final int aNew = b;
+                final int bNew = a % b;
+
+                a = aNew;
+                b = bNew;
+            }
+
+            sum += a;
+        }
+        return sum;
+    }
+
+    private boolean sumgcd(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+        final int len = size[0];
+
+        final int[] inputArrayA = createInputArrayInt(len, seed+0);
+        final int[] inputArrayB = createInputArrayInt(len, seed+1);
+
+        final long javaTimeStart = java.lang.System.currentTimeMillis();
+        final long javaRslt = sumgcd(inputArrayA, inputArrayB);
+        final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+        final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+        Allocation inputAllocationA = Allocation.createSized(RS, Element.I32(RS), len);
+        Allocation inputAllocationB = Allocation.createSized(RS, Element.I32(RS), len);
+
+        final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+        inputAllocationA.copyFrom(inputArrayA);
+        inputAllocationB.copyFrom(inputArrayB);
+
+        final long kernelTimeStart = java.lang.System.currentTimeMillis();
+        final long rsRslt = s.reduce_sumgcd(inputAllocationA, inputAllocationB).get();
+        final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+        return result("sumgcd",
+                new timing(javaTimeStart, javaTimeEnd, rsTimeStart, copyTimeStart, kernelTimeStart, rsTimeEnd,
+                        inputAllocationA, inputAllocationB),
+                javaRslt, rsRslt);
     }
 
     ///////////////////////////////////////////////////////////////////
 
+    public static final int maxSeedsPerTest = 10;
+
+    static interface Test {
+        // A test execution is characterized by two properties: A seed
+        // and a size.
+        //
+        // The seed is used for generating pseudorandom input data.
+        // Ideally, we use different seeds for different tests and for
+        // different executions of the same test at different sizes.
+        // A test with multiple blocks of input data (i.e., for a
+        // reduction with multiple inputs) may want multiple seeds; it
+        // may use the seeds seed..seed+maxSeedsPerTest-1.
+        //
+        // The size indicates the amount of input data.  It is the number
+        // of cells in a particular dimension of the iteration space.
+        boolean run(RenderScript RS, ScriptC_reduce s, int seed, int[] size);
+    };
+
+    static class TestDescription {
+        public TestDescription(String myTestName, Test myTest, int mySeed, int[] myDefSize, int[] myLog2MaxSize) {
+            testName    = myTestName;
+            test        = myTest;
+            seed        = mySeed;
+            defSize     = myDefSize;
+            log2MaxSize = myLog2MaxSize;
+        };
+
+        public TestDescription(String myTestName, Test myTest, int mySeed, int[] myDefSize) {
+            testName    = myTestName;
+            test        = myTest;
+            seed        = mySeed;
+            defSize     = myDefSize;
+            log2MaxSize = null;
+        };
+
+        public final String testName;
+
+        public final Test test;
+
+        // When executing the test, scale this up by maxSeedsPerTest.
+        public final int seed;
+
+        // If we're only going to run the test once, what size should
+        // we use?
+        public final int[] defSize;
+
+        // If we're going to run the test over a range of sizes, what
+        // is the maximum size to use?
+        public final int[] log2MaxSize;
+    };
+
+    private boolean run(TestDescription td, RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+        String arrayContent = "";
+        for (int i = 0; i < size.length; ++i) {
+            if (i != 0)
+                arrayContent += ", ";
+            arrayContent += size[i];
+        }
+        Log.i(TAG, "Running " + td.testName + "(seed = " + seed + ", size[] = {" + arrayContent + "})");
+        return td.test.run(RS, s, seed, size);
+    }
+
+    private final TestDescription[] correctnessTests = {
+        // alloc and array variants of the same test will use the same
+        // seed, in case results need to be compared.
+
+        new TestDescription("addint1D", this::addint1D, 0, new int[]{100000}, new int[]{20}),
+        new TestDescription("addint1D_array", this::addint1D_array, 0, new int[]{100000}, new int[]{20}),
+        new TestDescription("addint2D", this::addint2D, 1, new int[]{450, 225}),
+        new TestDescription("findMinAndMax", this::findMinAndMax, 3, new int[]{100000}, new int[]{20}),
+        new TestDescription("findMinAndMaxArray", this::findMinAndMax_array, 3, new int[]{100000}, new int[]{20}),
+        new TestDescription("fz", this::fz, 4, new int[]{100000}, new int[]{20}),
+        new TestDescription("fz_array", this::fz_array, 4, new int[]{100000}, new int[]{20}),
+        new TestDescription("fz2", this::fz2, 5, new int[]{225, 450}),
+        new TestDescription("fz3", this::fz3, 6, new int[]{59, 48, 37}),
+        new TestDescription("histogram", this::histogram, 7, new int[]{100000}, new int[]{20}),
+        new TestDescription("histogram_array", this::histogram_array, 7, new int[]{100000}, new int[]{20}),
+        // might want to add: new TestDescription("mode", this::mode, 8, new int[]{100000}, new int[]{20}),
+        new TestDescription("mode_array", this::mode_array, 8, new int[]{100000}, new int[]{20}),
+        new TestDescription("sumgcd", this::sumgcd, 9, new int[]{1 << 16}, new int[]{20})
+    };
+
+    private boolean runCorrectnessQuick(RenderScript RS, ScriptC_reduce s) {
+        boolean pass = true;
+
+        for (TestDescription td : correctnessTests) {
+            pass &= run(td, RS, s, maxSeedsPerTest * td.seed, td.defSize);
+        }
+
+        return pass;
+    }
+
+    private boolean runCorrectness(RenderScript RS, ScriptC_reduce s) {
+        boolean pass = true;
+
+        for (TestDescription td : correctnessTests) {
+            if (td.log2MaxSize == null)  // TODO: Eventually this should never happen?
+                continue;
+
+            if (td.log2MaxSize.length == 1) {
+                final int log2MaxSize = td.log2MaxSize[0];
+                // We will execute the test with the following sizes:
+                // (a) Each power of 2 from zero (2**0) up to log2MaxSize (2**log2MaxSize)
+                // (b) Each size from (a) +/-1
+                // (c) 2 random sizes between adjacent points in (a)
+                int[] testSizes = new int[
+                    /* a */ (1 + log2MaxSize) +
+                    /* b */ 2*(1 + log2MaxSize) +
+                    /* c */ 2*log2MaxSize];
+
+                // NOTE: Each test execution gets maxSeedsPerTest, and
+                // there are up to 3 + 5*log2MaxSize test executions
+                // of a test, and we need a seed for (c).  Assuming
+                // log2MaxSize does not exceed 32, then it should be
+                // sufficient to reserve 1 + 5*32*maxSeedsPerTest seeds
+                // per TestDescription.
+                final int seedForPickingTestSizes = td.seed * (1 + 5*32*maxSeedsPerTest);
+
+                int nextTestIdx = 0;
+
+                // Fill in (a) and (b)
+                for (int i = 0; i <= log2MaxSize; ++i) {
+                    final int pwrOf2 = 1 << i;
+                    testSizes[nextTestIdx++] = pwrOf2;      /* a */
+                    testSizes[nextTestIdx++] = pwrOf2 - 1;  /* b */
+                    testSizes[nextTestIdx++] = pwrOf2 + 1;  /* b */
+                }
+
+                // Fill in (c)
+                Random r = new Random(seedForPickingTestSizes);
+                for (int i = 0; i < log2MaxSize; ++i) {
+                    final int lo = (1 << i) + 1;
+                    final int hi = 1 << (i + 1);
+
+                    if (lo < hi) {
+                        for (int j = 0; j < 2; ++j) {
+                            testSizes[nextTestIdx++] = r.nextInt(hi - lo) + lo;
+                        }
+                    }
+                }
+
+                Arrays.sort(testSizes);
+
+                int[] lastTestSizeArg = new int[]{-1};
+                for (int i = 0; i < testSizes.length; ++i) {
+                    if ((testSizes[i] > 0) && (testSizes[i] != lastTestSizeArg[0])) {
+                        lastTestSizeArg[0] = testSizes[i];
+                        final int seedForTestExecution = seedForPickingTestSizes + 1 + i*maxSeedsPerTest;
+                        pass &= run(td, RS, s, seedForTestExecution, lastTestSizeArg);
+                    }
+                }
+            }
+            // TODO: lengths 2 and 3, and assert otherwise
+        }
+
+        return pass;
+    }
+
+    private final TestDescription[] performanceTests = {
+        new TestDescription("addint1D", this::addint1D, 0, new int[]{100000 << 10}),
+        new TestDescription("addint2D", this::addint2D, 1, new int[]{450 << 5, 225 << 5}),
+        new TestDescription("findMinAndMax", this::findMinAndMax, 3, new int[]{100000 << 9}),
+        new TestDescription("fz", this::fz, 4, new int[]{100000 << 10}),
+        new TestDescription("fz2", this::fz2, 5, new int[]{225 << 5, 450 << 5}),
+        new TestDescription("fz3", this::fz3, 6, new int[]{59 << 3, 48 << 3, 37 << 3}),
+        new TestDescription("histogram", this::histogram, 7, new int[]{100000 << 10}),
+        // might want to add: new TestDescription("mode", this::mode, 8, new int[]{100000}),
+        new TestDescription("sumgcd", this::sumgcd, 9, new int[]{1 << 21})
+    };
+
+    private boolean runPerformanceQuick(RenderScript RS, ScriptC_reduce s) {
+        boolean pass = true;
+
+        for (TestDescription td : performanceTests) {
+            pass &= run(td, RS, s, maxSeedsPerTest * td.seed, td.defSize);
+        }
+
+        return pass;
+    }
+
+
     public void run() {
         RenderScript pRS = RenderScript.create(mCtx);
         ScriptC_reduce s = new ScriptC_reduce(pRS);
@@ -320,15 +810,10 @@ public class UT_reduce extends UnitTest {
         s.set_posInf(Float.POSITIVE_INFINITY);
 
         boolean pass = true;
-        pass &= addint1D(pRS, s);
-        pass &= addint2D(pRS, s);
-        pass &= dp(pRS, s);
-        pass &= findMinAndMax(pRS, s);
-        pass &= fz(pRS, s);
-        pass &= fz2(pRS, s);
-        pass &= fz3(pRS, s);
-        pass &= histogram(pRS, s);
-        pass &= mode(pRS, s);
+
+        pass &= runCorrectnessQuick(pRS, s);
+        pass &= runCorrectness(pRS, s);
+        // pass &= runPerformanceQuick(pRS, s);
 
         pRS.finish();
         pRS.destroy();
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
index 3a64a738..6a50d2bf 100644
--- a/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
@@ -119,39 +119,6 @@ public class UT_reduce_backward extends UnitTest {
 
     ///////////////////////////////////////////////////////////////////
 
-    private float dp(float[] input1, float[] input2) {
-        _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
-        float rslt = 0;
-        for (int idx = 0; idx < input1.length; ++idx)
-            rslt += input1[idx] * input2[idx];
-        return rslt;
-    }
-
-    private boolean dp(RenderScript RS, ScriptC_reduce_backward s) {
-        final float[] input1 = createInputArrayFloat(100000, 2);
-        final float[] input2 = createInputArrayFloat(100000, 3);
-
-        final float javaRslt = dp(input1, input2);
-        final float rsRslt = s.reduce_dp(input1, input2).get();
-
-        // NOTE: Using a floating point equality check to test for
-        // correctness -- as we do below -- is a bad idea.  It's only
-        // reliable if the Java and RenderScript implementation of dp
-        // use the same algorithm.  Equality could be broken by
-        // different optimizations between the two, or running the
-        // RenderScript algorithm multithreaded, or running the
-        // RenderScript algorithm on a GPU rather than the CPU.
-        //
-        // Should we be checking instead that the results are
-        // "sufficiently close"?  Cooking the input set to try to
-        // ensure a deterministic result?  Changing to integers
-        // instead?
-        return result("dp", javaRslt, rsRslt);
-    }
-
-    ///////////////////////////////////////////////////////////////////
-
     private Int2 findMinAndMax(float[] input) {
         float minVal = Float.POSITIVE_INFINITY;
         int minIdx = -1;
@@ -322,7 +289,6 @@ public class UT_reduce_backward extends UnitTest {
         boolean pass = true;
         pass &= addint1D(pRS, s);
         pass &= addint2D(pRS, s);
-        pass &= dp(pRS, s);
         pass &= findMinAndMax(pRS, s);
         pass &= fz(pRS, s);
         pass &= fz2(pRS, s);
diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce.rs b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
index be09dfb6..97b45e0c 100644
--- a/java/tests/RsTest/src/com/android/rs/test/reduce.rs
+++ b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
@@ -16,18 +16,6 @@ static void aiAccum(int *accum, int val) { *accum += val; }
 
 /////////////////////////////////////////////////////////////////////////
 
-#pragma rs reduce(dp) \
-  accumulator(dpAccum) combiner(dpSum)
-
-static void dpAccum(float *accum, float in1, float in2) {
-  *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-/////////////////////////////////////////////////////////////////////////
-
 #pragma rs reduce(findMinAndMax) \
   initializer(fMMInit) accumulator(fMMAccumulator) \
   combiner(fMMCombiner) outconverter(fMMOutConverter)
@@ -61,8 +49,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) {
 
 static void fMMCombiner(MinAndMax *accum,
                         const MinAndMax *val) {
-  fMMAccumulator(accum, val->min.val, val->min.idx);
-  fMMAccumulator(accum, val->max.val, val->max.idx);
+  if (val->min.val < accum->min.val)
+    accum->min = val->min;
+  if (val->max.val > accum->max.val)
+    accum->max = val->max;
 }
 
 static void fMMOutConverter(int2 *result,
@@ -160,3 +150,24 @@ static void modeOutConvert(int2 *result, const Histogram *h) {
   result->x = mode;
   result->y = (*h)[mode];
 }
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(sumgcd) accumulator(sgAccum) combiner(sgCombine)
+
+static int gcd(int a, int b) {
+  while (b != 0) {
+    const int aNew = b;
+    const int bNew = a % b;
+
+    a = aNew;
+    b = bNew;
+  }
+  return a;
+}
+
+static void sgAccum(long *accum, int a, int b) {
+  *accum += gcd(a, b);
+}
+
+static void sgCombine(long *accum, const long *other) { *accum += *other; }
diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs b/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
index 419e7090..41252c8a 100644
--- a/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
+++ b/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
@@ -15,18 +15,6 @@ static void aiAccum(int *accum, int val) { *accum += val; }
 
 /////////////////////////////////////////////////////////////////////////
 
-static void dpAccum(float *accum, float in1, float in2) {
-  *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-#pragma rs reduce(dp) \
-  accumulator(dpAccum) combiner(dpSum)
-
-/////////////////////////////////////////////////////////////////////////
-
 typedef struct {
   float val;
   int idx;
@@ -56,8 +44,10 @@ static void fMMAccumulator(MinAndMax *accum, float in, int x) {
 
 static void fMMCombiner(MinAndMax *accum,
                         const MinAndMax *val) {
-  fMMAccumulator(accum, val->min.val, val->min.idx);
-  fMMAccumulator(accum, val->max.val, val->max.idx);
+  if (val->min.val < accum->min.val)
+    accum->min = val->min;
+  if (val->max.val > accum->max.val)
+    accum->max = val->max;
 }
 
 static void fMMOutConverter(int2 *result,
diff --git a/rsContext.cpp b/rsContext.cpp
index 77e82f64..122815fb 100644
--- a/rsContext.cpp
+++ b/rsContext.cpp
@@ -260,6 +260,8 @@ void * Context::threadProc(void *vrsc) {
     rsc->props.mLogShadersAttr = getProp("debug.rs.shader.attributes") != 0;
     rsc->props.mLogShadersUniforms = getProp("debug.rs.shader.uniforms") != 0;
     rsc->props.mLogVisual = getProp("debug.rs.visual") != 0;
+    rsc->props.mLogReduceAccum = getProp("debug.rs.reduce-accum") != 0;
+    rsc->props.mDebugReduceSplitAccum = getProp("debug.rs.reduce-split-accum") != 0;
     rsc->props.mDebugMaxThreads = getProp("debug.rs.max-threads");
 
     if (getProp("debug.rs.debug") != 0) {
diff --git a/rsContext.h b/rsContext.h
index fce22b56..dd2fc00e 100644
--- a/rsContext.h
+++ b/rsContext.h
@@ -226,6 +226,8 @@ public:
         bool mLogShadersAttr;
         bool mLogShadersUniforms;
         bool mLogVisual;
+        bool mLogReduceAccum;
+        bool mDebugReduceSplitAccum;
         uint32_t mDebugMaxThreads;
     } props;