diff options
author | Simon Hosie <simon.hosie@arm.com> | 2014-02-19 22:08:48 -0800 |
---|---|---|
committer | Simon Hosie <simon.hosie@arm.com> | 2014-03-04 13:59:27 -0800 |
commit | 446788007efe0a673d0366284026adfa17b36fed (patch) | |
tree | 5f749c1d902299dbf629408df074039e85e58e32 /cpu_ref/rsCpuIntrinsicBlur.cpp | |
parent | ccd7a46d0c0052209bf3ab8657f40622065d1d1f (diff) | |
download | rs-446788007efe0a673d0366284026adfa17b36fed.tar.gz |
Optimisations to blur intrinsic.
Try to keep all data in-register whereever possible, and use only a minimal
circular buffer on the stack when necessary. Implementations in AArch32 and
AArch64 NEON.
Change-Id: If3dd4932a94ee1cadde46e298b8f6bf14b6c2bdc
Diffstat (limited to 'cpu_ref/rsCpuIntrinsicBlur.cpp')
-rw-r--r-- | cpu_ref/rsCpuIntrinsicBlur.cpp | 78 |
1 files changed, 23 insertions, 55 deletions
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp index 52e31f90..1d6c57b0 100644 --- a/cpu_ref/rsCpuIntrinsicBlur.cpp +++ b/cpu_ref/rsCpuIntrinsicBlur.cpp @@ -37,7 +37,7 @@ public: protected: float mFp[104]; - short mIp[104]; + uint16_t mIp[104]; void **mScratch; size_t *mScratchSize; float mRadius; @@ -95,7 +95,7 @@ void RsdCpuScriptIntrinsicBlur::ComputeGaussianWeights() { normalizeFactor = 1.0f / normalizeFactor; for (r = -mIradius; r <= mIradius; r ++) { mFp[r + mIradius] *= normalizeFactor; - mIp[r + mIradius] = (short)(mIp[r + mIradius] * 32768); + mIp[r + mIradius] = (uint16_t)(mFp[r + mIradius] * 65536.0f + 0.5f); } } @@ -147,27 +147,16 @@ static void OneVU1(const RsForEachStubParamStruct *p, float *out, int32_t x, int out[0] = blurredPixel; } -extern "C" void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct); -extern "C" void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct); -extern "C" void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct); + +extern "C" void rsdIntrinsicBlurU1_K(uchar *out, uchar const *in, size_t w, size_t h, + size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab); +extern "C" void rsdIntrinsicBlurU4_K(uchar4 *out, uchar4 const *in, size_t w, size_t h, + size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab); static void OneVFU4(float4 *out, const uchar *ptrIn, int iStride, const float* gPtr, int ct, int x1, int x2) { -#if defined(ARCH_ARM_HAVE_VFP) - if (gArchUseSIMD) { - int t = (x2 - x1); - t &= ~1; - if(t) { - rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t); - x1 += t; - ptrIn += t << 2; - out += t; - } - } -#endif - while(x2 > x1) { const uchar *pi = ptrIn; float4 blurredPixel = 0; @@ -209,19 +198,6 @@ static void OneVFU1(float *out, len--; } -#if defined(ARCH_ARM_HAVE_VFP) - if (gArchUseSIMD && (x2 > x1)) { - int t = (x2 - x1) >> 2; - t &= ~1; - if(t) { - rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t ); - len -= t << 2; - ptrIn += t << 2; - out += t << 2; - } - } -#endif - while(len > 0) { const uchar *pi = ptrIn; float blurredPixel = 0; @@ -289,6 +265,14 @@ void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p, uint32_t x1 = xstart; uint32_t x2 = xend; +#if defined(ARCH_ARM_HAVE_VFP) + if (gArchUseSIMD) { + rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * p->y), p->dimX, p->dimY, + stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius); + return; + } +#endif + if (p->dimX > 2048) { if ((p->dimX > cp->mScratchSize[p->lid]) || !cp->mScratch[p->lid]) { // Pad the side of the allocation by one unit to allow alignment later @@ -317,16 +301,6 @@ void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p, out++; x1++; } -#if defined(ARCH_ARM_HAVE_VFP) - if (gArchUseSIMD) { - if ((x1 + cp->mIradius) < x2) { - rsdIntrinsicBlurHFU4_K(out, buf - cp->mIradius, cp->mFp, - cp->mIradius * 2 + 1, x1, x2 - cp->mIradius); - out += (x2 - cp->mIradius) - x1; - x1 = x2 - cp->mIradius; - } - } -#endif while(x2 > x1) { OneHU4(p, out, x1, buf, cp->mFp, cp->mIradius); out++; @@ -350,6 +324,14 @@ void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p, uint32_t x1 = xstart; uint32_t x2 = xend; +#if defined(ARCH_ARM_HAVE_VFP) + if (gArchUseSIMD) { + rsdIntrinsicBlurU1_K(out, pin + stride * p->y, p->dimX, p->dimY, + stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius); + return; + } +#endif + float *fout = (float *)buf; int y = p->y; if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius -1))) { @@ -370,20 +352,6 @@ void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p, out++; x1++; } -#if defined(ARCH_ARM_HAVE_VFP) - if (gArchUseSIMD) { - if ((x1 + cp->mIradius) < x2) { - uint32_t len = x2 - (x1 + cp->mIradius); - len &= ~3; - if (len > 0) { - rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - cp->mIradius, cp->mFp, - cp->mIradius * 2 + 1, x1, x1 + len); - out += len; - x1 += len; - } - } - } -#endif while(x2 > x1) { OneHU1(p, out, x1, buf, cp->mFp, cp->mIradius); out++; |