diff options
author | Jason Sams <jsams@google.com> | 2012-11-27 18:26:33 -0800 |
---|---|---|
committer | Jason Sams <jsams@google.com> | 2012-11-27 18:26:33 -0800 |
commit | 7079cd8ddb0874d431446aea305b8c3b10e4f0ba (patch) | |
tree | a9632163bb1c73e15ff0eceb1475c7adb832797d /cpu_ref/rsCpuIntrinsicBlur.cpp | |
parent | c905efd76fdcc1b8846b229bf7d991d185a7b4b7 (diff) | |
download | rs-7079cd8ddb0874d431446aea305b8c3b10e4f0ba.tar.gz |
Fix uchar blur performance.
Fix bug with misaligned access on odd width images.
Change-Id: Ie9f40121cf8eddbafdb2149814fcff28fff62036
Diffstat (limited to 'cpu_ref/rsCpuIntrinsicBlur.cpp')
-rw-r--r-- | cpu_ref/rsCpuIntrinsicBlur.cpp | 46 |
1 files changed, 37 insertions, 9 deletions
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp index 1229f79e..cdeac0b5 100644 --- a/cpu_ref/rsCpuIntrinsicBlur.cpp +++ b/cpu_ref/rsCpuIntrinsicBlur.cpp @@ -56,6 +56,9 @@ protected: void RsdCpuScriptIntrinsicBlur::ComputeGaussianWeights() { + memset(fp, 0, sizeof(fp)); + memset(ip, 0, sizeof(ip)); + // Compute gaussian weights for the blur // e is the euler's number float e = 2.718281828459045f; @@ -144,6 +147,7 @@ static void OneVU1(const RsForEachStubParamStruct *p, float *out, int32_t x, int extern "C" void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct); extern "C" void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct); +extern "C" void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct); static void OneVFU4(float4 *out, const uchar *ptrIn, int iStride, const float* gPtr, int ct, @@ -174,22 +178,40 @@ static void OneVFU4(float4 *out, out->xyzw = blurredPixel; x1++; out++; + ptrIn++; } } static void OneVFU1(float *out, const uchar *ptrIn, int iStride, const float* gPtr, int ct, int len) { + while(len && (((int)ptrIn) & 0x3)) { + const uchar *pi = ptrIn; + float blurredPixel = 0; + const float* gp = gPtr; + + for (int r = 0; r < ct; r++) { + float pf = (float)pi[0]; + blurredPixel += pf * gp[0]; + pi += iStride; + gp++; + } + out[0] = blurredPixel; + len--; + out++; + ptrIn++; + } + #if defined(ARCH_ARM_HAVE_NEON) { int t = len >> 2; t &= ~1; if(t) { - rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, len); + rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t << 2); + len -= t << 2; + ptrIn += t << 2; + out += t << 2; } - len -= t << 2; - ptrIn += t << 2; - out += t << 2; } #endif @@ -207,6 +229,7 @@ static void OneVFU1(float *out, out[0] = blurredPixel; len--; out++; + ptrIn++; } } @@ -244,6 +267,7 @@ static void OneHU1(const RsForEachStubParamStruct *p, uchar *out, int32_t x, void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p, uint32_t xstart, uint32_t xend, uint32_t instep, uint32_t outstep) { + // fixme float buf[4 * 2048]; RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr; if (!cp->alloc.get()) { @@ -321,16 +345,20 @@ void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p, } x1 = xstart; - while ((x1 < (uint32_t)cp->iradius) && (x1 < x2)) { + while ((x1 < x2) && + ((x1 < (uint32_t)cp->iradius) || (((int)out) & 0x3))) { OneHU1(p, out, x1, buf, cp->fp, cp->iradius); out++; x1++; } -#if 0//defined(ARCH_ARM_HAVE_NEON) +#if defined(ARCH_ARM_HAVE_NEON) if ((x1 + cp->iradius) < x2) { - rsdIntrinsicBlurHFU4_K(out, ((float4 *)buf) - cp->iradius, cp->fp, cp->iradius * 2 + 1, x1, 0, x2 - cp->iradius); - out += (x2 - cp->iradius) - x1; - x1 = x2 - cp->iradius; + uint32_t len = x2 - (x1 + cp->iradius); + len >>= 2; + rsdIntrinsicBlurHFU1_K(out, ((float4 *)buf) - cp->iradius, cp->fp, + cp->iradius * 2 + 1, x1, x1+len); + out += len << 2; + x1 += len << 2; } #endif while(x2 > x1) { |