diff options
Diffstat (limited to 'renderscript-toolkit/src/main/cpp/Convolve5x5.cpp')
-rw-r--r-- | renderscript-toolkit/src/main/cpp/Convolve5x5.cpp | 348 |
1 files changed, 348 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Convolve5x5.cpp b/renderscript-toolkit/src/main/cpp/Convolve5x5.cpp new file mode 100644 index 0000000..6731bf4 --- /dev/null +++ b/renderscript-toolkit/src/main/cpp/Convolve5x5.cpp @@ -0,0 +1,348 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstdint> + +#include "RenderScriptToolkit.h" +#include "TaskProcessor.h" +#include "Utils.h" + +namespace renderscript { + +#define LOG_TAG "renderscript.toolkit.Convolve5x5" + +extern "C" void rsdIntrinsicConvolve5x5_K(void* dst, const void* y0, const void* y1, const void* y2, + const void* y3, const void* y4, const int16_t* coef, + uint32_t count); + +class Convolve5x5Task : public Task { + const void* mIn; + void* mOut; + // Even though we have exactly 25 coefficients, store them in an array of size 28 so that + // the SIMD instructions can load them in three chunks of 8 and 1 of chunk of 4. + float mFp[28]; + int16_t mIp[28]; + + void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1, + const uchar* py2, const uchar* py3, const uchar* py4); + void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY, + size_t startX, size_t startY, size_t endX, size_t endY); + + // Process a 2D tile of the overall work. threadIndex identifies which thread does the work. + void processData(int threadIndex, size_t startX, size_t startY, size_t endX, + size_t endY) override; + + public: + Convolve5x5Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY, + const float* coefficients, const Restriction* restriction) + : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} { + for (int ct = 0; ct < 25; ct++) { + mFp[ct] = coefficients[ct]; + if (mFp[ct] >= 0) { + mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f); + } else { + mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f); + } + } + } +}; + +template <typename InputOutputType, typename ComputationType> +static void ConvolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0, + const InputOutputType* py1, const InputOutputType* py2, + const InputOutputType* py3, const InputOutputType* py4, const float* coeff, + int32_t width) { + uint32_t x0 = std::max((int32_t)x - 2, 0); + uint32_t x1 = std::max((int32_t)x - 1, 0); + uint32_t x2 = x; + uint32_t x3 = std::min((int32_t)x + 1, width - 1); + uint32_t x4 = std::min((int32_t)x + 2, width - 1); + + ComputationType px = convert<ComputationType>(py0[x0]) * coeff[0] + + convert<ComputationType>(py0[x1]) * coeff[1] + + convert<ComputationType>(py0[x2]) * coeff[2] + + convert<ComputationType>(py0[x3]) * coeff[3] + + convert<ComputationType>(py0[x4]) * coeff[4] + + + convert<ComputationType>(py1[x0]) * coeff[5] + + convert<ComputationType>(py1[x1]) * coeff[6] + + convert<ComputationType>(py1[x2]) * coeff[7] + + convert<ComputationType>(py1[x3]) * coeff[8] + + convert<ComputationType>(py1[x4]) * coeff[9] + + + convert<ComputationType>(py2[x0]) * coeff[10] + + convert<ComputationType>(py2[x1]) * coeff[11] + + convert<ComputationType>(py2[x2]) * coeff[12] + + convert<ComputationType>(py2[x3]) * coeff[13] + + convert<ComputationType>(py2[x4]) * coeff[14] + + + convert<ComputationType>(py3[x0]) * coeff[15] + + convert<ComputationType>(py3[x1]) * coeff[16] + + convert<ComputationType>(py3[x2]) * coeff[17] + + convert<ComputationType>(py3[x3]) * coeff[18] + + convert<ComputationType>(py3[x4]) * coeff[19] + + + convert<ComputationType>(py4[x0]) * coeff[20] + + convert<ComputationType>(py4[x1]) * coeff[21] + + convert<ComputationType>(py4[x2]) * coeff[22] + + convert<ComputationType>(py4[x3]) * coeff[23] + + convert<ComputationType>(py4[x4]) * coeff[24]; + px = clamp(px + 0.5f, 0.f, 255.f); + *out = convert<InputOutputType>(px); +} + +#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT +template <typename InputOutputType> +static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0, + const InputOutputType* py1, const InputOutputType* py2, + const InputOutputType* py3, const InputOutputType* py4, const float* coeff, + int32_t width) { + uint32_t x0 = std::max((int32_t)x - 2, 0); + uint32_t x1 = std::max((int32_t)x - 1, 0); + uint32_t x2 = x; + uint32_t x3 = std::min((int32_t)x + 1, width - 1); + uint32_t x4 = std::min((int32_t)x + 2, width - 1); + + InputOutputType px = py0[x0] * coeff[0] + py0[x1] * coeff[1] + py0[x2] * coeff[2] + + py0[x3] * coeff[3] + py0[x4] * coeff[4] + + + py1[x0] * coeff[5] + py1[x1] * coeff[6] + py1[x2] * coeff[7] + + py1[x3] * coeff[8] + py1[x4] * coeff[9] + + + py2[x0] * coeff[10] + py2[x1] * coeff[11] + py2[x2] * coeff[12] + + py2[x3] * coeff[13] + py2[x4] * coeff[14] + + + py3[x0] * coeff[15] + py3[x1] * coeff[16] + py3[x2] * coeff[17] + + py3[x3] * coeff[18] + py3[x4] * coeff[19] + + + py4[x0] * coeff[20] + py4[x1] * coeff[21] + py4[x2] * coeff[22] + + py4[x3] * coeff[23] + py4[x4] * coeff[24]; + *out = px; +} +#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT + +/** + * This function convolves one line. + * + * @param pout Where to place the next output. + * @param xstart Index in the X direction of where to start. + * @param xend End index + * @param ppy0 Points to the start of the line two above. + * @param ppy1 Points to the start of the line one above. + * @param ppy2 Points to the start of the current line. + * @param ppy3 Points to the start of the line one below. + * @param ppy4 Points to the start of the line two below. + */ +void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0, + const uchar* ppy1, const uchar* ppy2, const uchar* ppy3, + const uchar* ppy4) { + uchar4* out = (uchar4*)pout; + const uchar4* py0 = (const uchar4*)ppy0; + const uchar4* py1 = (const uchar4*)ppy1; + const uchar4* py2 = (const uchar4*)ppy2; + const uchar4* py3 = (const uchar4*)ppy3; + const uchar4* py4 = (const uchar4*)ppy4; + + while ((x1 < x2) && (x1 < 2)) { + ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX); + out++; + x1++; + } +#if defined(ARCH_X86_HAVE_SSSE3) + // for x86 SIMD, require minimum of 7 elements (4 for SIMD, + // 3 for end boundary where x may hit the end boundary) + if (mUsesSimd && ((x1 + 6) < x2)) { + // subtract 3 for end boundary + uint32_t len = (x2 - x1 - 3) >> 2; + rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, + py4 + x1 - 2, mIp, len); + out += len << 2; + x1 += len << 2; + } +#endif + +#if defined(ARCH_ARM_USE_INTRINSICS) + if (mUsesSimd && ((x1 + 3) < x2)) { + uint32_t len = (x2 - x1 - 3) >> 1; + rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, + py4 + x1 - 2, mIp, len); + out += len << 1; + x1 += len << 1; + } +#endif + + while (x1 < x2) { + ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX); + out++; + x1++; + } +} + +#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT +// This will need more cleanup before it can be used. +void Convolve5x5Task::kernelF4(const ConvolveInfo* info, float4* out, + uint32_t xstart, uint32_t xend, uint32_t currentY) { + const uchar* pin = (const uchar*)info->in; + const size_t stride = info->stride; + + uint32_t y0 = std::max((int32_t)currentY - 2, 0); + uint32_t y1 = std::max((int32_t)currentY - 1, 0); + uint32_t y2 = currentY; + uint32_t y3 = std::min((int32_t)currentY + 1, sizeY); + uint32_t y4 = std::min((int32_t)currentY + 2, sizeY); + + const float4* py0 = (const float4*)(pin + stride * y0); + const float4* py1 = (const float4*)(pin + stride * y1); + const float4* py2 = (const float4*)(pin + stride * y2); + const float4* py3 = (const float4*)(pin + stride * y3); + const float4* py4 = (const float4*)(pin + stride * y4); + + for (uint32_t x = xstart; x < xend; x++, out++) { + ConvolveOneF<float4>(x, out, py0, py1, py2, py3, py4, mFp, sizeX); + } +} + +void RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo* info, float2* out, + uint32_t xstart, uint32_t xend, uint32_t currentY) { + const uchar* pin = (const uchar*)info->in; + const size_t stride = info->stride; + + uint32_t y0 = std::max((int32_t)currentY - 2, 0); + uint32_t y1 = std::max((int32_t)currentY - 1, 0); + uint32_t y2 = currentY; + uint32_t y3 = std::min((int32_t)currentY + 1, sizeY); + uint32_t y4 = std::min((int32_t)currentY + 2, sizeY); + + const float2* py0 = (const float2*)(pin + stride * y0); + const float2* py1 = (const float2*)(pin + stride * y1); + const float2* py2 = (const float2*)(pin + stride * y2); + const float2* py3 = (const float2*)(pin + stride * y3); + const float2* py4 = (const float2*)(pin + stride * y4); + + for (uint32_t x = xstart; x < xend; x++, out++) { + ConvolveOneF<float2>(x, out, py0, py1, py2, py3, py4, mFp, sizeX); + } +} + +void RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo* info, float* out, + uint32_t xstart, uint32_t xend, uint32_t currentY) { + const uchar* pin = (const uchar*)info->in; + const size_t stride = info->stride; + + uint32_t y0 = std::max((int32_t)currentY - 2, 0); + uint32_t y1 = std::max((int32_t)currentY - 1, 0); + uint32_t y2 = currentY; + uint32_t y3 = std::min((int32_t)currentY + 1, sizeY); + uint32_t y4 = std::min((int32_t)currentY + 2, sizeY); + + const float* py0 = (const float*)(pin + stride * y0); + const float* py1 = (const float*)(pin + stride * y1); + const float* py2 = (const float*)(pin + stride * y2); + const float* py3 = (const float*)(pin + stride * y3); + const float* py4 = (const float*)(pin + stride * y4); + + for (uint32_t x = xstart; x < xend; x++, out++) { + ConvolveOneF<float>(x, out, py0, py1, py2, py3, py4, mFp, sizeX); + } +} +#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT + +template <typename InputOutputType, typename ComputationType> +static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY, + size_t startX, size_t startY, size_t endX, size_t endY, float* mFp) { + const size_t stride = vectorSize * sizeX; + for (size_t y = startY; y < endY; y++) { + uint32_t y0 = std::max((int32_t)y - 2, 0); + uint32_t y1 = std::max((int32_t)y - 1, 0); + uint32_t y2 = y; + uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1)); + uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1)); + + size_t offset = (y * sizeX + startX) * vectorSize; + InputOutputType* px = (InputOutputType*)(pout + offset); + InputOutputType* py0 = (InputOutputType*)(pin + stride * y0); + InputOutputType* py1 = (InputOutputType*)(pin + stride * y1); + InputOutputType* py2 = (InputOutputType*)(pin + stride * y2); + InputOutputType* py3 = (InputOutputType*)(pin + stride * y3); + InputOutputType* py4 = (InputOutputType*)(pin + stride * y4); + for (uint32_t x = startX; x < endX; x++, px++) { + ConvolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, py3, py4, mFp, + sizeX); + } + } +} + +void Convolve5x5Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, + size_t sizeY, size_t startX, size_t startY, size_t endX, + size_t endY) { + const size_t stride = paddedSize(vectorSize) * sizeX; + for (size_t y = startY; y < endY; y++) { + uint32_t y0 = std::max((int32_t)y - 2, 0); + uint32_t y1 = std::max((int32_t)y - 1, 0); + uint32_t y2 = y; + uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1)); + uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1)); + + size_t offset = (y * sizeX + startX) * paddedSize(vectorSize); + uchar* px = pout + offset; + const uchar* py0 = pin + stride * y0; + const uchar* py1 = pin + stride * y1; + const uchar* py2 = pin + stride * y2; + const uchar* py3 = pin + stride * y3; + const uchar* py4 = pin + stride * y4; + kernelU4(px, startX, endX, py0, py1, py2, py3, py4); + } +} + +void Convolve5x5Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX, + size_t endY) { + // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY, + // endX, endY); + switch (mVectorSize) { + case 1: + convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, + startX, startY, endX, endY, mFp); + break; + case 2: + convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, + startX, startY, endX, endY, mFp); + break; + case 3: + case 4: + convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY, + endX, endY); + break; + } +} + +void RenderScriptToolkit::convolve5x5(const void* in, void* out, size_t vectorSize, size_t sizeX, + size_t sizeY, const float* coefficients, + const Restriction* restriction) { +#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE + if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) { + return; + } + if (vectorSize < 1 || vectorSize > 4) { + ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize); + return; + } +#endif + + Convolve5x5Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction); + processor->doTask(&task); +} + +} // namespace renderscript |