/* * Copyright (C) 2012 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "RenderScriptToolkit.h" #include "TaskProcessor.h" #include "Utils.h" namespace android { namespace renderscript { #define LOG_TAG "renderscript.toolkit.Convolve5x5" extern "C" void rsdIntrinsicConvolve5x5_K(void* dst, const void* y0, const void* y1, const void* y2, const void* y3, const void* y4, const int16_t* coef, uint32_t count); class Convolve5x5Task : public Task { const void* mIn; void* mOut; // Even though we have exactly 25 coefficients, store them in an array of size 28 so that // the SIMD instructions can load them in three chunks of 8 and 1 of chunk of 4. float mFp[28]; int16_t mIp[28]; void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1, const uchar* py2, const uchar* py3, const uchar* py4); void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY, size_t startX, size_t startY, size_t endX, size_t endY); // Process a 2D tile of the overall work. threadIndex identifies which thread does the work. virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX, size_t endY) override; public: Convolve5x5Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY, const float* coefficients, const Restriction* restriction) : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} { for (int ct = 0; ct < 25; ct++) { mFp[ct] = coefficients[ct]; if (mFp[ct] >= 0) { mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f); } else { mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f); } } } }; template static void ConvolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0, const InputOutputType* py1, const InputOutputType* py2, const InputOutputType* py3, const InputOutputType* py4, const float* coeff, int32_t width) { uint32_t x0 = std::max((int32_t)x - 2, 0); uint32_t x1 = std::max((int32_t)x - 1, 0); uint32_t x2 = x; uint32_t x3 = std::min((int32_t)x + 1, width - 1); uint32_t x4 = std::min((int32_t)x + 2, width - 1); ComputationType px = convert(py0[x0]) * coeff[0] + convert(py0[x1]) * coeff[1] + convert(py0[x2]) * coeff[2] + convert(py0[x3]) * coeff[3] + convert(py0[x4]) * coeff[4] + convert(py1[x0]) * coeff[5] + convert(py1[x1]) * coeff[6] + convert(py1[x2]) * coeff[7] + convert(py1[x3]) * coeff[8] + convert(py1[x4]) * coeff[9] + convert(py2[x0]) * coeff[10] + convert(py2[x1]) * coeff[11] + convert(py2[x2]) * coeff[12] + convert(py2[x3]) * coeff[13] + convert(py2[x4]) * coeff[14] + convert(py3[x0]) * coeff[15] + convert(py3[x1]) * coeff[16] + convert(py3[x2]) * coeff[17] + convert(py3[x3]) * coeff[18] + convert(py3[x4]) * coeff[19] + convert(py4[x0]) * coeff[20] + convert(py4[x1]) * coeff[21] + convert(py4[x2]) * coeff[22] + convert(py4[x3]) * coeff[23] + convert(py4[x4]) * coeff[24]; px = clamp(px + 0.5f, 0.f, 255.f); *out = convert(px); } #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT template static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0, const InputOutputType* py1, const InputOutputType* py2, const InputOutputType* py3, const InputOutputType* py4, const float* coeff, int32_t width) { uint32_t x0 = std::max((int32_t)x - 2, 0); uint32_t x1 = std::max((int32_t)x - 1, 0); uint32_t x2 = x; uint32_t x3 = std::min((int32_t)x + 1, width - 1); uint32_t x4 = std::min((int32_t)x + 2, width - 1); InputOutputType px = py0[x0] * coeff[0] + py0[x1] * coeff[1] + py0[x2] * coeff[2] + py0[x3] * coeff[3] + py0[x4] * coeff[4] + py1[x0] * coeff[5] + py1[x1] * coeff[6] + py1[x2] * coeff[7] + py1[x3] * coeff[8] + py1[x4] * coeff[9] + py2[x0] * coeff[10] + py2[x1] * coeff[11] + py2[x2] * coeff[12] + py2[x3] * coeff[13] + py2[x4] * coeff[14] + py3[x0] * coeff[15] + py3[x1] * coeff[16] + py3[x2] * coeff[17] + py3[x3] * coeff[18] + py3[x4] * coeff[19] + py4[x0] * coeff[20] + py4[x1] * coeff[21] + py4[x2] * coeff[22] + py4[x3] * coeff[23] + py4[x4] * coeff[24]; *out = px; } #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT /** * This function convolves one line. * * @param pout Where to place the next output. * @param xstart Index in the X direction of where to start. * @param xend End index * @param ppy0 Points to the start of the line two above. * @param ppy1 Points to the start of the line one above. * @param ppy2 Points to the start of the current line. * @param ppy3 Points to the start of the line one below. * @param ppy4 Points to the start of the line two below. */ void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0, const uchar* ppy1, const uchar* ppy2, const uchar* ppy3, const uchar* ppy4) { uchar4* out = (uchar4*)pout; const uchar4* py0 = (const uchar4*)ppy0; const uchar4* py1 = (const uchar4*)ppy1; const uchar4* py2 = (const uchar4*)ppy2; const uchar4* py3 = (const uchar4*)ppy3; const uchar4* py4 = (const uchar4*)ppy4; while ((x1 < x2) && (x1 < 2)) { ConvolveOneU(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX); out++; x1++; } #if defined(ARCH_X86_HAVE_SSSE3) // for x86 SIMD, require minimum of 7 elements (4 for SIMD, // 3 for end boundary where x may hit the end boundary) if (mUsesSimd && ((x1 + 6) < x2)) { // subtract 3 for end boundary uint32_t len = (x2 - x1 - 3) >> 2; rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, mIp, len); out += len << 2; x1 += len << 2; } #endif #if defined(ARCH_ARM_USE_INTRINSICS) if (mUsesSimd && ((x1 + 3) < x2)) { uint32_t len = (x2 - x1 - 3) >> 1; rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, mIp, len); out += len << 1; x1 += len << 1; } #endif while (x1 < x2) { ConvolveOneU(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX); out++; x1++; } } #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT // This will need more cleanup before it can be used. void Convolve5x5Task::kernelF4(const ConvolveInfo* info, float4* out, uint32_t xstart, uint32_t xend, uint32_t currentY) { const uchar* pin = (const uchar*)info->in; const size_t stride = info->stride; uint32_t y0 = std::max((int32_t)currentY - 2, 0); uint32_t y1 = std::max((int32_t)currentY - 1, 0); uint32_t y2 = currentY; uint32_t y3 = std::min((int32_t)currentY + 1, sizeY); uint32_t y4 = std::min((int32_t)currentY + 2, sizeY); const float4* py0 = (const float4*)(pin + stride * y0); const float4* py1 = (const float4*)(pin + stride * y1); const float4* py2 = (const float4*)(pin + stride * y2); const float4* py3 = (const float4*)(pin + stride * y3); const float4* py4 = (const float4*)(pin + stride * y4); for (uint32_t x = xstart; x < xend; x++, out++) { ConvolveOneF(x, out, py0, py1, py2, py3, py4, mFp, sizeX); } } void RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo* info, float2* out, uint32_t xstart, uint32_t xend, uint32_t currentY) { const uchar* pin = (const uchar*)info->in; const size_t stride = info->stride; uint32_t y0 = std::max((int32_t)currentY - 2, 0); uint32_t y1 = std::max((int32_t)currentY - 1, 0); uint32_t y2 = currentY; uint32_t y3 = std::min((int32_t)currentY + 1, sizeY); uint32_t y4 = std::min((int32_t)currentY + 2, sizeY); const float2* py0 = (const float2*)(pin + stride * y0); const float2* py1 = (const float2*)(pin + stride * y1); const float2* py2 = (const float2*)(pin + stride * y2); const float2* py3 = (const float2*)(pin + stride * y3); const float2* py4 = (const float2*)(pin + stride * y4); for (uint32_t x = xstart; x < xend; x++, out++) { ConvolveOneF(x, out, py0, py1, py2, py3, py4, mFp, sizeX); } } void RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo* info, float* out, uint32_t xstart, uint32_t xend, uint32_t currentY) { const uchar* pin = (const uchar*)info->in; const size_t stride = info->stride; uint32_t y0 = std::max((int32_t)currentY - 2, 0); uint32_t y1 = std::max((int32_t)currentY - 1, 0); uint32_t y2 = currentY; uint32_t y3 = std::min((int32_t)currentY + 1, sizeY); uint32_t y4 = std::min((int32_t)currentY + 2, sizeY); const float* py0 = (const float*)(pin + stride * y0); const float* py1 = (const float*)(pin + stride * y1); const float* py2 = (const float*)(pin + stride * y2); const float* py3 = (const float*)(pin + stride * y3); const float* py4 = (const float*)(pin + stride * y4); for (uint32_t x = xstart; x < xend; x++, out++) { ConvolveOneF(x, out, py0, py1, py2, py3, py4, mFp, sizeX); } } #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT template static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY, size_t startX, size_t startY, size_t endX, size_t endY, float* mFp) { const size_t stride = vectorSize * sizeX; for (size_t y = startY; y < endY; y++) { uint32_t y0 = std::max((int32_t)y - 2, 0); uint32_t y1 = std::max((int32_t)y - 1, 0); uint32_t y2 = y; uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1)); uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1)); size_t offset = (y * sizeX + startX) * vectorSize; InputOutputType* px = (InputOutputType*)(pout + offset); InputOutputType* py0 = (InputOutputType*)(pin + stride * y0); InputOutputType* py1 = (InputOutputType*)(pin + stride * y1); InputOutputType* py2 = (InputOutputType*)(pin + stride * y2); InputOutputType* py3 = (InputOutputType*)(pin + stride * y3); InputOutputType* py4 = (InputOutputType*)(pin + stride * y4); for (uint32_t x = startX; x < endX; x++, px++) { ConvolveOneU(x, px, py0, py1, py2, py3, py4, mFp, sizeX); } } } void Convolve5x5Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY, size_t startX, size_t startY, size_t endX, size_t endY) { const size_t stride = paddedSize(vectorSize) * sizeX; for (size_t y = startY; y < endY; y++) { uint32_t y0 = std::max((int32_t)y - 2, 0); uint32_t y1 = std::max((int32_t)y - 1, 0); uint32_t y2 = y; uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1)); uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1)); size_t offset = (y * sizeX + startX) * paddedSize(vectorSize); uchar* px = pout + offset; const uchar* py0 = pin + stride * y0; const uchar* py1 = pin + stride * y1; const uchar* py2 = pin + stride * y2; const uchar* py3 = pin + stride * y3; const uchar* py4 = pin + stride * y4; kernelU4(px, startX, endX, py0, py1, py2, py3, py4); } } void Convolve5x5Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX, size_t endY) { // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY, // endX, endY); switch (mVectorSize) { case 1: convolveU((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY, endX, endY, mFp); break; case 2: convolveU((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY, endX, endY, mFp); break; case 3: case 4: convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY, endX, endY); break; } } void RenderScriptToolkit::convolve5x5(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY, const float* coefficients, const Restriction* restriction) { #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) { return; } if (vectorSize < 1 || vectorSize > 4) { ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize); return; } #endif Convolve5x5Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction); processor->doTask(&task); } } // namespace renderscript } // namespace android