1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <cstdint>
18 
19 #include "RenderScriptToolkit.h"
20 #include "TaskProcessor.h"
21 #include "Utils.h"
22 
23 namespace android {
24 namespace renderscript {
25 
26 #define LOG_TAG "renderscript.toolkit.Convolve5x5"
27 
28 extern "C" void rsdIntrinsicConvolve5x5_K(void* dst, const void* y0, const void* y1, const void* y2,
29                                           const void* y3, const void* y4, const int16_t* coef,
30                                           uint32_t count);
31 
32 class Convolve5x5Task : public Task {
33     const void* mIn;
34     void* mOut;
35     // Even though we have exactly 25 coefficients, store them in an array of size 28 so that
36     // the SIMD instructions can load them in three chunks of 8 and 1 of chunk of 4.
37     float mFp[28];
38     int16_t mIp[28];
39 
40     void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
41                   const uchar* py2, const uchar* py3, const uchar* py4);
42     void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
43                     size_t startX, size_t startY, size_t endX, size_t endY);
44 
45     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
46     virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
47                              size_t endY) override;
48 
49    public:
Convolve5x5Task(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)50     Convolve5x5Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
51                     const float* coefficients, const Restriction* restriction)
52         : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
53         for (int ct = 0; ct < 25; ct++) {
54             mFp[ct] = coefficients[ct];
55             if (mFp[ct] >= 0) {
56                 mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
57             } else {
58                 mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
59             }
60         }
61     }
62 };
63 
64 template <typename InputOutputType, typename ComputationType>
ConvolveOneU(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const InputOutputType * py3,const InputOutputType * py4,const float * coeff,int32_t width)65 static void ConvolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
66                          const InputOutputType* py1, const InputOutputType* py2,
67                          const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
68                          int32_t width) {
69     uint32_t x0 = std::max((int32_t)x - 2, 0);
70     uint32_t x1 = std::max((int32_t)x - 1, 0);
71     uint32_t x2 = x;
72     uint32_t x3 = std::min((int32_t)x + 1, width - 1);
73     uint32_t x4 = std::min((int32_t)x + 2, width - 1);
74 
75     ComputationType px = convert<ComputationType>(py0[x0]) * coeff[0] +
76                          convert<ComputationType>(py0[x1]) * coeff[1] +
77                          convert<ComputationType>(py0[x2]) * coeff[2] +
78                          convert<ComputationType>(py0[x3]) * coeff[3] +
79                          convert<ComputationType>(py0[x4]) * coeff[4] +
80 
81                          convert<ComputationType>(py1[x0]) * coeff[5] +
82                          convert<ComputationType>(py1[x1]) * coeff[6] +
83                          convert<ComputationType>(py1[x2]) * coeff[7] +
84                          convert<ComputationType>(py1[x3]) * coeff[8] +
85                          convert<ComputationType>(py1[x4]) * coeff[9] +
86 
87                          convert<ComputationType>(py2[x0]) * coeff[10] +
88                          convert<ComputationType>(py2[x1]) * coeff[11] +
89                          convert<ComputationType>(py2[x2]) * coeff[12] +
90                          convert<ComputationType>(py2[x3]) * coeff[13] +
91                          convert<ComputationType>(py2[x4]) * coeff[14] +
92 
93                          convert<ComputationType>(py3[x0]) * coeff[15] +
94                          convert<ComputationType>(py3[x1]) * coeff[16] +
95                          convert<ComputationType>(py3[x2]) * coeff[17] +
96                          convert<ComputationType>(py3[x3]) * coeff[18] +
97                          convert<ComputationType>(py3[x4]) * coeff[19] +
98 
99                          convert<ComputationType>(py4[x0]) * coeff[20] +
100                          convert<ComputationType>(py4[x1]) * coeff[21] +
101                          convert<ComputationType>(py4[x2]) * coeff[22] +
102                          convert<ComputationType>(py4[x3]) * coeff[23] +
103                          convert<ComputationType>(py4[x4]) * coeff[24];
104     px = clamp(px + 0.5f, 0.f, 255.f);
105     *out = convert<InputOutputType>(px);
106 }
107 
108 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
109 template <typename InputOutputType>
ConvolveOneF(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const InputOutputType * py3,const InputOutputType * py4,const float * coeff,int32_t width)110 static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
111                          const InputOutputType* py1, const InputOutputType* py2,
112                          const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
113                          int32_t width) {
114     uint32_t x0 = std::max((int32_t)x - 2, 0);
115     uint32_t x1 = std::max((int32_t)x - 1, 0);
116     uint32_t x2 = x;
117     uint32_t x3 = std::min((int32_t)x + 1, width - 1);
118     uint32_t x4 = std::min((int32_t)x + 2, width - 1);
119 
120     InputOutputType px = py0[x0] * coeff[0] + py0[x1] * coeff[1] + py0[x2] * coeff[2] +
121                          py0[x3] * coeff[3] + py0[x4] * coeff[4] +
122 
123                          py1[x0] * coeff[5] + py1[x1] * coeff[6] + py1[x2] * coeff[7] +
124                          py1[x3] * coeff[8] + py1[x4] * coeff[9] +
125 
126                          py2[x0] * coeff[10] + py2[x1] * coeff[11] + py2[x2] * coeff[12] +
127                          py2[x3] * coeff[13] + py2[x4] * coeff[14] +
128 
129                          py3[x0] * coeff[15] + py3[x1] * coeff[16] + py3[x2] * coeff[17] +
130                          py3[x3] * coeff[18] + py3[x4] * coeff[19] +
131 
132                          py4[x0] * coeff[20] + py4[x1] * coeff[21] + py4[x2] * coeff[22] +
133                          py4[x3] * coeff[23] + py4[x4] * coeff[24];
134     *out = px;
135 }
136 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
137 
138 /**
139  * This function convolves one line.
140  *
141  * @param pout Where to place the next output.
142  * @param xstart Index in the X direction of where to start.
143  * @param xend End index
144  * @param ppy0 Points to the start of the line two above.
145  * @param ppy1 Points to the start of the line one above.
146  * @param ppy2 Points to the start of the current line.
147  * @param ppy3 Points to the start of the line one below.
148  * @param ppy4 Points to the start of the line two below.
149  */
kernelU4(uchar * pout,uint32_t x1,uint32_t x2,const uchar * ppy0,const uchar * ppy1,const uchar * ppy2,const uchar * ppy3,const uchar * ppy4)150 void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0,
151                                const uchar* ppy1, const uchar* ppy2, const uchar* ppy3,
152                                const uchar* ppy4) {
153     uchar4* out = (uchar4*)pout;
154     const uchar4* py0 = (const uchar4*)ppy0;
155     const uchar4* py1 = (const uchar4*)ppy1;
156     const uchar4* py2 = (const uchar4*)ppy2;
157     const uchar4* py3 = (const uchar4*)ppy3;
158     const uchar4* py4 = (const uchar4*)ppy4;
159 
160     while ((x1 < x2) && (x1 < 2)) {
161         ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
162         out++;
163         x1++;
164     }
165 #if defined(ARCH_X86_HAVE_SSSE3)
166     // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
167     // 3 for end boundary where x may hit the end boundary)
168     if (mUsesSimd && ((x1 + 6) < x2)) {
169         // subtract 3 for end boundary
170         uint32_t len = (x2 - x1 - 3) >> 2;
171         rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
172                                   py4 + x1 - 2, mIp, len);
173         out += len << 2;
174         x1 += len << 2;
175     }
176 #endif
177 
178 #if defined(ARCH_ARM_USE_INTRINSICS)
179     if (mUsesSimd && ((x1 + 3) < x2)) {
180         uint32_t len = (x2 - x1 - 3) >> 1;
181         rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
182                                   py4 + x1 - 2, mIp, len);
183         out += len << 1;
184         x1 += len << 1;
185     }
186 #endif
187 
188     while (x1 < x2) {
189         ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
190         out++;
191         x1++;
192     }
193 }
194 
195 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
196 // This will need more cleanup before it can be used.
kernelF4(const ConvolveInfo * info,float4 * out,uint32_t xstart,uint32_t xend,uint32_t currentY)197 void Convolve5x5Task::kernelF4(const ConvolveInfo* info, float4* out,
198                                uint32_t xstart, uint32_t xend, uint32_t currentY) {
199     const uchar* pin = (const uchar*)info->in;
200     const size_t stride = info->stride;
201 
202     uint32_t y0 = std::max((int32_t)currentY - 2, 0);
203     uint32_t y1 = std::max((int32_t)currentY - 1, 0);
204     uint32_t y2 = currentY;
205     uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
206     uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
207 
208     const float4* py0 = (const float4*)(pin + stride * y0);
209     const float4* py1 = (const float4*)(pin + stride * y1);
210     const float4* py2 = (const float4*)(pin + stride * y2);
211     const float4* py3 = (const float4*)(pin + stride * y3);
212     const float4* py4 = (const float4*)(pin + stride * y4);
213 
214     for (uint32_t x = xstart; x < xend; x++, out++) {
215         ConvolveOneF<float4>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
216     }
217 }
218 
RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo * info,float2 * out,uint32_t xstart,uint32_t xend,uint32_t currentY)219 void RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo* info, float2* out,
220                                                uint32_t xstart, uint32_t xend, uint32_t currentY) {
221     const uchar* pin = (const uchar*)info->in;
222     const size_t stride = info->stride;
223 
224     uint32_t y0 = std::max((int32_t)currentY - 2, 0);
225     uint32_t y1 = std::max((int32_t)currentY - 1, 0);
226     uint32_t y2 = currentY;
227     uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
228     uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
229 
230     const float2* py0 = (const float2*)(pin + stride * y0);
231     const float2* py1 = (const float2*)(pin + stride * y1);
232     const float2* py2 = (const float2*)(pin + stride * y2);
233     const float2* py3 = (const float2*)(pin + stride * y3);
234     const float2* py4 = (const float2*)(pin + stride * y4);
235 
236     for (uint32_t x = xstart; x < xend; x++, out++) {
237         ConvolveOneF<float2>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
238     }
239 }
240 
RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo * info,float * out,uint32_t xstart,uint32_t xend,uint32_t currentY)241 void RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo* info, float* out,
242                                                uint32_t xstart, uint32_t xend, uint32_t currentY) {
243     const uchar* pin = (const uchar*)info->in;
244     const size_t stride = info->stride;
245 
246     uint32_t y0 = std::max((int32_t)currentY - 2, 0);
247     uint32_t y1 = std::max((int32_t)currentY - 1, 0);
248     uint32_t y2 = currentY;
249     uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
250     uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
251 
252     const float* py0 = (const float*)(pin + stride * y0);
253     const float* py1 = (const float*)(pin + stride * y1);
254     const float* py2 = (const float*)(pin + stride * y2);
255     const float* py3 = (const float*)(pin + stride * y3);
256     const float* py4 = (const float*)(pin + stride * y4);
257 
258     for (uint32_t x = xstart; x < xend; x++, out++) {
259         ConvolveOneF<float>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
260     }
261 }
262 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
263 
264 template <typename InputOutputType, typename ComputationType>
convolveU(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY,float * mFp)265 static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
266                       size_t startX, size_t startY, size_t endX, size_t endY, float* mFp) {
267     const size_t stride = vectorSize * sizeX;
268     for (size_t y = startY; y < endY; y++) {
269         uint32_t y0 = std::max((int32_t)y - 2, 0);
270         uint32_t y1 = std::max((int32_t)y - 1, 0);
271         uint32_t y2 = y;
272         uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
273         uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
274 
275         size_t offset = (y * sizeX + startX) * vectorSize;
276         InputOutputType* px = (InputOutputType*)(pout + offset);
277         InputOutputType* py0 = (InputOutputType*)(pin + stride * y0);
278         InputOutputType* py1 = (InputOutputType*)(pin + stride * y1);
279         InputOutputType* py2 = (InputOutputType*)(pin + stride * y2);
280         InputOutputType* py3 = (InputOutputType*)(pin + stride * y3);
281         InputOutputType* py4 = (InputOutputType*)(pin + stride * y4);
282         for (uint32_t x = startX; x < endX; x++, px++) {
283             ConvolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, py3, py4, mFp,
284                                                            sizeX);
285         }
286     }
287 }
288 
convolveU4(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY)289 void Convolve5x5Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
290                                  size_t sizeY, size_t startX, size_t startY, size_t endX,
291                                  size_t endY) {
292     const size_t stride = paddedSize(vectorSize) * sizeX;
293     for (size_t y = startY; y < endY; y++) {
294         uint32_t y0 = std::max((int32_t)y - 2, 0);
295         uint32_t y1 = std::max((int32_t)y - 1, 0);
296         uint32_t y2 = y;
297         uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
298         uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
299 
300         size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
301         uchar* px = pout + offset;
302         const uchar* py0 = pin + stride * y0;
303         const uchar* py1 = pin + stride * y1;
304         const uchar* py2 = pin + stride * y2;
305         const uchar* py3 = pin + stride * y3;
306         const uchar* py4 = pin + stride * y4;
307         kernelU4(px, startX, endX, py0, py1, py2, py3, py4);
308     }
309 }
310 
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)311 void Convolve5x5Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
312                                   size_t endY) {
313     // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
314     // endX, endY);
315     switch (mVectorSize) {
316         case 1:
317             convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
318                                     startX, startY, endX, endY, mFp);
319             break;
320         case 2:
321             convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
322                                       startX, startY, endX, endY, mFp);
323             break;
324         case 3:
325         case 4:
326             convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
327                        endX, endY);
328             break;
329     }
330 }
331 
convolve5x5(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)332 void RenderScriptToolkit::convolve5x5(const void* in, void* out, size_t vectorSize, size_t sizeX,
333                                       size_t sizeY, const float* coefficients,
334                                       const Restriction* restriction) {
335 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
336     if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
337         return;
338     }
339     if (vectorSize < 1 || vectorSize > 4) {
340         ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
341         return;
342     }
343 #endif
344 
345     Convolve5x5Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
346     processor->doTask(&task);
347 }
348 
349 }  // namespace renderscript
350 }  // namespace android
351