1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "TaskProcessor.h"
18 
19 #include <assert.h>
20 #include <sys/prctl.h>
21 
22 #include "RenderScriptToolkit.h"
23 #include "Utils.h"
24 
25 #define LOG_TAG "renderscript.toolkit.TaskProcessor"
26 
27 namespace android {
28 namespace renderscript {
29 
setTiling(unsigned int targetTileSizeInBytes)30 int Task::setTiling(unsigned int targetTileSizeInBytes) {
31     // Empirically, values smaller than 1000 are unlikely to give good performance.
32     targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes);
33     const size_t cellSizeInBytes =
34             mVectorSize;  // If we add float support, vectorSize * 4 for that.
35     const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes;
36     assert(targetCellsPerTile > 0);
37 
38     size_t cellsToProcessY;
39     size_t cellsToProcessX;
40     if (mRestriction == nullptr) {
41         cellsToProcessX = mSizeX;
42         cellsToProcessY = mSizeY;
43     } else {
44         assert(mRestriction->endX > mRestriction->startX);
45         assert(mRestriction->endY > mRestriction->startY);
46         cellsToProcessX = mRestriction->endX - mRestriction->startX;
47         cellsToProcessY = mRestriction->endY - mRestriction->startY;
48     }
49 
50     // We want rows as large as possible, as the SIMD code we have is more efficient with
51     // large rows.
52     mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile);
53     // Once we know the number of tiles per row, we divide that row evenly. We round up to make
54     // sure all cells are included in the last tile of the row.
55     mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow);
56 
57     // We do the same thing for the Y direction.
58     size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX);
59     mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile);
60     mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn);
61 
62     return mTilesPerRow * mTilesPerColumn;
63 }
64 
processTile(unsigned int threadIndex,size_t tileIndex)65 void Task::processTile(unsigned int threadIndex, size_t tileIndex) {
66     // Figure out the overall boundaries.
67     size_t startWorkX;
68     size_t startWorkY;
69     size_t endWorkX;
70     size_t endWorkY;
71     if (mRestriction == nullptr) {
72         startWorkX = 0;
73         startWorkY = 0;
74         endWorkX = mSizeX;
75         endWorkY = mSizeY;
76     } else {
77         startWorkX = mRestriction->startX;
78         startWorkY = mRestriction->startY;
79         endWorkX = mRestriction->endX;
80         endWorkY = mRestriction->endY;
81     }
82     // Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify
83     // first the X, Y coordinate of our tile in that grid.
84     size_t tileIndexY = tileIndex / mTilesPerRow;
85     size_t tileIndexX = tileIndex % mTilesPerRow;
86     // Calculate the starting and ending point of that tile.
87     size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX;
88     size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY;
89     size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX);
90     size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY);
91 
92     // Call the derived class to do the specific work.
93     if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) {
94         // When the tile covers entire rows, we can take advantage that some ops are not 2D.
95         processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1);
96     } else {
97         processData(threadIndex, startCellX, startCellY, endCellX, endCellY);
98     }
99 }
100 
TaskProcessor(unsigned int numThreads)101 TaskProcessor::TaskProcessor(unsigned int numThreads)
102     : mUsesSimd{cpuSupportsSimd()},
103       /* If the requested number of threads is 0, we'll decide based on the number of cores.
104        * Through empirical testing, we've found that using more than 6 threads does not help.
105        * There may be more optimal choices to make depending on the SoC but we'll stick to
106        * this simple heuristic for now.
107        *
108        * We'll re-use the thread that calls the processor doTask method, so we'll spawn one less
109        * worker pool thread than the total number of threads.
110        */
111       mNumberOfPoolThreads{numThreads ? numThreads - 1
112                                       : std::min(6u, std::thread::hardware_concurrency() - 1)} {
113     for (size_t i = 0; i < mNumberOfPoolThreads; i++) {
114         mPoolThreads.emplace_back(
115                 std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false));
116     }
117 }
118 
~TaskProcessor()119 TaskProcessor::~TaskProcessor() {
120     {
121         std::lock_guard<std::mutex> lock(mQueueMutex);
122         mStopThreads = true;
123         mWorkAvailableOrStop.notify_all();
124     }
125 
126     for (auto& thread : mPoolThreads) {
127         thread.join();
128     }
129 }
130 
processTilesOfWork(int threadIndex,bool returnWhenNoWork)131 void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) {
132     if (threadIndex != 0) {
133         // Set the name of the thread, except for thread 0, which is not part of the pool.
134         // PR_SET_NAME takes a maximum of 16 characters, including the terminating null.
135         char name[16]{"RenderScToolkit"};
136         prctl(PR_SET_NAME, name, 0, 0, 0);
137         // ALOGI("Starting thread%d", threadIndex);
138     }
139 
140     std::unique_lock<std::mutex> lock(mQueueMutex);
141     while (true) {
142         mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() REQUIRES(mQueueMutex) {
143             return mStopThreads || (mTilesNotYetStarted > 0) ||
144                    (returnWhenNoWork && (mTilesNotYetStarted == 0));
145         });
146         // ALOGI("Woke thread%d", threadIndex);
147 
148         // This ScopedLockAssertion is to help the compiler when it checks thread annotations
149         // to realize that we have the lock. It's however not completely true; we don't
150         // hold the lock while processing the tile.
151         // TODO Figure out how to fix that.
152         android::base::ScopedLockAssertion lockAssert(mQueueMutex);
153         if (mStopThreads || (returnWhenNoWork && mTilesNotYetStarted == 0)) {
154             break;
155         }
156 
157         while (mTilesNotYetStarted > 0 && !mStopThreads) {
158             // This picks the tiles in decreasing order but that does not matter.
159             int myTile = --mTilesNotYetStarted;
160             mTilesInProcess++;
161             lock.unlock();
162             {
163                 // We won't be executing this code unless the main thread is
164                 // holding the mTaskMutex lock, which guards mCurrentTask.
165                 // The compiler can't figure this out.
166                 android::base::ScopedLockAssertion lockAssert(mTaskMutex);
167                 mCurrentTask->processTile(threadIndex, myTile);
168             }
169             lock.lock();
170             mTilesInProcess--;
171             if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) {
172                 mWorkIsFinished.notify_one();
173             }
174         }
175     }
176     // if (threadIndex != 0) {
177     //     ALOGI("Ending thread%d", threadIndex);
178     // }
179 }
180 
doTask(Task * task)181 void TaskProcessor::doTask(Task* task) {
182     std::lock_guard<std::mutex> lockGuard(mTaskMutex);
183     task->setUsesSimd(mUsesSimd);
184     mCurrentTask = task;
185     // Notify the thread pool of available work.
186     startWork(task);
187     // Start processing some of the tiles on the calling thread.
188     processTilesOfWork(0, true);
189     // Wait for all the pool workers to complete.
190     waitForPoolWorkersToComplete();
191     mCurrentTask = nullptr;
192 }
193 
startWork(Task * task)194 void TaskProcessor::startWork(Task* task) {
195     /**
196      * The size in bytes that we're hoping each tile will be. If this value is too small,
197      * we'll spend too much time in synchronization. If it's too large, some cores may be
198      * idle while others still have a lot of work to do. Ideally, it would depend on the
199      * device we're running. 16k is the same value used by RenderScript and seems reasonable
200      * from ad-hoc tests.
201      */
202     const size_t targetTileSize = 16 * 1024;
203 
204     std::lock_guard<std::mutex> lock(mQueueMutex);
205     assert(mTilesInProcess == 0);
206     mTilesNotYetStarted = task->setTiling(targetTileSize);
207     mWorkAvailableOrStop.notify_all();
208 }
209 
waitForPoolWorkersToComplete()210 void TaskProcessor::waitForPoolWorkersToComplete() {
211     std::unique_lock<std::mutex> lock(mQueueMutex);
212     // The predicate, i.e. the lambda, will make sure that
213     // we terminate even if the main thread calls this after
214     // mWorkIsFinished is signaled.
215     mWorkIsFinished.wait(lock, [this]() REQUIRES(mQueueMutex) {
216         return mTilesNotYetStarted == 0 && mTilesInProcess == 0;
217     });
218 }
219 
220 }  // namespace renderscript
221 }  // namespace android
222