1 /*
2  * Copyright (C) 2022 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <chrono>
19 #include <functional>
20 #include <future>
21 #include <optional>
22 #include <queue>
23 #include <stack>
24 #include <string>
25 #include <type_traits>
26 #include <unordered_map>
27 #include <unordered_set>
28 #include <utility>
29 #include <variant>
30 
31 #include "aemu/base/synchronization/ConditionVariable.h"
32 #include "aemu/base/synchronization/Lock.h"
33 #include "aemu/base/Metrics.h"
34 #include "aemu/base/threads/Thread.h"
35 #include "host-common/GfxstreamFatalError.h"
36 #include "host-common/logging.h"
37 
38 using android::base::EventHangMetadata;
39 using android::base::getCurrentThreadId;
40 
41 #define WATCHDOG_BUILDER(healthMonitorPtr, msg)                                  \
42     ::emugl::HealthWatchdogBuilder<std::decay_t<decltype(*(healthMonitorPtr))>>( \
43         (healthMonitorPtr), __FILE__, __func__, msg, __LINE__)
44 
45 namespace emugl {
46 
47 using android::base::ConditionVariable;
48 using android::base::Lock;
49 using android::base::MetricsLogger;
50 using std::chrono::duration;
51 using std::chrono::steady_clock;
52 using std::chrono::time_point;
53 using HangAnnotations = EventHangMetadata::HangAnnotations;
54 
55 static uint64_t kDefaultIntervalMs = 1'000;
56 static uint64_t kDefaultTimeoutMs = 5'000;
57 static std::chrono::nanoseconds kTimeEpsilon(1);
58 
59 // HealthMonitor provides the ability to register arbitrary start/touch/stop events associated
60 // with client defined tasks. At some pre-defined interval, it will periodically consume
61 // all logged events to assess whether the system is hanging on any task. Via the
62 // MetricsLogger, it will log hang and unhang events when it detects tasks hanging/resuming.
63 // Design doc: http://go/gfxstream-health-monitor
64 template <class Clock = steady_clock>
65 class HealthMonitor : public android::base::Thread {
66    public:
67     // Alias for task id.
68     using Id = uint64_t;
69 
70     // Constructor
71     // `heatbeatIntervalMs` is the interval, in milleseconds, that the thread will sleep for
72     // in between health checks.
73     HealthMonitor(MetricsLogger& metricsLogger, uint64_t heartbeatInterval = kDefaultIntervalMs);
74 
75     // Destructor
76     // Enqueues an event to end monitoring and waits on thread to process remaining queued events.
77     ~HealthMonitor();
78 
79     // Start monitoring a task. Returns an id that is used for touch and stop operations.
80     // `metadata` is a struct containing info on the task watchdog to be passed through to the
81     // metrics logger.
82     // `onHangAnnotationsCallback` is an optional containing a callable that will return key-value
83     // string pairs to be recorded at the time a hang is detected, which is useful for debugging.
84     // `timeout` is the duration in milliseconds a task is allowed to run before it's
85     // considered "hung". Because `timeout` must be larger than the monitor's heartbeat
86     // interval, as shorter timeout periods would not be detected, this method will set actual
87     // timeout to the lesser of `timeout` and twice the heartbeat interval.
88     // `parentId` can be the Id of another task. Events in this monitored task will update
89     // the parent task recursively.
90     Id startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,
91                            std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
92                                onHangAnnotationsCallback = std::nullopt,
93                            uint64_t timeout = kDefaultTimeoutMs,
94                            std::optional<Id> parentId = std::nullopt);
95 
96     // Touch a monitored task. Resets the timeout countdown for that task.
97     void touchMonitoredTask(Id id);
98 
99     // Stop monitoring a task.
100     void stopMonitoringTask(Id id);
101 
102    private:
103     using Duration = typename Clock::duration;  // duration<double>;
104     using Timestamp = time_point<Clock, Duration>;
105 
106     // Allow test class access to private functions
107     friend class HealthMonitorTest;
108 
109     struct MonitoredEventType {
110         struct Start {
111             Id id;
112             std::unique_ptr<EventHangMetadata> metadata;
113             Timestamp timeOccurred;
114             std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
115                 onHangAnnotationsCallback;
116             Duration timeoutThreshold;
117             std::optional<Id> parentId;
118         };
119         struct Touch {
120             Id id;
121             Timestamp timeOccurred;
122         };
123         struct Stop {
124             Id id;
125             Timestamp timeOccurred;
126         };
127         struct EndMonitoring {};
128         struct Poll {
129             std::promise<void> complete;
130         };
131     };
132 
133     using MonitoredEvent =
134         std::variant<std::monostate, typename MonitoredEventType::Start,
135                      typename MonitoredEventType::Touch, typename MonitoredEventType::Stop,
136                      typename MonitoredEventType::EndMonitoring, typename MonitoredEventType::Poll>;
137 
138     struct MonitoredTask {
139         Id id;
140         Timestamp timeoutTimestamp;
141         Duration timeoutThreshold;
142         std::optional<Timestamp> hungTimestamp;
143         std::unique_ptr<EventHangMetadata> metadata;
144         std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback;
145         std::optional<Id> parentId;
146     };
147 
148     // Thread's main loop
149     intptr_t main() override;
150 
151     // Update the parent task
152     void updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
153                           const MonitoredTask& task, Timestamp eventTime);
154 
155     // Explicitly wake the monitor thread. Returns a future that can be used to wait until the
156     // poll event has been processed.
157     std::future<void> poll();
158 
159     // Immutable. Multi-thread access is safe.
160     const Duration mInterval;
161 
162     // Members accessed only on the worker thread. Not protected by mutex.
163     int mHungTasks = 0;
164     MetricsLogger& mLogger;
165     std::unordered_map<Id, MonitoredTask> mMonitoredTasks;
166 
167     // Lock and cv control access to queue and id counter
168     android::base::ConditionVariable mCv;
169     Lock mLock;
170     Id mNextId = 0;
171     std::queue<std::unique_ptr<MonitoredEvent>> mEventQueue;
172 };
173 
174 // This class provides an RAII mechanism for monitoring a task.
175 // HealthMonitorT should have the exact same interface as HealthMonitor. Note that HealthWatchdog
176 // can be used in performance critical path, so we use a template to dispatch a call here to
177 // overcome the performance cost of virtual function dispatch.
178 template <class HealthMonitorT = HealthMonitor<>>
179 class HealthWatchdog {
180    public:
181     HealthWatchdog(HealthMonitorT* healthMonitor, std::unique_ptr<EventHangMetadata> metadata,
182                    std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
183                        onHangAnnotationsCallback = std::nullopt,
184                    uint64_t timeout = kDefaultTimeoutMs)
mHealthMonitor(healthMonitor)185         : mHealthMonitor(healthMonitor), mThreadId(getCurrentThreadId()) {
186         if (!mHealthMonitor) {
187             mId = std::nullopt;
188             return;
189         }
190         auto& threadTasks = getMonitoredThreadTasks();
191         auto& stack = threadTasks[mHealthMonitor];
192         typename HealthMonitorT::Id id = mHealthMonitor->startMonitoringTask(
193             std::move(metadata), std::move(onHangAnnotationsCallback), timeout,
194             stack.empty() ? std::nullopt : std::make_optional(stack.top()));
195         mId = id;
196         stack.push(id);
197     }
198 
~HealthWatchdog()199     ~HealthWatchdog() {
200         if (!mId.has_value()) {
201             return;
202         }
203         mHealthMonitor->stopMonitoringTask(*mId);
204         checkedStackPop();
205     }
206 
touch()207     void touch() {
208         if (!mId.has_value()) {
209             return;
210         }
211         mHealthMonitor->touchMonitoredTask(*mId);
212     }
213 
214     // Return the underlying Id, and don't issue a stop on destruction.
release()215     std::optional<typename HealthMonitorT::Id> release() {
216         if (mId.has_value()) {
217             checkedStackPop();
218         }
219         return std::exchange(mId, std::nullopt);
220     }
221 
222    private:
223     using ThreadTasks =
224         std::unordered_map<HealthMonitorT*, std::stack<typename HealthMonitorT::Id>>;
225     std::optional<typename HealthMonitorT::Id> mId;
226     HealthMonitorT* mHealthMonitor;
227     const unsigned long mThreadId;
228 
229     // Thread local stack of task Ids enables better reentrant behavior.
230     // Multiple health monitors are not expected or advised, but as an injected dependency,
231     // it is possible.
getMonitoredThreadTasks()232     ThreadTasks& getMonitoredThreadTasks() {
233         static thread_local ThreadTasks threadTasks;
234         return threadTasks;
235     }
236 
237     // Pop the stack for the current thread, but with validation. Must be called with a non-empty
238     // WatchDog.
checkedStackPop()239     void checkedStackPop() {
240         typename HealthMonitorT::Id id = *mId;
241         auto& threadTasks = getMonitoredThreadTasks();
242         auto& stack = threadTasks[mHealthMonitor];
243         if (getCurrentThreadId() != mThreadId) {
244             GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
245                 << "HealthWatchdog destructor thread does not match origin. Destructor must be "
246                    "called on the same thread.";
247         }
248         if (stack.empty()) {
249             GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
250                 << "HealthWatchdog thread local stack is empty!";
251         }
252         if (stack.top() != id) {
253             GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
254                 << "HealthWatchdog id " << id << " does not match top of stack: " << stack.top();
255         }
256         stack.pop();
257     }
258 };
259 
260 // HealthMonitorT should have the exact same interface as HealthMonitor. This template parameter is
261 // used for injecting a different type for testing.
262 template <class HealthMonitorT>
263 class HealthWatchdogBuilder {
264    public:
HealthWatchdogBuilder(HealthMonitorT * healthMonitor,const char * fileName,const char * functionName,const char * message,uint32_t line)265     HealthWatchdogBuilder(HealthMonitorT* healthMonitor, const char* fileName,
266                           const char* functionName, const char* message, uint32_t line)
267         : mHealthMonitor(healthMonitor),
268           mMetadata(std::make_unique<EventHangMetadata>(
269               fileName, functionName, message, line, EventHangMetadata::HangType::kOther, nullptr)),
270           mTimeoutMs(kDefaultTimeoutMs),
271           mOnHangCallback(std::nullopt) {}
272 
273     DISALLOW_COPY_ASSIGN_AND_MOVE(HealthWatchdogBuilder);
274 
setHangType(EventHangMetadata::HangType hangType)275     HealthWatchdogBuilder& setHangType(EventHangMetadata::HangType hangType) {
276         if (mHealthMonitor) mMetadata->hangType = hangType;
277         return *this;
278     }
setTimeoutMs(uint32_t timeoutMs)279     HealthWatchdogBuilder& setTimeoutMs(uint32_t timeoutMs) {
280         if (mHealthMonitor) mTimeoutMs = timeoutMs;
281         return *this;
282     }
283     // F should be a callable that returns a std::unique_ptr<EventHangMetadata::HangAnnotations>. We
284     // use template instead of std::function here to avoid extra copy.
285     template <class F>
setOnHangCallback(F && callback)286     HealthWatchdogBuilder& setOnHangCallback(F&& callback) {
287         if (mHealthMonitor) {
288             mOnHangCallback =
289                 std::function<std::unique_ptr<HangAnnotations>()>(std::forward<F>(callback));
290         }
291         return *this;
292     }
293 
setAnnotations(std::unique_ptr<HangAnnotations> annotations)294     HealthWatchdogBuilder& setAnnotations(std::unique_ptr<HangAnnotations> annotations) {
295         if (mHealthMonitor) mMetadata->data = std::move(annotations);
296         return *this;
297     }
298 
build()299     std::unique_ptr<HealthWatchdog<HealthMonitorT>> build() {
300         // We are allocating on the heap, so there is a performance hit. However we also allocate
301         // EventHangMetadata on the heap, so this should be Ok. If we see performance issues with
302         // these allocations, for HealthWatchdog, we can always use placement new + noop deleter to
303         // avoid heap allocation for HealthWatchdog.
304         return std::make_unique<HealthWatchdog<HealthMonitorT>>(
305             mHealthMonitor, std::move(mMetadata), std::move(mOnHangCallback), mTimeoutMs);
306     }
307 
308    private:
309     HealthMonitorT* mHealthMonitor;
310     std::unique_ptr<EventHangMetadata> mMetadata;
311     uint32_t mTimeoutMs;
312     std::optional<std::function<std::unique_ptr<HangAnnotations>()>> mOnHangCallback;
313 };
314 
315 std::unique_ptr<HealthMonitor<>> CreateHealthMonitor(
316     MetricsLogger& metricsLogger, uint64_t heartbeatInterval = kDefaultIntervalMs);
317 
318 }  // namespace emugl
319