1 /*
2  * Copyright (C) 2022 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <chrono>
19 #include <functional>
20 #include <future>
21 #include <optional>
22 #include <queue>
23 #include <stack>
24 #include <string>
25 #include <type_traits>
26 #include <unordered_map>
27 #include <unordered_set>
28 #include <variant>
29 #include <utility>
30 
31 #include "aemu/base/AndroidHealthMonitorConsumer.h"
32 #include "aemu/base/synchronization/AndroidConditionVariable.h"
33 #include "aemu/base/synchronization/AndroidLock.h"
34 #include "aemu/base/threads/AndroidThread.h"
35 
36 #include <log/log.h>
37 
38 using gfxstream::guest::EventHangMetadata;
39 
40 #define WATCHDOG_BUILDER(healthMonitorPtr, msg)                                              \
41     gfxstream::guest::HealthWatchdogBuilder<std::decay_t<decltype(*(healthMonitorPtr))>>( \
42         (healthMonitorPtr), __FILE__, __func__, msg, __LINE__)
43 
44 namespace gfxstream {
45 namespace guest {
46 
47 using gfxstream::guest::ConditionVariable;
48 using gfxstream::guest::Lock;
49 using std::chrono::duration;
50 using std::chrono::steady_clock;
51 using std::chrono::time_point;
52 using HangAnnotations = EventHangMetadata::HangAnnotations;
53 
54 static uint64_t kDefaultIntervalMs = 1'000;
55 static uint64_t kDefaultTimeoutMs = 5'000;
56 static std::chrono::nanoseconds kTimeEpsilon(1);
57 
58 // HealthMonitor provides the ability to register arbitrary start/touch/stop events associated
59 // with client defined tasks. At some pre-defined interval, it will periodically consume
60 // all logged events to assess whether the system is hanging on any task. Via the
61 // HealthMonitorConsumer, it will log hang and unhang events when it detects tasks hanging/resuming.
62 // Design doc: http://go/gfxstream-health-monitor
63 template <class Clock = steady_clock>
64 class HealthMonitor : public gfxstream::guest::Thread {
65    public:
66     // Alias for task id.
67     using Id = uint64_t;
68 
69     // Constructor
70     // `heatbeatIntervalMs` is the interval, in milleseconds, that the thread will sleep for
71     // in between health checks.
72     HealthMonitor(HealthMonitorConsumer& consumer, uint64_t heartbeatInterval = kDefaultIntervalMs);
73 
74     // Destructor
75     // Enqueues an event to end monitoring and waits on thread to process remaining queued events.
76     ~HealthMonitor();
77 
78     // Start monitoring a task. Returns an id that is used for touch and stop operations.
79     // `metadata` is a struct containing info on the task watchdog to be passed through to the
80     // metrics logger.
81     // `onHangAnnotationsCallback` is an optional containing a callable that will return key-value
82     // string pairs to be recorded at the time a hang is detected, which is useful for debugging.
83     // `timeout` is the duration in milliseconds a task is allowed to run before it's
84     // considered "hung". Because `timeout` must be larger than the monitor's heartbeat
85     // interval, as shorter timeout periods would not be detected, this method will set actual
86     // timeout to the lesser of `timeout` and twice the heartbeat interval.
87     // `parentId` can be the Id of another task. Events in this monitored task will update
88     // the parent task recursively.
89     Id startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,
90                            std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
91                                onHangAnnotationsCallback = std::nullopt,
92                            uint64_t timeout = kDefaultTimeoutMs,
93                            std::optional<Id> parentId = std::nullopt);
94 
95     // Touch a monitored task. Resets the timeout countdown for that task.
96     void touchMonitoredTask(Id id);
97 
98     // Stop monitoring a task.
99     void stopMonitoringTask(Id id);
100 
101    private:
102     using Duration = typename Clock::duration;  // duration<double>;
103     using Timestamp = time_point<Clock, Duration>;
104 
105     // Allow test class access to private functions
106     friend class HealthMonitorTest;
107 
108     struct MonitoredEventType {
109         struct Start {
110             Id id;
111             std::unique_ptr<EventHangMetadata> metadata;
112             Timestamp timeOccurred;
113             std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
114                 onHangAnnotationsCallback;
115             Duration timeoutThreshold;
116             std::optional<Id> parentId;
117         };
118         struct Touch {
119             Id id;
120             Timestamp timeOccurred;
121         };
122         struct Stop {
123             Id id;
124             Timestamp timeOccurred;
125         };
126         struct EndMonitoring {};
127         struct Poll {
128             std::promise<void> complete;
129         };
130     };
131 
132     using MonitoredEvent =
133         std::variant<std::monostate, typename MonitoredEventType::Start,
134                      typename MonitoredEventType::Touch, typename MonitoredEventType::Stop,
135                      typename MonitoredEventType::EndMonitoring, typename MonitoredEventType::Poll>;
136 
137     struct MonitoredTask {
138         Id id;
139         Timestamp timeoutTimestamp;
140         Duration timeoutThreshold;
141         std::optional<Timestamp> hungTimestamp;
142         std::unique_ptr<EventHangMetadata> metadata;
143         std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback;
144         std::optional<Id> parentId;
145     };
146 
147     // Thread's main loop
148     intptr_t main() override;
149 
150     // Update the parent task
151     void updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
152                           const MonitoredTask& task, Timestamp eventTime);
153 
154     // Explicitly wake the monitor thread. Returns a future that can be used to wait until the
155     // poll event has been processed.
156     std::future<void> poll();
157 
158     // Immutable. Multi-thread access is safe.
159     const Duration mInterval;
160 
161     // Members accessed only on the worker thread. Not protected by mutex.
162     int mHungTasks = 0;
163     HealthMonitorConsumer& mConsumer;
164     std::unordered_map<Id, MonitoredTask> mMonitoredTasks;
165 
166     // Lock and cv control access to queue and id counter
167     ConditionVariable mCv;
168     Lock mLock;
169     Id mNextId = 0;
170     std::queue<std::unique_ptr<MonitoredEvent>> mEventQueue;
171 };
172 
173 // This class provides an RAII mechanism for monitoring a task.
174 // HealthMonitorT should have the exact same interface as HealthMonitor. Note that HealthWatchdog
175 // can be used in performance critical path, so we use a template to dispatch a call here to
176 // overcome the performance cost of virtual function dispatch.
177 template <class HealthMonitorT = HealthMonitor<>>
178 class HealthWatchdog {
179    public:
180     HealthWatchdog(HealthMonitorT* healthMonitor, std::unique_ptr<EventHangMetadata> metadata,
181                    std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
182                        onHangAnnotationsCallback = std::nullopt,
183                    uint64_t timeout = kDefaultTimeoutMs)
mHealthMonitor(healthMonitor)184         : mHealthMonitor(healthMonitor), mThreadId(getCurrentThreadId()) {
185         if (!mHealthMonitor) {
186             mId = std::nullopt;
187             return;
188         }
189         // TODO: willho@ re-enable thread awareness b/253483619
190         typename HealthMonitorT::Id id = mHealthMonitor->startMonitoringTask(
191             std::move(metadata), std::move(onHangAnnotationsCallback), timeout, std::nullopt);
192         mId = id;
193     }
194 
~HealthWatchdog()195     ~HealthWatchdog() {
196         if (!mId.has_value()) {
197             return;
198         }
199         mHealthMonitor->stopMonitoringTask(*mId);
200     }
201 
touch()202     void touch() {
203         if (!mId.has_value()) {
204             return;
205         }
206         mHealthMonitor->touchMonitoredTask(*mId);
207     }
208 
209     // Return the underlying Id, and don't issue a stop on destruction.
release()210     std::optional<typename HealthMonitorT::Id> release() {
211         return std::exchange(mId, std::nullopt);
212     }
213 
214    private:
215     using ThreadTasks =
216         std::unordered_map<HealthMonitorT*, std::stack<typename HealthMonitorT::Id>>;
217     std::optional<typename HealthMonitorT::Id> mId;
218     HealthMonitorT* mHealthMonitor;
219     const unsigned long mThreadId;
220 };
221 
222 // HealthMonitorT should have the exact same interface as HealthMonitor. This template parameter is
223 // used for injecting a different type for testing.
224 template <class HealthMonitorT>
225 class HealthWatchdogBuilder {
226    public:
HealthWatchdogBuilder(HealthMonitorT * healthMonitor,const char * fileName,const char * functionName,const char * message,uint32_t line)227     HealthWatchdogBuilder(HealthMonitorT* healthMonitor, const char* fileName,
228                           const char* functionName, const char* message, uint32_t line)
229         : mHealthMonitor(healthMonitor),
230           mMetadata(std::make_unique<EventHangMetadata>(
231               fileName, functionName, message, line, EventHangMetadata::HangType::kOther, nullptr)),
232           mTimeoutMs(kDefaultTimeoutMs),
233           mOnHangCallback(std::nullopt) {}
234 
235     DISALLOW_COPY_ASSIGN_AND_MOVE(HealthWatchdogBuilder);
236 
setHangType(EventHangMetadata::HangType hangType)237     HealthWatchdogBuilder& setHangType(EventHangMetadata::HangType hangType) {
238         if (mHealthMonitor) mMetadata->hangType = hangType;
239         return *this;
240     }
setTimeoutMs(uint32_t timeoutMs)241     HealthWatchdogBuilder& setTimeoutMs(uint32_t timeoutMs) {
242         if (mHealthMonitor) mTimeoutMs = timeoutMs;
243         return *this;
244     }
245     // F should be a callable that returns a std::unique_ptr<EventHangMetadata::HangAnnotations>. We
246     // use template instead of std::function here to avoid extra copy.
247     template <class F>
setOnHangCallback(F && callback)248     HealthWatchdogBuilder& setOnHangCallback(F&& callback) {
249         if (mHealthMonitor) {
250             mOnHangCallback =
251                 std::function<std::unique_ptr<HangAnnotations>()>(std::forward<F>(callback));
252         }
253         return *this;
254     }
255 
setAnnotations(std::unique_ptr<HangAnnotations> annotations)256     HealthWatchdogBuilder& setAnnotations(std::unique_ptr<HangAnnotations> annotations) {
257         if (mHealthMonitor) mMetadata->data = std::move(annotations);
258         return *this;
259     }
260 
build()261     std::unique_ptr<HealthWatchdog<HealthMonitorT>> build() {
262         // We are allocating on the heap, so there is a performance hit. However we also allocate
263         // EventHangMetadata on the heap, so this should be Ok. If we see performance issues with
264         // these allocations, for HealthWatchdog, we can always use placement new + noop deleter to
265         // avoid heap allocation for HealthWatchdog.
266         return std::make_unique<HealthWatchdog<HealthMonitorT>>(
267             mHealthMonitor, std::move(mMetadata), std::move(mOnHangCallback), mTimeoutMs);
268     }
269 
270    private:
271     HealthMonitorT* mHealthMonitor;
272     std::unique_ptr<EventHangMetadata> mMetadata;
273     uint32_t mTimeoutMs;
274     std::optional<std::function<std::unique_ptr<HangAnnotations>()>> mOnHangCallback;
275 };
276 
277 std::unique_ptr<HealthMonitor<>> CreateHealthMonitor(
278     HealthMonitorConsumer& consumer, uint64_t heartbeatInterval = kDefaultIntervalMs);
279 
280 } // namespace guest
281 } // namespace gfxstream
282