1 /* 2 * Copyright (C) 2022 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #pragma once 17 18 #include <chrono> 19 #include <functional> 20 #include <future> 21 #include <optional> 22 #include <queue> 23 #include <stack> 24 #include <string> 25 #include <type_traits> 26 #include <unordered_map> 27 #include <unordered_set> 28 #include <utility> 29 #include <variant> 30 31 #include "aemu/base/synchronization/ConditionVariable.h" 32 #include "aemu/base/synchronization/Lock.h" 33 #include "aemu/base/Metrics.h" 34 #include "aemu/base/threads/Thread.h" 35 #include "host-common/GfxstreamFatalError.h" 36 #include "host-common/logging.h" 37 38 using android::base::EventHangMetadata; 39 using android::base::getCurrentThreadId; 40 41 #define WATCHDOG_BUILDER(healthMonitorPtr, msg) \ 42 ::emugl::HealthWatchdogBuilder<std::decay_t<decltype(*(healthMonitorPtr))>>( \ 43 (healthMonitorPtr), __FILE__, __func__, msg, __LINE__) 44 45 namespace emugl { 46 47 using android::base::ConditionVariable; 48 using android::base::Lock; 49 using android::base::MetricsLogger; 50 using std::chrono::duration; 51 using std::chrono::steady_clock; 52 using std::chrono::time_point; 53 using HangAnnotations = EventHangMetadata::HangAnnotations; 54 55 static uint64_t kDefaultIntervalMs = 1'000; 56 static uint64_t kDefaultTimeoutMs = 5'000; 57 static std::chrono::nanoseconds kTimeEpsilon(1); 58 59 // HealthMonitor provides the ability to register arbitrary start/touch/stop events associated 60 // with client defined tasks. At some pre-defined interval, it will periodically consume 61 // all logged events to assess whether the system is hanging on any task. Via the 62 // MetricsLogger, it will log hang and unhang events when it detects tasks hanging/resuming. 63 // Design doc: http://go/gfxstream-health-monitor 64 template <class Clock = steady_clock> 65 class HealthMonitor : public android::base::Thread { 66 public: 67 // Alias for task id. 68 using Id = uint64_t; 69 70 // Constructor 71 // `heatbeatIntervalMs` is the interval, in milleseconds, that the thread will sleep for 72 // in between health checks. 73 HealthMonitor(MetricsLogger& metricsLogger, uint64_t heartbeatInterval = kDefaultIntervalMs); 74 75 // Destructor 76 // Enqueues an event to end monitoring and waits on thread to process remaining queued events. 77 ~HealthMonitor(); 78 79 // Start monitoring a task. Returns an id that is used for touch and stop operations. 80 // `metadata` is a struct containing info on the task watchdog to be passed through to the 81 // metrics logger. 82 // `onHangAnnotationsCallback` is an optional containing a callable that will return key-value 83 // string pairs to be recorded at the time a hang is detected, which is useful for debugging. 84 // `timeout` is the duration in milliseconds a task is allowed to run before it's 85 // considered "hung". Because `timeout` must be larger than the monitor's heartbeat 86 // interval, as shorter timeout periods would not be detected, this method will set actual 87 // timeout to the lesser of `timeout` and twice the heartbeat interval. 88 // `parentId` can be the Id of another task. Events in this monitored task will update 89 // the parent task recursively. 90 Id startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata, 91 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> 92 onHangAnnotationsCallback = std::nullopt, 93 uint64_t timeout = kDefaultTimeoutMs, 94 std::optional<Id> parentId = std::nullopt); 95 96 // Touch a monitored task. Resets the timeout countdown for that task. 97 void touchMonitoredTask(Id id); 98 99 // Stop monitoring a task. 100 void stopMonitoringTask(Id id); 101 102 private: 103 using Duration = typename Clock::duration; // duration<double>; 104 using Timestamp = time_point<Clock, Duration>; 105 106 // Allow test class access to private functions 107 friend class HealthMonitorTest; 108 109 struct MonitoredEventType { 110 struct Start { 111 Id id; 112 std::unique_ptr<EventHangMetadata> metadata; 113 Timestamp timeOccurred; 114 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> 115 onHangAnnotationsCallback; 116 Duration timeoutThreshold; 117 std::optional<Id> parentId; 118 }; 119 struct Touch { 120 Id id; 121 Timestamp timeOccurred; 122 }; 123 struct Stop { 124 Id id; 125 Timestamp timeOccurred; 126 }; 127 struct EndMonitoring {}; 128 struct Poll { 129 std::promise<void> complete; 130 }; 131 }; 132 133 using MonitoredEvent = 134 std::variant<std::monostate, typename MonitoredEventType::Start, 135 typename MonitoredEventType::Touch, typename MonitoredEventType::Stop, 136 typename MonitoredEventType::EndMonitoring, typename MonitoredEventType::Poll>; 137 138 struct MonitoredTask { 139 Id id; 140 Timestamp timeoutTimestamp; 141 Duration timeoutThreshold; 142 std::optional<Timestamp> hungTimestamp; 143 std::unique_ptr<EventHangMetadata> metadata; 144 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback; 145 std::optional<Id> parentId; 146 }; 147 148 // Thread's main loop 149 intptr_t main() override; 150 151 // Update the parent task 152 void updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events, 153 const MonitoredTask& task, Timestamp eventTime); 154 155 // Explicitly wake the monitor thread. Returns a future that can be used to wait until the 156 // poll event has been processed. 157 std::future<void> poll(); 158 159 // Immutable. Multi-thread access is safe. 160 const Duration mInterval; 161 162 // Members accessed only on the worker thread. Not protected by mutex. 163 int mHungTasks = 0; 164 MetricsLogger& mLogger; 165 std::unordered_map<Id, MonitoredTask> mMonitoredTasks; 166 167 // Lock and cv control access to queue and id counter 168 android::base::ConditionVariable mCv; 169 Lock mLock; 170 Id mNextId = 0; 171 std::queue<std::unique_ptr<MonitoredEvent>> mEventQueue; 172 }; 173 174 // This class provides an RAII mechanism for monitoring a task. 175 // HealthMonitorT should have the exact same interface as HealthMonitor. Note that HealthWatchdog 176 // can be used in performance critical path, so we use a template to dispatch a call here to 177 // overcome the performance cost of virtual function dispatch. 178 template <class HealthMonitorT = HealthMonitor<>> 179 class HealthWatchdog { 180 public: 181 HealthWatchdog(HealthMonitorT* healthMonitor, std::unique_ptr<EventHangMetadata> metadata, 182 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> 183 onHangAnnotationsCallback = std::nullopt, 184 uint64_t timeout = kDefaultTimeoutMs) mHealthMonitor(healthMonitor)185 : mHealthMonitor(healthMonitor), mThreadId(getCurrentThreadId()) { 186 if (!mHealthMonitor) { 187 mId = std::nullopt; 188 return; 189 } 190 auto& threadTasks = getMonitoredThreadTasks(); 191 auto& stack = threadTasks[mHealthMonitor]; 192 typename HealthMonitorT::Id id = mHealthMonitor->startMonitoringTask( 193 std::move(metadata), std::move(onHangAnnotationsCallback), timeout, 194 stack.empty() ? std::nullopt : std::make_optional(stack.top())); 195 mId = id; 196 stack.push(id); 197 } 198 ~HealthWatchdog()199 ~HealthWatchdog() { 200 if (!mId.has_value()) { 201 return; 202 } 203 mHealthMonitor->stopMonitoringTask(*mId); 204 checkedStackPop(); 205 } 206 touch()207 void touch() { 208 if (!mId.has_value()) { 209 return; 210 } 211 mHealthMonitor->touchMonitoredTask(*mId); 212 } 213 214 // Return the underlying Id, and don't issue a stop on destruction. release()215 std::optional<typename HealthMonitorT::Id> release() { 216 if (mId.has_value()) { 217 checkedStackPop(); 218 } 219 return std::exchange(mId, std::nullopt); 220 } 221 222 private: 223 using ThreadTasks = 224 std::unordered_map<HealthMonitorT*, std::stack<typename HealthMonitorT::Id>>; 225 std::optional<typename HealthMonitorT::Id> mId; 226 HealthMonitorT* mHealthMonitor; 227 const unsigned long mThreadId; 228 229 // Thread local stack of task Ids enables better reentrant behavior. 230 // Multiple health monitors are not expected or advised, but as an injected dependency, 231 // it is possible. getMonitoredThreadTasks()232 ThreadTasks& getMonitoredThreadTasks() { 233 static thread_local ThreadTasks threadTasks; 234 return threadTasks; 235 } 236 237 // Pop the stack for the current thread, but with validation. Must be called with a non-empty 238 // WatchDog. checkedStackPop()239 void checkedStackPop() { 240 typename HealthMonitorT::Id id = *mId; 241 auto& threadTasks = getMonitoredThreadTasks(); 242 auto& stack = threadTasks[mHealthMonitor]; 243 if (getCurrentThreadId() != mThreadId) { 244 GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER)) 245 << "HealthWatchdog destructor thread does not match origin. Destructor must be " 246 "called on the same thread."; 247 } 248 if (stack.empty()) { 249 GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER)) 250 << "HealthWatchdog thread local stack is empty!"; 251 } 252 if (stack.top() != id) { 253 GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER)) 254 << "HealthWatchdog id " << id << " does not match top of stack: " << stack.top(); 255 } 256 stack.pop(); 257 } 258 }; 259 260 // HealthMonitorT should have the exact same interface as HealthMonitor. This template parameter is 261 // used for injecting a different type for testing. 262 template <class HealthMonitorT> 263 class HealthWatchdogBuilder { 264 public: HealthWatchdogBuilder(HealthMonitorT * healthMonitor,const char * fileName,const char * functionName,const char * message,uint32_t line)265 HealthWatchdogBuilder(HealthMonitorT* healthMonitor, const char* fileName, 266 const char* functionName, const char* message, uint32_t line) 267 : mHealthMonitor(healthMonitor), 268 mMetadata(std::make_unique<EventHangMetadata>( 269 fileName, functionName, message, line, EventHangMetadata::HangType::kOther, nullptr)), 270 mTimeoutMs(kDefaultTimeoutMs), 271 mOnHangCallback(std::nullopt) {} 272 273 DISALLOW_COPY_ASSIGN_AND_MOVE(HealthWatchdogBuilder); 274 setHangType(EventHangMetadata::HangType hangType)275 HealthWatchdogBuilder& setHangType(EventHangMetadata::HangType hangType) { 276 if (mHealthMonitor) mMetadata->hangType = hangType; 277 return *this; 278 } setTimeoutMs(uint32_t timeoutMs)279 HealthWatchdogBuilder& setTimeoutMs(uint32_t timeoutMs) { 280 if (mHealthMonitor) mTimeoutMs = timeoutMs; 281 return *this; 282 } 283 // F should be a callable that returns a std::unique_ptr<EventHangMetadata::HangAnnotations>. We 284 // use template instead of std::function here to avoid extra copy. 285 template <class F> setOnHangCallback(F && callback)286 HealthWatchdogBuilder& setOnHangCallback(F&& callback) { 287 if (mHealthMonitor) { 288 mOnHangCallback = 289 std::function<std::unique_ptr<HangAnnotations>()>(std::forward<F>(callback)); 290 } 291 return *this; 292 } 293 setAnnotations(std::unique_ptr<HangAnnotations> annotations)294 HealthWatchdogBuilder& setAnnotations(std::unique_ptr<HangAnnotations> annotations) { 295 if (mHealthMonitor) mMetadata->data = std::move(annotations); 296 return *this; 297 } 298 build()299 std::unique_ptr<HealthWatchdog<HealthMonitorT>> build() { 300 // We are allocating on the heap, so there is a performance hit. However we also allocate 301 // EventHangMetadata on the heap, so this should be Ok. If we see performance issues with 302 // these allocations, for HealthWatchdog, we can always use placement new + noop deleter to 303 // avoid heap allocation for HealthWatchdog. 304 return std::make_unique<HealthWatchdog<HealthMonitorT>>( 305 mHealthMonitor, std::move(mMetadata), std::move(mOnHangCallback), mTimeoutMs); 306 } 307 308 private: 309 HealthMonitorT* mHealthMonitor; 310 std::unique_ptr<EventHangMetadata> mMetadata; 311 uint32_t mTimeoutMs; 312 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> mOnHangCallback; 313 }; 314 315 std::unique_ptr<HealthMonitor<>> CreateHealthMonitor( 316 MetricsLogger& metricsLogger, uint64_t heartbeatInterval = kDefaultIntervalMs); 317 318 } // namespace emugl 319