diff options
author | William Ho <willho@google.com> | 2022-09-12 13:51:56 -0400 |
---|---|---|
committer | Joshua Duong <joshuaduong@google.com> | 2022-09-15 08:03:11 -0700 |
commit | 3a20ff5c908e0c40a7ad814541834864b6c1c40a (patch) | |
tree | d706e651887f01b433252c86c98ad3bbd88f7397 /base | |
parent | 82cc35a4efb65c87958288eeda1939e6bbe91a65 (diff) | |
download | aemu-3a20ff5c908e0c40a7ad814541834864b6c1c40a.tar.gz |
Add parent id arg to HealthMonitor tasks
...to support desired reentrant behavior.
Bug: 244354559
Test: Unit tests. Built and ran emulator
Change-Id: Ie629a1115866124ca6a33425399e85fe1ebd0c3c
Diffstat (limited to 'base')
-rw-r--r-- | base/HealthMonitor.cpp | 62 | ||||
-rw-r--r-- | base/HealthMonitor_unittest.cpp | 108 | ||||
-rw-r--r-- | base/include/base/HealthMonitor.h | 61 | ||||
-rw-r--r-- | base/include/base/Metrics.h | 2 |
4 files changed, 209 insertions, 24 deletions
diff --git a/base/HealthMonitor.cpp b/base/HealthMonitor.cpp index 67c9285..4a9ab93 100644 --- a/base/HealthMonitor.cpp +++ b/base/HealthMonitor.cpp @@ -60,7 +60,7 @@ template <class Clock> typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask( std::unique_ptr<EventHangMetadata> metadata, std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback, - uint64_t timeout) { + uint64_t timeout, std::optional<Id> parentId) { auto intervalMs = duration_cast<std::chrono::milliseconds>(mInterval).count(); if (timeout < intervalMs) { WARN("Timeout value %d is too low (heartbeat is every %d). Increasing to %d", timeout, @@ -75,7 +75,8 @@ typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask( .metadata = std::move(metadata), .timeOccurred = Clock::now(), .onHangAnnotationsCallback = std::move(onHangAnnotationsCallback), - .timeoutThreshold = Duration(std::chrono::milliseconds(timeout))}); + .timeoutThreshold = Duration(std::chrono::milliseconds(timeout)), + .parentId = parentId}); mEventQueue.push(std::move(event)); return id; } @@ -140,24 +141,34 @@ intptr_t HealthMonitor<Clock>::main() { GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER)) << "MonitoredEvent type not found"; }, - [this](typename MonitoredEventType::Start& event) { + [this, &events](typename MonitoredEventType::Start& event) { auto it = mMonitoredTasks.find(event.id); if (it != mMonitoredTasks.end()) { ERR("Registered multiple start events for task %d", event.id); return; } - mMonitoredTasks.emplace( - event.id, std::move(MonitoredTask{ - .id = event.id, - .timeoutTimestamp = - event.timeOccurred + event.timeoutThreshold, - .timeoutThreshold = event.timeoutThreshold, - .hungTimestamp = std::nullopt, - .metadata = std::move(event.metadata), - .onHangAnnotationsCallback = - std::move(event.onHangAnnotationsCallback)})); + if (event.parentId && mMonitoredTasks.find(event.parentId.value()) == + mMonitoredTasks.end()) { + WARN("Requested parent task %d does not exist.", + event.parentId.value()); + event.parentId = std::nullopt; + } + it = mMonitoredTasks + .emplace(event.id, + std::move(MonitoredTask{ + .id = event.id, + .timeoutTimestamp = event.timeOccurred + + event.timeoutThreshold, + .timeoutThreshold = event.timeoutThreshold, + .hungTimestamp = std::nullopt, + .metadata = std::move(event.metadata), + .onHangAnnotationsCallback = + std::move(event.onHangAnnotationsCallback), + .parentId = event.parentId})) + .first; + updateTaskParent(events, it->second, event.timeOccurred); }, - [this](typename MonitoredEventType::Touch& event) { + [this, &events](typename MonitoredEventType::Touch& event) { auto it = mMonitoredTasks.find(event.id); if (it == mMonitoredTasks.end()) { ERR("HealthMonitor has no task in progress for id %d", event.id); @@ -166,8 +177,10 @@ intptr_t HealthMonitor<Clock>::main() { auto& task = it->second; task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold; + updateTaskParent(events, task, event.timeOccurred); }, - [this, &tasksToRemove](typename MonitoredEventType::Stop& event) { + [this, &tasksToRemove, + &events](typename MonitoredEventType::Stop& event) { auto it = mMonitoredTasks.find(event.id); if (it == mMonitoredTasks.end()) { ERR("HealthMonitor has no task in progress for id %d", event.id); @@ -176,6 +189,7 @@ intptr_t HealthMonitor<Clock>::main() { auto& task = it->second; task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold; + updateTaskParent(events, task, event.timeOccurred); // Mark it for deletion, but retain it until the end of // the health check concurrent tasks hung @@ -206,7 +220,8 @@ intptr_t HealthMonitor<Clock>::main() { auto newAnnotations = (*task.onHangAnnotationsCallback)(); task.metadata->mergeAnnotations(std::move(newAnnotations)); } - mLogger.logMetricEvent(MetricEventHang{.metadata = task.metadata.get(), + mLogger.logMetricEvent(MetricEventHang{.taskId = task.id, + .metadata = task.metadata.get(), .otherHungTasks = newHungTasks}); task.hungTimestamp = task.timeoutTimestamp; newHungTasks++; @@ -218,8 +233,8 @@ intptr_t HealthMonitor<Clock>::main() { task.timeoutTimestamp - (task.hungTimestamp.value() + task.timeoutThreshold)) .count(); - mLogger.logMetricEvent( - MetricEventUnHang{.metadata = task.metadata.get(), .hung_ms = hangTime}); + mLogger.logMetricEvent(MetricEventUnHang{ + .taskId = task.id, .metadata = task.metadata.get(), .hung_ms = hangTime}); task.hungTimestamp = std::nullopt; newHungTasks--; } @@ -243,6 +258,17 @@ intptr_t HealthMonitor<Clock>::main() { return 0; } +template <class Clock> +void HealthMonitor<Clock>::updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events, + const MonitoredTask& task, Timestamp eventTime) { + std::optional<Id> parentId = task.parentId; + if (parentId) { + auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Touch{ + .id = parentId.value(), .timeOccurred = eventTime + Duration(kTimeEpsilon)}); + events.push(std::move(event)); + } +} + template class HealthMonitor<steady_clock>; template class HealthMonitor<TestClock>; diff --git a/base/HealthMonitor_unittest.cpp b/base/HealthMonitor_unittest.cpp index 4fa3757..7001916 100644 --- a/base/HealthMonitor_unittest.cpp +++ b/base/HealthMonitor_unittest.cpp @@ -348,4 +348,112 @@ TEST_F(HealthMonitorTest, taskHungWithAttachedCallback) { healthMonitor.stopMonitoringTask(id); } +TEST_F(HealthMonitorTest, healthyTaskWithParent) { + EXPECT_CALL(logger, logMetricEvent(_)).Times(0); + + auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>()); + step(defaultHangThresholdS - 1); + auto child = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(), + std::nullopt, kDefaultTimeoutMs, id); + step(defaultHangThresholdS - 1); + healthMonitor.stopMonitoringTask(child); + step(defaultHangThresholdS - 1); + healthMonitor.stopMonitoringTask(id); +} + +TEST_F(HealthMonitorTest, threeChainOfHungTasks) { + { + InSequence s; + EXPECT_CALL(logger, logMetricEvent( + VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 2)))) + .Times(1); + EXPECT_CALL(logger, logMetricEvent( + VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 1)))) + .Times(1); + EXPECT_CALL(logger, logMetricEvent( + VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 0)))) + .Times(1); + EXPECT_CALL( + logger, + logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 2)))) + .Times(1); + EXPECT_CALL( + logger, + logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 1)))) + .Times(1); + EXPECT_CALL( + logger, + logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 0)))) + .Times(1); + } + + auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>()); + step(defaultHangThresholdS - 1); + auto child = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(), + std::nullopt, kDefaultTimeoutMs, id); + step(defaultHangThresholdS - 1); + auto grandchild = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(), + std::nullopt, kDefaultTimeoutMs, child); + step(defaultHangThresholdS + 1); + healthMonitor.touchMonitoredTask(grandchild); + step(1); + healthMonitor.stopMonitoringTask(grandchild); + healthMonitor.stopMonitoringTask(child); + healthMonitor.stopMonitoringTask(id); +} + +TEST_F(HealthMonitorTest, parentEndsBeforeChild) { + { + InSequence s; + EXPECT_CALL(logger, logMetricEvent( + VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 1)))) + .Times(1); + EXPECT_CALL( + logger, + logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 1)))) + .Times(1); + } + + auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>()); + step(defaultHangThresholdS - 1); + auto child = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(), + std::nullopt, kDefaultTimeoutMs, id); + healthMonitor.stopMonitoringTask(id); + step(defaultHangThresholdS + 1); + healthMonitor.stopMonitoringTask(child); +} + +TEST_F(HealthMonitorTest, siblingsHangParentStillHealthy) { + { + InSequence s; + EXPECT_CALL(logger, logMetricEvent( + VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 1)))) + .Times(1); + EXPECT_CALL( + logger, + logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 1)))) + .Times(1); + EXPECT_CALL(logger, logMetricEvent( + VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 2)))) + .Times(1); + EXPECT_CALL( + logger, + logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 2)))) + .Times(1); + } + + auto parent = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>()); + step(defaultHangThresholdS - 1); + /* 1 */ auto child = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(), + std::nullopt, kDefaultTimeoutMs, parent); + step(defaultHangThresholdS - 1); + /* 2 */ auto secondChild = healthMonitor.startMonitoringTask( + std::make_unique<EventHangMetadata>(), std::nullopt, kDefaultTimeoutMs, parent); + step(2); + healthMonitor.stopMonitoringTask(child); + step(defaultHangThresholdS - 1); + healthMonitor.stopMonitoringTask(secondChild); + healthMonitor.stopMonitoringTask(parent); +} + } // namespace emugl diff --git a/base/include/base/HealthMonitor.h b/base/include/base/HealthMonitor.h index 41ba10f..16b9228 100644 --- a/base/include/base/HealthMonitor.h +++ b/base/include/base/HealthMonitor.h @@ -20,6 +20,7 @@ #include <future>
#include <optional>
#include <queue>
+#include <stack>
#include <string>
#include <unordered_map>
#include <unordered_set>
@@ -29,9 +30,11 @@ #include "base/Lock.h"
#include "base/Metrics.h"
#include "base/Thread.h"
+#include "host-common/GfxstreamFatalError.h"
#include "host-common/logging.h"
using android::base::EventHangMetadata;
+using android::base::getCurrentThreadId;
#define WATCHDOG_DATA(msg, hangType, data) \
std::make_unique<EventHangMetadata>(__FILE__, __func__, msg, __LINE__, hangType, data)
@@ -48,12 +51,12 @@ using HangAnnotations = EventHangMetadata::HangAnnotations; static uint64_t kDefaultIntervalMs = 1'000;
static uint64_t kDefaultTimeoutMs = 5'000;
+static std::chrono::nanoseconds kTimeEpsilon(1);
// HealthMonitor provides the ability to register arbitrary start/touch/stop events associated
// with client defined tasks. At some pre-defined interval, it will periodically consume
// all logged events to assess whether the system is hanging on any task. Via the
// MetricsLogger, it will log hang and unhang events when it detects tasks hanging/resuming.
-// TODO: willho@ Integrate with crashpad to upload host dumps when a hang is detected.
// Design doc: http://go/gfxstream-health-monitor
template <class Clock = steady_clock>
class HealthMonitor : public android::base::Thread {
@@ -79,10 +82,13 @@ class HealthMonitor : public android::base::Thread { // considered "hung". Because `timeout` must be larger than the monitor's heartbeat
// interval, as shorter timeout periods would not be detected, this method will set actual
// timeout to the lesser of `timeout` and twice the heartbeat interval.
+ // `parentId` can be the Id of another task. Events in this monitored task will update
+ // the parent task recursively.
Id startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,
std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
onHangAnnotationsCallback = std::nullopt,
- uint64_t timeout = kDefaultTimeoutMs);
+ uint64_t timeout = kDefaultTimeoutMs,
+ std::optional<Id> parentId = std::nullopt);
// Touch a monitored task. Resets the timeout countdown for that task.
void touchMonitoredTask(Id id);
@@ -105,6 +111,7 @@ class HealthMonitor : public android::base::Thread { std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
onHangAnnotationsCallback;
Duration timeoutThreshold;
+ std::optional<Id> parentId;
};
struct Touch {
Id id;
@@ -132,11 +139,16 @@ class HealthMonitor : public android::base::Thread { std::optional<Timestamp> hungTimestamp;
std::unique_ptr<EventHangMetadata> metadata;
std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback;
+ std::optional<Id> parentId;
};
// Thread's main loop
intptr_t main() override;
+ // Update the parent task
+ void updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
+ const MonitoredTask& task, Timestamp eventTime);
+
// Explicitly wake the monitor thread. Returns a future that can be used to wait until the
// poll event has been processed.
std::future<void> poll();
@@ -164,18 +176,55 @@ class HealthWatchdog { std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
onHangAnnotationsCallback = std::nullopt,
uint64_t timeout = kDefaultTimeoutMs)
- : mHealthMonitor(healthMonitor) {
- mId = mHealthMonitor.startMonitoringTask(std::move(metadata),
- std::move(onHangAnnotationsCallback), timeout);
+ : mHealthMonitor(healthMonitor), mThreadId(getCurrentThreadId()) {
+ auto& threadTasks = getMonitoredThreadTasks();
+ auto& stack = threadTasks[&mHealthMonitor];
+ mId = mHealthMonitor.startMonitoringTask(
+ std::move(metadata), std::move(onHangAnnotationsCallback), timeout,
+ stack.empty() ? std::nullopt : std::make_optional(stack.top()));
+ stack.push(mId);
}
- ~HealthWatchdog() { mHealthMonitor.stopMonitoringTask(mId); }
+ ~HealthWatchdog() {
+ mHealthMonitor.stopMonitoringTask(mId);
+ auto& threadTasks = getMonitoredThreadTasks();
+ auto& stack = threadTasks[&mHealthMonitor];
+ if (getCurrentThreadId() != mThreadId) {
+ ERR("HealthWatchdog destructor thread does not match origin. Destructor must be called "
+ "on the same thread.");
+ GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
+ << "HealthWatchdog destructor thread does not match origin. Destructor must be "
+ "called on the same thread.";
+ }
+ if (stack.empty()) {
+ ERR("HealthWatchdog thread local stack is empty!");
+ GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
+ << "HealthWatchdog thread local stack is empty!";
+ }
+ if (stack.top() != mId) {
+ ERR("HealthWatchdog id %d does not match top of stack: %d ", mId, stack.top());
+ GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
+ << "HealthWatchdog id " << mId << " does not match top of stack: " << stack.top();
+ }
+ stack.pop();
+ }
void touch() { mHealthMonitor.touchMonitoredTask(mId); }
private:
+ using ThreadTasks =
+ std::unordered_map<HealthMonitor<Clock>*, std::stack<typename HealthMonitor<Clock>::Id>>;
typename HealthMonitor<Clock>::Id mId;
HealthMonitor<Clock>& mHealthMonitor;
+ const unsigned long mThreadId;
+
+ // Thread local stack of task Ids enables better reentrant behavior.
+ // Multiple health monitors are not expected or advised, but as an injected dependency,
+ // it is possible.
+ ThreadTasks& getMonitoredThreadTasks() {
+ static thread_local ThreadTasks threadTasks;
+ return threadTasks;
+ }
};
} // namespace emugl
diff --git a/base/include/base/Metrics.h b/base/include/base/Metrics.h index 2eea15b..87b3ea7 100644 --- a/base/include/base/Metrics.h +++ b/base/include/base/Metrics.h @@ -71,10 +71,12 @@ struct EventHangMetadata { struct MetricEventFreeze {}; struct MetricEventUnFreeze { int64_t frozen_ms; }; struct MetricEventHang { + uint64_t taskId; /* From HealthMonitor */ EventHangMetadata* metadata; int64_t otherHungTasks; }; struct MetricEventUnHang { + uint64_t taskId; /* From HealthMonitor */ EventHangMetadata* metadata; int64_t hung_ms; }; |