aboutsummaryrefslogtreecommitdiff
path: root/base
diff options
context:
space:
mode:
authorWilliam Ho <willho@google.com>2022-09-12 13:51:56 -0400
committerJoshua Duong <joshuaduong@google.com>2022-09-15 08:03:11 -0700
commit3a20ff5c908e0c40a7ad814541834864b6c1c40a (patch)
treed706e651887f01b433252c86c98ad3bbd88f7397 /base
parent82cc35a4efb65c87958288eeda1939e6bbe91a65 (diff)
downloadaemu-3a20ff5c908e0c40a7ad814541834864b6c1c40a.tar.gz
Add parent id arg to HealthMonitor tasks
...to support desired reentrant behavior. Bug: 244354559 Test: Unit tests. Built and ran emulator Change-Id: Ie629a1115866124ca6a33425399e85fe1ebd0c3c
Diffstat (limited to 'base')
-rw-r--r--base/HealthMonitor.cpp62
-rw-r--r--base/HealthMonitor_unittest.cpp108
-rw-r--r--base/include/base/HealthMonitor.h61
-rw-r--r--base/include/base/Metrics.h2
4 files changed, 209 insertions, 24 deletions
diff --git a/base/HealthMonitor.cpp b/base/HealthMonitor.cpp
index 67c9285..4a9ab93 100644
--- a/base/HealthMonitor.cpp
+++ b/base/HealthMonitor.cpp
@@ -60,7 +60,7 @@ template <class Clock>
typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask(
std::unique_ptr<EventHangMetadata> metadata,
std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback,
- uint64_t timeout) {
+ uint64_t timeout, std::optional<Id> parentId) {
auto intervalMs = duration_cast<std::chrono::milliseconds>(mInterval).count();
if (timeout < intervalMs) {
WARN("Timeout value %d is too low (heartbeat is every %d). Increasing to %d", timeout,
@@ -75,7 +75,8 @@ typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask(
.metadata = std::move(metadata),
.timeOccurred = Clock::now(),
.onHangAnnotationsCallback = std::move(onHangAnnotationsCallback),
- .timeoutThreshold = Duration(std::chrono::milliseconds(timeout))});
+ .timeoutThreshold = Duration(std::chrono::milliseconds(timeout)),
+ .parentId = parentId});
mEventQueue.push(std::move(event));
return id;
}
@@ -140,24 +141,34 @@ intptr_t HealthMonitor<Clock>::main() {
GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER)) <<
"MonitoredEvent type not found";
},
- [this](typename MonitoredEventType::Start& event) {
+ [this, &events](typename MonitoredEventType::Start& event) {
auto it = mMonitoredTasks.find(event.id);
if (it != mMonitoredTasks.end()) {
ERR("Registered multiple start events for task %d", event.id);
return;
}
- mMonitoredTasks.emplace(
- event.id, std::move(MonitoredTask{
- .id = event.id,
- .timeoutTimestamp =
- event.timeOccurred + event.timeoutThreshold,
- .timeoutThreshold = event.timeoutThreshold,
- .hungTimestamp = std::nullopt,
- .metadata = std::move(event.metadata),
- .onHangAnnotationsCallback =
- std::move(event.onHangAnnotationsCallback)}));
+ if (event.parentId && mMonitoredTasks.find(event.parentId.value()) ==
+ mMonitoredTasks.end()) {
+ WARN("Requested parent task %d does not exist.",
+ event.parentId.value());
+ event.parentId = std::nullopt;
+ }
+ it = mMonitoredTasks
+ .emplace(event.id,
+ std::move(MonitoredTask{
+ .id = event.id,
+ .timeoutTimestamp = event.timeOccurred +
+ event.timeoutThreshold,
+ .timeoutThreshold = event.timeoutThreshold,
+ .hungTimestamp = std::nullopt,
+ .metadata = std::move(event.metadata),
+ .onHangAnnotationsCallback =
+ std::move(event.onHangAnnotationsCallback),
+ .parentId = event.parentId}))
+ .first;
+ updateTaskParent(events, it->second, event.timeOccurred);
},
- [this](typename MonitoredEventType::Touch& event) {
+ [this, &events](typename MonitoredEventType::Touch& event) {
auto it = mMonitoredTasks.find(event.id);
if (it == mMonitoredTasks.end()) {
ERR("HealthMonitor has no task in progress for id %d", event.id);
@@ -166,8 +177,10 @@ intptr_t HealthMonitor<Clock>::main() {
auto& task = it->second;
task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
+ updateTaskParent(events, task, event.timeOccurred);
},
- [this, &tasksToRemove](typename MonitoredEventType::Stop& event) {
+ [this, &tasksToRemove,
+ &events](typename MonitoredEventType::Stop& event) {
auto it = mMonitoredTasks.find(event.id);
if (it == mMonitoredTasks.end()) {
ERR("HealthMonitor has no task in progress for id %d", event.id);
@@ -176,6 +189,7 @@ intptr_t HealthMonitor<Clock>::main() {
auto& task = it->second;
task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
+ updateTaskParent(events, task, event.timeOccurred);
// Mark it for deletion, but retain it until the end of
// the health check concurrent tasks hung
@@ -206,7 +220,8 @@ intptr_t HealthMonitor<Clock>::main() {
auto newAnnotations = (*task.onHangAnnotationsCallback)();
task.metadata->mergeAnnotations(std::move(newAnnotations));
}
- mLogger.logMetricEvent(MetricEventHang{.metadata = task.metadata.get(),
+ mLogger.logMetricEvent(MetricEventHang{.taskId = task.id,
+ .metadata = task.metadata.get(),
.otherHungTasks = newHungTasks});
task.hungTimestamp = task.timeoutTimestamp;
newHungTasks++;
@@ -218,8 +233,8 @@ intptr_t HealthMonitor<Clock>::main() {
task.timeoutTimestamp -
(task.hungTimestamp.value() + task.timeoutThreshold))
.count();
- mLogger.logMetricEvent(
- MetricEventUnHang{.metadata = task.metadata.get(), .hung_ms = hangTime});
+ mLogger.logMetricEvent(MetricEventUnHang{
+ .taskId = task.id, .metadata = task.metadata.get(), .hung_ms = hangTime});
task.hungTimestamp = std::nullopt;
newHungTasks--;
}
@@ -243,6 +258,17 @@ intptr_t HealthMonitor<Clock>::main() {
return 0;
}
+template <class Clock>
+void HealthMonitor<Clock>::updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
+ const MonitoredTask& task, Timestamp eventTime) {
+ std::optional<Id> parentId = task.parentId;
+ if (parentId) {
+ auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Touch{
+ .id = parentId.value(), .timeOccurred = eventTime + Duration(kTimeEpsilon)});
+ events.push(std::move(event));
+ }
+}
+
template class HealthMonitor<steady_clock>;
template class HealthMonitor<TestClock>;
diff --git a/base/HealthMonitor_unittest.cpp b/base/HealthMonitor_unittest.cpp
index 4fa3757..7001916 100644
--- a/base/HealthMonitor_unittest.cpp
+++ b/base/HealthMonitor_unittest.cpp
@@ -348,4 +348,112 @@ TEST_F(HealthMonitorTest, taskHungWithAttachedCallback) {
healthMonitor.stopMonitoringTask(id);
}
+TEST_F(HealthMonitorTest, healthyTaskWithParent) {
+ EXPECT_CALL(logger, logMetricEvent(_)).Times(0);
+
+ auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());
+ step(defaultHangThresholdS - 1);
+ auto child = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(),
+ std::nullopt, kDefaultTimeoutMs, id);
+ step(defaultHangThresholdS - 1);
+ healthMonitor.stopMonitoringTask(child);
+ step(defaultHangThresholdS - 1);
+ healthMonitor.stopMonitoringTask(id);
+}
+
+TEST_F(HealthMonitorTest, threeChainOfHungTasks) {
+ {
+ InSequence s;
+ EXPECT_CALL(logger, logMetricEvent(
+ VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 2))))
+ .Times(1);
+ EXPECT_CALL(logger, logMetricEvent(
+ VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 1))))
+ .Times(1);
+ EXPECT_CALL(logger, logMetricEvent(
+ VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 0))))
+ .Times(1);
+ EXPECT_CALL(
+ logger,
+ logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 2))))
+ .Times(1);
+ EXPECT_CALL(
+ logger,
+ logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 1))))
+ .Times(1);
+ EXPECT_CALL(
+ logger,
+ logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 0))))
+ .Times(1);
+ }
+
+ auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());
+ step(defaultHangThresholdS - 1);
+ auto child = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(),
+ std::nullopt, kDefaultTimeoutMs, id);
+ step(defaultHangThresholdS - 1);
+ auto grandchild = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(),
+ std::nullopt, kDefaultTimeoutMs, child);
+ step(defaultHangThresholdS + 1);
+ healthMonitor.touchMonitoredTask(grandchild);
+ step(1);
+ healthMonitor.stopMonitoringTask(grandchild);
+ healthMonitor.stopMonitoringTask(child);
+ healthMonitor.stopMonitoringTask(id);
+}
+
+TEST_F(HealthMonitorTest, parentEndsBeforeChild) {
+ {
+ InSequence s;
+ EXPECT_CALL(logger, logMetricEvent(
+ VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 1))))
+ .Times(1);
+ EXPECT_CALL(
+ logger,
+ logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 1))))
+ .Times(1);
+ }
+
+ auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());
+ step(defaultHangThresholdS - 1);
+ auto child = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(),
+ std::nullopt, kDefaultTimeoutMs, id);
+ healthMonitor.stopMonitoringTask(id);
+ step(defaultHangThresholdS + 1);
+ healthMonitor.stopMonitoringTask(child);
+}
+
+TEST_F(HealthMonitorTest, siblingsHangParentStillHealthy) {
+ {
+ InSequence s;
+ EXPECT_CALL(logger, logMetricEvent(
+ VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 1))))
+ .Times(1);
+ EXPECT_CALL(
+ logger,
+ logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 1))))
+ .Times(1);
+ EXPECT_CALL(logger, logMetricEvent(
+ VariantWith<MetricEventHang>(Field(&MetricEventHang::taskId, 2))))
+ .Times(1);
+ EXPECT_CALL(
+ logger,
+ logMetricEvent(VariantWith<MetricEventUnHang>(Field(&MetricEventUnHang::taskId, 2))))
+ .Times(1);
+ }
+
+ auto parent = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());
+ step(defaultHangThresholdS - 1);
+ /* 1 */ auto child = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(),
+ std::nullopt, kDefaultTimeoutMs, parent);
+ step(defaultHangThresholdS - 1);
+ /* 2 */ auto secondChild = healthMonitor.startMonitoringTask(
+ std::make_unique<EventHangMetadata>(), std::nullopt, kDefaultTimeoutMs, parent);
+ step(2);
+ healthMonitor.stopMonitoringTask(child);
+ step(defaultHangThresholdS - 1);
+ healthMonitor.stopMonitoringTask(secondChild);
+ healthMonitor.stopMonitoringTask(parent);
+}
+
} // namespace emugl
diff --git a/base/include/base/HealthMonitor.h b/base/include/base/HealthMonitor.h
index 41ba10f..16b9228 100644
--- a/base/include/base/HealthMonitor.h
+++ b/base/include/base/HealthMonitor.h
@@ -20,6 +20,7 @@
#include <future>
#include <optional>
#include <queue>
+#include <stack>
#include <string>
#include <unordered_map>
#include <unordered_set>
@@ -29,9 +30,11 @@
#include "base/Lock.h"
#include "base/Metrics.h"
#include "base/Thread.h"
+#include "host-common/GfxstreamFatalError.h"
#include "host-common/logging.h"
using android::base::EventHangMetadata;
+using android::base::getCurrentThreadId;
#define WATCHDOG_DATA(msg, hangType, data) \
std::make_unique<EventHangMetadata>(__FILE__, __func__, msg, __LINE__, hangType, data)
@@ -48,12 +51,12 @@ using HangAnnotations = EventHangMetadata::HangAnnotations;
static uint64_t kDefaultIntervalMs = 1'000;
static uint64_t kDefaultTimeoutMs = 5'000;
+static std::chrono::nanoseconds kTimeEpsilon(1);
// HealthMonitor provides the ability to register arbitrary start/touch/stop events associated
// with client defined tasks. At some pre-defined interval, it will periodically consume
// all logged events to assess whether the system is hanging on any task. Via the
// MetricsLogger, it will log hang and unhang events when it detects tasks hanging/resuming.
-// TODO: willho@ Integrate with crashpad to upload host dumps when a hang is detected.
// Design doc: http://go/gfxstream-health-monitor
template <class Clock = steady_clock>
class HealthMonitor : public android::base::Thread {
@@ -79,10 +82,13 @@ class HealthMonitor : public android::base::Thread {
// considered "hung". Because `timeout` must be larger than the monitor's heartbeat
// interval, as shorter timeout periods would not be detected, this method will set actual
// timeout to the lesser of `timeout` and twice the heartbeat interval.
+ // `parentId` can be the Id of another task. Events in this monitored task will update
+ // the parent task recursively.
Id startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,
std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
onHangAnnotationsCallback = std::nullopt,
- uint64_t timeout = kDefaultTimeoutMs);
+ uint64_t timeout = kDefaultTimeoutMs,
+ std::optional<Id> parentId = std::nullopt);
// Touch a monitored task. Resets the timeout countdown for that task.
void touchMonitoredTask(Id id);
@@ -105,6 +111,7 @@ class HealthMonitor : public android::base::Thread {
std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
onHangAnnotationsCallback;
Duration timeoutThreshold;
+ std::optional<Id> parentId;
};
struct Touch {
Id id;
@@ -132,11 +139,16 @@ class HealthMonitor : public android::base::Thread {
std::optional<Timestamp> hungTimestamp;
std::unique_ptr<EventHangMetadata> metadata;
std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback;
+ std::optional<Id> parentId;
};
// Thread's main loop
intptr_t main() override;
+ // Update the parent task
+ void updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
+ const MonitoredTask& task, Timestamp eventTime);
+
// Explicitly wake the monitor thread. Returns a future that can be used to wait until the
// poll event has been processed.
std::future<void> poll();
@@ -164,18 +176,55 @@ class HealthWatchdog {
std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
onHangAnnotationsCallback = std::nullopt,
uint64_t timeout = kDefaultTimeoutMs)
- : mHealthMonitor(healthMonitor) {
- mId = mHealthMonitor.startMonitoringTask(std::move(metadata),
- std::move(onHangAnnotationsCallback), timeout);
+ : mHealthMonitor(healthMonitor), mThreadId(getCurrentThreadId()) {
+ auto& threadTasks = getMonitoredThreadTasks();
+ auto& stack = threadTasks[&mHealthMonitor];
+ mId = mHealthMonitor.startMonitoringTask(
+ std::move(metadata), std::move(onHangAnnotationsCallback), timeout,
+ stack.empty() ? std::nullopt : std::make_optional(stack.top()));
+ stack.push(mId);
}
- ~HealthWatchdog() { mHealthMonitor.stopMonitoringTask(mId); }
+ ~HealthWatchdog() {
+ mHealthMonitor.stopMonitoringTask(mId);
+ auto& threadTasks = getMonitoredThreadTasks();
+ auto& stack = threadTasks[&mHealthMonitor];
+ if (getCurrentThreadId() != mThreadId) {
+ ERR("HealthWatchdog destructor thread does not match origin. Destructor must be called "
+ "on the same thread.");
+ GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
+ << "HealthWatchdog destructor thread does not match origin. Destructor must be "
+ "called on the same thread.";
+ }
+ if (stack.empty()) {
+ ERR("HealthWatchdog thread local stack is empty!");
+ GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
+ << "HealthWatchdog thread local stack is empty!";
+ }
+ if (stack.top() != mId) {
+ ERR("HealthWatchdog id %d does not match top of stack: %d ", mId, stack.top());
+ GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
+ << "HealthWatchdog id " << mId << " does not match top of stack: " << stack.top();
+ }
+ stack.pop();
+ }
void touch() { mHealthMonitor.touchMonitoredTask(mId); }
private:
+ using ThreadTasks =
+ std::unordered_map<HealthMonitor<Clock>*, std::stack<typename HealthMonitor<Clock>::Id>>;
typename HealthMonitor<Clock>::Id mId;
HealthMonitor<Clock>& mHealthMonitor;
+ const unsigned long mThreadId;
+
+ // Thread local stack of task Ids enables better reentrant behavior.
+ // Multiple health monitors are not expected or advised, but as an injected dependency,
+ // it is possible.
+ ThreadTasks& getMonitoredThreadTasks() {
+ static thread_local ThreadTasks threadTasks;
+ return threadTasks;
+ }
};
} // namespace emugl
diff --git a/base/include/base/Metrics.h b/base/include/base/Metrics.h
index 2eea15b..87b3ea7 100644
--- a/base/include/base/Metrics.h
+++ b/base/include/base/Metrics.h
@@ -71,10 +71,12 @@ struct EventHangMetadata {
struct MetricEventFreeze {};
struct MetricEventUnFreeze { int64_t frozen_ms; };
struct MetricEventHang {
+ uint64_t taskId; /* From HealthMonitor */
EventHangMetadata* metadata;
int64_t otherHungTasks;
};
struct MetricEventUnHang {
+ uint64_t taskId; /* From HealthMonitor */
EventHangMetadata* metadata;
int64_t hung_ms;
};