diff options
Diffstat (limited to 'base/HealthMonitor.cpp')
-rw-r--r-- | base/HealthMonitor.cpp | 62 |
1 files changed, 44 insertions, 18 deletions
diff --git a/base/HealthMonitor.cpp b/base/HealthMonitor.cpp index 67c9285..4a9ab93 100644 --- a/base/HealthMonitor.cpp +++ b/base/HealthMonitor.cpp @@ -60,7 +60,7 @@ template <class Clock> typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask( std::unique_ptr<EventHangMetadata> metadata, std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback, - uint64_t timeout) { + uint64_t timeout, std::optional<Id> parentId) { auto intervalMs = duration_cast<std::chrono::milliseconds>(mInterval).count(); if (timeout < intervalMs) { WARN("Timeout value %d is too low (heartbeat is every %d). Increasing to %d", timeout, @@ -75,7 +75,8 @@ typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask( .metadata = std::move(metadata), .timeOccurred = Clock::now(), .onHangAnnotationsCallback = std::move(onHangAnnotationsCallback), - .timeoutThreshold = Duration(std::chrono::milliseconds(timeout))}); + .timeoutThreshold = Duration(std::chrono::milliseconds(timeout)), + .parentId = parentId}); mEventQueue.push(std::move(event)); return id; } @@ -140,24 +141,34 @@ intptr_t HealthMonitor<Clock>::main() { GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER)) << "MonitoredEvent type not found"; }, - [this](typename MonitoredEventType::Start& event) { + [this, &events](typename MonitoredEventType::Start& event) { auto it = mMonitoredTasks.find(event.id); if (it != mMonitoredTasks.end()) { ERR("Registered multiple start events for task %d", event.id); return; } - mMonitoredTasks.emplace( - event.id, std::move(MonitoredTask{ - .id = event.id, - .timeoutTimestamp = - event.timeOccurred + event.timeoutThreshold, - .timeoutThreshold = event.timeoutThreshold, - .hungTimestamp = std::nullopt, - .metadata = std::move(event.metadata), - .onHangAnnotationsCallback = - std::move(event.onHangAnnotationsCallback)})); + if (event.parentId && mMonitoredTasks.find(event.parentId.value()) == + mMonitoredTasks.end()) { + WARN("Requested parent task %d does not exist.", + event.parentId.value()); + event.parentId = std::nullopt; + } + it = mMonitoredTasks + .emplace(event.id, + std::move(MonitoredTask{ + .id = event.id, + .timeoutTimestamp = event.timeOccurred + + event.timeoutThreshold, + .timeoutThreshold = event.timeoutThreshold, + .hungTimestamp = std::nullopt, + .metadata = std::move(event.metadata), + .onHangAnnotationsCallback = + std::move(event.onHangAnnotationsCallback), + .parentId = event.parentId})) + .first; + updateTaskParent(events, it->second, event.timeOccurred); }, - [this](typename MonitoredEventType::Touch& event) { + [this, &events](typename MonitoredEventType::Touch& event) { auto it = mMonitoredTasks.find(event.id); if (it == mMonitoredTasks.end()) { ERR("HealthMonitor has no task in progress for id %d", event.id); @@ -166,8 +177,10 @@ intptr_t HealthMonitor<Clock>::main() { auto& task = it->second; task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold; + updateTaskParent(events, task, event.timeOccurred); }, - [this, &tasksToRemove](typename MonitoredEventType::Stop& event) { + [this, &tasksToRemove, + &events](typename MonitoredEventType::Stop& event) { auto it = mMonitoredTasks.find(event.id); if (it == mMonitoredTasks.end()) { ERR("HealthMonitor has no task in progress for id %d", event.id); @@ -176,6 +189,7 @@ intptr_t HealthMonitor<Clock>::main() { auto& task = it->second; task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold; + updateTaskParent(events, task, event.timeOccurred); // Mark it for deletion, but retain it until the end of // the health check concurrent tasks hung @@ -206,7 +220,8 @@ intptr_t HealthMonitor<Clock>::main() { auto newAnnotations = (*task.onHangAnnotationsCallback)(); task.metadata->mergeAnnotations(std::move(newAnnotations)); } - mLogger.logMetricEvent(MetricEventHang{.metadata = task.metadata.get(), + mLogger.logMetricEvent(MetricEventHang{.taskId = task.id, + .metadata = task.metadata.get(), .otherHungTasks = newHungTasks}); task.hungTimestamp = task.timeoutTimestamp; newHungTasks++; @@ -218,8 +233,8 @@ intptr_t HealthMonitor<Clock>::main() { task.timeoutTimestamp - (task.hungTimestamp.value() + task.timeoutThreshold)) .count(); - mLogger.logMetricEvent( - MetricEventUnHang{.metadata = task.metadata.get(), .hung_ms = hangTime}); + mLogger.logMetricEvent(MetricEventUnHang{ + .taskId = task.id, .metadata = task.metadata.get(), .hung_ms = hangTime}); task.hungTimestamp = std::nullopt; newHungTasks--; } @@ -243,6 +258,17 @@ intptr_t HealthMonitor<Clock>::main() { return 0; } +template <class Clock> +void HealthMonitor<Clock>::updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events, + const MonitoredTask& task, Timestamp eventTime) { + std::optional<Id> parentId = task.parentId; + if (parentId) { + auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Touch{ + .id = parentId.value(), .timeOccurred = eventTime + Duration(kTimeEpsilon)}); + events.push(std::move(event)); + } +} + template class HealthMonitor<steady_clock>; template class HealthMonitor<TestClock>; |