summaryrefslogtreecommitdiff
path: root/mali_kbase
diff options
context:
space:
mode:
authorVarad Gautam <varadgautam@google.com>2023-01-09 13:11:50 +0000
committerTreeHugger Robot <treehugger-gerrit@google.com>2023-03-23 22:45:01 +0000
commit8141064fa8984d1b2d25b9c4a84b5e5b7916b83b (patch)
tree9adcdf9a5074547b525557a9305ec4b957018a51 /mali_kbase
parentc79291e74ddfb1b157e18e9f698752e61357b348 (diff)
downloadgpu-8141064fa8984d1b2d25b9c4a84b5e5b7916b83b.tar.gz
Powercycle mali to recover from a PM timeout
The existing reset flow (kbase_pm_do_reset()) is: 1. Write to SOFT_RESET and wait for irq until timeout. 2. If RESET_COMPLETED irq timed out, write to HARD_RESET and wait for irq until timeout. 3. If RESET_COMPLETED irq timed out, powercycle the GPU via kbase_pm_hw_reset(). If a power transition timed out (ie, kbase_pm_timed_out()), writing to SOFT/HARD_RESET regs is unreliable and can send the GPU into an undefined state (eg, when writing to SOFT/HARD_RESET regs if L2 is transitioning) and prevent recovery. Introduce a RESET_FLAGS_FORCE_PM_HW_RESET flag to allow resetting the GPU via powercycle, which currently only happens when soft/hard reset both fail, and use only this method to reset the GPU from kbase_pm_timed_out(). Note: Originally pushed as pa/Ic57680225, re-merge this patch per go/p22-udc-gfx-rollout kbase upstreaming: WIP: b/243522189#comment23 Change-Id: I5b8ca3b9e49cf355f665c0b56061e06ef3ed9e0b Signed-off-by: Varad Gautam <varadgautam@google.com> Bug: 241217496 Bug: 270305834 Test: (v2) SST ~5700h (b/271438225#comment14) / (v1) SST ~2500h (b/265003962)
Diffstat (limited to 'mali_kbase')
-rw-r--r--mali_kbase/backend/gpu/mali_kbase_pm_driver.c24
-rw-r--r--mali_kbase/csf/mali_kbase_csf_defs.h2
-rw-r--r--mali_kbase/csf/mali_kbase_csf_reset_gpu.c4
-rw-r--r--mali_kbase/mali_kbase_reset_gpu.h3
4 files changed, 32 insertions, 1 deletions
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
index b046903..2bfc27a 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
@@ -2421,8 +2421,22 @@ static void kbase_pm_timed_out(struct kbase_device *kbdev)
dev_err(kbdev->dev, "Power transition timed out unexpectedly\n");
kbase_gpu_timeout_debug_message(kbdev);
dev_err(kbdev->dev, "Sending reset to GPU - all running jobs will be lost\n");
+
+ /* pixel: If either:
+ * 1. L2/MCU power transition timed out, or,
+ * 2. kbase state machine fell out of sync with the hw state,
+ * a soft/hard reset (ie writing to SOFT/HARD_RESET regs) is insufficient to resume
+ * operation.
+ *
+ * Besides, Odin TRM advises against touching SOFT/HARD_RESET
+ * regs if L2_PWRTRANS is 1 to avoid undefined state.
+ *
+ * We have already lost work if we end up here, so send a powercycle to reset the hw,
+ * which is more reliable.
+ */
if (kbase_prepare_to_reset_gpu(kbdev,
- RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
+ RESET_FLAGS_HWC_UNRECOVERABLE_ERROR |
+ RESET_FLAGS_FORCE_PM_HW_RESET))
kbase_reset_gpu(kbdev);
}
@@ -3179,6 +3193,14 @@ static int kbase_pm_do_reset(struct kbase_device *kbdev)
struct kbasep_reset_timeout_data rtdata;
int ret;
+#if MALI_USE_CSF
+ if (kbdev->csf.reset.force_pm_hw_reset && kbdev->pm.backend.callback_hardware_reset) {
+ dev_err(kbdev->dev, "Power Cycle reset mali");
+ kbdev->csf.reset.force_pm_hw_reset = false;
+ return kbase_pm_hw_reset(kbdev);
+ }
+#endif
+
KBASE_KTRACE_ADD(kbdev, CORE_GPU_SOFT_RESET, NULL, 0);
KBASE_TLSTREAM_JD_GPU_SOFT_RESET(kbdev, kbdev);
diff --git a/mali_kbase/csf/mali_kbase_csf_defs.h b/mali_kbase/csf/mali_kbase_csf_defs.h
index af93fb3..520a41b 100644
--- a/mali_kbase/csf/mali_kbase_csf_defs.h
+++ b/mali_kbase/csf/mali_kbase_csf_defs.h
@@ -828,6 +828,7 @@ struct kbase_csf_context {
* mechanism to check for deadlocks involving reset waits.
* @state: Tracks if the GPU reset is in progress or not.
* The state is represented by enum @kbase_csf_reset_gpu_state.
+ * @force_pm_hw_reset: pixel: Powercycle the GPU instead of attempting a soft/hard reset.
*/
struct kbase_csf_reset_gpu {
struct workqueue_struct *workq;
@@ -835,6 +836,7 @@ struct kbase_csf_reset_gpu {
wait_queue_head_t wait;
struct rw_semaphore sem;
atomic_t state;
+ bool force_pm_hw_reset;
};
/**
diff --git a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
index 77aa3ce..4d9cc99 100644
--- a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
+++ b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
@@ -513,6 +513,9 @@ bool kbase_prepare_to_reset_gpu(struct kbase_device *kbdev, unsigned int flags)
/* Some other thread is already resetting the GPU */
return false;
+ if (flags & RESET_FLAGS_FORCE_PM_HW_RESET)
+ kbdev->csf.reset.force_pm_hw_reset = true;
+
return true;
}
KBASE_EXPORT_TEST_API(kbase_prepare_to_reset_gpu);
@@ -631,6 +634,7 @@ int kbase_reset_gpu_init(struct kbase_device *kbdev)
init_waitqueue_head(&kbdev->csf.reset.wait);
init_rwsem(&kbdev->csf.reset.sem);
+ kbdev->csf.reset.force_pm_hw_reset = false;
return 0;
}
diff --git a/mali_kbase/mali_kbase_reset_gpu.h b/mali_kbase/mali_kbase_reset_gpu.h
index ff631e9..a78a75a 100644
--- a/mali_kbase/mali_kbase_reset_gpu.h
+++ b/mali_kbase/mali_kbase_reset_gpu.h
@@ -151,6 +151,9 @@ void kbase_reset_gpu_assert_failed_or_prevented(struct kbase_device *kbdev);
/* This reset should be treated as an unrecoverable error by HW counter logic */
#define RESET_FLAGS_HWC_UNRECOVERABLE_ERROR ((unsigned int)(1 << 0))
+/* pixel: Powercycle the GPU instead of attempting a soft/hard reset (only used on CSF hw). */
+#define RESET_FLAGS_FORCE_PM_HW_RESET ((unsigned int)(1 << 1))
+
/**
* kbase_prepare_to_reset_gpu_locked - Prepare for resetting the GPU.
* @kbdev: Device pointer