GPUCORE-36682 Lock MMU while disabling AS to prevent use after freeandroid-t-qpr3-beta-3_r0.5 android-t-qpr3-beta-3_r0.4 android-t-qpr3-beta-3_r0.3

*Affects CSF GPUs only, but changes to common code.* During an invalid GPU page fault, kbase will try to flush the GPU cache and disable the faulting address space (AS). There is a small window between flushing of the GPU L2 cache (MMU resumes) and when the AS is disabled where existing jobs on the GPU may access memory for that AS, dirtying the GPU cache. This is a problem as the kctx->as_nr is marked as KBASEP_AS_NR_INVALID and thus no cache maintenance will be performed on the AS of the faulty context when cleaning up the csg_slot and releasing the context. This patch addresses that issue by: 1. locking the AS via a GPU command 2. flushing the cache 3. disabling the AS 4. unlocking the AS This ensures that any jobs remaining on the GPU will not be able to access the memory due to the locked AS. Once the AS is unlocked, any memory access will fail as the AS is now disabled. Change-Id: I5e02face6ca0fa4526576dd70d0261ea3ee69506 (cherry picked from commit 566789dffda3dfec00ecf00f9819e7a515fb2c61) Provenance: https://code.ipdelivery.arm.com/c/GPU/mali-ddk/+/5071 Bug: 274014055
author: Suzanne Candanedo <suzanne.candanedo@arm.com> 2023-03-20 20:31:29 +0000
committer: Guus Sliepen <gsliepen@google.com> 2023-03-28 08:13:19 +0000
commit: d4a9cc691fdde6aae0f5d40ad3d949ab76518e42 (patch)
tree: e50f914be2492f8fa22e18eb6e3d9e0f22496607
parent: 04bf4049652e9aa3e952bdc30c560054e1c0f060 (diff)
download: gpu-d4a9cc691fdde6aae0f5d40ad3d949ab76518e42.tar.gz
3 files changed, 75 insertions, 66 deletions
diff --git a/mali_kbase/mmu/mali_kbase_mmu.c b/mali_kbase/mmu/mali_kbase_mmu.c
index 4828cdc..88b0197 100644
--- a/mali_kbase/mmu/mali_kbase_mmu.c
+++ b/mali_kbase/mmu/mali_kbase_mmu.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -2066,60 +2066,6 @@ int kbase_mmu_insert_pages(struct kbase_device *kbdev,
 
 KBASE_EXPORT_TEST_API(kbase_mmu_insert_pages);
 
-/**
- * kbase_mmu_flush_noretain() - Flush and invalidate the GPU caches
- * without retaining the kbase context.
- * @kctx: The KBase context.
- * @vpfn: The virtual page frame number to start the flush on.
- * @nr: The number of pages to flush.
- *
- * As per kbase_mmu_flush_invalidate but doesn't retain the kctx or do any
- * other locking.
- */
-static void kbase_mmu_flush_noretain(struct kbase_context *kctx, u64 vpfn, size_t nr)
-{
-	struct kbase_device *kbdev = kctx->kbdev;
-	int err;
-	/* Calls to this function are inherently asynchronous, with respect to
-	 * MMU operations.
-	 */
-	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
-	struct kbase_mmu_hw_op_param op_param;
-
-	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
-	lockdep_assert_held(&kctx->kbdev->mmu_hw_mutex);
-
-	/* Early out if there is nothing to do */
-	if (nr == 0)
-		return;
-
-	/* flush L2 and unlock the VA (resumes the MMU) */
-	op_param.vpfn = vpfn;
-	op_param.nr = nr;
-	op_param.op = KBASE_MMU_OP_FLUSH_MEM;
-	op_param.kctx_id = kctx->id;
-	op_param.mmu_sync_info = mmu_sync_info;
-	if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
-		/* Value used to prevent skipping of any levels when flushing */
-		op_param.flush_skip_levels = pgd_level_to_skip_flush(0xF);
-		err = kbase_mmu_hw_do_flush_on_gpu_ctrl(kbdev, &kbdev->as[kctx->as_nr],
-							&op_param);
-	} else {
-		err = kbase_mmu_hw_do_flush_locked(kbdev, &kbdev->as[kctx->as_nr],
-						   &op_param);
-	}
-
-	if (err) {
-		/* Flush failed to complete, assume the
-		 * GPU has hung and perform a reset to recover
-		 */
-		dev_err(kbdev->dev, "Flush for GPU page table update did not complete. Issuing GPU soft-reset to recover");
-
-		if (kbase_prepare_to_reset_gpu_locked(kbdev, RESET_FLAGS_NONE))
-			kbase_reset_gpu_locked(kbdev);
-	}
-}
-
 void kbase_mmu_update(struct kbase_device *kbdev,
 		struct kbase_mmu_table *mmut,
 		int as_nr)
@@ -2142,6 +2088,14 @@ void kbase_mmu_disable_as(struct kbase_device *kbdev, int as_nr)
 
 void kbase_mmu_disable(struct kbase_context *kctx)
 {
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+	struct kbase_device *kbdev = kctx->kbdev;
+	struct kbase_mmu_hw_op_param op_param = { 0 };
+	int lock_err, flush_err;
+
 	/* ASSERT that the context has a valid as_nr, which is only the case
 	 * when it's scheduled in.
 	 *
@@ -2152,16 +2106,49 @@ void kbase_mmu_disable(struct kbase_context *kctx)
 	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
 	lockdep_assert_held(&kctx->kbdev->mmu_hw_mutex);
 
-	/*
-	 * The address space is being disabled, drain all knowledge of it out
-	 * from the caches as pages and page tables might be freed after this.
-	 *
-	 * The job scheduler code will already be holding the locks and context
-	 * so just do the flush.
+	op_param.vpfn = 0;
+	op_param.nr = ~0;
+	op_param.op = KBASE_MMU_OP_FLUSH_MEM;
+	op_param.kctx_id = kctx->id;
+	op_param.mmu_sync_info = mmu_sync_info;
+
+#if MALI_USE_CSF
+	/* 0xF value used to prevent skipping of any levels when flushing */
+	if (mmu_flush_cache_on_gpu_ctrl(kbdev))
+		op_param.flush_skip_levels = pgd_level_to_skip_flush(0xF);
+#endif
+
+	/* lock MMU to prevent existing jobs on GPU from executing while the AS is
+	 * not yet disabled
+	 */
+	lock_err = kbase_mmu_hw_do_lock(kbdev, &kbdev->as[kctx->as_nr], &op_param);
+	if (lock_err)
+		dev_err(kbdev->dev, "Failed to lock AS %d for ctx %d_%d", kctx->as_nr, kctx->tgid,
+			kctx->id);
+
+	/* Issue the flush command only when L2 cache is in stable power on state.
+	 * Any other state for L2 cache implies that shader cores are powered off,
+	 * which in turn implies there is no execution happening on the GPU.
 	 */
-	kbase_mmu_flush_noretain(kctx, 0, ~0);
+	if (kbdev->pm.backend.l2_state == KBASE_L2_ON) {
+		flush_err = kbase_gpu_cache_flush_and_busy_wait(kbdev,
+								GPU_COMMAND_CACHE_CLN_INV_L2_LSC);
+		if (flush_err)
+			dev_err(kbdev->dev,
+				"Failed to flush GPU cache when disabling AS %d for ctx %d_%d",
+				kctx->as_nr, kctx->tgid, kctx->id);
+	}
+	kbdev->mmu_mode->disable_as(kbdev, kctx->as_nr);
+
+	if (!lock_err) {
+		/* unlock the MMU to allow it to resume */
+		lock_err =
+			kbase_mmu_hw_do_unlock_no_addr(kbdev, &kbdev->as[kctx->as_nr], &op_param);
+		if (lock_err)
+			dev_err(kbdev->dev, "Failed to unlock AS %d for ctx %d_%d", kctx->as_nr,
+				kctx->tgid, kctx->id);
+	}
 
-	kctx->kbdev->mmu_mode->disable_as(kctx->kbdev, kctx->as_nr);
 #if !MALI_USE_CSF
 	/*
 	 * JM GPUs has some L1 read only caches that need to be invalidated
@@ -2169,8 +2156,15 @@ void kbase_mmu_disable(struct kbase_context *kctx)
 	 * the slot_rb tracking field so such invalidation is performed when
 	 * a new katom is executed on the affected slots.
 	 */
-	kbase_backend_slot_kctx_purge_locked(kctx->kbdev, kctx);
+	kbase_backend_slot_kctx_purge_locked(kbdev, kctx);
 #endif
+
+	/* kbase_gpu_cache_flush_and_busy_wait() will reset the GPU on timeout. Only
+	 * reset the GPU if locking or unlocking fails.
+	 */
+	if (lock_err)
+		if (kbase_prepare_to_reset_gpu_locked(kbdev, RESET_FLAGS_NONE))
+			kbase_reset_gpu_locked(kbdev);
 }
 KBASE_EXPORT_TEST_API(kbase_mmu_disable);
 
diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw.h b/mali_kbase/mmu/mali_kbase_mmu_hw.h
index 438dd5e..3291143 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_hw.h
+++ b/mali_kbase/mmu/mali_kbase_mmu_hw.h
@@ -133,6 +133,21 @@ int kbase_mmu_hw_do_unlock_no_addr(struct kbase_device *kbdev, struct kbase_as *
 int kbase_mmu_hw_do_unlock(struct kbase_device *kbdev, struct kbase_as *as,
 			   const struct kbase_mmu_hw_op_param *op_param);
 /**
+ * kbase_mmu_hw_do_lock - Issue a LOCK operation to the MMU.
+ *
+ * @kbdev:      Kbase device to issue the MMU operation on.
+ * @as:         Address space to issue the MMU operation on.
+ * @op_param:   Pointer to struct containing information about the MMU
+ *              operation to perform.
+ *
+ * Context: Acquires the hwaccess_lock, expects the caller to hold the mmu_hw_mutex
+ *
+ * Return: Zero if the operation was successful, non-zero otherwise.
+ */
+int kbase_mmu_hw_do_lock(struct kbase_device *kbdev, struct kbase_as *as,
+			  const struct kbase_mmu_hw_op_param *op_param);
+
+/**
  * kbase_mmu_hw_do_flush - Issue a flush operation to the MMU.
  *
  * @kbdev:      Kbase device to issue the MMU operation on.
diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
index 1a6157a..9b41894 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
+++ b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
@@ -387,7 +387,7 @@ static int mmu_hw_do_lock_no_wait(struct kbase_device *kbdev, struct kbase_as *a
 	return ret;
 }
 
-static int mmu_hw_do_lock(struct kbase_device *kbdev, struct kbase_as *as,
+int kbase_mmu_hw_do_lock(struct kbase_device *kbdev, struct kbase_as *as,
 			  const struct kbase_mmu_hw_op_param *op_param)
 {
 	int ret;
@@ -550,7 +550,7 @@ int kbase_mmu_hw_do_flush_on_gpu_ctrl(struct kbase_device *kbdev, struct kbase_a
 		gpu_cmd = GPU_COMMAND_CACHE_CLN_INV_L2;
 
 	/* 1. Issue MMU_AS_CONTROL.COMMAND.LOCK operation. */
-	ret = mmu_hw_do_lock(kbdev, as, op_param);
+	ret = kbase_mmu_hw_do_lock(kbdev, as, op_param);
 	if (ret)
 		return ret;
author	Suzanne Candanedo <suzanne.candanedo@arm.com>	2023-03-20 20:31:29 +0000
committer	Guus Sliepen <gsliepen@google.com>	2023-03-28 08:13:19 +0000
commit	d4a9cc691fdde6aae0f5d40ad3d949ab76518e42 (patch)
tree	e50f914be2492f8fa22e18eb6e3d9e0f22496607
parent	04bf4049652e9aa3e952bdc30c560054e1c0f060 (diff)
download	gpu-d4a9cc691fdde6aae0f5d40ad3d949ab76518e42.tar.gz