GPUCORE-36251: Make HeapContext GPU VA to be GPU cacheline alignedandroid-t-qpr2-beta-1_r0.6 android-gs-pantah-5.10-t-qpr2-beta-1

Customer reported an issue where an unexpected GPU page fault happened due to Tiler trying to access the chunk that was already freed by the Userspace. The issue was root caused to cacheline sharing between the HeapContexts of 2 Tiler heaps of the same Kbase context. The page fault occurred for an Application that made use of more than 1 GPU queue group where one of the group, and its corresponding Tiler heap instance, is created and destroyed multiple times over the lifetime of Application. Kbase sub-allocates memory for a HeapContext from a 4KB page that is mapped as cached on GPU side, and the memory for HeapContext is zeroed on allocation through an uncached CPU mapping. Since the size of HeapContext is 32 bytes, 2 HeapContexts (corresponding to 2 Tiler heaps of the same context) can end up sharing the same GPU cacheline (which is 64 bytes in size). GPU page fault occurred as FW found a non NULL or stale value for the 'free_list_head' pointer in the HeapContext, even though the Heap was newly created, and so FW assumed a free chunk is available and passed the address of it to the Tiler and didn't raise an OoM event for Host. The stale value was found as the zeroing of new HeapContext's memory on allocation got lost due to the eviction of cacheline from L2 cache. The cacheline became dirty when FW had updated the contents of older HeapContext (sharing the cacheline with new HeapContext) on CSG suspend operation. This commit makes the GPU VA of HeapContext to be GPU cacheline aligned to avoid cacheline sharing. The alignment would suffice and there is no explicit cache flush needed when HeapContext is freed, as whole GPU cache would anyways be flushed on Odin & Turse GPUs when the initial chunks are freed just before the HeapContext is freed. Provenance: https://code.ipdelivery.arm.com/c/GPU/mali-ddk/+/4724/ Test: Boot to home Bug: 259523790 Change-Id: Ie9e8bffcadbd2ca7705dcd44f9be76754e28138d Signed-off-by: Jeremy Kemp <jeremykemp@google.com>
author: Akash Goel <akash.goel@arm.com> 2022-10-27 22:44:30 +0100
committer: Jeremy Kemp <jeremykemp@google.com> 2022-11-23 10:19:50 +0000
commit: 014047d6f3353765c422bf6a5c096908a76ffc32 (patch)
tree: f843642076beb96200f91e46b48ce5a56dd625c3
parent: 2a499c8d84a8594fd01888892f55b188419a5bc1 (diff)
download: gpu-014047d6f3353765c422bf6a5c096908a76ffc32.tar.gz
2 files changed, 22 insertions, 17 deletions
diff --git a/mali_kbase/csf/mali_kbase_csf_defs.h b/mali_kbase/csf/mali_kbase_csf_defs.h
index 836b558..af93fb3 100644
--- a/mali_kbase/csf/mali_kbase_csf_defs.h
+++ b/mali_kbase/csf/mali_kbase_csf_defs.h
@@ -599,6 +599,8 @@ struct kbase_csf_cpu_queue_context {
  * @lock:     Lock preventing concurrent access to the @in_use bitmap.
  * @in_use:   Bitmap that indicates which heap context structures are currently
  *            allocated (in @region).
+ * @heap_context_size_aligned: Size of a heap context structure, in bytes,
+ *                             aligned to GPU cacheline size.
  *
  * Heap context structures are allocated by the kernel for use by the firmware.
  * The current implementation subdivides a single GPU memory region for use as
@@ -610,6 +612,7 @@ struct kbase_csf_heap_context_allocator {
 	u64 gpu_va;
 	struct mutex lock;
 	DECLARE_BITMAP(in_use, MAX_TILER_HEAPS);
+	u32 heap_context_size_aligned;
 };
 
 /**
diff --git a/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c b/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c
index 1876d50..828e8a6 100644
--- a/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c
+++ b/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c
@@ -23,10 +23,7 @@
 #include "mali_kbase_csf_heap_context_alloc.h"
 
 /* Size of one heap context structure, in bytes. */
-#define HEAP_CTX_SIZE ((size_t)32)
-
-/* Total size of the GPU memory region allocated for heap contexts, in bytes. */
-#define HEAP_CTX_REGION_SIZE (MAX_TILER_HEAPS * HEAP_CTX_SIZE)
+#define HEAP_CTX_SIZE ((u32)32)
 
 /**
  * sub_alloc - Sub-allocate a heap context from a GPU memory region
@@ -38,8 +35,8 @@
 static u64 sub_alloc(struct kbase_csf_heap_context_allocator *const ctx_alloc)
 {
 	struct kbase_context *const kctx = ctx_alloc->kctx;
-	int heap_nr = 0;
-	size_t ctx_offset = 0;
+	unsigned long heap_nr = 0;
+	u32 ctx_offset = 0;
 	u64 heap_gpu_va = 0;
 	struct kbase_vmap_struct mapping;
 	void *ctx_ptr = NULL;
@@ -55,24 +52,24 @@ static u64 sub_alloc(struct kbase_csf_heap_context_allocator *const ctx_alloc)
 		return 0;
 	}
 
-	ctx_offset = heap_nr * HEAP_CTX_SIZE;
+	ctx_offset = heap_nr * ctx_alloc->heap_context_size_aligned;
 	heap_gpu_va = ctx_alloc->gpu_va + ctx_offset;
 	ctx_ptr = kbase_vmap_prot(kctx, heap_gpu_va,
-		HEAP_CTX_SIZE, KBASE_REG_CPU_WR, &mapping);
+		ctx_alloc->heap_context_size_aligned, KBASE_REG_CPU_WR, &mapping);
 
 	if (unlikely(!ctx_ptr)) {
 		dev_err(kctx->kbdev->dev,
-			"Failed to map tiler heap context %d (0x%llX)\n",
+			"Failed to map tiler heap context %lu (0x%llX)\n",
 			heap_nr, heap_gpu_va);
 		return 0;
 	}
 
-	memset(ctx_ptr, 0, HEAP_CTX_SIZE);
+	memset(ctx_ptr, 0, ctx_alloc->heap_context_size_aligned);
 	kbase_vunmap(ctx_ptr, &mapping);
 
 	bitmap_set(ctx_alloc->in_use, heap_nr, 1);
 
-	dev_dbg(kctx->kbdev->dev, "Allocated tiler heap context %d (0x%llX)\n",
+	dev_dbg(kctx->kbdev->dev, "Allocated tiler heap context %lu (0x%llX)\n",
 		heap_nr, heap_gpu_va);
 
 	return heap_gpu_va;
@@ -88,7 +85,7 @@ static void sub_free(struct kbase_csf_heap_context_allocator *const ctx_alloc,
 	u64 const heap_gpu_va)
 {
 	struct kbase_context *const kctx = ctx_alloc->kctx;
-	u64 ctx_offset = 0;
+	u32 ctx_offset = 0;
 	unsigned int heap_nr = 0;
 
 	lockdep_assert_held(&ctx_alloc->lock);
@@ -99,13 +96,13 @@ static void sub_free(struct kbase_csf_heap_context_allocator *const ctx_alloc,
 	if (WARN_ON(heap_gpu_va < ctx_alloc->gpu_va))
 		return;
 
-	ctx_offset = heap_gpu_va - ctx_alloc->gpu_va;
+	ctx_offset = (u32)(heap_gpu_va - ctx_alloc->gpu_va);
 
-	if (WARN_ON(ctx_offset >= HEAP_CTX_REGION_SIZE) ||
-		WARN_ON(ctx_offset % HEAP_CTX_SIZE))
+	if (WARN_ON(ctx_offset >= (ctx_alloc->region->nr_pages << PAGE_SHIFT)) ||
+		WARN_ON(ctx_offset % ctx_alloc->heap_context_size_aligned))
 		return;
 
-	heap_nr = ctx_offset / HEAP_CTX_SIZE;
+	heap_nr = ctx_offset / ctx_alloc->heap_context_size_aligned;
 	dev_dbg(kctx->kbdev->dev,
 		"Freed tiler heap context %d (0x%llX)\n", heap_nr, heap_gpu_va);
 
@@ -116,12 +113,17 @@ int kbase_csf_heap_context_allocator_init(
 	struct kbase_csf_heap_context_allocator *const ctx_alloc,
 	struct kbase_context *const kctx)
 {
+	const u32 gpu_cache_line_size =
+		(1U << kctx->kbdev->gpu_props.props.l2_props.log2_line_size);
+
 	/* We cannot pre-allocate GPU memory here because the
 	 * custom VA zone may not have been created yet.
 	 */
 	ctx_alloc->kctx = kctx;
 	ctx_alloc->region = NULL;
 	ctx_alloc->gpu_va = 0;
+	ctx_alloc->heap_context_size_aligned =
+		(HEAP_CTX_SIZE + gpu_cache_line_size - 1) & ~(gpu_cache_line_size - 1);
 
 	mutex_init(&ctx_alloc->lock);
 	bitmap_zero(ctx_alloc->in_use, MAX_TILER_HEAPS);
@@ -156,7 +158,7 @@ u64 kbase_csf_heap_context_allocator_alloc(
 	struct kbase_context *const kctx = ctx_alloc->kctx;
 	u64 flags = BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | BASE_MEM_PROT_CPU_WR |
 		    BASEP_MEM_NO_USER_FREE | BASE_MEM_PROT_CPU_RD;
-	u64 nr_pages = PFN_UP(HEAP_CTX_REGION_SIZE);
+	u64 nr_pages = PFN_UP(MAX_TILER_HEAPS * ctx_alloc->heap_context_size_aligned);
 	u64 heap_gpu_va = 0;
 
 	/* Calls to this function are inherently asynchronous, with respect to
author	Akash Goel <akash.goel@arm.com>	2022-10-27 22:44:30 +0100
committer	Jeremy Kemp <jeremykemp@google.com>	2022-11-23 10:19:50 +0000
commit	014047d6f3353765c422bf6a5c096908a76ffc32 (patch)
tree	f843642076beb96200f91e46b48ce5a56dd625c3
parent	2a499c8d84a8594fd01888892f55b188419a5bc1 (diff)
download	gpu-014047d6f3353765c422bf6a5c096908a76ffc32.tar.gz