From a0a0021e080c185aaff8b7173604de78e30bf65b Mon Sep 17 00:00:00 2001
From: Kevin Park <youngeun.park@arm.com>
Date: Wed, 26 Jul 2023 12:35:54 +0100
Subject: [Official] MIDCET-4806,GPUCORE-38732 Continue FLUSH_MEM after power
 transition timeout

This patch continues FLUSH_MEM operation at function caller and increases the timeout value for the core power transition.

Change-Id: I8cf5417976ab31b8d1e9e33e6359a91455d37144
(cherry picked from commit 71f7017252aabbb0013a2f151d894959b7c7c226)
Bug: 295942377
Provenance: https://code.ipdelivery.arm.com/c/GPU/mali-ddk/+/5812
---
 mali_kbase/mmu/mali_kbase_mmu_hw_direct.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'mali_kbase')

diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
index 122e9ef..50eeb4e 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
+++ b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
@@ -174,7 +174,7 @@ static int write_cmd(struct kbase_device *kbdev, int as_nr, u32 cmd)
 #if MALI_USE_CSF && !IS_ENABLED(CONFIG_MALI_NO_MALI)
 static int wait_cores_power_trans_complete(struct kbase_device *kbdev)
 {
-#define WAIT_TIMEOUT 1000 /* 1ms timeout */
+#define WAIT_TIMEOUT 50000 /* 50ms timeout */
 #define DELAY_TIME_IN_US 1
 	const int max_iterations = WAIT_TIMEOUT;
 	int loop;
@@ -194,7 +194,9 @@ static int wait_cores_power_trans_complete(struct kbase_device *kbdev)
 	}
 
 	if (loop == max_iterations) {
-		dev_warn(kbdev->dev, "SHADER_PWRTRANS set for too long");
+		dev_warn(kbdev->dev, "SHADER_PWRTRANS %08x%08x set for too long",
+			kbase_reg_read(kbdev, GPU_CONTROL_REG(SHADER_PWRTRANS_HI)),
+			kbase_reg_read(kbdev, GPU_CONTROL_REG(SHADER_PWRTRANS_LO)));
 		return -ETIMEDOUT;
 	}
 
@@ -230,9 +232,8 @@ static int apply_hw_issue_GPU2019_3901_wa(struct kbase_device *kbdev,
 	 * the workaround can be safely skipped.
 	 */
 	if (kbdev->pm.backend.l2_state != KBASE_L2_OFF) {
-		if (*mmu_cmd != AS_COMMAND_FLUSH_MEM) {
-			dev_warn(kbdev->dev,
-				 "Unexpected mmu command received");
+		if (unlikely(*mmu_cmd != AS_COMMAND_FLUSH_MEM)) {
+			dev_warn(kbdev->dev, "Unexpected MMU command(%u) received", *mmu_cmd);
 			ret = -EINVAL;
 			goto unlock;
 		}
@@ -502,8 +503,14 @@ static int mmu_hw_do_flush(struct kbase_device *kbdev, struct kbase_as *as,
 	if (mmu_cmd == AS_COMMAND_FLUSH_MEM) {
 		ret = apply_hw_issue_GPU2019_3901_wa(kbdev, &mmu_cmd,
 						as->number, hwaccess_locked);
-		if (ret)
-			return ret;
+		if (ret) {
+			dev_warn(kbdev->dev,
+				 "Failed to apply WA for HW issue when doing MMU flush op on VA range %llx-%llx for AS %u",
+				 op_param->vpfn << PAGE_SHIFT,
+				 ((op_param->vpfn + op_param->nr) << PAGE_SHIFT) - 1,
+				 as->number);
+			/* Continue with the MMU flush operation */
+		}
 	}
 #endif
 
-- 
cgit v1.2.3


From 662a6ec447752444340d113d6c4d5256930fb894 Mon Sep 17 00:00:00 2001
From: Rivaldo Hong <rivaldo.hong@arm.com>
Date: Tue, 14 Feb 2023 14:39:34 +0900
Subject: [Official] MIDCET-4820,GPUCORE-36255 Sync whole USER_BUFFER pages
 upon GPU mapping

As a result of this change now the whole of each of the pages
underlying a memory region imported as a USER_BUFFER region
are synchronized and the driver disables automatic CPU cache
synchronization upon DMA map/unmap operations.

(cherry picked from commit 0c2790fb2b006edb3e648a8a9a732e3aadcf2436)
Provenance: https://code.ipdelivery.arm.com/c/GPU/mali-ddk/+/5829
Bug: 295897369
Change-Id: Ifbd656b78e49ae614ca87e156321d0ccd6dd8887
---
 mali_kbase/csf/mali_kbase_csf_kcpu.c |   7 ++
 mali_kbase/mali_kbase_mem.c          | 155 ++++++++++++++++++++++++++++-------
 mali_kbase/mali_kbase_mem_linux.c    |  77 ++++++++++-------
 3 files changed, 181 insertions(+), 58 deletions(-)

(limited to 'mali_kbase')

diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.c b/mali_kbase/csf/mali_kbase_csf_kcpu.c
index dc66f62..1cb93b4 100644
--- a/mali_kbase/csf/mali_kbase_csf_kcpu.c
+++ b/mali_kbase/csf/mali_kbase_csf_kcpu.c
@@ -80,7 +80,14 @@ static int kbase_kcpu_map_import_prepare(
 		 * on the physical pages tracking object. When the last
 		 * reference to the tracking object is dropped the pages
 		 * would be unpinned if they weren't unpinned before.
+		 *
+		 * Region should be CPU cached: abort if it isn't.
 		 */
+		if (WARN_ON(!(reg->flags & KBASE_REG_CPU_CACHED))) {
+			ret = -EINVAL;
+			goto out;
+		}
+
 		ret = kbase_jd_user_buf_pin_pages(kctx, reg);
 		if (ret)
 			goto out;
diff --git a/mali_kbase/mali_kbase_mem.c b/mali_kbase/mali_kbase_mem.c
index 7c09772..0d00b14 100644
--- a/mali_kbase/mali_kbase_mem.c
+++ b/mali_kbase/mali_kbase_mem.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -2059,6 +2059,7 @@ void kbase_sync_single(struct kbase_context *kctx,
 			src = ((unsigned char *)kmap(gpu_page)) + offset;
 			dst = ((unsigned char *)kmap(cpu_page)) + offset;
 		}
+
 		memcpy(dst, src, size);
 		kunmap(gpu_page);
 		kunmap(cpu_page);
@@ -4940,10 +4941,7 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 	struct page **pages;
 	struct tagged_addr *pa;
 	long i, dma_mapped_pages;
-	unsigned long address;
 	struct device *dev;
-	unsigned long offset_within_page;
-	unsigned long remaining_size;
 	unsigned long gwt_mask = ~0;
 	/* Calls to this function are inherently asynchronous, with respect to
 	 * MMU operations.
@@ -4959,20 +4957,34 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 
 	alloc = reg->gpu_alloc;
 	pa = kbase_get_gpu_phy_pages(reg);
-	address = alloc->imported.user_buf.address;
 	pinned_pages = alloc->nents;
 	pages = alloc->imported.user_buf.pages;
 	dev = kctx->kbdev->dev;
-	offset_within_page = address & ~PAGE_MASK;
-	remaining_size = alloc->imported.user_buf.size;
 
+	/* Manual CPU cache synchronization.
+	 *
+	 * The driver disables automatic CPU cache synchronization because the
+	 * memory pages that enclose the imported region may also contain
+	 * sub-regions which are not imported and that are allocated and used
+	 * by the user process. This may be the case of memory at the beginning
+	 * of the first page and at the end of the last page. Automatic CPU cache
+	 * synchronization would force some operations on those memory allocations,
+	 * unbeknown to the user process: in particular, a CPU cache invalidate
+	 * upon unmapping would destroy the content of dirty CPU caches and cause
+	 * the user process to lose CPU writes to the non-imported sub-regions.
+	 *
+	 * When the GPU claims ownership of the imported memory buffer, it shall
+	 * commit CPU writes for the whole of all pages that enclose the imported
+	 * region, otherwise the initial content of memory would be wrong.
+	 */
 	for (i = 0; i < pinned_pages; i++) {
-		unsigned long map_size =
-			MIN(PAGE_SIZE - offset_within_page, remaining_size);
-		dma_addr_t dma_addr = dma_map_page(dev, pages[i],
-				offset_within_page, map_size,
-				DMA_BIDIRECTIONAL);
-
+		dma_addr_t dma_addr;
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+		dma_addr = dma_map_page(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+		dma_addr = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL,
+					      DMA_ATTR_SKIP_CPU_SYNC);
+#endif
 		err = dma_mapping_error(dev, dma_addr);
 		if (err)
 			goto unwind;
@@ -4980,8 +4992,7 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 		alloc->imported.user_buf.dma_addrs[i] = dma_addr;
 		pa[i] = as_tagged(page_to_phys(pages[i]));
 
-		remaining_size -= map_size;
-		offset_within_page = 0;
+		dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
 	}
 
 #ifdef CONFIG_MALI_CINSTR_GWT
@@ -4999,19 +5010,26 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 	/* fall down */
 unwind:
 	alloc->nents = 0;
-	offset_within_page = address & ~PAGE_MASK;
-	remaining_size = alloc->imported.user_buf.size;
 	dma_mapped_pages = i;
-	/* Run the unmap loop in the same order as map loop */
+	/* Run the unmap loop in the same order as map loop, and perform again
+	 * CPU cache synchronization to re-write the content of dirty CPU caches
+	 * to memory. This is precautionary measure in case a GPU job has taken
+	 * advantage of a partially GPU-mapped range to write and corrupt the
+	 * content of memory, either inside or outside the imported region.
+	 *
+	 * Notice that this error recovery path doesn't try to be optimal and just
+	 * flushes the entire page range.
+	 */
 	for (i = 0; i < dma_mapped_pages; i++) {
-		unsigned long unmap_size =
-			MIN(PAGE_SIZE - offset_within_page, remaining_size);
+		dma_addr_t dma_addr = alloc->imported.user_buf.dma_addrs[i];
 
-		dma_unmap_page(kctx->kbdev->dev,
-				alloc->imported.user_buf.dma_addrs[i],
-				unmap_size, DMA_BIDIRECTIONAL);
-		remaining_size -= unmap_size;
-		offset_within_page = 0;
+		dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+		dma_unmap_page(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+		dma_unmap_page_attrs(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL,
+				     DMA_ATTR_SKIP_CPU_SYNC);
+#endif
 	}
 
 	/* The user buffer could already have been previously pinned before
@@ -5052,12 +5070,89 @@ static void kbase_jd_user_buf_unmap(struct kbase_context *kctx, struct kbase_mem
 #endif
 
 	for (i = 0; i < alloc->imported.user_buf.nr_pages; i++) {
-		unsigned long unmap_size =
-			MIN(remaining_size, PAGE_SIZE - offset_within_page);
+		unsigned long imported_size = MIN(remaining_size, PAGE_SIZE - offset_within_page);
+		/* Notice: this is a temporary variable that is used for DMA sync
+		 * operations, and that could be incremented by an offset if the
+		 * current page contains both imported and non-imported memory
+		 * sub-regions.
+		 *
+		 * It is valid to add an offset to this value, because the offset
+		 * is always kept within the physically contiguous dma-mapped range
+		 * and there's no need to translate to physical address to offset it.
+		 *
+		 * This variable is not going to be used for the actual DMA unmap
+		 * operation, that shall always use the original DMA address of the
+		 * whole memory page.
+		 */
 		dma_addr_t dma_addr = alloc->imported.user_buf.dma_addrs[i];
 
-		dma_unmap_page(kctx->kbdev->dev, dma_addr, unmap_size,
-				DMA_BIDIRECTIONAL);
+		/* Manual CPU cache synchronization.
+		 *
+		 * When the GPU returns ownership of the buffer to the CPU, the driver
+		 * needs to treat imported and non-imported memory differently.
+		 *
+		 * The first case to consider is non-imported sub-regions at the
+		 * beginning of the first page and at the end of last page. For these
+		 * sub-regions: CPU cache shall be committed with a clean+invalidate,
+		 * in order to keep the last CPU write.
+		 *
+		 * Imported region prefers the opposite treatment: this memory has been
+		 * legitimately mapped and used by the GPU, hence GPU writes shall be
+		 * committed to memory, while CPU cache shall be invalidated to make
+		 * sure that CPU reads the correct memory content.
+		 *
+		 * The following diagram shows the expect value of the variables
+		 * used in this loop in the corner case of an imported region encloed
+		 * by a single memory page:
+		 *
+		 * page boundary ->|---------- | <- dma_addr (initial value)
+		 *                 |           |
+		 *                 | - - - - - | <- offset_within_page
+		 *                 |XXXXXXXXXXX|\
+		 *                 |XXXXXXXXXXX| \
+		 *                 |XXXXXXXXXXX|  }- imported_size
+		 *                 |XXXXXXXXXXX| /
+		 *                 |XXXXXXXXXXX|/
+		 *                 | - - - - - | <- offset_within_page + imported_size
+		 *                 |           |\
+		 *                 |           | }- PAGE_SIZE - imported_size - offset_within_page
+		 *                 |           |/
+		 * page boundary ->|-----------|
+		 *
+		 * If the imported region is enclosed by more than one page, then
+		 * offset_within_page = 0 for any page after the first.
+		 */
+
+		/* Only for first page: handle non-imported range at the beginning. */
+		if (offset_within_page > 0) {
+			dma_sync_single_for_device(kctx->kbdev->dev, dma_addr, offset_within_page,
+						   DMA_BIDIRECTIONAL);
+			dma_addr += offset_within_page;
+		}
+
+		/* For every page: handle imported range. */
+		if (imported_size > 0)
+			dma_sync_single_for_cpu(kctx->kbdev->dev, dma_addr, imported_size,
+						DMA_BIDIRECTIONAL);
+
+		/* Only for last page (that may coincide with first page):
+		 * handle non-imported range at the end.
+		 */
+		if ((imported_size + offset_within_page) < PAGE_SIZE) {
+			dma_addr += imported_size;
+			dma_sync_single_for_device(kctx->kbdev->dev, dma_addr,
+						   PAGE_SIZE - imported_size - offset_within_page,
+						   DMA_BIDIRECTIONAL);
+		}
+
+		/* Notice: use the original DMA address to unmap the whole memory page. */
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+		dma_unmap_page(kctx->kbdev->dev, alloc->imported.user_buf.dma_addrs[i], PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+#else
+		dma_unmap_page_attrs(kctx->kbdev->dev, alloc->imported.user_buf.dma_addrs[i],
+				     PAGE_SIZE, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+#endif
 		if (writeable)
 			set_page_dirty_lock(pages[i]);
 #if !MALI_USE_CSF
@@ -5065,7 +5160,7 @@ static void kbase_jd_user_buf_unmap(struct kbase_context *kctx, struct kbase_mem
 		pages[i] = NULL;
 #endif
 
-		remaining_size -= unmap_size;
+		remaining_size -= imported_size;
 		offset_within_page = 0;
 	}
 #if !MALI_USE_CSF
diff --git a/mali_kbase/mali_kbase_mem_linux.c b/mali_kbase/mali_kbase_mem_linux.c
index ffa0ebc..5289532 100644
--- a/mali_kbase/mali_kbase_mem_linux.c
+++ b/mali_kbase/mali_kbase_mem_linux.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -36,7 +36,7 @@
 #include <linux/cache.h>
 #include <linux/memory_group_manager.h>
 #include <linux/math64.h>
-
+#include <linux/version.h>
 #include <mali_kbase.h>
 #include <mali_kbase_mem_linux.h>
 #include <tl/mali_kbase_tracepoints.h>
@@ -1529,10 +1529,10 @@ static struct kbase_va_region *kbase_mem_from_user_buffer(
 	int zone = KBASE_REG_ZONE_CUSTOM_VA;
 	bool shared_zone = false;
 	u32 cache_line_alignment = kbase_get_cache_line_alignment(kctx->kbdev);
-	unsigned long offset_within_page;
-	unsigned long remaining_size;
 	struct kbase_alloc_import_user_buf *user_buf;
 	struct page **pages = NULL;
+	struct tagged_addr *pa;
+	struct device *dev;
 	int write;
 
 	/* Flag supported only for dma-buf imported memory */
@@ -1680,29 +1680,44 @@ static struct kbase_va_region *kbase_mem_from_user_buffer(
 	reg->gpu_alloc->nents = 0;
 	reg->extension = 0;
 
-	if (pages) {
-		struct device *dev = kctx->kbdev->dev;
-		struct tagged_addr *pa = kbase_get_gpu_phy_pages(reg);
+	pa = kbase_get_gpu_phy_pages(reg);
+	dev = kctx->kbdev->dev;
 
+	if (pages) {
 		/* Top bit signifies that this was pinned on import */
 		user_buf->current_mapping_usage_count |= PINNED_ON_IMPORT;
 
-		offset_within_page = user_buf->address & ~PAGE_MASK;
-		remaining_size = user_buf->size;
+		/* Manual CPU cache synchronization.
+		 *
+		 * The driver disables automatic CPU cache synchronization because the
+		 * memory pages that enclose the imported region may also contain
+		 * sub-regions which are not imported and that are allocated and used
+		 * by the user process. This may be the case of memory at the beginning
+		 * of the first page and at the end of the last page. Automatic CPU cache
+		 * synchronization would force some operations on those memory allocations,
+		 * unbeknown to the user process: in particular, a CPU cache invalidate
+		 * upon unmapping would destroy the content of dirty CPU caches and cause
+		 * the user process to lose CPU writes to the non-imported sub-regions.
+		 *
+		 * When the GPU claims ownership of the imported memory buffer, it shall
+		 * commit CPU writes for the whole of all pages that enclose the imported
+		 * region, otherwise the initial content of memory would be wrong.
+		 */
 		for (i = 0; i < faulted_pages; i++) {
-			unsigned long map_size =
-				MIN(PAGE_SIZE - offset_within_page, remaining_size);
-			dma_addr_t dma_addr = dma_map_page(dev, pages[i],
-				offset_within_page, map_size, DMA_BIDIRECTIONAL);
-
+			dma_addr_t dma_addr;
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+			dma_addr = dma_map_page(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+			dma_addr = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
+						      DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+#endif
 			if (dma_mapping_error(dev, dma_addr))
 				goto unwind_dma_map;
 
 			user_buf->dma_addrs[i] = dma_addr;
 			pa[i] = as_tagged(page_to_phys(pages[i]));
 
-			remaining_size -= map_size;
-			offset_within_page = 0;
+			dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
 		}
 
 		reg->gpu_alloc->nents = faulted_pages;
@@ -1711,19 +1726,23 @@ static struct kbase_va_region *kbase_mem_from_user_buffer(
 	return reg;
 
 unwind_dma_map:
-	offset_within_page = user_buf->address & ~PAGE_MASK;
-	remaining_size = user_buf->size;
 	dma_mapped_pages = i;
-	/* Run the unmap loop in the same order as map loop */
+	/* Run the unmap loop in the same order as map loop, and perform again
+	 * CPU cache synchronization to re-write the content of dirty CPU caches
+	 * to memory. This precautionary measure is kept here to keep this code
+	 * aligned with kbase_jd_user_buf_map() to allow for a potential refactor
+	 * in the future.
+	 */
 	for (i = 0; i < dma_mapped_pages; i++) {
-		unsigned long unmap_size =
-			MIN(PAGE_SIZE - offset_within_page, remaining_size);
+		dma_addr_t dma_addr = user_buf->dma_addrs[i];
 
-		dma_unmap_page(kctx->kbdev->dev,
-				user_buf->dma_addrs[i],
-				unmap_size, DMA_BIDIRECTIONAL);
-		remaining_size -= unmap_size;
-		offset_within_page = 0;
+		dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+		dma_unmap_page_attrs(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+		dma_unmap_page_attrs(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL,
+				     DMA_ATTR_SKIP_CPU_SYNC);
+#endif
 	}
 fault_mismatch:
 	if (pages) {
@@ -1743,7 +1762,6 @@ no_alloc_obj:
 no_region:
 bad_size:
 	return NULL;
-
 }
 
 
@@ -2026,7 +2044,10 @@ int kbase_mem_import(struct kbase_context *kctx, enum base_mem_import_type type,
 		/* Remove COHERENT_SYSTEM flag if coherent mem is unavailable */
 		*flags &= ~BASE_MEM_COHERENT_SYSTEM;
 	}
-
+	if (((*flags & BASE_MEM_CACHED_CPU) == 0) && (type == BASE_MEM_IMPORT_TYPE_USER_BUFFER)) {
+		dev_warn(kctx->kbdev->dev, "USER_BUFFER must be CPU cached");
+		goto bad_flags;
+	}
 	if ((padding != 0) && (type != BASE_MEM_IMPORT_TYPE_UMM)) {
 		dev_warn(kctx->kbdev->dev,
 				"padding is only supported for UMM");
-- 
cgit v1.2.3