Mali Valhall Android DDK r38p1-01eac0

VX504X08X-BU-00000-r38p1-01eac0 - Valhall Android DDK VX504X08X-BU-60000-r38p1-01eac0 - Valhall Android Document Bundle VX504X08X-DC-11001-r38p1-01eac0 - Valhall Android DDK Software Errata VX504X08X-SW-99006-r38p1-01eac0 - Valhall Android Renderscript AOSP parts Signed-off-by: Jack Diver <diverj@google.com> Change-Id: I242060ad8ddc14475bda657cbbbe6b6c26ecfd57
author: Jack Diver <diverj@google.com> 2022-09-02 11:38:04 +0000
committer: Jack Diver <diverj@google.com> 2022-09-02 14:33:02 +0000
commit: c30533582604fe0365bc3ce4e9e8e19dec3109da (patch)
tree: 2dc4d074c820b535e9f18b8cd81d7e91bff042e5 /mali_kbase/mmu
parent: 88d7d984fed1c2a4358ce2bbc334e82d71e3a391 (diff)
download: gpu-c30533582604fe0365bc3ce4e9e8e19dec3109da.tar.gz
7 files changed, 1235 insertions, 650 deletions
diff --git a/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c b/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c
index c9ba3fc..04f5cdf 100644
--- a/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c
+++ b/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -152,8 +152,8 @@ void kbase_gpu_report_bus_fault_and_kill(struct kbase_context *kctx,
 
 	/* terminal fault, print info about the fault */
 	dev_err(kbdev->dev,
-		"GPU bus fault in AS%d at VA 0x%016llX\n"
-		"VA_VALID: %s\n"
+		"GPU bus fault in AS%d at PA 0x%016llX\n"
+		"PA_VALID: %s\n"
 		"raw fault status: 0x%X\n"
 		"exception type 0x%X: %s\n"
 		"access type 0x%X: %s\n"
diff --git a/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c b/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c
index fad5554..3130b33 100644
--- a/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c
+++ b/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -66,7 +66,7 @@ void kbase_gpu_report_bus_fault_and_kill(struct kbase_context *kctx,
 
 	/* terminal fault, print info about the fault */
 	dev_err(kbdev->dev,
-		"GPU bus fault in AS%d at VA 0x%016llX\n"
+		"GPU bus fault in AS%d at PA 0x%016llX\n"
 		"raw fault status: 0x%X\n"
 		"exception type 0x%X: %s\n"
 		"exception data 0x%X\n"
diff --git a/mali_kbase/mmu/mali_kbase_mmu.c b/mali_kbase/mmu/mali_kbase_mmu.c
index 5814b46..3246586 100644
--- a/mali_kbase/mmu/mali_kbase_mmu.c
+++ b/mali_kbase/mmu/mali_kbase_mmu.c
@@ -49,6 +49,25 @@
 #include <mali_kbase_trace_gpu_mem.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
 
+/* Threshold used to decide whether to flush full caches or just a physical range */
+#define KBASE_PA_RANGE_THRESHOLD_NR_PAGES 20
+#define MGM_DEFAULT_PTE_GROUP (0)
+
+/* Macro to convert updated PDGs to flags indicating levels skip in flush */
+#define pgd_level_to_skip_flush(dirty_pgds) (~(dirty_pgds) & 0xF)
+
+/* Small wrapper function to factor out GPU-dependent context releasing */
+static void release_ctx(struct kbase_device *kbdev,
+		struct kbase_context *kctx)
+{
+#if MALI_USE_CSF
+	CSTD_UNUSED(kbdev);
+	kbase_ctx_sched_release_ctx_lock(kctx);
+#else /* MALI_USE_CSF */
+	kbasep_js_runpool_release_ctx(kbdev, kctx);
+#endif /* MALI_USE_CSF */
+}
+
 static void mmu_hw_operation_begin(struct kbase_device *kbdev)
 {
 #if !IS_ENABLED(CONFIG_MALI_NO_MALI)
@@ -109,112 +128,187 @@ static bool mmu_flush_cache_on_gpu_ctrl(struct kbase_device *kbdev)
 }
 
 /**
- * mmu_flush_invalidate_on_gpu_ctrl() - Flush and invalidate the GPU caches
- * through GPU_CONTROL interface.
- * @kbdev:         kbase device to issue the MMU operation on.
- * @as:            address space to issue the MMU operation on.
- * @op_param:      parameters for the operation.
+ * mmu_flush_pa_range() - Flush physical address range
  *
- * This wrapper function alternates AS_COMMAND_FLUSH_PT and AS_COMMAND_FLUSH_MEM
- * to equivalent GPU_CONTROL command FLUSH_CACHES.
- * The function first issue LOCK to MMU-AS with kbase_mmu_hw_do_operation().
- * And issues cache-flush with kbase_gpu_cache_flush_and_busy_wait() function
- * then issue UNLOCK to MMU-AS with kbase_mmu_hw_do_operation().
+ * @kbdev:    kbase device to issue the MMU operation on.
+ * @phys:     Starting address of the physical range to start the operation on.
+ * @nr_bytes: Number of bytes to work on.
+ * @op:       Type of cache flush operation to perform.
  *
- * Return: Zero if the operation was successful, non-zero otherwise.
+ * Issue a cache flush physical range command.
  */
-static int
-mmu_flush_invalidate_on_gpu_ctrl(struct kbase_device *kbdev,
-				 struct kbase_as *as,
-				 struct kbase_mmu_hw_op_param *op_param)
+
+/**
+ * mmu_invalidate() - Perform an invalidate operation on MMU caches.
+ * @kbdev:      The Kbase device.
+ * @kctx:       The Kbase context.
+ * @as_nr:      GPU address space number for which invalidate is required.
+ * @op_param: Non-NULL pointer to struct containing information about the MMU
+ *            operation to perform.
+ *
+ * Perform an MMU invalidate operation on a particual address space
+ * by issuing a UNLOCK command.
+ */
+static void mmu_invalidate(struct kbase_device *kbdev, struct kbase_context *kctx, int as_nr,
+			   const struct kbase_mmu_hw_op_param *op_param)
 {
-	u32 flush_op;
-	int ret, ret2;
+	int err = 0;
+	unsigned long flags;
 
-	if (WARN_ON(kbdev == NULL) ||
-	    WARN_ON(as == NULL) ||
-	    WARN_ON(op_param == NULL))
-		return -EINVAL;
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
-	lockdep_assert_held(&kbdev->hwaccess_lock);
-	lockdep_assert_held(&kbdev->mmu_hw_mutex);
+	if (kbdev->pm.backend.gpu_powered && (!kctx || kctx->as_nr >= 0)) {
+		as_nr = kctx ? kctx->as_nr : as_nr;
+		err = kbase_mmu_hw_do_unlock(kbdev, &kbdev->as[as_nr], op_param);
+	}
 
-	/* Translate operation to command */
-	if (op_param->op == KBASE_MMU_OP_FLUSH_PT) {
-		flush_op = GPU_COMMAND_CACHE_CLN_INV_L2;
-	} else if (op_param->op == KBASE_MMU_OP_FLUSH_MEM) {
-		flush_op = GPU_COMMAND_CACHE_CLN_INV_L2_LSC;
-	} else {
-		dev_warn(kbdev->dev, "Invalid flush request (op = %d)\n",
-			 op_param->op);
-		return -EINVAL;
+	if (err) {
+		dev_err(kbdev->dev,
+			"Invalidate after GPU page table update did not complete. Issuing GPU soft-reset to recover");
+		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
+			kbase_reset_gpu(kbdev);
 	}
 
-	/* 1. Issue MMU_AS_CONTROL.COMMAND.LOCK operation. */
-	op_param->op = KBASE_MMU_OP_LOCK;
-	ret = kbase_mmu_hw_do_operation(kbdev, as, op_param);
-	if (ret)
-		return ret;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+}
+
+/* Perform a flush/invalidate on a particular address space
+ */
+static void mmu_flush_invalidate_as(struct kbase_device *kbdev, struct kbase_as *as,
+				    const struct kbase_mmu_hw_op_param *op_param)
+{
+	int err = 0;
+	unsigned long flags;
+
+	/* AS transaction begin */
+	mutex_lock(&kbdev->mmu_hw_mutex);
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	if (kbdev->pm.backend.gpu_powered)
+		err = kbase_mmu_hw_do_flush_locked(kbdev, as, op_param);
 
-	/* 2. Issue GPU_CONTROL.COMMAND.FLUSH_CACHES operation */
-	ret = kbase_gpu_cache_flush_and_busy_wait(kbdev, flush_op);
+	if (err) {
+		/* Flush failed to complete, assume the GPU has hung and
+		 * perform a reset to recover.
+		 */
+		dev_err(kbdev->dev, "Flush for GPU page table update did not complete. Issuing GPU soft-reset to recover");
 
-	/* 3. Issue MMU_AS_CONTROL.COMMAND.UNLOCK operation. */
-	op_param->op = KBASE_MMU_OP_UNLOCK;
-	ret2 = kbase_mmu_hw_do_operation(kbdev, as, op_param);
+		if (kbase_prepare_to_reset_gpu(
+			    kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
+			kbase_reset_gpu(kbdev);
+	}
 
-	return ret ?: ret2;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	mutex_unlock(&kbdev->mmu_hw_mutex);
+	/* AS transaction end */
 }
 
 /**
- * kbase_mmu_flush_invalidate() - Flush and invalidate the GPU caches.
- * @kctx: The KBase context.
- * @vpfn: The virtual page frame number to start the flush on.
- * @nr: The number of pages to flush.
- * @sync: Set if the operation should be synchronous or not.
+ * mmu_flush_invalidate() - Perform a flush operation on GPU caches.
+ * @kbdev:      The Kbase device.
+ * @kctx:       The Kbase context.
+ * @as_nr:      GPU address space number for which flush + invalidate is required.
+ * @op_param: Non-NULL pointer to struct containing information about the MMU
+ *            operation to perform.
  *
- * Issue a cache flush + invalidate to the GPU caches and invalidate the TLBs.
+ * This function performs the cache flush operation described by @op_param.
+ * The function retains a reference to the given @kctx and releases it
+ * after performing the flush operation.
  *
- * If sync is not set then transactions still in flight when the flush is issued
- * may use the old page tables and the data they write will not be written out
- * to memory, this function returns after the flush has been issued but
- * before all accesses which might effect the flushed region have completed.
+ * If operation is set to KBASE_MMU_OP_FLUSH_PT then this function will issue
+ * a cache flush + invalidate to the L2 caches and invalidate the TLBs.
  *
- * If sync is set then accesses in the flushed region will be drained
- * before data is flush and invalidated through L1, L2 and into memory,
- * after which point this function will return.
- * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops.
- */
-static void
-kbase_mmu_flush_invalidate(struct kbase_context *kctx, u64 vpfn, size_t nr,
-			   bool sync,
-			   enum kbase_caller_mmu_sync_info mmu_sync_info);
-
-/**
- * kbase_mmu_flush_invalidate_no_ctx() - Flush and invalidate the GPU caches.
- * @kbdev: Device pointer.
- * @vpfn: The virtual page frame number to start the flush on.
- * @nr: The number of pages to flush.
- * @sync: Set if the operation should be synchronous or not.
- * @as_nr: GPU address space number for which flush + invalidate is required.
- * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops.
+ * If operation is set to KBASE_MMU_OP_FLUSH_MEM then this function will issue
+ * a cache flush + invalidate to the L2 and GPU Load/Store caches as well as
+ * invalidating the TLBs.
  *
- * This is used for MMU tables which do not belong to a user space context.
+ * If operation is set to KBASE_MMU_OP_UNLOCK then this function will only
+ * invalidate the MMU caches and TLBs.
  */
-static void kbase_mmu_flush_invalidate_no_ctx(
-	struct kbase_device *kbdev, u64 vpfn, size_t nr, bool sync, int as_nr,
-	enum kbase_caller_mmu_sync_info mmu_sync_info);
+static void mmu_flush_invalidate(struct kbase_device *kbdev, struct kbase_context *kctx, int as_nr,
+				 const struct kbase_mmu_hw_op_param *op_param)
+{
+	bool ctx_is_in_runpool;
+
+	/* Early out if there is nothing to do */
+	if (op_param->nr == 0)
+		return;
+
+	/* If no context is provided then MMU operation is performed on address
+	 * space which does not belong to user space context. Otherwise, retain
+	 * refcount to context provided and release after flush operation.
+	 */
+	if (!kctx) {
+		mmu_flush_invalidate_as(kbdev, &kbdev->as[as_nr], op_param);
+	} else {
+#if !MALI_USE_CSF
+		mutex_lock(&kbdev->js_data.queue_mutex);
+		ctx_is_in_runpool = kbase_ctx_sched_inc_refcount(kctx);
+		mutex_unlock(&kbdev->js_data.queue_mutex);
+#else
+		ctx_is_in_runpool = kbase_ctx_sched_inc_refcount_if_as_valid(kctx);
+#endif /* !MALI_USE_CSF */
+
+		if (ctx_is_in_runpool) {
+			KBASE_DEBUG_ASSERT(kctx->as_nr != KBASEP_AS_NR_INVALID);
+
+			mmu_flush_invalidate_as(kbdev, &kbdev->as[kctx->as_nr], op_param);
+
+			release_ctx(kbdev, kctx);
+		}
+	}
+}
 
 /**
- * kbase_mmu_sync_pgd() - sync page directory to memory when needed.
- * @kbdev:	Device pointer.
- * @handle:	Address of DMA region.
- * @size:       Size of the region to sync.
+ * mmu_flush_invalidate_on_gpu_ctrl() - Perform a flush operation on GPU caches via
+ *                                    the GPU_CONTROL interface
+ * @kbdev:      The Kbase device.
+ * @kctx:       The Kbase context.
+ * @as_nr:      GPU address space number for which flush + invalidate is required.
+ * @op_param: Non-NULL pointer to struct containing information about the MMU
+ *            operation to perform.
  *
- * This should be called after each page directory update.
+ * Perform a flush/invalidate on a particular address space via the GPU_CONTROL
+ * interface.
  */
-static void kbase_mmu_sync_pgd(struct kbase_device *kbdev,
-		dma_addr_t handle, size_t size)
+static void mmu_flush_invalidate_on_gpu_ctrl(struct kbase_device *kbdev, struct kbase_context *kctx,
+					int as_nr, const struct kbase_mmu_hw_op_param *op_param)
+{
+	int err = 0;
+	unsigned long flags;
+
+	/* AS transaction begin */
+	mutex_lock(&kbdev->mmu_hw_mutex);
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	if (kbdev->pm.backend.gpu_powered && (!kctx || kctx->as_nr >= 0)) {
+		as_nr = kctx ? kctx->as_nr : as_nr;
+		err = kbase_mmu_hw_do_flush_on_gpu_ctrl(kbdev, &kbdev->as[as_nr],
+							op_param);
+	}
+
+	if (err) {
+		/* Flush failed to complete, assume the GPU has hung and
+		 * perform a reset to recover.
+		 */
+		dev_err(kbdev->dev,
+			"Flush for GPU page table update did not complete. Issuing GPU soft-reset to recover\n");
+
+		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
+			kbase_reset_gpu(kbdev);
+	}
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	mutex_unlock(&kbdev->mmu_hw_mutex);
+}
+
+static void kbase_mmu_sync_pgd_gpu(struct kbase_device *kbdev, struct kbase_context *kctx,
+				   phys_addr_t phys, size_t size,
+				   enum kbase_mmu_op_type flush_op)
+{
+}
+
+static void kbase_mmu_sync_pgd_cpu(struct kbase_device *kbdev, dma_addr_t handle, size_t size)
 {
 	/* In non-coherent system, ensure the GPU can read
 	 * the pages from memory
@@ -224,6 +318,34 @@ static void kbase_mmu_sync_pgd(struct kbase_device *kbdev,
 				DMA_TO_DEVICE);
 }
 
+/**
+ * kbase_mmu_sync_pgd() - sync page directory to memory when needed.
+ * @kbdev:    Device pointer.
+ * @kctx:     Context pointer.
+ * @phys:     Starting physical address of the destination region.
+ * @handle:   Address of DMA region.
+ * @size:     Size of the region to sync.
+ * @flush_op: MMU cache flush operation to perform on the physical address
+ *            range, if GPU control is available.
+ *
+ * This function is called whenever the association between a virtual address
+ * range and a physical address range changes, because a mapping is created or
+ * destroyed.
+ * One of the effects of this operation is performing an MMU cache flush
+ * operation only on the physical address range affected by this function, if
+ * GPU control is available.
+ *
+ * This should be called after each page directory update.
+ */
+static void kbase_mmu_sync_pgd(struct kbase_device *kbdev, struct kbase_context *kctx,
+			       phys_addr_t phys, dma_addr_t handle, size_t size,
+			       enum kbase_mmu_op_type flush_op)
+{
+
+	kbase_mmu_sync_pgd_cpu(kbdev, handle, size);
+	kbase_mmu_sync_pgd_gpu(kbdev, kctx, phys, size, flush_op);
+}
+
 /*
  * Definitions:
  * - PGD: Page Directory.
@@ -234,34 +356,89 @@ static void kbase_mmu_sync_pgd(struct kbase_device *kbdev,
  */
 
 static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn,
-					struct tagged_addr *phys, size_t nr,
-					unsigned long flags, int group_id);
+					   struct tagged_addr *phys, size_t nr, unsigned long flags,
+					   int group_id, u64 *dirty_pgds);
 
 /**
  * kbase_mmu_update_and_free_parent_pgds() - Update number of valid entries and
  *                                           free memory of the page directories
  *
- * @kbdev:   Device pointer.
- * @mmut:    GPU MMU page table.
- * @pgds:    Physical addresses of page directories to be freed.
- * @vpfn:    The virtual page frame number.
- * @level:   The level of MMU page table.
+ * @kbdev:    Device pointer.
+ * @mmut:     GPU MMU page table.
+ * @pgds:     Physical addresses of page directories to be freed.
+ * @vpfn:     The virtual page frame number.
+ * @level:    The level of MMU page table.
+ * @flush_op: The type of MMU flush operation to perform.
+ * @dirty_pgds: Flags to track every level where a PGD has been updated.
+ * @free_pgds_list: Linked list of the page directory pages to free.
  */
 static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
-						  struct kbase_mmu_table *mmut,
-						  phys_addr_t *pgds, u64 vpfn,
-						  int level);
+						  struct kbase_mmu_table *mmut, phys_addr_t *pgds,
+						  u64 vpfn, int level,
+						  enum kbase_mmu_op_type flush_op, u64 *dirty_pgds,
+						  struct list_head *free_pgds_list);
 /**
  * kbase_mmu_free_pgd() - Free memory of the page directory
  *
  * @kbdev:   Device pointer.
  * @mmut:    GPU MMU page table.
  * @pgd:     Physical address of page directory to be freed.
- * @dirty:   Flag to indicate whether the page may be dirty in the cache.
+ *
+ * This function is supposed to be called with mmu_lock held and after
+ * ensuring that GPU won't be able to access the page.
  */
-static void kbase_mmu_free_pgd(struct kbase_device *kbdev,
-			       struct kbase_mmu_table *mmut, phys_addr_t pgd,
-			       bool dirty);
+static void kbase_mmu_free_pgd(struct kbase_device *kbdev, struct kbase_mmu_table *mmut,
+			       phys_addr_t pgd)
+{
+	struct page *p;
+
+	lockdep_assert_held(&mmut->mmu_lock);
+
+	p = pfn_to_page(PFN_DOWN(pgd));
+
+	kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id], p, true);
+
+	atomic_sub(1, &kbdev->memdev.used_pages);
+
+	/* If MMU tables belong to a context then pages will have been accounted
+	 * against it, so we must decrement the usage counts here.
+	 */
+	if (mmut->kctx) {
+		kbase_process_page_usage_dec(mmut->kctx, 1);
+		atomic_sub(1, &mmut->kctx->used_pages);
+	}
+
+	kbase_trace_gpu_mem_usage_dec(kbdev, mmut->kctx, 1);
+}
+
+/**
+ * kbase_mmu_free_pgds_list() - Free the PGD pages present in the list
+ *
+ * @kbdev:          Device pointer.
+ * @mmut:           GPU MMU page table.
+ * @free_pgds_list: Linked list of the page directory pages to free.
+ *
+ * This function will call kbase_mmu_free_pgd() on each page directory page
+ * present in the @free_pgds_list.
+ *
+ * The function is supposed to be called after the GPU cache and MMU TLB has
+ * been invalidated post the teardown loop.
+ */
+static void kbase_mmu_free_pgds_list(struct kbase_device *kbdev, struct kbase_mmu_table *mmut,
+				     struct list_head *free_pgds_list)
+{
+	struct page *page, *next_page;
+
+	mutex_lock(&mmut->mmu_lock);
+
+	list_for_each_entry_safe(page, next_page, free_pgds_list, lru) {
+		list_del_init(&page->lru);
+		kbase_mmu_free_pgd(kbdev, mmut, page_to_phys(page));
+	}
+
+	mutex_unlock(&mmut->mmu_lock);
+}
+
 /**
  * reg_grow_calc_extra_pages() - Calculate the number of backed pages to add to
  *                               a region on a GPU page fault
@@ -345,8 +522,10 @@ static size_t reg_grow_calc_extra_pages(struct kbase_device *kbdev,
 static void kbase_gpu_mmu_handle_write_faulting_as(struct kbase_device *kbdev,
 						   struct kbase_as *faulting_as,
 						   u64 start_pfn, size_t nr,
-						   u32 kctx_id)
+						   u32 kctx_id, u64 dirty_pgds)
 {
+	int err;
+
 	/* Calls to this function are inherently synchronous, with respect to
 	 * MMU operations.
 	 */
@@ -359,22 +538,23 @@ static void kbase_gpu_mmu_handle_write_faulting_as(struct kbase_device *kbdev,
 			KBASE_MMU_FAULT_TYPE_PAGE);
 
 	/* flush L2 and unlock the VA (resumes the MMU) */
-	op_param = (struct kbase_mmu_hw_op_param){
-		.vpfn = start_pfn,
-		.nr = nr,
-		.op = KBASE_MMU_OP_FLUSH_PT,
-		.kctx_id = kctx_id,
-		.mmu_sync_info = mmu_sync_info,
-	};
+	op_param.vpfn = start_pfn;
+	op_param.nr = nr;
+	op_param.op = KBASE_MMU_OP_FLUSH_PT;
+	op_param.kctx_id = kctx_id;
+	op_param.mmu_sync_info = mmu_sync_info;
 	if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
 		unsigned long irq_flags;
 
 		spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
-		mmu_flush_invalidate_on_gpu_ctrl(kbdev, faulting_as, &op_param);
+		op_param.flush_skip_levels =
+				pgd_level_to_skip_flush(dirty_pgds);
+		err = kbase_mmu_hw_do_flush_on_gpu_ctrl(kbdev, faulting_as,
+							&op_param);
 		spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
 	} else {
 		mmu_hw_operation_begin(kbdev);
-		kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param);
+		err = kbase_mmu_hw_do_flush(kbdev, faulting_as, &op_param);
 		mmu_hw_operation_end(kbdev);
 	}
 
@@ -414,6 +594,7 @@ static void kbase_gpu_mmu_handle_write_fault(struct kbase_context *kctx,
 	u64 fault_pfn, pfn_offset;
 	int ret;
 	int as_no;
+	u64 dirty_pgds = 0;
 
 	as_no = faulting_as->number;
 	kbdev = container_of(faulting_as, struct kbase_device, as[as_no]);
@@ -472,12 +653,11 @@ static void kbase_gpu_mmu_handle_write_fault(struct kbase_context *kctx,
 	}
 
 	/* Now make this faulting page writable to GPU. */
-	ret = kbase_mmu_update_pages_no_flush(kctx, fault_pfn,
-				fault_phys_addr,
-				1, region->flags, region->gpu_alloc->group_id);
+	ret = kbase_mmu_update_pages_no_flush(kctx, fault_pfn, fault_phys_addr, 1, region->flags,
+					      region->gpu_alloc->group_id, &dirty_pgds);
 
 	kbase_gpu_mmu_handle_write_faulting_as(kbdev, faulting_as, fault_pfn, 1,
-					       kctx->id);
+					       kctx->id, dirty_pgds);
 
 	kbase_gpu_vm_unlock(kctx);
 }
@@ -712,18 +892,6 @@ static bool page_fault_try_alloc(struct kbase_context *kctx,
 	return true;
 }
 
-/* Small wrapper function to factor out GPU-dependent context releasing */
-static void release_ctx(struct kbase_device *kbdev,
-		struct kbase_context *kctx)
-{
-#if MALI_USE_CSF
-	CSTD_UNUSED(kbdev);
-	kbase_ctx_sched_release_ctx_lock(kctx);
-#else /* MALI_USE_CSF */
-	kbasep_js_runpool_release_ctx(kbdev, kctx);
-#endif /* MALI_USE_CSF */
-}
-
 void kbase_mmu_page_fault_worker(struct work_struct *data)
 {
 	u64 fault_pfn;
@@ -938,16 +1106,29 @@ page_fault_retry:
 		 * transaction (which should cause the other page fault to be
 		 * raised again).
 		 */
-		op_param = (struct kbase_mmu_hw_op_param){
-			.vpfn = 0,
-			.nr = 0,
-			.op = KBASE_MMU_OP_UNLOCK,
-			.kctx_id = kctx->id,
-			.mmu_sync_info = mmu_sync_info,
-		};
-		mmu_hw_operation_begin(kbdev);
-		kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param);
-		mmu_hw_operation_end(kbdev);
+		op_param.mmu_sync_info = mmu_sync_info;
+		op_param.kctx_id = kctx->id;
+		if (!mmu_flush_cache_on_gpu_ctrl(kbdev)) {
+			mmu_hw_operation_begin(kbdev);
+			err = kbase_mmu_hw_do_unlock_no_addr(kbdev, faulting_as,
+							     &op_param);
+			mmu_hw_operation_end(kbdev);
+		} else {
+			/* Can safely skip the invalidate for all levels in case
+			 * of duplicate page faults.
+			 */
+			op_param.flush_skip_levels = 0xF;
+			op_param.vpfn = fault_pfn;
+			op_param.nr = 1;
+			err = kbase_mmu_hw_do_unlock(kbdev, faulting_as,
+						     &op_param);
+		}
+
+		if (err) {
+			dev_err(kbdev->dev,
+				"Invalidation for MMU did not complete on handling page fault @ 0x%llx",
+				fault->addr);
+		}
 
 		mutex_unlock(&kbdev->mmu_hw_mutex);
 
@@ -975,16 +1156,29 @@ page_fault_retry:
 				KBASE_MMU_FAULT_TYPE_PAGE);
 
 		/* See comment [1] about UNLOCK usage */
-		op_param = (struct kbase_mmu_hw_op_param){
-			.vpfn = 0,
-			.nr = 0,
-			.op = KBASE_MMU_OP_UNLOCK,
-			.kctx_id = kctx->id,
-			.mmu_sync_info = mmu_sync_info,
-		};
-		mmu_hw_operation_begin(kbdev);
-		kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param);
-		mmu_hw_operation_end(kbdev);
+		op_param.mmu_sync_info = mmu_sync_info;
+		op_param.kctx_id = kctx->id;
+		if (!mmu_flush_cache_on_gpu_ctrl(kbdev)) {
+			mmu_hw_operation_begin(kbdev);
+			err = kbase_mmu_hw_do_unlock_no_addr(kbdev, faulting_as,
+							     &op_param);
+			mmu_hw_operation_end(kbdev);
+		} else {
+			/* Can safely skip the invalidate for all levels in case
+			 * of duplicate page faults.
+			 */
+			op_param.flush_skip_levels = 0xF;
+			op_param.vpfn = fault_pfn;
+			op_param.nr = 1;
+			err = kbase_mmu_hw_do_unlock(kbdev, faulting_as,
+						     &op_param);
+		}
+
+		if (err) {
+			dev_err(kbdev->dev,
+				"Invalidation for MMU did not complete on handling page fault @ 0x%llx",
+				fault->addr);
+		}
 
 		mutex_unlock(&kbdev->mmu_hw_mutex);
 
@@ -1009,6 +1203,7 @@ page_fault_retry:
 	spin_unlock(&kctx->mem_partials_lock);
 
 	if (grown) {
+		u64 dirty_pgds = 0;
 		u64 pfn_offset;
 		struct kbase_mmu_hw_op_param op_param;
 
@@ -1027,9 +1222,10 @@ page_fault_retry:
 		 * us to unlock the MMU as we see fit.
 		 */
 		err = kbase_mmu_insert_pages_no_flush(kbdev, &kctx->mmu,
-			region->start_pfn + pfn_offset,
-			&kbase_get_gpu_phy_pages(region)[pfn_offset],
-			new_pages, region->flags, region->gpu_alloc->group_id);
+						      region->start_pfn + pfn_offset,
+						      &kbase_get_gpu_phy_pages(region)[pfn_offset],
+						      new_pages, region->flags,
+						      region->gpu_alloc->group_id, &dirty_pgds);
 		if (err) {
 			kbase_free_phy_pages_helper(region->gpu_alloc,
 					new_pages);
@@ -1048,7 +1244,7 @@ page_fault_retry:
 				(u64)new_pages);
 		trace_mali_mmu_page_fault_grow(region, fault, new_pages);
 
-#if MALI_INCREMENTAL_RENDERING
+#if MALI_INCREMENTAL_RENDERING_JM
 		/* Switch to incremental rendering if we have nearly run out of
 		 * memory in a JIT memory allocation.
 		 */
@@ -1084,25 +1280,22 @@ page_fault_retry:
 		kbase_mmu_hw_clear_fault(kbdev, faulting_as,
 					 KBASE_MMU_FAULT_TYPE_PAGE);
 
-		/* flush L2 and unlock the VA (resumes the MMU) */
-		op_param = (struct kbase_mmu_hw_op_param){
-			.vpfn = fault->addr >> PAGE_SHIFT,
-			.nr = new_pages,
-			.op = KBASE_MMU_OP_FLUSH_PT,
-			.kctx_id = kctx->id,
-			.mmu_sync_info = mmu_sync_info,
-		};
+		op_param.vpfn = region->start_pfn + pfn_offset;
+		op_param.nr = new_pages;
+		op_param.op = KBASE_MMU_OP_FLUSH_PT;
+		op_param.kctx_id = kctx->id;
+		op_param.mmu_sync_info = mmu_sync_info;
 		if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
-			unsigned long irq_flags;
-
-			spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
-			err = mmu_flush_invalidate_on_gpu_ctrl(kbdev, faulting_as,
-							 &op_param);
-			spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
+			/* Unlock to invalidate the TLB (and resume the MMU) */
+			op_param.flush_skip_levels =
+				pgd_level_to_skip_flush(dirty_pgds);
+			err = kbase_mmu_hw_do_unlock(kbdev, faulting_as,
+						     &op_param);
 		} else {
+			/* flush L2 and unlock the VA (resumes the MMU) */
 			mmu_hw_operation_begin(kbdev);
-			err = kbase_mmu_hw_do_operation(kbdev, faulting_as,
-						  &op_param);
+			err = kbase_mmu_hw_do_flush(kbdev, faulting_as,
+						    &op_param);
 			mmu_hw_operation_end(kbdev);
 		}
 
@@ -1219,17 +1412,19 @@ static phys_addr_t kbase_mmu_alloc_pgd(struct kbase_device *kbdev,
 		struct kbase_mmu_table *mmut)
 {
 	u64 *page;
-	int i;
 	struct page *p;
+	phys_addr_t pgd;
 
 	p = kbase_mem_pool_alloc(&kbdev->mem_pools.small[mmut->group_id]);
 	if (!p)
-		return 0;
+		return KBASE_MMU_INVALID_PGD_ADDRESS;
 
 	page = kmap(p);
 	if (page == NULL)
 		goto alloc_free;
 
+	pgd = page_to_phys(p);
+
 	/* If the MMU tables belong to a context then account the memory usage
 	 * to that context, otherwise the MMU tables are device wide and are
 	 * only accounted to the device.
@@ -1250,26 +1445,28 @@ static phys_addr_t kbase_mmu_alloc_pgd(struct kbase_device *kbdev,
 
 	kbase_trace_gpu_mem_usage_inc(kbdev, mmut->kctx, 1);
 
-	for (i = 0; i < KBASE_MMU_PAGE_ENTRIES; i++)
-		kbdev->mmu_mode->entry_invalidate(&page[i]);
+	kbdev->mmu_mode->entries_invalidate(page, KBASE_MMU_PAGE_ENTRIES);
 
-	kbase_mmu_sync_pgd(kbdev, kbase_dma_addr(p), PAGE_SIZE);
+	/* As this page is newly created, therefore there is no content to
+	 * clean or invalidate in the GPU caches.
+	 */
+	kbase_mmu_sync_pgd_cpu(kbdev, kbase_dma_addr(p), PAGE_SIZE);
 
 	kunmap(p);
-	return page_to_phys(p);
+	return pgd;
 
 alloc_free:
 	kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id], p, false);
 
-	return 0;
+	return KBASE_MMU_INVALID_PGD_ADDRESS;
 }
 
 /* Given PGD PFN for level N, return PGD PFN for level N+1, allocating the
  * new table from the pool if needed and possible
  */
-static int mmu_get_next_pgd(struct kbase_device *kbdev,
-		struct kbase_mmu_table *mmut,
-		phys_addr_t *pgd, u64 vpfn, int level)
+static int mmu_get_next_pgd(struct kbase_device *kbdev, struct kbase_mmu_table *mmut,
+			    phys_addr_t *pgd, u64 vpfn, int level, bool *newly_created_pgd,
+			    u64 *dirty_pgds)
 {
 	u64 *page;
 	phys_addr_t target_pgd;
@@ -1293,21 +1490,49 @@ static int mmu_get_next_pgd(struct kbase_device *kbdev,
 		return -EINVAL;
 	}
 
-	target_pgd = kbdev->mmu_mode->pte_to_phy_addr(page[vpfn]);
+	if (!kbdev->mmu_mode->pte_is_valid(page[vpfn], level)) {
+		enum kbase_mmu_op_type flush_op = KBASE_MMU_OP_NONE;
+		unsigned int current_valid_entries;
+		u64 managed_pte;
 
-	if (!target_pgd) {
 		target_pgd = kbase_mmu_alloc_pgd(kbdev, mmut);
-		if (!target_pgd) {
+		if (target_pgd == KBASE_MMU_INVALID_PGD_ADDRESS) {
 			dev_dbg(kbdev->dev, "%s: kbase_mmu_alloc_pgd failure\n",
 					__func__);
 			kunmap(p);
 			return -ENOMEM;
 		}
 
-		kbdev->mmu_mode->entry_set_pte(page, vpfn, target_pgd);
+		current_valid_entries = kbdev->mmu_mode->get_num_valid_entries(page);
+		kbdev->mmu_mode->entry_set_pte(&managed_pte, target_pgd);
+		page[vpfn] = kbdev->mgm_dev->ops.mgm_update_gpu_pte(
+			kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, managed_pte);
+		kbdev->mmu_mode->set_num_valid_entries(page, current_valid_entries + 1);
 
-		kbase_mmu_sync_pgd(kbdev, kbase_dma_addr(p), PAGE_SIZE);
 		/* Rely on the caller to update the address space flags. */
+		if (newly_created_pgd && !*newly_created_pgd) {
+			*newly_created_pgd = true;
+			/* If code reaches here we know parent PGD of target PGD was
+			 * not newly created and should be flushed.
+			 */
+			flush_op = KBASE_MMU_OP_FLUSH_PT;
+
+			if (dirty_pgds)
+				*dirty_pgds |= 1ULL << level;
+		}
+
+		/* A new valid entry is added to an existing PGD. Perform the
+		 * invalidate operation for GPU cache as it could be having a
+		 * cacheline that contains the entry (in an invalid form).
+		 */
+		kbase_mmu_sync_pgd(kbdev, mmut->kctx,
+				   *pgd + (vpfn * sizeof(u64)),
+				   kbase_dma_addr(p) + (vpfn * sizeof(u64)),
+				   sizeof(u64), flush_op);
+	} else {
+		target_pgd = kbdev->mmu_mode->pte_to_phy_addr(
+			kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
+				kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, page[vpfn]));
 	}
 
 	kunmap(p);
@@ -1319,11 +1544,9 @@ static int mmu_get_next_pgd(struct kbase_device *kbdev,
 /*
  * Returns the PGD for the specified level of translation
  */
-static int mmu_get_pgd_at_level(struct kbase_device *kbdev,
-					struct kbase_mmu_table *mmut,
-					u64 vpfn,
-					int level,
-					phys_addr_t *out_pgd)
+static int mmu_get_pgd_at_level(struct kbase_device *kbdev, struct kbase_mmu_table *mmut, u64 vpfn,
+				int level, phys_addr_t *out_pgd, bool *newly_created_pgd,
+				u64 *dirty_pgds)
 {
 	phys_addr_t pgd;
 	int l;
@@ -1332,7 +1555,8 @@ static int mmu_get_pgd_at_level(struct kbase_device *kbdev,
 	pgd = mmut->pgd;
 
 	for (l = MIDGARD_MMU_TOPLEVEL; l < level; l++) {
-		int err = mmu_get_next_pgd(kbdev, mmut, &pgd, vpfn, l);
+		int err =
+			mmu_get_next_pgd(kbdev, mmut, &pgd, vpfn, l, newly_created_pgd, dirty_pgds);
 		/* Handle failure condition */
 		if (err) {
 			dev_dbg(kbdev->dev,
@@ -1347,20 +1571,18 @@ static int mmu_get_pgd_at_level(struct kbase_device *kbdev,
 	return 0;
 }
 
-static int mmu_get_bottom_pgd(struct kbase_device *kbdev,
-		struct kbase_mmu_table *mmut,
-		u64 vpfn,
-		phys_addr_t *out_pgd)
+static int mmu_get_bottom_pgd(struct kbase_device *kbdev, struct kbase_mmu_table *mmut, u64 vpfn,
+			      phys_addr_t *out_pgd, bool *newly_created_pgd, u64 *dirty_pgds)
 {
-	return mmu_get_pgd_at_level(kbdev, mmut, vpfn, MIDGARD_MMU_BOTTOMLEVEL,
-			out_pgd);
+	return mmu_get_pgd_at_level(kbdev, mmut, vpfn, MIDGARD_MMU_BOTTOMLEVEL, out_pgd,
+				    newly_created_pgd, dirty_pgds);
 }
 
 static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
-		struct kbase_mmu_table *mmut,
-		u64 from_vpfn, u64 to_vpfn)
+					      struct kbase_mmu_table *mmut, u64 from_vpfn,
+					      u64 to_vpfn, u64 *dirty_pgds,
+					      struct list_head *free_pgds_list)
 {
-	phys_addr_t pgd;
 	u64 vpfn = from_vpfn;
 	struct kbase_mmu_mode const *mmu_mode;
 
@@ -1373,7 +1595,6 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 	mmu_mode = kbdev->mmu_mode;
 
 	while (vpfn < to_vpfn) {
-		unsigned int i;
 		unsigned int idx = vpfn & 0x1FF;
 		unsigned int count = KBASE_MMU_PAGE_ENTRIES - idx;
 		unsigned int pcount = 0;
@@ -1381,6 +1602,8 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 		int level;
 		u64 *page;
 		phys_addr_t pgds[MIDGARD_MMU_BOTTOMLEVEL + 1];
+		phys_addr_t pgd = mmut->pgd;
+		struct page *p = phys_to_page(pgd);
 
 		register unsigned int num_of_valid_entries;
 
@@ -1388,17 +1611,17 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 			count = left;
 
 		/* need to check if this is a 2MB page or a 4kB */
-		pgd = mmut->pgd;
-
 		for (level = MIDGARD_MMU_TOPLEVEL;
 				level <= MIDGARD_MMU_BOTTOMLEVEL; level++) {
 			idx = (vpfn >> ((3 - level) * 9)) & 0x1FF;
 			pgds[level] = pgd;
-			page = kmap(phys_to_page(pgd));
+			page = kmap(p);
 			if (mmu_mode->ate_is_valid(page[idx], level))
 				break; /* keep the mapping */
-			kunmap(phys_to_page(pgd));
-			pgd = mmu_mode->pte_to_phy_addr(page[idx]);
+			kunmap(p);
+			pgd = mmu_mode->pte_to_phy_addr(kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
+				kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, page[idx]));
+			p = phys_to_page(pgd);
 		}
 
 		switch (level) {
@@ -1416,38 +1639,79 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 			goto next;
 		}
 
+		if (dirty_pgds && pcount > 0)
+			*dirty_pgds |= 1ULL << level;
+
 		num_of_valid_entries = mmu_mode->get_num_valid_entries(page);
 		if (WARN_ON_ONCE(num_of_valid_entries < pcount))
 			num_of_valid_entries = 0;
 		else
 			num_of_valid_entries -= pcount;
 
+		/* Invalidate the entries we added */
+		mmu_mode->entries_invalidate(&page[idx], pcount);
+
 		if (!num_of_valid_entries) {
-			kunmap(phys_to_page(pgd));
+			kunmap(p);
 
-			kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
+			list_add(&p->lru, free_pgds_list);
 
-			kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds,
-							      vpfn, level);
+			kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds, vpfn, level,
+							      KBASE_MMU_OP_NONE, dirty_pgds,
+							      free_pgds_list);
 			vpfn += count;
 			continue;
 		}
 
-		/* Invalidate the entries we added */
-		for (i = 0; i < pcount; i++)
-			mmu_mode->entry_invalidate(&page[idx + i]);
-
 		mmu_mode->set_num_valid_entries(page, num_of_valid_entries);
 
-		kbase_mmu_sync_pgd(kbdev,
-				   kbase_dma_addr(phys_to_page(pgd)) + 8 * idx,
-				   8 * pcount);
-		kunmap(phys_to_page(pgd));
+		/* MMU cache flush strategy is NONE because GPU cache maintenance is
+		 * going to be done by the caller
+		 */
+		kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (idx * sizeof(u64)),
+				   kbase_dma_addr(p) + sizeof(u64) * idx, sizeof(u64) * pcount,
+				   KBASE_MMU_OP_NONE);
+		kunmap(p);
 next:
 		vpfn += count;
 	}
 }
 
+static void mmu_flush_invalidate_insert_pages(struct kbase_device *kbdev,
+					      struct kbase_mmu_table *mmut, const u64 vpfn,
+					      size_t nr, u64 dirty_pgds,
+					      enum kbase_caller_mmu_sync_info mmu_sync_info)
+{
+	struct kbase_mmu_hw_op_param op_param;
+	int as_nr = 0;
+
+	op_param.vpfn = vpfn;
+	op_param.nr = nr;
+	op_param.op = KBASE_MMU_OP_FLUSH_PT;
+	op_param.mmu_sync_info = mmu_sync_info;
+	op_param.kctx_id = mmut->kctx ? mmut->kctx->id : 0xFFFFFFFF;
+	op_param.flush_skip_levels = pgd_level_to_skip_flush(dirty_pgds);
+
+#if MALI_USE_CSF
+	as_nr = mmut->kctx ? mmut->kctx->as_nr : MCU_AS_NR;
+#else
+	WARN_ON(!mmut->kctx);
+#endif
+
+	/* MMU cache flush strategy depends on whether GPU control commands for
+	 * flushing physical address ranges are supported. The new physical pages
+	 * are not present in GPU caches therefore they don't need any cache
+	 * maintenance, but PGDs in the page table may or may not be created anew.
+	 *
+	 * Operations that affect the whole GPU cache shall only be done if it's
+	 * impossible to update physical ranges.
+	 */
+	if (mmu_flush_cache_on_gpu_ctrl(kbdev))
+		mmu_invalidate(kbdev, mmut->kctx, as_nr, &op_param);
+	else
+		mmu_flush_invalidate(kbdev, mmut->kctx, as_nr, &op_param);
+}
+
 /*
  * Map the single page 'phys' 'nr' of times, starting at GPU PFN 'vpfn'
  */
@@ -1467,6 +1731,9 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 	size_t remain = nr;
 	int err;
 	struct kbase_device *kbdev;
+	enum kbase_mmu_op_type flush_op;
+	u64 dirty_pgds = 0;
+	LIST_HEAD(free_pgds_list);
 
 	if (WARN_ON(kctx == NULL))
 		return -EINVAL;
@@ -1488,6 +1755,7 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 		unsigned int count = KBASE_MMU_PAGE_ENTRIES - index;
 		struct page *p;
 		register unsigned int num_of_valid_entries;
+		bool newly_created_pgd = false;
 
 		if (count > remain)
 			count = remain;
@@ -1500,8 +1768,8 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 		 * 256 pages at once (on average). Do we really care?
 		 */
 		do {
-			err = mmu_get_bottom_pgd(kbdev, &kctx->mmu,
-					vpfn, &pgd);
+			err = mmu_get_bottom_pgd(kbdev, &kctx->mmu, vpfn, &pgd, &newly_created_pgd,
+						 &dirty_pgds);
 			if (err != -ENOMEM)
 				break;
 			/* Fill the memory pool with enough pages for
@@ -1521,10 +1789,9 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 				/* Invalidate the pages we have partially
 				 * completed
 				 */
-				mmu_insert_pages_failure_recovery(kbdev,
-						&kctx->mmu,
-						start_vpfn,
-						start_vpfn + recover_count);
+				mmu_insert_pages_failure_recovery(kbdev, &kctx->mmu, start_vpfn,
+								  start_vpfn + recover_count,
+								  &dirty_pgds, &free_pgds_list);
 			}
 			goto fail_unlock;
 		}
@@ -1537,10 +1804,9 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 				/* Invalidate the pages we have partially
 				 * completed
 				 */
-				mmu_insert_pages_failure_recovery(kbdev,
-						&kctx->mmu,
-						start_vpfn,
-						start_vpfn + recover_count);
+				mmu_insert_pages_failure_recovery(kbdev, &kctx->mmu, start_vpfn,
+								  start_vpfn + recover_count,
+								  &dirty_pgds, &free_pgds_list);
 			}
 			err = -ENOMEM;
 			goto fail_unlock;
@@ -1565,9 +1831,21 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 		vpfn += count;
 		remain -= count;
 
-		kbase_mmu_sync_pgd(kbdev,
-				kbase_dma_addr(p) + (index * sizeof(u64)),
-				count * sizeof(u64));
+		if (count > 0 && !newly_created_pgd)
+			dirty_pgds |= 1ULL << MIDGARD_MMU_BOTTOMLEVEL;
+
+		/* MMU cache flush operation here will depend on whether bottom level
+		 * PGD is newly created or not.
+		 *
+		 * If bottom level PGD is newly created then no cache maintenance is
+		 * required as the PGD will not exist in GPU cache. Otherwise GPU cache
+		 * maintenance is required for existing PGD.
+		 */
+		flush_op = newly_created_pgd ? KBASE_MMU_OP_NONE : KBASE_MMU_OP_FLUSH_PT;
+
+		kbase_mmu_sync_pgd(kbdev, kctx, pgd + (index * sizeof(u64)),
+				   kbase_dma_addr(p) + (index * sizeof(u64)), count * sizeof(u64),
+				   flush_op);
 
 		kunmap(p);
 		/* We have started modifying the page table.
@@ -1578,39 +1856,20 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 		recover_count += count;
 	}
 	mutex_unlock(&kctx->mmu.mmu_lock);
-	kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false, mmu_sync_info);
+
+	mmu_flush_invalidate_insert_pages(kbdev, &kctx->mmu, start_vpfn, nr, dirty_pgds,
+					  mmu_sync_info);
+
 	return 0;
 
 fail_unlock:
 	mutex_unlock(&kctx->mmu.mmu_lock);
-	kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false, mmu_sync_info);
-	return err;
-}
-
-static void kbase_mmu_free_pgd(struct kbase_device *kbdev,
-			       struct kbase_mmu_table *mmut, phys_addr_t pgd,
-			       bool dirty)
-{
-	struct page *p;
-
-	lockdep_assert_held(&mmut->mmu_lock);
-
-	p = pfn_to_page(PFN_DOWN(pgd));
-
-	kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id],
-			    p, dirty);
 
-	atomic_sub(1, &kbdev->memdev.used_pages);
-
-	/* If MMU tables belong to a context then pages will have been accounted
-	 * against it, so we must decrement the usage counts here.
-	 */
-	if (mmut->kctx) {
-		kbase_process_page_usage_dec(mmut->kctx, 1);
-		atomic_sub(1, &mmut->kctx->used_pages);
-	}
+	mmu_flush_invalidate_insert_pages(kbdev, &kctx->mmu, start_vpfn, nr, dirty_pgds,
+					  mmu_sync_info);
+	kbase_mmu_free_pgds_list(kbdev, &kctx->mmu, &free_pgds_list);
 
-	kbase_trace_gpu_mem_usage_dec(kbdev, mmut->kctx, 1);
+	return err;
 }
 
 u64 kbase_mmu_create_ate(struct kbase_device *const kbdev,
@@ -1624,12 +1883,9 @@ u64 kbase_mmu_create_ate(struct kbase_device *const kbdev,
 		group_id, level, entry);
 }
 
-int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
-				    struct kbase_mmu_table *mmut,
-				    const u64 start_vpfn,
-				    struct tagged_addr *phys, size_t nr,
-				    unsigned long flags,
-				    int const group_id)
+int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, struct kbase_mmu_table *mmut,
+				    const u64 start_vpfn, struct tagged_addr *phys, size_t nr,
+				    unsigned long flags, int const group_id, u64 *dirty_pgds)
 {
 	phys_addr_t pgd;
 	u64 *pgd_page;
@@ -1637,6 +1893,7 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 	size_t remain = nr;
 	int err;
 	struct kbase_mmu_mode const *mmu_mode;
+	LIST_HEAD(free_pgds_list);
 
 	/* Note that 0 is a valid start_vpfn */
 	/* 64-bit address range is the max */
@@ -1657,6 +1914,8 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 		struct page *p;
 		int cur_level;
 		register unsigned int num_of_valid_entries;
+		enum kbase_mmu_op_type flush_op;
+		bool newly_created_pgd = false;
 
 		if (count > remain)
 			count = remain;
@@ -1674,8 +1933,8 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 		 * 256 pages at once (on average). Do we really care?
 		 */
 		do {
-			err = mmu_get_pgd_at_level(kbdev, mmut, insert_vpfn,
-						   cur_level, &pgd);
+			err = mmu_get_pgd_at_level(kbdev, mmut, insert_vpfn, cur_level, &pgd,
+						   &newly_created_pgd, dirty_pgds);
 			if (err != -ENOMEM)
 				break;
 			/* Fill the memory pool with enough pages for
@@ -1689,14 +1948,14 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 		} while (!err);
 
 		if (err) {
-			dev_warn(kbdev->dev,
-				 "%s: mmu_get_bottom_pgd failure\n", __func__);
+			dev_warn(kbdev->dev, "%s: mmu_get_pgd_at_level failure\n", __func__);
 			if (insert_vpfn != start_vpfn) {
 				/* Invalidate the pages we have partially
 				 * completed
 				 */
-				mmu_insert_pages_failure_recovery(kbdev,
-						mmut, start_vpfn, insert_vpfn);
+				mmu_insert_pages_failure_recovery(kbdev, mmut, start_vpfn,
+								  insert_vpfn, dirty_pgds,
+								  &free_pgds_list);
 			}
 			goto fail_unlock;
 		}
@@ -1710,8 +1969,9 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 				/* Invalidate the pages we have partially
 				 * completed
 				 */
-				mmu_insert_pages_failure_recovery(kbdev,
-						mmut, start_vpfn, insert_vpfn);
+				mmu_insert_pages_failure_recovery(kbdev, mmut, start_vpfn,
+								  insert_vpfn, dirty_pgds,
+								  &free_pgds_list);
 			}
 			err = -ENOMEM;
 			goto fail_unlock;
@@ -1722,18 +1982,8 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 
 		if (cur_level == MIDGARD_MMU_LEVEL(2)) {
 			int level_index = (insert_vpfn >> 9) & 0x1FF;
-			u64 *target = &pgd_page[level_index];
-
-			if (mmu_mode->pte_is_valid(*target, cur_level)) {
-				kbase_mmu_free_pgd(
-					kbdev, mmut,
-					kbdev->mmu_mode->pte_to_phy_addr(
-						*target),
-					false);
-				num_of_valid_entries--;
-			}
-			*target = kbase_mmu_create_ate(kbdev, *phys, flags,
-				cur_level, group_id);
+			pgd_page[level_index] =
+				kbase_mmu_create_ate(kbdev, *phys, flags, cur_level, group_id);
 
 			num_of_valid_entries++;
 		} else {
@@ -1758,21 +2008,43 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 
 		mmu_mode->set_num_valid_entries(pgd_page, num_of_valid_entries);
 
+		if (dirty_pgds && count > 0 && !newly_created_pgd)
+			*dirty_pgds |= 1ULL << cur_level;
+
 		phys += count;
 		insert_vpfn += count;
 		remain -= count;
 
-		kbase_mmu_sync_pgd(kbdev,
-				kbase_dma_addr(p) + (vindex * sizeof(u64)),
-				count * sizeof(u64));
+		/* For the most part, the creation of a new virtual memory mapping does
+		 * not require cache flush operations, because the operation results
+		 * into the creation of new memory pages which are not present in GPU
+		 * caches. Therefore the defaul operation is NONE.
+		 *
+		 * However, it is quite common for the mapping to start and/or finish
+		 * at an already existing PGD. Moreover, the PTEs modified are not
+		 * necessarily aligned with GPU cache lines. Therefore, GPU cache
+		 * maintenance is required for existing PGDs.
+		 */
+		flush_op = newly_created_pgd ? KBASE_MMU_OP_NONE : KBASE_MMU_OP_FLUSH_PT;
+
+		kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (vindex * sizeof(u64)),
+				   kbase_dma_addr(p) + (vindex * sizeof(u64)), count * sizeof(u64),
+				   flush_op);
 
 		kunmap(p);
 	}
 
-	err = 0;
+	mutex_unlock(&mmut->mmu_lock);
+
+	return 0;
 
 fail_unlock:
 	mutex_unlock(&mmut->mmu_lock);
+
+	mmu_flush_invalidate_insert_pages(kbdev, mmut, start_vpfn, nr, *dirty_pgds,
+					  CALLER_MMU_ASYNC);
+	kbase_mmu_free_pgds_list(kbdev, mmut, &free_pgds_list);
+
 	return err;
 }
 
@@ -1787,24 +2059,27 @@ int kbase_mmu_insert_pages(struct kbase_device *kbdev,
 			   enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	int err;
+	u64 dirty_pgds = 0;
+	LIST_HEAD(free_pgds_list);
 
-	err = kbase_mmu_insert_pages_no_flush(kbdev, mmut, vpfn,
-			phys, nr, flags, group_id);
+	/* Early out if there is nothing to do */
+	if (nr == 0)
+		return 0;
 
-	if (mmut->kctx)
-		kbase_mmu_flush_invalidate(mmut->kctx, vpfn, nr, false,
-					   mmu_sync_info);
-	else
-		kbase_mmu_flush_invalidate_no_ctx(kbdev, vpfn, nr, false, as_nr,
-						  mmu_sync_info);
+	err = kbase_mmu_insert_pages_no_flush(kbdev, mmut, vpfn, phys, nr, flags, group_id,
+					      &dirty_pgds);
+	if (err)
+		return err;
 
-	return err;
+	mmu_flush_invalidate_insert_pages(kbdev, mmut, vpfn, nr, dirty_pgds, mmu_sync_info);
+
+	return 0;
 }
 
 KBASE_EXPORT_TEST_API(kbase_mmu_insert_pages);
 
 /**
- * kbase_mmu_flush_invalidate_noretain() - Flush and invalidate the GPU caches
+ * kbase_mmu_flush_noretain() - Flush and invalidate the GPU caches
  * without retaining the kbase context.
  * @kctx: The KBase context.
  * @vpfn: The virtual page frame number to start the flush on.
@@ -1813,17 +2088,15 @@ KBASE_EXPORT_TEST_API(kbase_mmu_insert_pages);
  * As per kbase_mmu_flush_invalidate but doesn't retain the kctx or do any
  * other locking.
  */
-static void kbase_mmu_flush_invalidate_noretain(struct kbase_context *kctx,
-						u64 vpfn, size_t nr)
+static void kbase_mmu_flush_noretain(struct kbase_context *kctx, u64 vpfn, size_t nr)
 {
 	struct kbase_device *kbdev = kctx->kbdev;
-	struct kbase_mmu_hw_op_param op_param;
 	int err;
-
 	/* Calls to this function are inherently asynchronous, with respect to
 	 * MMU operations.
 	 */
 	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+	struct kbase_mmu_hw_op_param op_param;
 
 	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
 	lockdep_assert_held(&kctx->kbdev->mmu_hw_mutex);
@@ -1833,155 +2106,32 @@ static void kbase_mmu_flush_invalidate_noretain(struct kbase_context *kctx,
 		return;
 
 	/* flush L2 and unlock the VA (resumes the MMU) */
-	op_param = (struct kbase_mmu_hw_op_param){
-		.vpfn = vpfn,
-		.nr = nr,
-		.op = KBASE_MMU_OP_FLUSH_MEM,
-		.kctx_id = kctx->id,
-		.mmu_sync_info = mmu_sync_info,
-	};
-
+	op_param.vpfn = vpfn;
+	op_param.nr = nr;
+	op_param.op = KBASE_MMU_OP_FLUSH_MEM;
+	op_param.kctx_id = kctx->id;
+	op_param.mmu_sync_info = mmu_sync_info;
 	if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
-		err = mmu_flush_invalidate_on_gpu_ctrl(
-			kbdev, &kbdev->as[kctx->as_nr], &op_param);
+		/* Value used to prevent skipping of any levels when flushing */
+		op_param.flush_skip_levels = pgd_level_to_skip_flush(0xF);
+		err = kbase_mmu_hw_do_flush_on_gpu_ctrl(kbdev, &kbdev->as[kctx->as_nr],
+							&op_param);
 	} else {
-		err = kbase_mmu_hw_do_operation(kbdev, &kbdev->as[kctx->as_nr],
-						&op_param);
+		err = kbase_mmu_hw_do_flush_locked(kbdev, &kbdev->as[kctx->as_nr],
+						   &op_param);
 	}
 
 	if (err) {
 		/* Flush failed to complete, assume the
 		 * GPU has hung and perform a reset to recover
 		 */
-		dev_err(kbdev->dev, "Flush for GPU page table update did not complete. Issuing GPU soft-reset to recover\n");
+		dev_err(kbdev->dev, "Flush for GPU page table update did not complete. Issuing GPU soft-reset to recover");
 
 		if (kbase_prepare_to_reset_gpu_locked(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu_locked(kbdev);
 	}
 }
 
-/* Perform a flush/invalidate on a particular address space
- */
-static void
-kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev, struct kbase_as *as,
-			      u64 vpfn, size_t nr, bool sync, u32 kctx_id,
-			      enum kbase_caller_mmu_sync_info mmu_sync_info)
-{
-	int err;
-	bool gpu_powered;
-	unsigned long flags;
-	struct kbase_mmu_hw_op_param op_param;
-
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	gpu_powered = kbdev->pm.backend.gpu_powered;
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-	/* GPU is off so there's no need to perform flush/invalidate.
-	 * But even if GPU is not actually powered down, after gpu_powered flag
-	 * was set to false, it is still safe to skip the flush/invalidate.
-	 * The TLB invalidation will anyways be performed due to AS_COMMAND_UPDATE
-	 * which is sent when address spaces are restored after gpu_powered flag
-	 * is set to true. Flushing of L2 cache is certainly not required as L2
-	 * cache is definitely off if gpu_powered is false.
-	 */
-	if (!gpu_powered)
-		return;
-
-	if (kbase_pm_context_active_handle_suspend(kbdev,
-				KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE)) {
-		/* GPU has just been powered off due to system suspend.
-		 * So again, no need to perform flush/invalidate.
-		 */
-		return;
-	}
-
-	/* AS transaction begin */
-	mutex_lock(&kbdev->mmu_hw_mutex);
-
-	op_param = (struct kbase_mmu_hw_op_param){
-		.vpfn = vpfn,
-		.nr = nr,
-		.kctx_id = kctx_id,
-		.mmu_sync_info = mmu_sync_info,
-	};
-
-	if (sync)
-		op_param.op = KBASE_MMU_OP_FLUSH_MEM;
-	else
-		op_param.op = KBASE_MMU_OP_FLUSH_PT;
-
-	if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
-		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-		err = mmu_flush_invalidate_on_gpu_ctrl(kbdev, as, &op_param);
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-	} else {
-		mmu_hw_operation_begin(kbdev);
-		err = kbase_mmu_hw_do_operation(kbdev, as, &op_param);
-		mmu_hw_operation_end(kbdev);
-	}
-
-	if (err) {
-		/* Flush failed to complete, assume the GPU has hung and
-		 * perform a reset to recover
-		 */
-		dev_err(kbdev->dev, "Flush for GPU page table update did not complete. Issuing GPU soft-reset to recover\n");
-
-		if (kbase_prepare_to_reset_gpu(
-			    kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
-			kbase_reset_gpu(kbdev);
-	}
-
-	mutex_unlock(&kbdev->mmu_hw_mutex);
-	/* AS transaction end */
-
-	kbase_pm_context_idle(kbdev);
-}
-
-static void
-kbase_mmu_flush_invalidate_no_ctx(struct kbase_device *kbdev, u64 vpfn,
-				  size_t nr, bool sync, int as_nr,
-				  enum kbase_caller_mmu_sync_info mmu_sync_info)
-{
-	/* Skip if there is nothing to do */
-	if (nr) {
-		kbase_mmu_flush_invalidate_as(kbdev, &kbdev->as[as_nr], vpfn,
-					      nr, sync, 0xFFFFFFFF,
-					      mmu_sync_info);
-	}
-}
-
-static void
-kbase_mmu_flush_invalidate(struct kbase_context *kctx, u64 vpfn, size_t nr,
-			   bool sync,
-			   enum kbase_caller_mmu_sync_info mmu_sync_info)
-{
-	struct kbase_device *kbdev;
-	bool ctx_is_in_runpool;
-
-	/* Early out if there is nothing to do */
-	if (nr == 0)
-		return;
-
-	kbdev = kctx->kbdev;
-#if !MALI_USE_CSF
-	mutex_lock(&kbdev->js_data.queue_mutex);
-	ctx_is_in_runpool = kbase_ctx_sched_inc_refcount(kctx);
-	mutex_unlock(&kbdev->js_data.queue_mutex);
-#else
-	ctx_is_in_runpool = kbase_ctx_sched_inc_refcount_if_as_valid(kctx);
-#endif /* !MALI_USE_CSF */
-
-	if (ctx_is_in_runpool) {
-		KBASE_DEBUG_ASSERT(kctx->as_nr != KBASEP_AS_NR_INVALID);
-
-		kbase_mmu_flush_invalidate_as(kbdev, &kbdev->as[kctx->as_nr],
-					      vpfn, nr, sync, kctx->id,
-					      mmu_sync_info);
-
-		release_ctx(kbdev, kctx);
-	}
-}
-
 void kbase_mmu_update(struct kbase_device *kbdev,
 		struct kbase_mmu_table *mmut,
 		int as_nr)
@@ -2021,7 +2171,7 @@ void kbase_mmu_disable(struct kbase_context *kctx)
 	 * The job scheduler code will already be holding the locks and context
 	 * so just do the flush.
 	 */
-	kbase_mmu_flush_invalidate_noretain(kctx, 0, ~0);
+	kbase_mmu_flush_noretain(kctx, 0, ~0);
 
 	kctx->kbdev->mmu_mode->disable_as(kctx->kbdev, kctx->as_nr);
 #if !MALI_USE_CSF
@@ -2037,9 +2187,10 @@ void kbase_mmu_disable(struct kbase_context *kctx)
 KBASE_EXPORT_TEST_API(kbase_mmu_disable);
 
 static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
-						  struct kbase_mmu_table *mmut,
-						  phys_addr_t *pgds, u64 vpfn,
-						  int level)
+						  struct kbase_mmu_table *mmut, phys_addr_t *pgds,
+						  u64 vpfn, int level,
+						  enum kbase_mmu_op_type flush_op, u64 *dirty_pgds,
+						  struct list_head *free_pgds_list)
 {
 	int current_level;
 
@@ -2047,39 +2198,94 @@ static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
 
 	for (current_level = level - 1; current_level >= MIDGARD_MMU_LEVEL(0);
 	     current_level--) {
-		u64 *current_page = kmap(phys_to_page(pgds[current_level]));
+		phys_addr_t current_pgd = pgds[current_level];
+		struct page *p = phys_to_page(current_pgd);
+		u64 *current_page = kmap(p);
 		unsigned int current_valid_entries =
 			kbdev->mmu_mode->get_num_valid_entries(current_page);
+		int index = (vpfn >> ((3 - current_level) * 9)) & 0x1FF;
+
+		/* We need to track every level that needs updating */
+		if (dirty_pgds)
+			*dirty_pgds |= 1ULL << current_level;
 
+		kbdev->mmu_mode->entries_invalidate(&current_page[index], 1);
 		if (current_valid_entries == 1 &&
 		    current_level != MIDGARD_MMU_LEVEL(0)) {
-			kunmap(phys_to_page(pgds[current_level]));
-
-			kbase_mmu_free_pgd(kbdev, mmut, pgds[current_level],
-					   true);
-		} else {
-			int index = (vpfn >> ((3 - current_level) * 9)) & 0x1FF;
+			kunmap(p);
 
-			kbdev->mmu_mode->entry_invalidate(&current_page[index]);
+			/* Ensure the cacheline containing the last valid entry
+			 * of PGD is invalidated from the GPU cache, before the
+			 * PGD page is freed.
+			 */
+			kbase_mmu_sync_pgd_gpu(kbdev, mmut->kctx,
+				current_pgd + (index * sizeof(u64)),
+				sizeof(u64), flush_op);
 
+			list_add(&p->lru, free_pgds_list);
+		} else {
 			current_valid_entries--;
 
 			kbdev->mmu_mode->set_num_valid_entries(
 				current_page, current_valid_entries);
 
-			kbase_mmu_sync_pgd(kbdev,
-					   kbase_dma_addr(phys_to_page(
-						   pgds[current_level])) +
-						   8 * index,
-					   8 * 1);
+			kunmap(p);
 
-			kunmap(phys_to_page(pgds[current_level]));
+			kbase_mmu_sync_pgd(kbdev, mmut->kctx, current_pgd + (index * sizeof(u64)),
+					   kbase_dma_addr(p) + (index * sizeof(u64)), sizeof(u64),
+					   flush_op);
 			break;
 		}
 	}
 }
 
-/*
+/**
+ * mmu_flush_invalidate_teardown_pages() - Perform flush operation after unmapping pages.
+ *
+ * @kbdev:       Pointer to kbase device.
+ * @kctx:        Pointer to kbase context.
+ * @as_nr:       Address space number, for GPU cache maintenance operations
+ *               that happen outside a specific kbase context.
+ * @phys:        Array of physical pages to flush.
+ * @op_param:  Non-NULL pointer to struct containing information about the flush
+ *             operation to perform.
+ *
+ * This function will do one of three things:
+ * 1. Invalidate the MMU caches, followed by a partial GPU cache flush of the
+ *    individual pages that were unmapped if feature is supported on GPU.
+ * 2. Perform a full GPU cache flush through the GPU_CONTROL interface if feature is
+ *    supported on GPU or,
+ * 3. Perform a full GPU cache flush through the MMU_CONTROL interface.
+ */
+static void mmu_flush_invalidate_teardown_pages(struct kbase_device *kbdev,
+						struct kbase_context *kctx, int as_nr,
+						struct tagged_addr *phys,
+						struct kbase_mmu_hw_op_param *op_param)
+{
+
+	if (!mmu_flush_cache_on_gpu_ctrl(kbdev)) {
+		mmu_flush_invalidate(kbdev, kctx, as_nr, op_param);
+		return;
+	} else if (op_param->op == KBASE_MMU_OP_FLUSH_MEM) {
+		mmu_flush_invalidate_on_gpu_ctrl(kbdev, kctx, as_nr, op_param);
+		return;
+	}
+
+}
+
+/**
+ * kbase_mmu_teardown_pages - Remove GPU virtual addresses from the MMU page table
+ *
+ * @kbdev:    Pointer to kbase device.
+ * @mmut:     Pointer to GPU MMU page table.
+ * @vpfn:     Start page frame number of the GPU virtual pages to unmap.
+ * @phys:     Array of physical pages currently mapped to the virtual
+ *            pages to unmap, or NULL. This is only used for GPU cache
+ *            maintenance.
+ * @nr:       Number of pages to unmap.
+ * @as_nr:    Address space number, for GPU cache maintenance operations
+ *            that happen outside a specific kbase context.
+ *
  * We actually discard the ATE and free the page table pages if no valid entries
  * exist in PGD.
  *
@@ -2088,15 +2294,25 @@ static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
  * These locks must be taken in the correct order with respect to others
  * already held by the caller. Refer to kbasep_js_runpool_release_ctx() for more
  * information.
+ *
+ * The @p phys pointer to physical pages is not necessary for unmapping virtual memory,
+ * but it is used for fine-grained GPU cache maintenance. If @p phys is NULL,
+ * GPU cache maintenance will be done as usual, that is invalidating the whole GPU caches
+ * instead of specific physical address ranges.
+ *
+ * Return: 0 on success, otherwise an error code.
  */
-int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
-	struct kbase_mmu_table *mmut, u64 vpfn, size_t nr, int as_nr)
+int kbase_mmu_teardown_pages(struct kbase_device *kbdev, struct kbase_mmu_table *mmut, u64 vpfn,
+			     struct tagged_addr *phys, size_t nr, int as_nr)
 {
-	phys_addr_t pgd;
 	u64 start_vpfn = vpfn;
 	size_t requested_nr = nr;
+	enum kbase_mmu_op_type flush_op = KBASE_MMU_OP_NONE;
 	struct kbase_mmu_mode const *mmu_mode;
+	struct kbase_mmu_hw_op_param op_param;
 	int err = -EFAULT;
+	u64 dirty_pgds = 0;
+	LIST_HEAD(free_pgds_list);
 
 	/* Calls to this function are inherently asynchronous, with respect to
 	 * MMU operations.
@@ -2108,12 +2324,25 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
 		return 0;
 	}
 
+	/* MMU cache flush strategy depends on the number of pages to unmap. In both cases
+	 * the operation is invalidate but the granularity of cache maintenance may change
+	 * according to the situation.
+	 *
+	 * If GPU control command operations are present and the number of pages is "small",
+	 * then the optimal strategy is flushing on the physical address range of the pages
+	 * which are affected by the operation. That implies both the PGDs which are modified
+	 * or removed from the page table and the physical pages which are freed from memory.
+	 *
+	 * Otherwise, there's no alternative to invalidating the whole GPU cache.
+	 */
+	if (mmu_flush_cache_on_gpu_ctrl(kbdev) && phys && nr <= KBASE_PA_RANGE_THRESHOLD_NR_PAGES)
+		flush_op = KBASE_MMU_OP_FLUSH_PT;
+
 	mutex_lock(&mmut->mmu_lock);
 
 	mmu_mode = kbdev->mmu_mode;
 
 	while (nr) {
-		unsigned int i;
 		unsigned int index = vpfn & 0x1FF;
 		unsigned int count = KBASE_MMU_PAGE_ENTRIES - index;
 		unsigned int pcount;
@@ -2121,19 +2350,19 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
 		u64 *page;
 		phys_addr_t pgds[MIDGARD_MMU_BOTTOMLEVEL + 1];
 		register unsigned int num_of_valid_entries;
+		phys_addr_t pgd = mmut->pgd;
+		struct page *p = phys_to_page(pgd);
 
 		if (count > nr)
 			count = nr;
 
-		/* need to check if this is a 2MB or a 4kB page */
-		pgd = mmut->pgd;
-
+		/* need to check if this is a 2MB page or a 4kB */
 		for (level = MIDGARD_MMU_TOPLEVEL;
 				level <= MIDGARD_MMU_BOTTOMLEVEL; level++) {
 			phys_addr_t next_pgd;
 
 			index = (vpfn >> ((3 - level) * 9)) & 0x1FF;
-			page = kmap(phys_to_page(pgd));
+			page = kmap(p);
 			if (mmu_mode->ate_is_valid(page[index], level))
 				break; /* keep the mapping */
 			else if (!mmu_mode->pte_is_valid(page[index], level)) {
@@ -2156,10 +2385,13 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
 					count = nr;
 				goto next;
 			}
-			next_pgd = mmu_mode->pte_to_phy_addr(page[index]);
+			next_pgd = mmu_mode->pte_to_phy_addr(
+				kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
+					kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, page[index]));
+			kunmap(p);
 			pgds[level] = pgd;
-			kunmap(phys_to_page(pgd));
 			pgd = next_pgd;
+			p = phys_to_page(pgd);
 		}
 
 		switch (level) {
@@ -2168,7 +2400,7 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
 			dev_warn(kbdev->dev,
 				 "%s: No support for ATEs at level %d\n",
 				 __func__, level);
-			kunmap(phys_to_page(pgd));
+			kunmap(p);
 			goto out;
 		case MIDGARD_MMU_LEVEL(2):
 			/* can only teardown if count >= 512 */
@@ -2194,50 +2426,66 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
 			continue;
 		}
 
+		if (pcount > 0)
+			dirty_pgds |= 1ULL << level;
+
 		num_of_valid_entries = mmu_mode->get_num_valid_entries(page);
 		if (WARN_ON_ONCE(num_of_valid_entries < pcount))
 			num_of_valid_entries = 0;
 		else
 			num_of_valid_entries -= pcount;
 
+		/* Invalidate the entries we added */
+		mmu_mode->entries_invalidate(&page[index], pcount);
+
 		if (!num_of_valid_entries) {
-			kunmap(phys_to_page(pgd));
+			kunmap(p);
+
+			/* Ensure the cacheline(s) containing the last valid entries
+			 * of PGD is invalidated from the GPU cache, before the
+			 * PGD page is freed.
+			 */
+			kbase_mmu_sync_pgd_gpu(kbdev, mmut->kctx,
+				pgd + (index * sizeof(u64)),
+				pcount * sizeof(u64), flush_op);
 
-			kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
+			list_add(&p->lru, &free_pgds_list);
 
-			kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds,
-							      vpfn, level);
+			kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds, vpfn, level,
+							      flush_op, &dirty_pgds,
+							      &free_pgds_list);
 
 			vpfn += count;
 			nr -= count;
 			continue;
 		}
 
-		/* Invalidate the entries we added */
-		for (i = 0; i < pcount; i++)
-			mmu_mode->entry_invalidate(&page[index + i]);
-
 		mmu_mode->set_num_valid_entries(page, num_of_valid_entries);
 
-		kbase_mmu_sync_pgd(
-			kbdev, kbase_dma_addr(phys_to_page(pgd)) + 8 * index,
-			8 * pcount);
+		kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (index * sizeof(u64)),
+				   kbase_dma_addr(p) + (index * sizeof(u64)), pcount * sizeof(u64),
+				   flush_op);
 next:
-		kunmap(phys_to_page(pgd));
+		kunmap(p);
 		vpfn += count;
 		nr -= count;
 	}
 	err = 0;
 out:
 	mutex_unlock(&mmut->mmu_lock);
+	/* Set up MMU operation parameters. See above about MMU cache flush strategy. */
+	op_param = (struct kbase_mmu_hw_op_param){
+		.vpfn = start_vpfn,
+		.nr = requested_nr,
+		.mmu_sync_info = mmu_sync_info,
+		.kctx_id = mmut->kctx ? mmut->kctx->id : 0xFFFFFFFF,
+		.op = (flush_op == KBASE_MMU_OP_FLUSH_PT) ? KBASE_MMU_OP_FLUSH_PT :
+							    KBASE_MMU_OP_FLUSH_MEM,
+		.flush_skip_levels = pgd_level_to_skip_flush(dirty_pgds),
+	};
+	mmu_flush_invalidate_teardown_pages(kbdev, mmut->kctx, as_nr, phys, &op_param);
 
-	if (mmut->kctx)
-		kbase_mmu_flush_invalidate(mmut->kctx, start_vpfn, requested_nr,
-					   true, mmu_sync_info);
-	else
-		kbase_mmu_flush_invalidate_no_ctx(kbdev, start_vpfn,
-						  requested_nr, true, as_nr,
-						  mmu_sync_info);
+	kbase_mmu_free_pgds_list(kbdev, mmut, &free_pgds_list);
 
 	return err;
 }
@@ -2257,6 +2505,7 @@ KBASE_EXPORT_TEST_API(kbase_mmu_teardown_pages);
  * @flags: Flags
  * @group_id: The physical memory group in which the page was allocated.
  *            Valid range is 0..(MEMORY_GROUP_MANAGER_NR_GROUPS-1).
+ * @dirty_pgds: Flags to track every level where a PGD has been updated.
  *
  * This will update page table entries that already exist on the GPU based on
  * the new flags that are passed (the physical pages pointed to by the page
@@ -2269,8 +2518,8 @@ KBASE_EXPORT_TEST_API(kbase_mmu_teardown_pages);
  *         successfully, otherwise an error code.
  */
 static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn,
-					struct tagged_addr *phys, size_t nr,
-					unsigned long flags, int const group_id)
+					   struct tagged_addr *phys, size_t nr, unsigned long flags,
+					   int const group_id, u64 *dirty_pgds)
 {
 	phys_addr_t pgd;
 	u64 *pgd_page;
@@ -2304,7 +2553,8 @@ static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn,
 		if (is_huge(*phys) && (index == index_in_large_page(*phys)))
 			cur_level = MIDGARD_MMU_LEVEL(2);
 
-		err = mmu_get_pgd_at_level(kbdev, &kctx->mmu, vpfn, cur_level, &pgd);
+		err = mmu_get_pgd_at_level(kbdev, &kctx->mmu, vpfn, cur_level, &pgd, NULL,
+					   dirty_pgds);
 		if (WARN_ON(err))
 			goto fail_unlock;
 
@@ -2331,9 +2581,9 @@ static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn,
 			pgd_page[level_index] = kbase_mmu_create_ate(kbdev,
 					*target_phys, flags, MIDGARD_MMU_LEVEL(2),
 					group_id);
-			kbase_mmu_sync_pgd(kbdev,
-				kbase_dma_addr(p) + (level_index * sizeof(u64)),
-				sizeof(u64));
+			kbase_mmu_sync_pgd(kbdev, kctx, pgd + (level_index * sizeof(u64)),
+					   kbase_dma_addr(p) + (level_index * sizeof(u64)),
+					   sizeof(u64), KBASE_MMU_OP_NONE);
 		} else {
 			for (i = 0; i < count; i++) {
 #ifdef CONFIG_MALI_DEBUG
@@ -2345,14 +2595,21 @@ static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn,
 					phys[i], flags, MIDGARD_MMU_BOTTOMLEVEL,
 					group_id);
 			}
-			kbase_mmu_sync_pgd(kbdev,
-				kbase_dma_addr(p) + (index * sizeof(u64)),
-				count * sizeof(u64));
+
+			/* MMU cache flush strategy is NONE because GPU cache maintenance
+			 * will be done by the caller.
+			 */
+			kbase_mmu_sync_pgd(kbdev, kctx, pgd + (index * sizeof(u64)),
+					   kbase_dma_addr(p) + (index * sizeof(u64)),
+					   count * sizeof(u64), KBASE_MMU_OP_NONE);
 		}
 
 		kbdev->mmu_mode->set_num_valid_entries(pgd_page,
 					num_of_valid_entries);
 
+		if (dirty_pgds && count > 0)
+			*dirty_pgds |= 1ULL << cur_level;
+
 		phys += count;
 		vpfn += count;
 		nr -= count;
@@ -2373,15 +2630,29 @@ int kbase_mmu_update_pages(struct kbase_context *kctx, u64 vpfn,
 			   unsigned long flags, int const group_id)
 {
 	int err;
+	struct kbase_mmu_hw_op_param op_param;
+	u64 dirty_pgds = 0;
 
 	/* Calls to this function are inherently asynchronous, with respect to
 	 * MMU operations.
 	 */
 	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
 
-	err = kbase_mmu_update_pages_no_flush(kctx, vpfn, phys, nr, flags,
-		group_id);
-	kbase_mmu_flush_invalidate(kctx, vpfn, nr, true, mmu_sync_info);
+	err = kbase_mmu_update_pages_no_flush(kctx, vpfn, phys, nr, flags, group_id, &dirty_pgds);
+
+	op_param = (const struct kbase_mmu_hw_op_param){
+		.vpfn = vpfn,
+		.nr = nr,
+		.op = KBASE_MMU_OP_FLUSH_MEM,
+		.kctx_id = kctx->id,
+		.mmu_sync_info = mmu_sync_info,
+		.flush_skip_levels = pgd_level_to_skip_flush(dirty_pgds),
+	};
+
+	if (mmu_flush_cache_on_gpu_ctrl(kctx->kbdev))
+		mmu_flush_invalidate_on_gpu_ctrl(kctx->kbdev, kctx, kctx->as_nr, &op_param);
+	else
+		mmu_flush_invalidate(kctx->kbdev, kctx, kctx->as_nr, &op_param);
 	return err;
 }
 
@@ -2389,47 +2660,45 @@ static void mmu_teardown_level(struct kbase_device *kbdev,
 		struct kbase_mmu_table *mmut, phys_addr_t pgd,
 		int level)
 {
-	phys_addr_t target_pgd;
 	u64 *pgd_page;
 	int i;
-	struct kbase_mmu_mode const *mmu_mode;
-	u64 *pgd_page_buffer;
+	struct memory_group_manager_device *mgm_dev = kbdev->mgm_dev;
+	struct kbase_mmu_mode const *mmu_mode = kbdev->mmu_mode;
+	u64 *pgd_page_buffer = NULL;
 
 	lockdep_assert_held(&mmut->mmu_lock);
 
-	/* Early-out. No need to kmap to check entries for L3 PGD. */
-	if (level == MIDGARD_MMU_BOTTOMLEVEL) {
-		kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
-		return;
-	}
-
 	pgd_page = kmap_atomic(pfn_to_page(PFN_DOWN(pgd)));
 	/* kmap_atomic should NEVER fail. */
 	if (WARN_ON(pgd_page == NULL))
 		return;
-	/* Copy the page to our preallocated buffer so that we can minimize
-	 * kmap_atomic usage
-	 */
-	pgd_page_buffer = mmut->mmu_teardown_pages[level];
-	memcpy(pgd_page_buffer, pgd_page, PAGE_SIZE);
+	if (level != MIDGARD_MMU_BOTTOMLEVEL) {
+		/* Copy the page to our preallocated buffer so that we can minimize
+		 * kmap_atomic usage
+		 */
+		pgd_page_buffer = mmut->mmu_teardown_pages[level];
+		memcpy(pgd_page_buffer, pgd_page, PAGE_SIZE);
+	}
+
+	/* Invalidate page after copying */
+	mmu_mode->entries_invalidate(pgd_page, KBASE_MMU_PAGE_ENTRIES);
 	kunmap_atomic(pgd_page);
 	pgd_page = pgd_page_buffer;
 
-	mmu_mode = kbdev->mmu_mode;
-
-	for (i = 0; i < KBASE_MMU_PAGE_ENTRIES; i++) {
-		target_pgd = mmu_mode->pte_to_phy_addr(pgd_page[i]);
-
-		if (target_pgd) {
+	if (level != MIDGARD_MMU_BOTTOMLEVEL) {
+		for (i = 0; i < KBASE_MMU_PAGE_ENTRIES; i++) {
 			if (mmu_mode->pte_is_valid(pgd_page[i], level)) {
-				mmu_teardown_level(kbdev, mmut,
-						   target_pgd,
-						   level + 1);
+				phys_addr_t target_pgd = mmu_mode->pte_to_phy_addr(
+					mgm_dev->ops.mgm_pte_to_original_pte(mgm_dev,
+									     MGM_DEFAULT_PTE_GROUP,
+									     level, pgd_page[i]));
+
+				mmu_teardown_level(kbdev, mmut, target_pgd, level + 1);
 			}
 		}
 	}
 
-	kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
+	kbase_mmu_free_pgd(kbdev, mmut, pgd);
 }
 
 int kbase_mmu_init(struct kbase_device *const kbdev,
@@ -2445,7 +2714,7 @@ int kbase_mmu_init(struct kbase_device *const kbdev,
 	mmut->group_id = group_id;
 	mutex_init(&mmut->mmu_lock);
 	mmut->kctx = kctx;
-	mmut->pgd = 0;
+	mmut->pgd = KBASE_MMU_INVALID_PGD_ADDRESS;
 
 	/* Preallocate MMU depth of 3 pages for mmu_teardown_level to use */
 	for (level = MIDGARD_MMU_TOPLEVEL;
@@ -2463,7 +2732,7 @@ int kbase_mmu_init(struct kbase_device *const kbdev,
 	 * kbase_mmu_alloc_pgd will allocate out of that pool. This is done to
 	 * avoid allocations from the kernel happening with the lock held.
 	 */
-	while (!mmut->pgd) {
+	while (mmut->pgd == KBASE_MMU_INVALID_PGD_ADDRESS) {
 		int err;
 
 		err = kbase_mem_pool_grow(
@@ -2486,7 +2755,7 @@ void kbase_mmu_term(struct kbase_device *kbdev, struct kbase_mmu_table *mmut)
 {
 	int level;
 
-	if (mmut->pgd) {
+	if (mmut->pgd != KBASE_MMU_INVALID_PGD_ADDRESS) {
 		mutex_lock(&mmut->mmu_lock);
 		mmu_teardown_level(kbdev, mmut, mmut->pgd, MIDGARD_MMU_TOPLEVEL);
 		mutex_unlock(&mmut->mmu_lock);
@@ -2510,6 +2779,7 @@ void kbase_mmu_as_term(struct kbase_device *kbdev, int i)
 	destroy_workqueue(kbdev->as[i].pf_wq);
 }
 
+#if defined(CONFIG_MALI_VECTOR_DUMP)
 static size_t kbasep_mmu_dump_level(struct kbase_context *kctx, phys_addr_t pgd,
 		int level, char ** const buffer, size_t *size_left)
 {
@@ -2555,7 +2825,9 @@ static size_t kbasep_mmu_dump_level(struct kbase_context *kctx, phys_addr_t pgd,
 		for (i = 0; i < KBASE_MMU_PAGE_ENTRIES; i++) {
 			if (mmu_mode->pte_is_valid(pgd_page[i], level)) {
 				target_pgd = mmu_mode->pte_to_phy_addr(
-						pgd_page[i]);
+					kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
+						kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP,
+						level, pgd_page[i]));
 
 				dump_size = kbasep_mmu_dump_level(kctx,
 						target_pgd, level + 1,
@@ -2649,6 +2921,7 @@ fail_free:
 	return NULL;
 }
 KBASE_EXPORT_TEST_API(kbase_mmu_dump);
+#endif /* defined(CONFIG_MALI_VECTOR_DUMP) */
 
 void kbase_mmu_bus_fault_worker(struct work_struct *data)
 {
diff --git a/mali_kbase/mmu/mali_kbase_mmu.h b/mali_kbase/mmu/mali_kbase_mmu.h
index 49665fb..5330306 100644
--- a/mali_kbase/mmu/mali_kbase_mmu.h
+++ b/mali_kbase/mmu/mali_kbase_mmu.h
@@ -25,6 +25,7 @@
 #include <uapi/gpu/arm/midgard/mali_base_kernel.h>
 
 #define KBASE_MMU_PAGE_ENTRIES 512
+#define KBASE_MMU_INVALID_PGD_ADDRESS (~(phys_addr_t)0)
 
 struct kbase_context;
 struct kbase_mmu_table;
@@ -129,11 +130,9 @@ void kbase_mmu_term(struct kbase_device *kbdev, struct kbase_mmu_table *mmut);
 u64 kbase_mmu_create_ate(struct kbase_device *kbdev,
 	struct tagged_addr phy, unsigned long flags, int level, int group_id);
 
-int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
-				    struct kbase_mmu_table *mmut,
-				    const u64 start_vpfn,
-				    struct tagged_addr *phys, size_t nr,
-				    unsigned long flags, int group_id);
+int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, struct kbase_mmu_table *mmut,
+				    const u64 start_vpfn, struct tagged_addr *phys, size_t nr,
+				    unsigned long flags, int group_id, u64 *dirty_pgds);
 int kbase_mmu_insert_pages(struct kbase_device *kbdev,
 			   struct kbase_mmu_table *mmut, u64 vpfn,
 			   struct tagged_addr *phys, size_t nr,
@@ -144,9 +143,8 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 				 unsigned long flags, int group_id,
 				 enum kbase_caller_mmu_sync_info mmu_sync_info);
 
-int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
-			     struct kbase_mmu_table *mmut, u64 vpfn,
-			     size_t nr, int as_nr);
+int kbase_mmu_teardown_pages(struct kbase_device *kbdev, struct kbase_mmu_table *mmut, u64 vpfn,
+			     struct tagged_addr *phys, size_t nr, int as_nr);
 int kbase_mmu_update_pages(struct kbase_context *kctx, u64 vpfn,
 			   struct tagged_addr *phys, size_t nr,
 			   unsigned long flags, int const group_id);
diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw.h b/mali_kbase/mmu/mali_kbase_mmu_hw.h
index 31658e0..09b3fa8 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_hw.h
+++ b/mali_kbase/mmu/mali_kbase_mmu_hw.h
@@ -75,12 +75,14 @@ enum kbase_mmu_op_type {
 };
 
 /**
- * struct kbase_mmu_hw_op_param  - parameters for kbase_mmu_hw_do_operation()
- * @vpfn:          MMU Virtual Page Frame Number to start the operation on.
- * @nr:            Number of pages to work on.
- * @op:            Operation type (written to ASn_COMMAND).
- * @kctx_id:       Kernel context ID for MMU command tracepoint
- * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops.
+ * struct kbase_mmu_hw_op_param  - parameters for kbase_mmu_hw_do_* functions
+ * @vpfn:           MMU Virtual Page Frame Number to start the operation on.
+ * @nr:             Number of pages to work on.
+ * @op:             Operation type (written to ASn_COMMAND).
+ * @kctx_id:        Kernel context ID for MMU command tracepoint.
+ * @mmu_sync_info:  Indicates whether this call is synchronous wrt MMU ops.
+ * @flush_skip_levels: Page table levels to skip flushing. (Only
+ *                     applicable if GPU supports feature)
  */
 struct kbase_mmu_hw_op_param {
 	u64 vpfn;
@@ -88,6 +90,7 @@ struct kbase_mmu_hw_op_param {
 	enum kbase_mmu_op_type op;
 	u32 kctx_id;
 	enum kbase_caller_mmu_sync_info mmu_sync_info;
+	u64 flush_skip_levels;
 };
 
 /**
@@ -102,18 +105,86 @@ void kbase_mmu_hw_configure(struct kbase_device *kbdev,
 		struct kbase_as *as);
 
 /**
- * kbase_mmu_hw_do_operation - Issue an operation to the MMU.
- * @kbdev:         kbase device to issue the MMU operation on.
- * @as:            address space to issue the MMU operation on.
- * @op_param:      parameters for the operation.
+ * kbase_mmu_hw_do_unlock_no_addr - Issue UNLOCK command to the MMU without
+ *                                  programming the LOCKADDR register and wait
+ *                                  for it to complete before returning.
  *
- * Issue an operation (MMU invalidate, MMU flush, etc) on the address space that
- * is associated with the provided kbase_context over the specified range
+ * @kbdev:     Kbase device to issue the MMU operation on.
+ * @as:        Address space to issue the MMU operation on.
+ * @op_param:  Pointer to struct containing information about the MMU
+ *             operation to perform.
+ *
+ * Return: 0 if issuing the command was successful, otherwise an error code.
+ */
+int kbase_mmu_hw_do_unlock_no_addr(struct kbase_device *kbdev, struct kbase_as *as,
+				   const struct kbase_mmu_hw_op_param *op_param);
+
+/**
+ * kbase_mmu_hw_do_unlock - Issue UNLOCK command to the MMU and wait for it
+ *                          to complete before returning.
+ *
+ * @kbdev:     Kbase device to issue the MMU operation on.
+ * @as:        Address space to issue the MMU operation on.
+ * @op_param:  Pointer to struct containing information about the MMU
+ *             operation to perform.
+ *
+ * Return: 0 if issuing the command was successful, otherwise an error code.
+ */
+int kbase_mmu_hw_do_unlock(struct kbase_device *kbdev, struct kbase_as *as,
+			   const struct kbase_mmu_hw_op_param *op_param);
+/**
+ * kbase_mmu_hw_do_flush - Issue a flush operation to the MMU.
+ *
+ * @kbdev:      Kbase device to issue the MMU operation on.
+ * @as:         Address space to issue the MMU operation on.
+ * @op_param:   Pointer to struct containing information about the MMU
+ *              operation to perform.
+ *
+ * Issue a flush operation on the address space as per the information
+ * specified inside @op_param. This function should not be called for
+ * GPUs where MMU command to flush the cache(s) is deprecated.
+ * mmu_hw_mutex needs to be held when calling this function.
+ *
+ * Return: Zero if the operation was successful, non-zero otherwise.
+ */
+int kbase_mmu_hw_do_flush(struct kbase_device *kbdev, struct kbase_as *as,
+			  const struct kbase_mmu_hw_op_param *op_param);
+
+/**
+ * kbase_mmu_hw_do_flush_locked - Issue a flush operation to the MMU.
+ *
+ * @kbdev:      Kbase device to issue the MMU operation on.
+ * @as:         Address space to issue the MMU operation on.
+ * @op_param:   Pointer to struct containing information about the MMU
+ *              operation to perform.
+ *
+ * Issue a flush operation on the address space as per the information
+ * specified inside @op_param. This function should not be called for
+ * GPUs where MMU command to flush the cache(s) is deprecated.
+ * Both mmu_hw_mutex and hwaccess_lock need to be held when calling this
+ * function.
+ *
+ * Return: Zero if the operation was successful, non-zero otherwise.
+ */
+int kbase_mmu_hw_do_flush_locked(struct kbase_device *kbdev, struct kbase_as *as,
+				 const struct kbase_mmu_hw_op_param *op_param);
+
+/**
+ * kbase_mmu_hw_do_flush_on_gpu_ctrl - Issue a flush operation to the MMU.
+ *
+ * @kbdev:      Kbase device to issue the MMU operation on.
+ * @as:         Address space to issue the MMU operation on.
+ * @op_param:   Pointer to struct containing information about the MMU
+ *              operation to perform.
+ *
+ * Issue a flush operation on the address space as per the information
+ * specified inside @op_param. GPU command is used to flush the cache(s)
+ * instead of the MMU command.
  *
  * Return: Zero if the operation was successful, non-zero otherwise.
  */
-int kbase_mmu_hw_do_operation(struct kbase_device *kbdev, struct kbase_as *as,
-			      struct kbase_mmu_hw_op_param *op_param);
+int kbase_mmu_hw_do_flush_on_gpu_ctrl(struct kbase_device *kbdev, struct kbase_as *as,
+				      const struct kbase_mmu_hw_op_param *op_param);
 
 /**
  * kbase_mmu_hw_clear_fault - Clear a fault that has been previously reported by
diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
index cdf9a84..79a07af 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
+++ b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2014-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -26,13 +26,17 @@
 #include <mali_kbase_mem.h>
 #include <mmu/mali_kbase_mmu_hw.h>
 #include <tl/mali_kbase_tracepoints.h>
+#include <linux/delay.h>
+
 
 /**
  * lock_region() - Generate lockaddr to lock memory region in MMU
- * @gpu_props: GPU properties for finding the MMU lock region size
- * @pfn:       Starting page frame number of the region to lock
- * @num_pages: Number of pages to lock. It must be greater than 0.
- * @lockaddr:  Address and size of memory region to lock
+ *
+ * @gpu_props: GPU properties for finding the MMU lock region size.
+ * @lockaddr:  Address and size of memory region to lock.
+ * @op_param:  Pointer to a struct containing the starting page frame number of
+ *             the region to lock, the number of pages to lock and page table
+ *             levels to skip when flushing (if supported).
  *
  * The lockaddr value is a combination of the starting address and
  * the size of the region that encompasses all the memory pages to lock.
@@ -63,14 +67,14 @@
  *
  * Return: 0 if success, or an error code on failure.
  */
-static int lock_region(struct kbase_gpu_props const *gpu_props, u64 pfn, u32 num_pages,
-		       u64 *lockaddr)
+static int lock_region(struct kbase_gpu_props const *gpu_props, u64 *lockaddr,
+		       const struct kbase_mmu_hw_op_param *op_param)
 {
-	const u64 lockaddr_base = pfn << PAGE_SHIFT;
-	const u64 lockaddr_end = ((pfn + num_pages) << PAGE_SHIFT) - 1;
+	const u64 lockaddr_base = op_param->vpfn << PAGE_SHIFT;
+	const u64 lockaddr_end = ((op_param->vpfn + op_param->nr) << PAGE_SHIFT) - 1;
 	u64 lockaddr_size_log2;
 
-	if (num_pages == 0)
+	if (op_param->nr == 0)
 		return -EINVAL;
 
 	/* The MMU lock region is a self-aligned region whose size
@@ -101,7 +105,7 @@ static int lock_region(struct kbase_gpu_props const *gpu_props, u64 pfn, u32 num
 	 * therefore the highest bit that differs is bit #16
 	 * and the region size (as a logarithm) is 16 + 1 = 17, i.e. 128 kB.
 	 */
-	lockaddr_size_log2 = fls(lockaddr_base ^ lockaddr_end);
+	lockaddr_size_log2 = fls64(lockaddr_base ^ lockaddr_end);
 
 	/* Cap the size against minimum and maximum values allowed. */
 	if (lockaddr_size_log2 > KBASE_LOCK_REGION_MAX_SIZE_LOG2)
@@ -122,14 +126,13 @@ static int lock_region(struct kbase_gpu_props const *gpu_props, u64 pfn, u32 num
 	 */
 	*lockaddr = lockaddr_base & ~((1ull << lockaddr_size_log2) - 1);
 	*lockaddr |= lockaddr_size_log2 - 1;
-
 	return 0;
 }
 
 static int wait_ready(struct kbase_device *kbdev,
 		unsigned int as_nr)
 {
-	unsigned int max_loops = KBASE_AS_INACTIVE_MAX_LOOPS;
+	u32 max_loops = KBASE_AS_INACTIVE_MAX_LOOPS;
 
 	/* Wait for the MMU status to indicate there is no active command. */
 	while (--max_loops &&
@@ -165,6 +168,100 @@ static int write_cmd(struct kbase_device *kbdev, int as_nr, u32 cmd)
 	return status;
 }
 
+#if MALI_USE_CSF && !IS_ENABLED(CONFIG_MALI_NO_MALI)
+static int wait_cores_power_trans_complete(struct kbase_device *kbdev)
+{
+#define WAIT_TIMEOUT 1000 /* 1ms timeout */
+#define DELAY_TIME_IN_US 1
+	const int max_iterations = WAIT_TIMEOUT;
+	int loop;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	for (loop = 0; loop < max_iterations; loop++) {
+		u32 lo =
+		    kbase_reg_read(kbdev, GPU_CONTROL_REG(SHADER_PWRTRANS_LO));
+		u32 hi =
+		    kbase_reg_read(kbdev, GPU_CONTROL_REG(SHADER_PWRTRANS_HI));
+
+		if (!lo && !hi)
+			break;
+
+		udelay(DELAY_TIME_IN_US);
+	}
+
+	if (loop == max_iterations) {
+		dev_warn(kbdev->dev, "SHADER_PWRTRANS set for too long");
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+/**
+ * apply_hw_issue_GPU2019_3901_wa - Apply WA for the HW issue GPU2019_3901
+ *
+ * @kbdev:             Kbase device to issue the MMU operation on.
+ * @mmu_cmd:           Pointer to the variable contain the value of MMU command
+ *                     that needs to be sent to flush the L2 cache and do an
+ *                     implicit unlock.
+ * @as_nr:             Address space number for which MMU command needs to be
+ *                     sent.
+ * @hwaccess_locked:   Flag to indicate if hwaccess_lock is held by the caller.
+ *
+ * This functions ensures that the flush of LSC is not missed for the pages that
+ * were unmapped from the GPU, due to the power down transition of shader cores.
+ *
+ * Return: 0 if the WA was successfully applied, non-zero otherwise.
+ */
+static int apply_hw_issue_GPU2019_3901_wa(struct kbase_device *kbdev,
+			u32 *mmu_cmd, unsigned int as_nr, bool hwaccess_locked)
+{
+	unsigned long flags = 0;
+	int ret = 0;
+
+	if (!hwaccess_locked)
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	/* Check if L2 is OFF. The cores also must be OFF if L2 is not up, so
+	 * the workaround can be safely skipped.
+	 */
+	if (kbdev->pm.backend.l2_state != KBASE_L2_OFF) {
+		if (*mmu_cmd != AS_COMMAND_FLUSH_MEM) {
+			dev_warn(kbdev->dev,
+				 "Unexpected mmu command received");
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		/* Wait for the LOCK MMU command to complete, issued by the caller */
+		ret = wait_ready(kbdev, as_nr);
+		if (ret)
+			goto unlock;
+
+		ret = kbase_gpu_cache_flush_and_busy_wait(kbdev,
+				GPU_COMMAND_CACHE_CLN_INV_LSC);
+		if (ret)
+			goto unlock;
+
+		ret = wait_cores_power_trans_complete(kbdev);
+		if (ret)
+			goto unlock;
+
+		/* As LSC is guaranteed to have been flushed we can use FLUSH_PT
+		 * MMU command to only flush the L2.
+		 */
+		*mmu_cmd = AS_COMMAND_FLUSH_PT;
+	}
+
+unlock:
+	if (!hwaccess_locked)
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	return ret;
+}
+#endif
+
 void kbase_mmu_hw_configure(struct kbase_device *kbdev, struct kbase_as *as)
 {
 	struct kbase_mmu_setup *current_setup = &as->current_setup;
@@ -222,95 +319,245 @@ void kbase_mmu_hw_configure(struct kbase_device *kbdev, struct kbase_as *as)
 #endif
 }
 
-int kbase_mmu_hw_do_operation(struct kbase_device *kbdev, struct kbase_as *as,
-			      struct kbase_mmu_hw_op_param *op_param)
+/**
+ * mmu_command_instr - Record an MMU command for instrumentation purposes.
+ *
+ * @kbdev:          Kbase device used to issue MMU operation on.
+ * @kctx_id:        Kernel context ID for MMU command tracepoint.
+ * @cmd:            Command issued to the MMU.
+ * @lock_addr:      Address of memory region locked for the operation.
+ * @mmu_sync_info:  Indicates whether this call is synchronous wrt MMU ops.
+ */
+static void mmu_command_instr(struct kbase_device *kbdev, u32 kctx_id, u32 cmd, u64 lock_addr,
+				    enum kbase_caller_mmu_sync_info mmu_sync_info)
+{
+	u64 lock_addr_base = AS_LOCKADDR_LOCKADDR_BASE_GET(lock_addr);
+	u32 lock_addr_size = AS_LOCKADDR_LOCKADDR_SIZE_GET(lock_addr);
+
+	bool is_mmu_synchronous = (mmu_sync_info == CALLER_MMU_SYNC);
+
+	KBASE_TLSTREAM_AUX_MMU_COMMAND(kbdev, kctx_id, cmd, is_mmu_synchronous, lock_addr_base,
+				       lock_addr_size);
+}
+
+/* Helper function to program the LOCKADDR register before LOCK/UNLOCK command
+ * is issued.
+ */
+static int mmu_hw_set_lock_addr(struct kbase_device *kbdev, int as_nr, u64 *lock_addr,
+				const struct kbase_mmu_hw_op_param *op_param)
+{
+	int ret;
+
+	ret = lock_region(&kbdev->gpu_props, lock_addr, op_param);
+
+	if (!ret) {
+		/* Set the region that needs to be updated */
+		kbase_reg_write(kbdev, MMU_AS_REG(as_nr, AS_LOCKADDR_LO),
+				*lock_addr & 0xFFFFFFFFUL);
+		kbase_reg_write(kbdev, MMU_AS_REG(as_nr, AS_LOCKADDR_HI),
+				(*lock_addr >> 32) & 0xFFFFFFFFUL);
+	}
+	return ret;
+}
+
+/**
+ * mmu_hw_do_lock_no_wait - Issue LOCK command to the MMU and return without
+ *                          waiting for it's completion.
+ *
+ * @kbdev:      Kbase device to issue the MMU operation on.
+ * @as:         Address space to issue the MMU operation on.
+ * @lock_addr:  Address of memory region locked for this operation.
+ * @op_param:   Pointer to a struct containing information about the MMU operation.
+ *
+ * Return: 0 if issuing the command was successful, otherwise an error code.
+ */
+static int mmu_hw_do_lock_no_wait(struct kbase_device *kbdev, struct kbase_as *as, u64 *lock_addr,
+				  const struct kbase_mmu_hw_op_param *op_param)
+{
+	int ret;
+
+	ret = mmu_hw_set_lock_addr(kbdev, as->number, lock_addr, op_param);
+
+	if (!ret)
+		write_cmd(kbdev, as->number, AS_COMMAND_LOCK);
+
+	return ret;
+}
+
+static int mmu_hw_do_lock(struct kbase_device *kbdev, struct kbase_as *as,
+			  const struct kbase_mmu_hw_op_param *op_param)
 {
 	int ret;
 	u64 lock_addr = 0x0;
 
-	if (WARN_ON(kbdev == NULL) ||
-	    WARN_ON(as == NULL) ||
-	    WARN_ON(op_param == NULL))
+	if (WARN_ON(kbdev == NULL) || WARN_ON(as == NULL))
 		return -EINVAL;
 
-	lockdep_assert_held(&kbdev->mmu_hw_mutex);
+	ret = mmu_hw_do_lock_no_wait(kbdev, as, &lock_addr, op_param);
+
+	if (!ret)
+		ret = wait_ready(kbdev, as->number);
 
-	if (op_param->op == KBASE_MMU_OP_UNLOCK) {
-		/* Unlock doesn't require a lock first */
-		ret = write_cmd(kbdev, as->number, AS_COMMAND_UNLOCK);
+	if (!ret)
+		mmu_command_instr(kbdev, op_param->kctx_id, AS_COMMAND_LOCK, lock_addr,
+				  op_param->mmu_sync_info);
 
-		/* Wait for UNLOCK command to complete */
+	return ret;
+}
+
+int kbase_mmu_hw_do_unlock_no_addr(struct kbase_device *kbdev, struct kbase_as *as,
+				   const struct kbase_mmu_hw_op_param *op_param)
+{
+	int ret = 0;
+
+	if (WARN_ON(kbdev == NULL) || WARN_ON(as == NULL))
+		return -EINVAL;
+
+	ret = write_cmd(kbdev, as->number, AS_COMMAND_UNLOCK);
+
+	/* Wait for UNLOCK command to complete */
+	if (!ret)
 		ret = wait_ready(kbdev, as->number);
 
-		if (!ret) {
-			/* read MMU_AS_CONTROL.LOCKADDR register */
-			lock_addr |= (u64)kbase_reg_read(kbdev,
-				MMU_AS_REG(as->number, AS_LOCKADDR_HI)) << 32;
-			lock_addr |= (u64)kbase_reg_read(kbdev,
-				MMU_AS_REG(as->number, AS_LOCKADDR_LO));
-		}
-	} else if (op_param->op >= KBASE_MMU_OP_FIRST &&
-		   op_param->op < KBASE_MMU_OP_COUNT) {
-		ret = lock_region(&kbdev->gpu_props, op_param->vpfn, op_param->nr, &lock_addr);
-
-		if (!ret) {
-			/* Lock the region that needs to be updated */
-			kbase_reg_write(kbdev,
-				MMU_AS_REG(as->number, AS_LOCKADDR_LO),
-				lock_addr & 0xFFFFFFFFUL);
-			kbase_reg_write(kbdev,
-				MMU_AS_REG(as->number, AS_LOCKADDR_HI),
-				(lock_addr >> 32) & 0xFFFFFFFFUL);
-			write_cmd(kbdev, as->number, AS_COMMAND_LOCK);
-
-			/* Translate and send operation to HW */
-			switch (op_param->op) {
-			case KBASE_MMU_OP_FLUSH_PT:
-				write_cmd(kbdev, as->number,
-					  AS_COMMAND_FLUSH_PT);
-				break;
-			case KBASE_MMU_OP_FLUSH_MEM:
-				write_cmd(kbdev, as->number,
-					  AS_COMMAND_FLUSH_MEM);
-				break;
-			case KBASE_MMU_OP_LOCK:
-				/* No further operation. */
-				break;
-			default:
-				dev_warn(kbdev->dev,
-					 "Unsupported MMU operation (op=%d).\n",
-					 op_param->op);
-				return -EINVAL;
-			};
-
-			/* Wait for the command to complete */
-			ret = wait_ready(kbdev, as->number);
-		}
-	} else {
-		/* Code should not reach here. */
-		dev_warn(kbdev->dev, "Invalid mmu operation (op=%d).\n",
-			 op_param->op);
+	if (!ret) {
+		u64 lock_addr = 0x0;
+		/* read MMU_AS_CONTROL.LOCKADDR register */
+		lock_addr |= (u64)kbase_reg_read(kbdev, MMU_AS_REG(as->number, AS_LOCKADDR_HI))
+			     << 32;
+		lock_addr |= (u64)kbase_reg_read(kbdev, MMU_AS_REG(as->number, AS_LOCKADDR_LO));
+
+		mmu_command_instr(kbdev, op_param->kctx_id, AS_COMMAND_UNLOCK,
+				  lock_addr, op_param->mmu_sync_info);
+	}
+
+	return ret;
+}
+
+int kbase_mmu_hw_do_unlock(struct kbase_device *kbdev, struct kbase_as *as,
+			   const struct kbase_mmu_hw_op_param *op_param)
+{
+	int ret = 0;
+	u64 lock_addr = 0x0;
+
+	if (WARN_ON(kbdev == NULL) || WARN_ON(as == NULL))
+		return -EINVAL;
+
+	ret = mmu_hw_set_lock_addr(kbdev, as->number, &lock_addr, op_param);
+
+	if (!ret)
+		ret = kbase_mmu_hw_do_unlock_no_addr(kbdev, as,
+						     op_param);
+
+	return ret;
+}
+
+static int mmu_hw_do_flush(struct kbase_device *kbdev, struct kbase_as *as,
+	const struct kbase_mmu_hw_op_param *op_param, bool hwaccess_locked)
+{
+	int ret;
+	u64 lock_addr = 0x0;
+	u32 mmu_cmd = AS_COMMAND_FLUSH_MEM;
+
+	if (WARN_ON(kbdev == NULL) || WARN_ON(as == NULL))
+		return -EINVAL;
+
+	/* MMU operations can be either FLUSH_PT or FLUSH_MEM, anything else at
+	 * this point would be unexpected.
+	 */
+	if (op_param->op != KBASE_MMU_OP_FLUSH_PT &&
+	    op_param->op != KBASE_MMU_OP_FLUSH_MEM) {
+		dev_err(kbdev->dev, "Unexpected flush operation received");
 		return -EINVAL;
 	}
 
-	/* MMU command instrumentation */
-	if (!ret) {
-		u64 lock_addr_base = AS_LOCKADDR_LOCKADDR_BASE_GET(lock_addr);
-		u32 lock_addr_size = AS_LOCKADDR_LOCKADDR_SIZE_GET(lock_addr);
+	lockdep_assert_held(&kbdev->mmu_hw_mutex);
 
-		bool is_mmu_synchronous = false;
+	if (op_param->op == KBASE_MMU_OP_FLUSH_PT)
+		mmu_cmd = AS_COMMAND_FLUSH_PT;
 
-		if (op_param->mmu_sync_info == CALLER_MMU_SYNC)
-			is_mmu_synchronous = true;
+	/* Lock the region that needs to be updated */
+	ret = mmu_hw_do_lock_no_wait(kbdev, as, &lock_addr, op_param);
+	if (ret)
+		return ret;
 
-		KBASE_TLSTREAM_AUX_MMU_COMMAND(kbdev, op_param->kctx_id,
-					       op_param->op, is_mmu_synchronous,
-					       lock_addr_base, lock_addr_size);
+#if MALI_USE_CSF && !IS_ENABLED(CONFIG_MALI_NO_MALI)
+	/* WA for the BASE_HW_ISSUE_GPU2019_3901. No runtime check is used here
+	 * as the WA is applicable to all CSF GPUs where FLUSH_MEM/PT command is
+	 * supported, and this function doesn't gets called for the GPUs where
+	 * FLUSH_MEM/PT command is deprecated.
+	 */
+	if (mmu_cmd == AS_COMMAND_FLUSH_MEM) {
+		ret = apply_hw_issue_GPU2019_3901_wa(kbdev, &mmu_cmd,
+						as->number, hwaccess_locked);
+		if (ret)
+			return ret;
 	}
+#endif
+
+	write_cmd(kbdev, as->number, mmu_cmd);
+
+	/* Wait for the command to complete */
+	ret = wait_ready(kbdev, as->number);
+
+	if (!ret)
+		mmu_command_instr(kbdev, op_param->kctx_id, mmu_cmd, lock_addr,
+				  op_param->mmu_sync_info);
 
 	return ret;
 }
 
+int kbase_mmu_hw_do_flush_locked(struct kbase_device *kbdev, struct kbase_as *as,
+				 const struct kbase_mmu_hw_op_param *op_param)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	return mmu_hw_do_flush(kbdev, as, op_param, true);
+}
+
+int kbase_mmu_hw_do_flush(struct kbase_device *kbdev, struct kbase_as *as,
+			  const struct kbase_mmu_hw_op_param *op_param)
+{
+	return mmu_hw_do_flush(kbdev, as, op_param, false);
+}
+
+int kbase_mmu_hw_do_flush_on_gpu_ctrl(struct kbase_device *kbdev, struct kbase_as *as,
+				      const struct kbase_mmu_hw_op_param *op_param)
+{
+	int ret, ret2;
+	u32 gpu_cmd = GPU_COMMAND_CACHE_CLN_INV_L2_LSC;
+
+	if (WARN_ON(kbdev == NULL) || WARN_ON(as == NULL))
+		return -EINVAL;
+
+	/* MMU operations can be either FLUSH_PT or FLUSH_MEM, anything else at
+	 * this point would be unexpected.
+	 */
+	if (op_param->op != KBASE_MMU_OP_FLUSH_PT &&
+	    op_param->op != KBASE_MMU_OP_FLUSH_MEM) {
+		dev_err(kbdev->dev, "Unexpected flush operation received");
+		return -EINVAL;
+	}
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+	lockdep_assert_held(&kbdev->mmu_hw_mutex);
+
+	if (op_param->op == KBASE_MMU_OP_FLUSH_PT)
+		gpu_cmd = GPU_COMMAND_CACHE_CLN_INV_L2;
+
+	/* 1. Issue MMU_AS_CONTROL.COMMAND.LOCK operation. */
+	ret = mmu_hw_do_lock(kbdev, as, op_param);
+	if (ret)
+		return ret;
+
+	/* 2. Issue GPU_CONTROL.COMMAND.FLUSH_CACHES operation */
+	ret = kbase_gpu_cache_flush_and_busy_wait(kbdev, gpu_cmd);
+
+	/* 3. Issue MMU_AS_CONTROL.COMMAND.UNLOCK operation. */
+	ret2 = kbase_mmu_hw_do_unlock_no_addr(kbdev, as, op_param);
+
+	return ret ?: ret2;
+}
+
 void kbase_mmu_hw_clear_fault(struct kbase_device *kbdev, struct kbase_as *as,
 		enum kbase_mmu_fault_type type)
 {
diff --git a/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c b/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c
index c061099..fcbccae 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c
+++ b/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2014, 2016-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2014, 2016-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -189,35 +189,31 @@ static void set_num_valid_entries(u64 *pgd, unsigned int num_of_valid_entries)
 		   << UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR);
 }
 
-static void entry_set_pte(u64 *pgd, u64 vpfn, phys_addr_t phy)
+static void entry_set_pte(u64 *entry, phys_addr_t phy)
 {
-	unsigned int nr_entries = get_num_valid_entries(pgd);
-
-	page_table_entry_set(&pgd[vpfn], (phy & PAGE_MASK) | ENTRY_ACCESS_BIT |
-						 ENTRY_IS_PTE);
-
-	set_num_valid_entries(pgd, nr_entries + 1);
+	page_table_entry_set(entry, (phy & PAGE_MASK) | ENTRY_ACCESS_BIT | ENTRY_IS_PTE);
 }
 
-static void entry_invalidate(u64 *entry)
+static void entries_invalidate(u64 *entry, u32 count)
 {
-	page_table_entry_set(entry, ENTRY_IS_INVAL);
+	u32 i;
+
+	for (i = 0; i < count; i++)
+		page_table_entry_set(entry + i, ENTRY_IS_INVAL);
 }
 
-static const struct kbase_mmu_mode aarch64_mode = {
-	.update = mmu_update,
-	.get_as_setup = kbase_mmu_get_as_setup,
-	.disable_as = mmu_disable_as,
-	.pte_to_phy_addr = pte_to_phy_addr,
-	.ate_is_valid = ate_is_valid,
-	.pte_is_valid = pte_is_valid,
-	.entry_set_ate = entry_set_ate,
-	.entry_set_pte = entry_set_pte,
-	.entry_invalidate = entry_invalidate,
-	.get_num_valid_entries = get_num_valid_entries,
-	.set_num_valid_entries = set_num_valid_entries,
-	.flags = KBASE_MMU_MODE_HAS_NON_CACHEABLE
-};
+static const struct kbase_mmu_mode aarch64_mode = { .update = mmu_update,
+						    .get_as_setup = kbase_mmu_get_as_setup,
+						    .disable_as = mmu_disable_as,
+						    .pte_to_phy_addr = pte_to_phy_addr,
+						    .ate_is_valid = ate_is_valid,
+						    .pte_is_valid = pte_is_valid,
+						    .entry_set_ate = entry_set_ate,
+						    .entry_set_pte = entry_set_pte,
+						    .entries_invalidate = entries_invalidate,
+						    .get_num_valid_entries = get_num_valid_entries,
+						    .set_num_valid_entries = set_num_valid_entries,
+						    .flags = KBASE_MMU_MODE_HAS_NON_CACHEABLE };
 
 struct kbase_mmu_mode const *kbase_mmu_mode_get_aarch64(void)
 {
author	Jack Diver <diverj@google.com>	2022-09-02 11:38:04 +0000
committer	Jack Diver <diverj@google.com>	2022-09-02 14:33:02 +0000
commit	c30533582604fe0365bc3ce4e9e8e19dec3109da (patch)
tree	2dc4d074c820b535e9f18b8cd81d7e91bff042e5 /mali_kbase/mmu
parent	88d7d984fed1c2a4358ce2bbc334e82d71e3a391 (diff)
download	gpu-c30533582604fe0365bc3ce4e9e8e19dec3109da.tar.gz