Update KMD to r48p0

Provenance: ipdelivery@02a6b5e039b17fd395ddc13d09efbe440223a56c Change-Id: Ia6d72d40f5c57508d818ad24e57547c1a411d644 Signed-off-by: Vamsidhar reddy Gaddam <gvamsi@google.com>
author: Vamsidhar reddy Gaddam <gvamsi@google.com> 2024-03-13 09:45:22 +0000
committer: Vamsidhar reddy Gaddam <gvamsi@google.com> 2024-03-13 09:45:22 +0000
commit: a999cd8fd398aed7390c8e5d99795e9b735d6ba7 (patch)
tree: 79503e1b07ccfd66140fb903be3a0f2e0ace147c
parent: 049a542207ed694271316782397b78b2e202086a (diff)
download: gpu-a999cd8fd398aed7390c8e5d99795e9b735d6ba7.tar.gz
91 files changed, 2048 insertions, 1198 deletions
diff --git a/common/include/linux/mali_hw_access.h b/common/include/linux/mali_hw_access.h
new file mode 100644
index 0000000..ca73036
--- /dev/null
+++ b/common/include/linux/mali_hw_access.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2023 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _MALI_HW_ACCESS_H_
+#define _MALI_HW_ACCESS_H_
+
+#include <asm/arch_timer.h>
+#include <linux/io.h>
+
+
+#define mali_readl(addr) readl(addr)
+#define mali_readq(addr) readq(addr)
+#define mali_writel(val, addr) writel(val, addr)
+#define mali_writeq(val, addr) writeq(val, addr)
+#define mali_ioremap(addr, size) ioremap(addr, size)
+#define mali_iounmap(addr) iounmap(addr)
+#define mali_arch_timer_get_cntfrq() arch_timer_get_cntfrq()
+
+
+#endif /* _MALI_HW_ACCESS_H_ */
diff --git a/common/include/linux/memory_group_manager.h b/common/include/linux/memory_group_manager.h
index 3820f1b..e92d3de 100644
--- a/common/include/linux/memory_group_manager.h
+++ b/common/include/linux/memory_group_manager.h
@@ -32,6 +32,10 @@ typedef int vm_fault_t;
 
 #define MEMORY_GROUP_MANAGER_NR_GROUPS (16)
 
+#define PTE_PBHA_SHIFT (59)
+#define PTE_PBHA_MASK ((uint64_t)0xf << PTE_PBHA_SHIFT)
+#define PTE_RES_BIT_MULTI_AS_SHIFT (63)
+
 struct memory_group_manager_device;
 struct memory_group_manager_import_data;
 
diff --git a/common/include/linux/version_compat_defs.h b/common/include/linux/version_compat_defs.h
index 366b50c..f8594a6 100644
--- a/common/include/linux/version_compat_defs.h
+++ b/common/include/linux/version_compat_defs.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2022-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2022-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -348,4 +348,43 @@ static inline long kbase_pin_user_pages_remote(struct task_struct *tsk, struct m
 
 #endif /* (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE) */
 
+#if (KERNEL_VERSION(4, 16, 0) > LINUX_VERSION_CODE)
+/* Null definition */
+#define ALLOW_ERROR_INJECTION(fname, err_type)
+#endif /* (KERNEL_VERSION(4, 16, 0) > LINUX_VERSION_CODE) */
+
+#if KERNEL_VERSION(6, 0, 0) > LINUX_VERSION_CODE
+#define KBASE_REGISTER_SHRINKER(reclaim, name, priv_data) register_shrinker(reclaim)
+
+#elif ((KERNEL_VERSION(6, 7, 0) > LINUX_VERSION_CODE) && \
+	!(defined(__ANDROID_COMMON_KERNEL__) && (KERNEL_VERSION(6, 6, 0) == LINUX_VERSION_CODE)))
+#define KBASE_REGISTER_SHRINKER(reclaim, name, priv_data) register_shrinker(reclaim, name)
+
+#else
+#define KBASE_REGISTER_SHRINKER(reclaim, name, priv_data) \
+	do {                                              \
+		reclaim->private_data = priv_data;        \
+		shrinker_register(reclaim);               \
+	} while (0)
+
+#endif /* KERNEL_VERSION(6, 0, 0) > LINUX_VERSION_CODE */
+
+#if ((KERNEL_VERSION(6, 7, 0) > LINUX_VERSION_CODE) && \
+	!(defined(__ANDROID_COMMON_KERNEL__) && (KERNEL_VERSION(6, 6, 0) == LINUX_VERSION_CODE)))
+#define KBASE_UNREGISTER_SHRINKER(reclaim) unregister_shrinker(&reclaim)
+#define KBASE_GET_KBASE_DATA_FROM_SHRINKER(s, type, var) container_of(s, type, var)
+#define DEFINE_KBASE_SHRINKER struct shrinker
+#define KBASE_INIT_RECLAIM(var, attr, name) (&((var)->attr))
+#define KBASE_SET_RECLAIM(var, attr, reclaim) ((var)->attr = (*reclaim))
+
+#else
+#define KBASE_UNREGISTER_SHRINKER(reclaim) shrinker_free(reclaim)
+#define KBASE_GET_KBASE_DATA_FROM_SHRINKER(s, type, var) s->private_data
+#define DEFINE_KBASE_SHRINKER struct shrinker *
+#define KBASE_SHRINKER_ALLOC(name) shrinker_alloc(0, name)
+#define KBASE_INIT_RECLAIM(var, attr, name) (KBASE_SHRINKER_ALLOC(name))
+#define KBASE_SET_RECLAIM(var, attr, reclaim) ((var)->attr = reclaim)
+
+#endif
+
 #endif /* _VERSION_COMPAT_DEFS_H_ */
diff --git a/common/include/uapi/gpu/arm/midgard/backend/gpu/mali_kbase_model_dummy.h b/common/include/uapi/gpu/arm/midgard/backend/gpu/mali_kbase_model_dummy.h
index 564f477..b80817f 100644
--- a/common/include/uapi/gpu/arm/midgard/backend/gpu/mali_kbase_model_dummy.h
+++ b/common/include/uapi/gpu/arm/midgard/backend/gpu/mali_kbase_model_dummy.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h b/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h
index 0fb8242..3fff8b2 100644
--- a/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h
+++ b/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h
@@ -46,7 +46,11 @@
  */
 #define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19)
 
-#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20)
+/* Unused bit for CSF, only used in JM for BASE_MEM_TILER_ALIGN_TOP */
+#define BASE_MEM_UNUSED_BIT_20 ((base_mem_alloc_flags)1 << 20)
+
+/* Unused bit for CSF, only used in JM for BASE_MEM_FLAG_MAP_FIXED */
+#define BASE_MEM_UNUSED_BIT_27 ((base_mem_alloc_flags)1 << 27)
 
 /* Must be FIXABLE memory: its GPU VA will be determined at a later point,
  * at which time it will be at a fixed GPU VA.
@@ -62,9 +66,15 @@
  */
 #define BASEP_MEM_FLAGS_KERNEL_ONLY (BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE)
 
-/* A mask of all currently reserved flags
- */
-#define BASE_MEM_FLAGS_RESERVED BASE_MEM_RESERVED_BIT_20
+/* A mask of all flags that should not be queried */
+#define BASE_MEM_DONT_QUERY \
+	(BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_IMPORT_SHARED | BASE_MEM_SAME_VA)
+
+/* A mask of all currently reserved flags */
+#define BASE_MEM_FLAGS_RESERVED ((base_mem_alloc_flags)0)
+
+/* A mask of all bits that are not used by a flag on CSF */
+#define BASE_MEM_FLAGS_UNUSED (BASE_MEM_UNUSED_BIT_20 | BASE_MEM_UNUSED_BIT_27)
 
 /* Special base mem handles specific to CSF.
  */
@@ -474,7 +484,26 @@ struct base_gpu_queue_error_fatal_payload {
 };
 
 /**
- * enum base_gpu_queue_group_error_type - GPU Fatal error type.
+ * struct base_gpu_queue_error_fault_payload - Recoverable fault
+ *        error information related to GPU command queue.
+ *
+ * @sideband:     Additional information about this recoverable fault.
+ * @status:       Recoverable fault information.
+ *                This consists of exception type (least significant byte) and
+ *                data (remaining bytes). One example of exception type is
+ *                INSTR_INVALID_PC (0x50).
+ * @csi_index:    Index of the CSF interface the queue is bound to.
+ * @padding:      Padding to make multiple of 64bits
+ */
+struct base_gpu_queue_error_fault_payload {
+	__u64 sideband;
+	__u32 status;
+	__u8 csi_index;
+	__u8 padding[3];
+};
+
+/**
+ * enum base_gpu_queue_group_error_type - GPU error type.
  *
  * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL:       Fatal error associated with GPU
  *                                          command queue group.
@@ -484,7 +513,9 @@ struct base_gpu_queue_error_fatal_payload {
  *                                          progress timeout.
  * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out
  *                                             of tiler heap memory.
- * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types
+ * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FAULT: Fault error reported for GPU
+ *                                          command queue.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of GPU error types
  *
  * This type is used for &struct_base_gpu_queue_group_error.error_type.
  */
@@ -493,6 +524,7 @@ enum base_gpu_queue_group_error_type {
 	BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL,
 	BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT,
 	BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM,
+	BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FAULT,
 	BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT
 };
 
@@ -512,6 +544,7 @@ struct base_gpu_queue_group_error {
 	union {
 		struct base_gpu_queue_group_error_fatal_payload fatal_group;
 		struct base_gpu_queue_error_fatal_payload fatal_queue;
+		struct base_gpu_queue_error_fault_payload fault_queue;
 	} payload;
 };
 
diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h b/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h
index 537c90d..c56e071 100644
--- a/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h
+++ b/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2020-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2020-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -95,15 +95,23 @@
  * 1.22:
  * - Add comp_pri_threshold and comp_pri_ratio attributes to
  *   kbase_ioctl_cs_queue_group_create.
+ * - Made the BASE_MEM_DONT_NEED memory flag queryable.
  * 1.23:
  * - Disallows changing the sharability on the GPU of imported dma-bufs to
  *   BASE_MEM_COHERENT_SYSTEM using KBASE_IOCTL_MEM_FLAGS_CHANGE.
  * 1.24:
  * - Implement full block state support for hardware counters.
+ * 1.25:
+ * - Add support for CS_FAULT reporting to userspace
+ * 1.26:
+ * - Made the BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP and BASE_MEM_KERNEL_SYNC memory
+ *   flags queryable.
+ * 1.27:
+ * - Implement support for HWC block state availability.
  */
 
 #define BASE_UK_VERSION_MAJOR 1
-#define BASE_UK_VERSION_MINOR 24
+#define BASE_UK_VERSION_MINOR 27
 
 /**
  * struct kbase_ioctl_version_check - Check version compatibility between
@@ -340,6 +348,8 @@ union kbase_ioctl_cs_queue_group_create_1_18 {
  * @in.csi_handlers:  Flags to signal that the application intends to use CSI
  *                    exception handlers in some linear buffers to deal with
  *                    the given exception types.
+ * @in.cs_fault_report_enable:  Flag to indicate reporting of CS_FAULTs
+ *                    to userspace.
  * @in.padding:       Currently unused, must be zero
  * @out:              Output parameters
  * @out.group_handle: Handle of a newly created queue group.
@@ -360,7 +370,8 @@ union kbase_ioctl_cs_queue_group_create {
 		/**
 		 * @in.reserved:   Reserved, currently unused, must be zero.
 		 */
-		__u16 reserved;
+		__u8 reserved;
+		__u8 cs_fault_report_enable;
 		/**
 		 * @in.dvs_buf: buffer for deferred vertex shader
 		 */
@@ -637,6 +648,22 @@ union kbase_ioctl_read_user_page {
 
 #define KBASE_IOCTL_READ_USER_PAGE _IOWR(KBASE_IOCTL_TYPE, 60, union kbase_ioctl_read_user_page)
 
+/**
+ * struct kbase_ioctl_queue_group_clear_faults - Re-enable CS FAULT reporting for the GPU queues
+ *
+ * @addr: CPU VA to an array of GPU VAs of the buffers backing the queues
+ * @nr_queues: Number of queues in the array
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_queue_group_clear_faults {
+	__u64 addr;
+	__u32 nr_queues;
+	__u8 padding[4];
+};
+
+#define KBASE_IOCTL_QUEUE_GROUP_CLEAR_FAULTS \
+	_IOW(KBASE_IOCTL_TYPE, 61, struct kbase_ioctl_queue_group_clear_faults)
+
 /***************
  * test ioctls *
  ***************/
diff --git a/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h
index d347854..d4d12ae 100644
--- a/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h
+++ b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2015-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2015-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
diff --git a/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h b/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h
index 9478334..25563e4 100644
--- a/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h
+++ b/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h
@@ -30,15 +30,11 @@
  * See base_mem_alloc_flags.
  */
 
-/* Used as BASE_MEM_FIXED in other backends */
-#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8)
+/* Unused bit for JM, only used in CSF for BASE_MEM_FIXED */
+#define BASE_MEM_UNUSED_BIT_8 ((base_mem_alloc_flags)1 << 8)
 
-/**
- * BASE_MEM_RESERVED_BIT_19 - Bit 19 is reserved.
- *
- * Do not remove, use the next unreserved bit for new flags
- */
-#define BASE_MEM_RESERVED_BIT_19 ((base_mem_alloc_flags)1 << 19)
+/* Unused bit for JM, only used in CSF for BASE_CSF_EVENT */
+#define BASE_MEM_UNUSED_BIT_19 ((base_mem_alloc_flags)1 << 19)
 
 /**
  * BASE_MEM_TILER_ALIGN_TOP - Memory starting from the end of the initial commit is aligned
@@ -64,9 +60,15 @@
 	(BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE | BASE_MEM_FLAG_MAP_FIXED | \
 	 BASEP_MEM_PERFORM_JIT_TRIM)
 
-/* A mask of all currently reserved flags
- */
-#define BASE_MEM_FLAGS_RESERVED (BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_19)
+/* A mask of all flags that should not be queried */
+#define BASE_MEM_DONT_QUERY \
+	(BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_IMPORT_SHARED | BASE_MEM_SAME_VA)
+
+/* A mask of all currently reserved flags */
+#define BASE_MEM_FLAGS_RESERVED ((base_mem_alloc_flags)0)
+
+/* A mask of all bits that are not used by a flag on JM */
+#define BASE_MEM_FLAGS_UNUSED (BASE_MEM_UNUSED_BIT_8 | BASE_MEM_UNUSED_BIT_19)
 
 /* Similar to BASE_MEM_TILER_ALIGN_TOP, memory starting from the end of the
  * initial commit is aligned to 'extension' pages, where 'extension' must be a power
diff --git a/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h b/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h
index 2a7a06a..43e35a7 100644
--- a/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h
+++ b/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h
@@ -149,15 +149,19 @@
  *   from the parent process.
  * 11.40:
  * - Remove KBASE_IOCTL_HWCNT_READER_SETUP and KBASE_HWCNT_READER_* ioctls.
+ * - Made the BASE_MEM_DONT_NEED memory flag queryable.
  * 11.41:
  * - Disallows changing the sharability on the GPU of imported dma-bufs to
  *   BASE_MEM_COHERENT_SYSTEM using KBASE_IOCTL_MEM_FLAGS_CHANGE.
  * 11.42:
  * - Implement full block state support for hardware counters.
+ * 11.43:
+ * - Made the BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP and BASE_MEM_KERNEL_SYNC memory
+ *   flags queryable.
  */
 
 #define BASE_UK_VERSION_MAJOR 11
-#define BASE_UK_VERSION_MINOR 42
+#define BASE_UK_VERSION_MINOR 43
 
 /**
  * struct kbase_ioctl_version_check - Check version compatibility between
diff --git a/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h b/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h
index cb1a1e8..19df2d1 100644
--- a/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h
@@ -89,10 +89,9 @@ typedef __u32 base_mem_alloc_flags;
 /* A mask of all the flags that can be returned via the base_mem_get_flags()
  * interface.
  */
-#define BASE_MEM_FLAGS_QUERYABLE                                                           \
-	(BASE_MEM_FLAGS_INPUT_MASK &                                                       \
-	 ~(BASE_MEM_SAME_VA | BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_IMPORT_SHARED | \
-	   BASE_MEM_FLAGS_RESERVED | BASEP_MEM_FLAGS_KERNEL_ONLY))
+#define BASE_MEM_FLAGS_QUERYABLE                                                       \
+	(BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_DONT_QUERY | BASE_MEM_FLAGS_RESERVED | \
+				       BASE_MEM_FLAGS_UNUSED | BASEP_MEM_FLAGS_KERNEL_ONLY))
 
 /**
  * enum base_mem_import_type - Memory types supported by @a base_mem_import
diff --git a/mali_kbase/Kbuild b/mali_kbase/Kbuild
index 598b021..8a85857 100644
--- a/mali_kbase/Kbuild
+++ b/mali_kbase/Kbuild
@@ -69,7 +69,7 @@ endif
 #
 
 # Driver version string which is returned to userspace via an ioctl
-MALI_RELEASE_NAME ?= '"r47p0-01eac0"'
+MALI_RELEASE_NAME ?= '"r48p0-01eac0"'
 # Set up defaults if not defined by build system
 ifeq ($(CONFIG_MALI_DEBUG), y)
     MALI_UNIT_TEST = 1
diff --git a/mali_kbase/Kconfig b/mali_kbase/Kconfig
index c49c49b..57e8e32 100644
--- a/mali_kbase/Kconfig
+++ b/mali_kbase/Kconfig
@@ -72,7 +72,6 @@ config MALI_NO_MALI_DEFAULT_GPU
 	help
 	  This option sets the default GPU to identify as for No Mali builds.
 
-
 endchoice
 
 menu "Platform specific options"
@@ -217,16 +216,6 @@ config MALI_CORESTACK
 
 	  If unsure, say N.
 
-comment "Platform options"
-	depends on MALI_MIDGARD && MALI_EXPERT
-
-config MALI_ERROR_INJECT
-	bool "Enable No Mali error injection"
-	depends on MALI_MIDGARD && MALI_EXPERT && MALI_NO_MALI
-	default n
-	help
-	  Enables insertion of errors to test module failure and recovery mechanisms.
-
 comment "Debug options"
 	depends on MALI_MIDGARD && MALI_EXPERT
 
diff --git a/mali_kbase/Makefile b/mali_kbase/Makefile
index 07c3019..acbd4fa 100644
--- a/mali_kbase/Makefile
+++ b/mali_kbase/Makefile
@@ -76,7 +76,6 @@ ifeq ($(MALI_KCONFIG_EXT_PREFIX),)
             else
                 # Prevent misuse when CONFIG_MALI_NO_MALI=n
                 CONFIG_MALI_REAL_HW = y
-                CONFIG_MALI_ERROR_INJECT = n
             endif
 
 
@@ -108,7 +107,6 @@ ifeq ($(MALI_KCONFIG_EXT_PREFIX),)
             CONFIG_MALI_JOB_DUMP = n
             CONFIG_MALI_NO_MALI = n
             CONFIG_MALI_REAL_HW = y
-            CONFIG_MALI_ERROR_INJECT = n
             CONFIG_MALI_HW_ERRATA_1485982_NOT_AFFECTED = n
             CONFIG_MALI_HW_ERRATA_1485982_USE_CLOCK_ALTERNATIVE = n
             CONFIG_MALI_PRFCNT_SET_SELECT_VIA_DEBUG_FS = n
@@ -171,7 +169,6 @@ ifeq ($(MALI_KCONFIG_EXT_PREFIX),)
         CONFIG_MALI_PWRSOFT_765 \
         CONFIG_MALI_JOB_DUMP \
         CONFIG_MALI_NO_MALI \
-        CONFIG_MALI_ERROR_INJECT \
         CONFIG_MALI_HW_ERRATA_1485982_NOT_AFFECTED \
         CONFIG_MALI_HW_ERRATA_1485982_USE_CLOCK_ALTERNATIVE \
         CONFIG_MALI_PRFCNT_SET_PRIMARY \
@@ -272,6 +269,8 @@ CFLAGS_MODULE += -Wmissing-field-initializers
 CFLAGS_MODULE += -Wno-type-limits
 CFLAGS_MODULE += $(call cc-option, -Wmaybe-uninitialized)
 CFLAGS_MODULE += $(call cc-option, -Wunused-macros)
+# The following ensures the stack frame does not get larger than a page
+CFLAGS_MODULE += -Wframe-larger-than=4096
 
 KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN2
 
diff --git a/mali_kbase/Mconfig b/mali_kbase/Mconfig
index b3c5323..07bb50e 100644
--- a/mali_kbase/Mconfig
+++ b/mali_kbase/Mconfig
@@ -64,7 +64,6 @@ config MALI_NO_MALI
 	  All calls to the simulated hardware will complete immediately as if the hardware
 	  completed the task.
 
-
 endchoice
 
 
@@ -206,45 +205,6 @@ config LARGE_PAGE_SUPPORT
 
 	  If in doubt, say Y.
 
-choice
-	prompt "Error injection level"
-	depends on MALI_MIDGARD && MALI_EXPERT
-	default MALI_ERROR_INJECT_NONE
-	help
-	  Enables insertion of errors to test module failure and recovery mechanisms.
-
-config MALI_ERROR_INJECT_NONE
-	bool "disabled"
-	depends on MALI_MIDGARD && MALI_EXPERT
-	help
-	  Error injection is disabled.
-
-config MALI_ERROR_INJECT_TRACK_LIST
-	bool "error track list"
-	depends on MALI_MIDGARD && MALI_EXPERT && NO_MALI
-	help
-	  Errors to inject are pre-configured by the user.
-
-config MALI_ERROR_INJECT_RANDOM
-	bool "random error injection"
-	depends on MALI_MIDGARD && MALI_EXPERT && NO_MALI
-	help
-	  Injected errors are random, rather than user-driven.
-
-endchoice
-
-config MALI_ERROR_INJECT_ON
-	string
-	depends on MALI_MIDGARD && MALI_EXPERT
-	default "0" if MALI_ERROR_INJECT_NONE
-	default "1" if MALI_ERROR_INJECT_TRACK_LIST
-	default "2" if MALI_ERROR_INJECT_RANDOM
-
-config MALI_ERROR_INJECT
-	bool
-	depends on MALI_MIDGARD && MALI_EXPERT
-	default y if !MALI_ERROR_INJECT_NONE
-
 config MALI_DEBUG
 	bool "Enable debug build"
 	depends on MALI_MIDGARD && MALI_EXPERT
diff --git a/mali_kbase/backend/gpu/Kbuild b/mali_kbase/backend/gpu/Kbuild
index a06b15d..c91f147 100644
--- a/mali_kbase/backend/gpu/Kbuild
+++ b/mali_kbase/backend/gpu/Kbuild
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2014-2022 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2014-2023 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -47,12 +47,7 @@ endif
 mali_kbase-$(CONFIG_MALI_DEVFREQ) += \
     backend/gpu/mali_kbase_devfreq.o
 
-ifneq ($(CONFIG_MALI_REAL_HW),y)
-    mali_kbase-y += backend/gpu/mali_kbase_model_linux.o
-endif
+mali_kbase-$(CONFIG_MALI_NO_MALI) += backend/gpu/mali_kbase_model_linux.o
 
 # NO_MALI Dummy model interface
 mali_kbase-$(CONFIG_MALI_NO_MALI) += backend/gpu/mali_kbase_model_dummy.o
-# HW error simulation
-mali_kbase-$(CONFIG_MALI_NO_MALI) += backend/gpu/mali_kbase_model_error_generator.o
-
diff --git a/mali_kbase/backend/gpu/mali_kbase_devfreq.c b/mali_kbase/backend/gpu/mali_kbase_devfreq.c
index 905d188..2c1feed 100644
--- a/mali_kbase/backend/gpu/mali_kbase_devfreq.c
+++ b/mali_kbase/backend/gpu/mali_kbase_devfreq.c
@@ -431,7 +431,7 @@ static int kbase_devfreq_init_core_mask_table(struct kbase_device *kbdev)
 		err = of_property_read_u64(node, "opp-hz-real", real_freqs);
 #endif
 		if (err < 0) {
-			dev_warn(kbdev->dev, "Failed to read opp-hz-real property with error %d\n",
+			dev_warn(kbdev->dev, "Failed to read opp-hz-real property with error %d",
 				 err);
 			continue;
 		}
@@ -439,8 +439,8 @@ static int kbase_devfreq_init_core_mask_table(struct kbase_device *kbdev)
 		err = of_property_read_u32_array(node, "opp-microvolt", opp_volts,
 						 kbdev->nr_regulators);
 		if (err < 0) {
-			dev_warn(kbdev->dev,
-				 "Failed to read opp-microvolt property with error %d\n", err);
+			dev_warn(kbdev->dev, "Failed to read opp-microvolt property with error %d",
+				 err);
 			continue;
 		}
 #endif
@@ -450,11 +450,12 @@ static int kbase_devfreq_init_core_mask_table(struct kbase_device *kbdev)
 		if (core_mask != shader_present && corestack_driver_control) {
 			dev_warn(
 				kbdev->dev,
-				"Ignoring OPP %llu - Dynamic Core Scaling not supported on this GPU\n",
+				"Ignoring OPP %llu - Dynamic Core Scaling not supported on this GPU",
 				opp_freq);
 			continue;
 		}
 
+
 		core_count_p = of_get_property(node, "opp-core-count", NULL);
 		if (core_count_p) {
 			u64 remaining_core_mask = kbdev->gpu_props.shader_present;
diff --git a/mali_kbase/backend/gpu/mali_kbase_irq_internal.h b/mali_kbase/backend/gpu/mali_kbase_irq_internal.h
index 4374793..4798df9 100644
--- a/mali_kbase/backend/gpu/mali_kbase_irq_internal.h
+++ b/mali_kbase/backend/gpu/mali_kbase_irq_internal.h
@@ -74,7 +74,7 @@ void kbase_synchronize_irqs(struct kbase_device *kbdev);
  * Return: 0 on success. Error code (negative) on failure.
  */
 int kbase_validate_interrupts(struct kbase_device *const kbdev);
-#endif /* CONFIG_MALI_REAL_HW */
+#endif /* IS_ENABLED(CONFIG_MALI_REAL_HW) */
 #endif /* CONFIG_MALI_DEBUG */
 
 /**
diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
index 3868799..a8b75f2 100644
--- a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
+++ b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
@@ -1437,7 +1437,7 @@ void kbase_backend_reset(struct kbase_device *kbdev, ktime_t *end_timestamp)
 			 * then leave it in the RB and next time we're kicked
 			 * it will be processed again from the starting state.
 			 */
-			if (keep_in_jm_rb) {
+			if (!kbase_is_gpu_removed(kbdev) && keep_in_jm_rb) {
 				katom->protected_state.exit = KBASE_ATOM_EXIT_PROTECTED_CHECK;
 				/* As the atom was not removed, increment the
 				 * index so that we read the correct atom in the
diff --git a/mali_kbase/backend/gpu/mali_kbase_model_dummy.c b/mali_kbase/backend/gpu/mali_kbase_model_dummy.c
index c340760..b034ffe 100644
--- a/mali_kbase/backend/gpu/mali_kbase_model_dummy.c
+++ b/mali_kbase/backend/gpu/mali_kbase_model_dummy.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2014-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -25,42 +25,8 @@
  *   insmod'ing mali_kbase.ko with no arguments after a build with "scons
  *   gpu=tXYZ" will yield the expected GPU ID for tXYZ. This can always be
  *   overridden by passing the 'no_mali_gpu' argument to insmod.
- *
- * - if CONFIG_MALI_ERROR_INJECT is defined the error injection system is
- *   activated.
  */
 
-/* Implementation of failure injection system:
- *
- * Error conditions are generated by gpu_generate_error().
- * According to CONFIG_MALI_ERROR_INJECT definition gpu_generate_error() either
- * generates an error HW condition randomly (CONFIG_MALI_ERROR_INJECT_RANDOM) or
- * checks if there is (in error_track_list) an error configuration to be set for
- * the current job chain (CONFIG_MALI_ERROR_INJECT_RANDOM not defined).
- * Each error condition will trigger a specific "state" for a certain set of
- * registers as per Midgard Architecture Specifications doc.
- *
- * According to Midgard Architecture Specifications doc the following registers
- * are always affected by error conditions:
- *
- * JOB Exception:
- *				JOB_IRQ_RAWSTAT
- *				JOB<n> STATUS AREA
- *
- * MMU Exception:
- *				MMU_IRQ_RAWSTAT
- *				AS<n>_FAULTSTATUS
- *				AS<n>_FAULTADDRESS
- *
- * GPU Exception:
- *				GPU_IRQ_RAWSTAT
- *				GPU_FAULTSTATUS
- *				GPU_FAULTADDRESS
- *
- *	For further clarification on the model behaviour upon specific error
- *      conditions the user may refer to the Midgard Architecture Specification
- *      document
- */
 #include <mali_kbase.h>
 #include <device/mali_kbase_device.h>
 #include <hw_access/mali_kbase_hw_access_regmap.h>
@@ -126,7 +92,7 @@ struct error_status_t hw_error_status;
  */
 struct control_reg_values_t {
 	const char *name;
-	u32 gpu_id;
+	u64 gpu_id;
 	u32 as_present;
 	u32 thread_max_threads;
 	u32 thread_max_workgroup_size;
@@ -524,7 +490,7 @@ MODULE_PARM_DESC(no_mali_gpu, "GPU to identify as");
 static u32 gpu_model_get_prfcnt_value(enum kbase_ipa_core_type core_type, u32 cnt_idx,
 				      bool is_low_word)
 {
-	u64 *counters_data;
+	u64 *counters_data = NULL;
 	u32 core_count = 0;
 	u32 event_index;
 	u64 value = 0;
@@ -580,6 +546,9 @@ static u32 gpu_model_get_prfcnt_value(enum kbase_ipa_core_type core_type, u32 cn
 		break;
 	}
 
+	if (unlikely(counters_data == NULL))
+		return 0;
+
 	for (core = 0; core < core_count; core++) {
 		value += counters_data[event_index];
 		event_index += KBASE_DUMMY_MODEL_COUNTER_PER_CORE;
@@ -1172,9 +1141,6 @@ static void midgard_model_update(void *h)
 
 		/*this job is done assert IRQ lines */
 		signal_int(dummy, i);
-#ifdef CONFIG_MALI_ERROR_INJECT
-		midgard_set_error(i);
-#endif /* CONFIG_MALI_ERROR_INJECT */
 		update_register_statuses(dummy, i);
 		/*if this job slot returned failures we cannot use it */
 		if (hw_error_status.job_irq_rawstat & (1u << (i + 16))) {
@@ -1564,6 +1530,7 @@ void midgard_model_write_reg(void *h, u32 addr, u32 value)
 		case L2_PWROFF_HI:
 		case PWR_KEY:
 		case PWR_OVERRIDE0:
+		case PWR_OVERRIDE1:
 #if MALI_USE_CSF
 		case SHADER_PWRFEATURES:
 		case CSF_CONFIG:
@@ -1607,8 +1574,7 @@ void midgard_model_read_reg(void *h, u32 addr, u32 *const value)
 #else /* !MALI_USE_CSF */
 	if (addr == GPU_CONTROL_REG(GPU_ID)) {
 #endif /* !MALI_USE_CSF */
-
-		*value = dummy->control_reg_values->gpu_id;
+		*value = dummy->control_reg_values->gpu_id & U32_MAX;
 	} else if (addr == JOB_CONTROL_REG(JOB_IRQ_RAWSTAT)) {
 		*value = hw_error_status.job_irq_rawstat;
 		pr_debug("%s", "JS_IRQ_RAWSTAT being read");
@@ -2166,9 +2132,3 @@ int gpu_model_control(void *model, struct kbase_model_control_params *params)
 
 	return 0;
 }
-
-u64 midgard_model_arch_timer_get_cntfrq(void *h)
-{
-	CSTD_UNUSED(h);
-	return arch_timer_get_cntfrq();
-}
diff --git a/mali_kbase/backend/gpu/mali_kbase_model_linux.h b/mali_kbase/backend/gpu/mali_kbase_model_linux.h
index 65eb620..d38bb88 100644
--- a/mali_kbase/backend/gpu/mali_kbase_model_linux.h
+++ b/mali_kbase/backend/gpu/mali_kbase_model_linux.h
@@ -48,12 +48,8 @@
 /*
  * Include Model definitions
  */
-
-#if IS_ENABLED(CONFIG_MALI_NO_MALI)
 #include <backend/gpu/mali_kbase_model_dummy.h>
-#endif /* IS_ENABLED(CONFIG_MALI_NO_MALI) */
 
-#if !IS_ENABLED(CONFIG_MALI_REAL_HW)
 /**
  * kbase_gpu_device_create() - Generic create function.
  *
@@ -117,15 +113,6 @@ void midgard_model_write_reg(void *h, u32 addr, u32 value);
 void midgard_model_read_reg(void *h, u32 addr, u32 *const value);
 
 /**
- * midgard_model_arch_timer_get_cntfrq - Get Model specific System Timer Frequency
- *
- * @h: Model handle.
- *
- * Return: Frequency in Hz
- */
-u64 midgard_model_arch_timer_get_cntfrq(void *h);
-
-/**
  * gpu_device_raise_irq() - Private IRQ raise function.
  *
  * @model: Model handle.
@@ -155,6 +142,5 @@ void gpu_device_set_data(void *model, void *data);
  * Return: Pointer to the data carried by model.
  */
 void *gpu_device_get_data(void *model);
-#endif /* !IS_ENABLED(CONFIG_MALI_REAL_HW) */
 
 #endif /* _KBASE_MODEL_LINUX_H_ */
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
index 030d56a..801db54 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
@@ -36,6 +36,9 @@
 #include <linux/version_compat_defs.h>
 #include <linux/pm_runtime.h>
 #include <mali_kbase_reset_gpu.h>
+#ifdef CONFIG_MALI_ARBITER_SUPPORT
+#include <csf/mali_kbase_csf_scheduler.h>
+#endif /* !CONFIG_MALI_ARBITER_SUPPORT */
 #endif /* !MALI_USE_CSF */
 #include <hwcnt/mali_kbase_hwcnt_context.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
@@ -860,9 +863,11 @@ void kbase_pm_set_debug_core_mask(struct kbase_device *kbdev, u64 new_core_mask)
 }
 KBASE_EXPORT_TEST_API(kbase_pm_set_debug_core_mask);
 #else
-void kbase_pm_set_debug_core_mask(struct kbase_device *kbdev, u64 new_core_mask_js0,
-				  u64 new_core_mask_js1, u64 new_core_mask_js2)
+void kbase_pm_set_debug_core_mask(struct kbase_device *kbdev, u64 *new_core_mask,
+				  size_t new_core_mask_size)
 {
+	size_t i;
+
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 	lockdep_assert_held(&kbdev->pm.lock);
 
@@ -870,13 +875,14 @@ void kbase_pm_set_debug_core_mask(struct kbase_device *kbdev, u64 new_core_mask_
 		dev_warn_once(
 			kbdev->dev,
 			"Change of core mask not supported for slot 0 as dummy job WA is enabled");
-		new_core_mask_js0 = kbdev->pm.debug_core_mask[0];
+		new_core_mask[0] = kbdev->pm.debug_core_mask[0];
 	}
 
-	kbdev->pm.debug_core_mask[0] = new_core_mask_js0;
-	kbdev->pm.debug_core_mask[1] = new_core_mask_js1;
-	kbdev->pm.debug_core_mask[2] = new_core_mask_js2;
-	kbdev->pm.debug_core_mask_all = new_core_mask_js0 | new_core_mask_js1 | new_core_mask_js2;
+	kbdev->pm.debug_core_mask_all = 0;
+	for (i = 0; i < new_core_mask_size; i++) {
+		kbdev->pm.debug_core_mask[i] = new_core_mask[i];
+		kbdev->pm.debug_core_mask_all |= new_core_mask[i];
+	}
 
 	kbase_pm_update_dynamic_cores_onoff(kbdev);
 }
@@ -962,7 +968,9 @@ void kbase_hwaccess_pm_resume(struct kbase_device *kbdev)
 void kbase_pm_handle_gpu_lost(struct kbase_device *kbdev)
 {
 	unsigned long flags;
-#if !MALI_USE_CSF
+#if MALI_USE_CSF
+	unsigned long flags_sched;
+#else
 	ktime_t end_timestamp = ktime_get_raw();
 #endif
 	struct kbase_arbiter_vm_state *arb_vm_state = kbdev->pm.arb_vm_state;
@@ -981,24 +989,41 @@ void kbase_pm_handle_gpu_lost(struct kbase_device *kbdev)
 		 */
 		WARN(!kbase_is_gpu_removed(kbdev), "GPU is still available after GPU lost event\n");
 
-		/* Full GPU reset will have been done by hypervisor, so
-		 * cancel
-		 */
+#if MALI_USE_CSF
+		/* Full GPU reset will have been done by hypervisor, so cancel */
+		kbase_reset_gpu_prevent_and_wait(kbdev);
+
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		kbase_csf_scheduler_spin_lock(kbdev, &flags_sched);
+		atomic_set(&kbdev->hwaccess.backend.reset_gpu, KBASE_RESET_GPU_NOT_PENDING);
+		kbase_csf_scheduler_spin_unlock(kbdev, flags_sched);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+		kbase_synchronize_irqs(kbdev);
+
+		/* Scheduler reset happens outside of spinlock due to the mutex it acquires */
+		kbase_csf_scheduler_reset(kbdev);
+
+		/* Update kbase status */
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		kbdev->protected_mode = false;
+		kbase_pm_update_state(kbdev);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+#else
+		/* Full GPU reset will have been done by hypervisor, so cancel */
 		atomic_set(&kbdev->hwaccess.backend.reset_gpu, KBASE_RESET_GPU_NOT_PENDING);
 		hrtimer_cancel(&kbdev->hwaccess.backend.reset_timer);
+
 		kbase_synchronize_irqs(kbdev);
 
 		/* Clear all jobs running on the GPU */
 		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 		kbdev->protected_mode = false;
-#if !MALI_USE_CSF
 		kbase_backend_reset(kbdev, &end_timestamp);
 		kbase_pm_metrics_update(kbdev, NULL);
-#endif
 		kbase_pm_update_state(kbdev);
 		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
-#if !MALI_USE_CSF
 		/* Cancel any pending HWC dumps */
 		spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
 		if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_DUMPING ||
@@ -1008,12 +1033,11 @@ void kbase_pm_handle_gpu_lost(struct kbase_device *kbdev)
 			wake_up(&kbdev->hwcnt.backend.wait);
 		}
 		spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
-#endif
+#endif /* MALI_USE_CSF */
 	}
 	mutex_unlock(&arb_vm_state->vm_state_lock);
 	mutex_unlock(&kbdev->pm.lock);
 }
-
 #endif /* CONFIG_MALI_ARBITER_SUPPORT */
 
 #if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
@@ -1069,7 +1093,7 @@ static int pm_handle_mcu_sleep_on_runtime_suspend(struct kbase_device *kbdev)
 	 */
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	if (kbdev->pm.backend.gpu_sleep_mode_active && kbdev->pm.backend.exit_gpu_sleep_mode &&
-	    !work_pending(&kbdev->csf.scheduler.gpu_idle_work)) {
+	    !atomic_read(&kbdev->csf.scheduler.pending_gpu_idle_work)) {
 		u32 glb_req =
 			kbase_csf_firmware_global_input_read(&kbdev->csf.global_iface, GLB_REQ);
 		u32 glb_ack = kbase_csf_firmware_global_output(&kbdev->csf.global_iface, GLB_ACK);
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
index f042b48..7bbfef8 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
@@ -3139,6 +3139,7 @@ static int kbase_set_tiler_quirks(struct kbase_device *kbdev)
 	return 0;
 }
 
+
 static int kbase_pm_hw_issues_detect(struct kbase_device *kbdev)
 {
 	struct device_node *np = kbdev->dev->of_node;
@@ -3191,6 +3192,7 @@ static int kbase_pm_hw_issues_detect(struct kbase_device *kbdev)
 		error = kbase_set_mmu_quirks(kbdev);
 	}
 
+
 	return error;
 }
 
@@ -3210,6 +3212,7 @@ static void kbase_pm_hw_issues_apply(struct kbase_device *kbdev)
 #else
 	kbase_reg_write32(kbdev, GPU_CONTROL_ENUM(JM_CONFIG), kbdev->hw_quirks_gpu);
 #endif
+
 }
 
 void kbase_pm_cache_snoop_enable(struct kbase_device *kbdev)
diff --git a/mali_kbase/backend/gpu/mali_kbase_time.c b/mali_kbase/backend/gpu/mali_kbase_time.c
index dfdf469..c403161 100644
--- a/mali_kbase/backend/gpu/mali_kbase_time.c
+++ b/mali_kbase/backend/gpu/mali_kbase_time.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2014-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -30,10 +30,7 @@
 #include <mali_kbase_config_defaults.h>
 #include <linux/version_compat_defs.h>
 #include <asm/arch_timer.h>
-
-#if !IS_ENABLED(CONFIG_MALI_REAL_HW)
-#include <backend/gpu/mali_kbase_model_linux.h>
-#endif
+#include <linux/mali_hw_access.h>
 
 struct kbase_timeout_info {
 	char *selector_str;
@@ -47,6 +44,7 @@ static struct kbase_timeout_info timeout_info[KBASE_TIMEOUT_SELECTOR_COUNT] = {
 	[CSF_PM_TIMEOUT] = { "CSF_PM_TIMEOUT", CSF_PM_TIMEOUT_CYCLES },
 	[CSF_GPU_RESET_TIMEOUT] = { "CSF_GPU_RESET_TIMEOUT", CSF_GPU_RESET_TIMEOUT_CYCLES },
 	[CSF_CSG_SUSPEND_TIMEOUT] = { "CSF_CSG_SUSPEND_TIMEOUT", CSF_CSG_SUSPEND_TIMEOUT_CYCLES },
+	[CSF_CSG_TERM_TIMEOUT] = { "CSF_CSG_TERM_TIMEOUT", CSF_CSG_TERM_TIMEOUT_CYCLES },
 	[CSF_FIRMWARE_BOOT_TIMEOUT] = { "CSF_FIRMWARE_BOOT_TIMEOUT",
 					CSF_FIRMWARE_BOOT_TIMEOUT_CYCLES },
 	[CSF_FIRMWARE_PING_TIMEOUT] = { "CSF_FIRMWARE_PING_TIMEOUT",
@@ -307,11 +305,7 @@ static void get_cpu_gpu_time(struct kbase_device *kbdev, u64 *cpu_ts, u64 *gpu_t
 
 u64 kbase_arch_timer_get_cntfrq(struct kbase_device *kbdev)
 {
-	u64 freq = arch_timer_get_cntfrq();
-
-#if !IS_ENABLED(CONFIG_MALI_REAL_HW)
-	freq = midgard_model_arch_timer_get_cntfrq(kbdev->model);
-#endif
+	u64 freq = mali_arch_timer_get_cntfrq();
 
 	dev_dbg(kbdev->dev, "System Timer Freq = %lluHz", freq);
 
diff --git a/mali_kbase/build.bp b/mali_kbase/build.bp
index 77e193a..a0570c2 100644
--- a/mali_kbase/build.bp
+++ b/mali_kbase/build.bp
@@ -71,18 +71,6 @@ bob_defaults {
     mali_real_hw: {
         kbuild_options: ["CONFIG_MALI_REAL_HW=y"],
     },
-    mali_error_inject_none: {
-        kbuild_options: ["CONFIG_MALI_ERROR_INJECT_NONE=y"],
-    },
-    mali_error_inject_track_list: {
-        kbuild_options: ["CONFIG_MALI_ERROR_INJECT_TRACK_LIST=y"],
-    },
-    mali_error_inject_random: {
-        kbuild_options: ["CONFIG_MALI_ERROR_INJECT_RANDOM=y"],
-    },
-    mali_error_inject: {
-        kbuild_options: ["CONFIG_MALI_ERROR_INJECT=y"],
-    },
     mali_debug: {
         kbuild_options: [
             "CONFIG_MALI_DEBUG=y",
diff --git a/mali_kbase/context/backend/mali_kbase_context_csf.c b/mali_kbase/context/backend/mali_kbase_context_csf.c
index 8b14108..1bf5f9b 100644
--- a/mali_kbase/context/backend/mali_kbase_context_csf.c
+++ b/mali_kbase/context/backend/mali_kbase_context_csf.c
@@ -187,11 +187,17 @@ void kbase_destroy_context(struct kbase_context *kctx)
 	 * Customer side that a hang could occur if context termination is
 	 * not blocked until the resume of GPU device.
 	 */
+#ifdef CONFIG_MALI_ARBITER_SUPPORT
+	atomic_inc(&kbdev->pm.gpu_users_waiting);
+#endif /* CONFIG_MALI_ARBITER_SUPPORT */
 	while (kbase_pm_context_active_handle_suspend(kbdev,
 						      KBASE_PM_SUSPEND_HANDLER_DONT_INCREASE)) {
 		dev_info(kbdev->dev, "Suspend in progress when destroying context");
 		wait_event(kbdev->pm.resume_wait, !kbase_pm_is_suspending(kbdev));
 	}
+#ifdef CONFIG_MALI_ARBITER_SUPPORT
+	atomic_dec(&kbdev->pm.gpu_users_waiting);
+#endif /* CONFIG_MALI_ARBITER_SUPPORT */
 
 	/* Have synchronized against the System suspend and incremented the
 	 * pm.active_count. So any subsequent invocation of System suspend
diff --git a/mali_kbase/csf/mali_kbase_csf.c b/mali_kbase/csf/mali_kbase_csf.c
index a8b5052..894eac1 100644
--- a/mali_kbase/csf/mali_kbase_csf.c
+++ b/mali_kbase/csf/mali_kbase_csf.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -534,6 +534,8 @@ static int csf_queue_register_internal(struct kbase_context *kctx,
 
 	queue->blocked_reason = CS_STATUS_BLOCKED_REASON_REASON_UNBLOCKED;
 
+	queue->clear_faults = true;
+
 	INIT_LIST_HEAD(&queue->link);
 	atomic_set(&queue->pending_kick, 0);
 	INIT_LIST_HEAD(&queue->pending_kick_link);
@@ -729,7 +731,7 @@ out:
 }
 
 /**
- * get_bound_queue_group - Get the group to which a queue was bound
+ * get_bound_queue_group() - Get the group to which a queue was bound
  *
  * @queue: Pointer to the queue for this group
  *
@@ -842,6 +844,48 @@ void kbase_csf_ring_cs_kernel_doorbell(struct kbase_device *kbdev, int csi_index
 		kbase_csf_ring_csg_doorbell(kbdev, csg_nr);
 }
 
+int kbase_csf_queue_group_clear_faults(struct kbase_context *kctx,
+				       struct kbase_ioctl_queue_group_clear_faults *faults)
+{
+	void __user *user_bufs = u64_to_user_ptr(faults->addr);
+	u32 i;
+	int ret = 0;
+	struct kbase_device *kbdev = kctx->kbdev;
+	const u32 nr_queues = faults->nr_queues;
+
+	if (unlikely(nr_queues > kbdev->csf.global_iface.groups[0].stream_num)) {
+		dev_warn(kbdev->dev, "Invalid nr_queues %u", nr_queues);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < nr_queues; ++i) {
+		u64 buf_gpu_addr;
+		struct kbase_va_region *region;
+
+		if (copy_from_user(&buf_gpu_addr, user_bufs, sizeof(buf_gpu_addr)))
+			return -EFAULT;
+
+		kbase_gpu_vm_lock(kctx);
+		region = kbase_region_tracker_find_region_enclosing_address(kctx, buf_gpu_addr);
+		if (likely(!kbase_is_region_invalid_or_free(region))) {
+			struct kbase_queue *queue = region->user_data;
+
+			queue->clear_faults = true;
+		} else {
+			dev_warn(kbdev->dev, "GPU queue %u without a valid command buffer region",
+				 i);
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+		kbase_gpu_vm_unlock(kctx);
+
+		user_bufs = (void __user *)((uintptr_t)user_bufs + sizeof(buf_gpu_addr));
+	}
+out_unlock:
+	kbase_gpu_vm_unlock(kctx);
+	return ret;
+}
+
 int kbase_csf_queue_kick(struct kbase_context *kctx, struct kbase_ioctl_cs_queue_kick *kick)
 {
 	struct kbase_device *kbdev = kctx->kbdev;
@@ -863,7 +907,7 @@ int kbase_csf_queue_kick(struct kbase_context *kctx, struct kbase_ioctl_cs_queue
 		struct kbase_queue *queue = region->user_data;
 
 		if (queue && (queue->bind_state == KBASE_CSF_QUEUE_BOUND)) {
-			spin_lock(&kbdev->csf.pending_gpuq_kicks_lock);
+			spin_lock(&kbdev->csf.pending_gpuq_kick_queues_lock);
 			if (list_empty(&queue->pending_kick_link)) {
 				/* Queue termination shall block until this
 				 * kick has been handled.
@@ -871,10 +915,12 @@ int kbase_csf_queue_kick(struct kbase_context *kctx, struct kbase_ioctl_cs_queue
 				atomic_inc(&queue->pending_kick);
 				list_add_tail(
 					&queue->pending_kick_link,
-					&kbdev->csf.pending_gpuq_kicks[queue->group_priority]);
-				complete(&kbdev->csf.scheduler.kthread_signal);
+					&kbdev->csf.pending_gpuq_kick_queues[queue->group_priority]);
+				if (atomic_cmpxchg(&kbdev->csf.pending_gpuq_kicks, false, true) ==
+				    false)
+					complete(&kbdev->csf.scheduler.kthread_signal);
 			}
-			spin_unlock(&kbdev->csf.pending_gpuq_kicks_lock);
+			spin_unlock(&kbdev->csf.pending_gpuq_kick_queues_lock);
 		}
 	} else {
 		dev_dbg(kbdev->dev,
@@ -1090,12 +1136,11 @@ static int create_normal_suspend_buffer(struct kbase_context *const kctx,
 }
 
 static void timer_event_worker(struct work_struct *data);
-static void protm_event_worker(struct work_struct *data);
 static void term_normal_suspend_buffer(struct kbase_context *const kctx,
 				       struct kbase_normal_suspend_buffer *s_buf);
 
 /**
- * create_suspend_buffers - Setup normal and protected mode
+ * create_suspend_buffers() - Setup normal and protected mode
  *				suspend buffers.
  *
  * @kctx:	Address of the kbase context within which the queue group
@@ -1201,7 +1246,8 @@ static int create_queue_group(struct kbase_context *const kctx,
 			INIT_LIST_HEAD(&group->link_to_schedule);
 			INIT_LIST_HEAD(&group->error_fatal.link);
 			INIT_WORK(&group->timer_event_work, timer_event_worker);
-			INIT_WORK(&group->protm_event_work, protm_event_worker);
+			INIT_LIST_HEAD(&group->protm_event_work);
+			atomic_set(&group->pending_protm_event_work, 0);
 			bitmap_zero(group->protm_pending_bitmap, MAX_SUPPORTED_STREAMS_PER_GROUP);
 
 			group->run_state = KBASE_CSF_GROUP_INACTIVE;
@@ -1374,7 +1420,7 @@ void kbase_csf_term_descheduled_queue_group(struct kbase_queue_group *group)
 }
 
 /**
- * term_queue_group - Terminate a GPU command queue group.
+ * term_queue_group() - Terminate a GPU command queue group.
  *
  * @group: Pointer to GPU command queue group data.
  *
@@ -1402,8 +1448,8 @@ static void term_queue_group(struct kbase_queue_group *group)
 }
 
 /**
- * wait_group_deferred_deschedule_completion - Wait for refcount of the group to
- *         become 0 that was taken when the group deschedule had to be deferred.
+ * wait_group_deferred_deschedule_completion() - Wait for refcount of the group
+ *     to become 0 that was taken when the group deschedule had to be deferred.
  *
  * @group: Pointer to GPU command queue group that is being deleted.
  *
@@ -1432,7 +1478,10 @@ static void wait_group_deferred_deschedule_completion(struct kbase_queue_group *
 static void cancel_queue_group_events(struct kbase_queue_group *group)
 {
 	cancel_work_sync(&group->timer_event_work);
-	cancel_work_sync(&group->protm_event_work);
+
+	/* Drain a pending protected mode request if any */
+	kbase_csf_scheduler_wait_for_kthread_pending_work(group->kctx->kbdev,
+							  &group->pending_protm_event_work);
 }
 
 static void remove_pending_group_fatal_error(struct kbase_queue_group *group)
@@ -1587,6 +1636,7 @@ int kbase_csf_ctx_init(struct kbase_context *kctx)
 
 	INIT_LIST_HEAD(&kctx->csf.queue_list);
 	INIT_LIST_HEAD(&kctx->csf.link);
+	atomic_set(&kctx->csf.pending_sync_update, 0);
 
 	kbase_csf_event_init(kctx);
 
@@ -1822,7 +1872,7 @@ void kbase_csf_ctx_term(struct kbase_context *kctx)
 }
 
 /**
- * handle_oom_event - Handle the OoM event generated by the firmware for the
+ * handle_oom_event() - Handle the OoM event generated by the firmware for the
  *                    CSI.
  *
  * @group:  Pointer to the CSG group the oom-event belongs to.
@@ -1892,7 +1942,7 @@ static int handle_oom_event(struct kbase_queue_group *const group,
 }
 
 /**
- * report_tiler_oom_error - Report a CSG error due to a tiler heap OOM event
+ * report_tiler_oom_error() - Report a CSG error due to a tiler heap OOM event
  *
  * @group: Pointer to the GPU command queue group that encountered the error
  */
@@ -1935,7 +1985,7 @@ static void flush_gpu_cache_on_fatal_error(struct kbase_device *kbdev)
 }
 
 /**
- * kbase_queue_oom_event - Handle tiler out-of-memory for a GPU command queue.
+ * kbase_queue_oom_event() - Handle tiler out-of-memory for a GPU command queue.
  *
  * @queue: Pointer to queue for which out-of-memory event was received.
  *
@@ -2023,7 +2073,7 @@ unlock:
 }
 
 /**
- * oom_event_worker - Tiler out-of-memory handler called from a workqueue.
+ * oom_event_worker() - Tiler out-of-memory handler called from a workqueue.
  *
  * @data: Pointer to a work_struct embedded in GPU command queue data.
  *
@@ -2051,7 +2101,8 @@ static void oom_event_worker(struct work_struct *data)
 }
 
 /**
- * report_group_timeout_error - Report the timeout error for the group to userspace.
+ * report_group_timeout_error() - Report the timeout error for the group to
+ *                                userspace.
  *
  * @group: Pointer to the group for which timeout error occurred
  */
@@ -2075,7 +2126,7 @@ static void report_group_timeout_error(struct kbase_queue_group *const group)
 }
 
 /**
- * timer_event_worker - Handle the progress timeout error for the group
+ * timer_event_worker() - Handle the progress timeout error for the group
  *
  * @data: Pointer to a work_struct embedded in GPU command queue group data.
  *
@@ -2110,7 +2161,7 @@ static void timer_event_worker(struct work_struct *data)
 }
 
 /**
- * handle_progress_timer_event - Progress timer timeout event handler.
+ * handle_progress_timer_event() - Progress timer timeout event handler.
  *
  * @group: Pointer to GPU queue group for which the timeout event is received.
  *
@@ -2201,41 +2252,7 @@ static void report_group_fatal_error(struct kbase_queue_group *const group)
 }
 
 /**
- * protm_event_worker - Protected mode switch request event handler
- *			called from a workqueue.
- *
- * @data: Pointer to a work_struct embedded in GPU command queue group data.
- *
- * Request to switch to protected mode.
- */
-static void protm_event_worker(struct work_struct *data)
-{
-	struct kbase_queue_group *const group =
-		container_of(data, struct kbase_queue_group, protm_event_work);
-	struct kbase_protected_suspend_buffer *sbuf = &group->protected_suspend_buf;
-	int err = 0;
-
-	KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, PROTM_EVENT_WORKER_START, group, 0u);
-
-	err = alloc_grp_protected_suspend_buffer_pages(group);
-	if (!err) {
-		kbase_csf_scheduler_group_protm_enter(group);
-	} else if (err == -ENOMEM && sbuf->alloc_retries <= PROTM_ALLOC_MAX_RETRIES) {
-		sbuf->alloc_retries++;
-		/* try again to allocate pages */
-		queue_work(group->kctx->csf.wq, &group->protm_event_work);
-	} else if (sbuf->alloc_retries >= PROTM_ALLOC_MAX_RETRIES || err != -ENOMEM) {
-		dev_err(group->kctx->kbdev->dev,
-			"Failed to allocate physical pages for Protected mode suspend buffer for the group %d of context %d_%d",
-			group->handle, group->kctx->tgid, group->kctx->id);
-		report_group_fatal_error(group);
-	}
-
-	KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, PROTM_EVENT_WORKER_END, group, 0u);
-}
-
-/**
- * handle_fault_event - Handler for CS fault.
+ * handle_fault_event() - Handler for CS fault.
  *
  * @queue:  Pointer to queue for which fault event was received.
  * @cs_ack: Value of the CS_ACK register in the CS kernel input page used for
@@ -2331,7 +2348,7 @@ static void report_queue_fatal_error(struct kbase_queue *const queue, u32 cs_fat
 }
 
 /**
- * cs_error_worker - Handle the CS_FATAL/CS_FAULT error for the GPU queue
+ * cs_error_worker() - Handle the CS_FATAL/CS_FAULT error for the GPU queue
  *
  * @data: Pointer to a work_struct embedded in GPU command queue.
  *
@@ -2409,7 +2426,7 @@ unlock:
 }
 
 /**
- * handle_fatal_event - Handler for CS fatal.
+ * handle_fatal_event() - Handler for CS fatal.
  *
  * @queue:    Pointer to queue for which fatal event was received.
  * @stream:   Pointer to the structure containing info provided by the
@@ -2471,7 +2488,7 @@ static void handle_fatal_event(struct kbase_queue *const queue,
 }
 
 /**
- * process_cs_interrupts - Process interrupts for a CS.
+ * process_cs_interrupts() - Process interrupts for a CS.
  *
  * @group:  Pointer to GPU command queue group data.
  * @ginfo:  The CSG interface provided by the firmware.
@@ -2585,7 +2602,7 @@ static void process_cs_interrupts(struct kbase_queue_group *const group,
 		}
 
 		if (!group->protected_suspend_buf.pma)
-			queue_work(group->kctx->csf.wq, &group->protm_event_work);
+			kbase_csf_scheduler_enqueue_protm_event_work(group);
 
 		if (test_bit(group->csg_nr, scheduler->csg_slots_idle_mask)) {
 			clear_bit(group->csg_nr, scheduler->csg_slots_idle_mask);
@@ -2598,7 +2615,7 @@ static void process_cs_interrupts(struct kbase_queue_group *const group,
 }
 
 /**
- * process_csg_interrupts - Process interrupts for a CSG.
+ * process_csg_interrupts() - Process interrupts for a CSG.
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  * @csg_nr: CSG number.
@@ -2718,7 +2735,7 @@ static void process_csg_interrupts(struct kbase_device *const kbdev, u32 const c
 }
 
 /**
- * process_prfcnt_interrupts - Process performance counter interrupts.
+ * process_prfcnt_interrupts() - Process performance counter interrupts.
  *
  * @kbdev:   Instance of a GPU platform device that implements a CSF interface.
  * @glb_req: Global request register value.
@@ -2790,7 +2807,7 @@ static void process_prfcnt_interrupts(struct kbase_device *kbdev, u32 glb_req, u
 }
 
 /**
- * check_protm_enter_req_complete - Check if PROTM_ENTER request completed
+ * check_protm_enter_req_complete() - Check if PROTM_ENTER request completed
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  * @glb_req: Global request register value.
@@ -2824,7 +2841,7 @@ static inline void check_protm_enter_req_complete(struct kbase_device *kbdev, u3
 }
 
 /**
- * process_protm_exit - Handle the protected mode exit interrupt
+ * process_protm_exit() - Handle the protected mode exit interrupt
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  * @glb_ack: Global acknowledge register value.
@@ -2913,7 +2930,7 @@ static inline void process_tracked_info_for_protm(struct kbase_device *kbdev,
 		if (!tock_triggered) {
 			dev_dbg(kbdev->dev, "Group-%d on slot-%d start protm work\n", group->handle,
 				group->csg_nr);
-			queue_work(group->kctx->csf.wq, &group->protm_event_work);
+			kbase_csf_scheduler_enqueue_protm_event_work(group);
 		}
 	}
 }
@@ -2942,6 +2959,46 @@ static void order_job_irq_clear_with_iface_mem_read(void)
 	dmb(osh);
 }
 
+static const char *const glb_fatal_status_errors[GLB_FATAL_STATUS_VALUE_COUNT] = {
+	[GLB_FATAL_STATUS_VALUE_OK] = "OK",
+	[GLB_FATAL_STATUS_VALUE_ASSERT] = "Firmware assert triggered",
+	[GLB_FATAL_STATUS_VALUE_UNEXPECTED_EXCEPTION] =
+		"Hardware raised an exception firmware did not expect",
+	[GLB_FATAL_STATUS_VALUE_HANG] = "Firmware hangs and watchdog timer expired",
+};
+
+/**
+ * handle_glb_fatal_event() - Handle the GLB fatal event
+ *
+ * @kbdev:        Instance of GPU device.
+ * @global_iface: CSF global interface
+ */
+static void handle_glb_fatal_event(struct kbase_device *kbdev,
+				   const struct kbase_csf_global_iface *const global_iface)
+{
+	const char *error_string = NULL;
+	const u32 fatal_status = kbase_csf_firmware_global_output(global_iface, GLB_FATAL_STATUS);
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
+	dev_warn(kbdev->dev, "MCU encountered unrecoverable error");
+
+	if (fatal_status < GLB_FATAL_STATUS_VALUE_COUNT)
+		error_string = glb_fatal_status_errors[fatal_status];
+	else {
+		dev_err(kbdev->dev, "Invalid GLB_FATAL_STATUS (%u)", fatal_status);
+		return;
+	}
+
+	if (fatal_status == GLB_FATAL_STATUS_VALUE_OK)
+		dev_err(kbdev->dev, "GLB_FATAL_STATUS(OK) must be set with proper reason");
+	else {
+		dev_warn(kbdev->dev, "GLB_FATAL_STATUS: %s", error_string);
+		if (kbase_prepare_to_reset_gpu_locked(kbdev, RESET_FLAGS_NONE))
+			kbase_reset_gpu_locked(kbdev);
+	}
+}
+
 void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val)
 {
 	bool deferred_handling_glb_idle_irq = false;
@@ -3016,6 +3073,9 @@ void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val)
 					deferred_handling_glb_idle_irq = true;
 				}
 
+				if (glb_ack & GLB_ACK_FATAL_MASK)
+					handle_glb_fatal_event(kbdev, global_iface);
+
 				process_prfcnt_interrupts(kbdev, glb_req, glb_ack);
 
 				kbase_csf_scheduler_spin_unlock(kbdev, flags);
@@ -3077,6 +3137,11 @@ void kbase_csf_doorbell_mapping_term(struct kbase_device *kbdev)
 	if (kbdev->csf.db_filp) {
 		struct page *page = as_page(kbdev->csf.dummy_db_page);
 
+		/* This is a shared dummy sink page for avoiding potential segmentation fault
+		 * to user-side library when a csi is off slot. Additionally, the call is on
+		 * module unload path, so the page can be left uncleared before returning it
+		 * back to kbdev memory pool.
+		 */
 		kbase_mem_pool_free(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], page, false);
 
 		fput(kbdev->csf.db_filp);
@@ -3108,26 +3173,27 @@ int kbase_csf_doorbell_mapping_init(struct kbase_device *kbdev)
 	return 0;
 }
 
-void kbase_csf_pending_gpuq_kicks_init(struct kbase_device *kbdev)
+void kbase_csf_pending_gpuq_kick_queues_init(struct kbase_device *kbdev)
 {
 	size_t i;
 
-	for (i = 0; i != ARRAY_SIZE(kbdev->csf.pending_gpuq_kicks); ++i)
-		INIT_LIST_HEAD(&kbdev->csf.pending_gpuq_kicks[i]);
-	spin_lock_init(&kbdev->csf.pending_gpuq_kicks_lock);
+	atomic_set(&kbdev->csf.pending_gpuq_kicks, false);
+	for (i = 0; i != ARRAY_SIZE(kbdev->csf.pending_gpuq_kick_queues); ++i)
+		INIT_LIST_HEAD(&kbdev->csf.pending_gpuq_kick_queues[i]);
+	spin_lock_init(&kbdev->csf.pending_gpuq_kick_queues_lock);
 }
 
-void kbase_csf_pending_gpuq_kicks_term(struct kbase_device *kbdev)
+void kbase_csf_pending_gpuq_kick_queues_term(struct kbase_device *kbdev)
 {
 	size_t i;
 
-	spin_lock(&kbdev->csf.pending_gpuq_kicks_lock);
-	for (i = 0; i != ARRAY_SIZE(kbdev->csf.pending_gpuq_kicks); ++i) {
-		if (!list_empty(&kbdev->csf.pending_gpuq_kicks[i]))
+	spin_lock(&kbdev->csf.pending_gpuq_kick_queues_lock);
+	for (i = 0; i != ARRAY_SIZE(kbdev->csf.pending_gpuq_kick_queues); ++i) {
+		if (!list_empty(&kbdev->csf.pending_gpuq_kick_queues[i]))
 			dev_warn(kbdev->dev,
 				 "Some GPU queue kicks for priority %zu were not handled", i);
 	}
-	spin_unlock(&kbdev->csf.pending_gpuq_kicks_lock);
+	spin_unlock(&kbdev->csf.pending_gpuq_kick_queues_lock);
 }
 
 void kbase_csf_free_dummy_user_reg_page(struct kbase_device *kbdev)
@@ -3135,6 +3201,11 @@ void kbase_csf_free_dummy_user_reg_page(struct kbase_device *kbdev)
 	if (kbdev->csf.user_reg.filp) {
 		struct page *page = as_page(kbdev->csf.user_reg.dummy_page);
 
+		/* This is a shared dummy page in place of the real USER Register page just
+		 * before the GPU is powered down. Additionally, the call is on module unload
+		 * path, so the page can be left uncleared before returning it back to kbdev
+		 * memory pool.
+		 */
 		kbase_mem_pool_free(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], page, false);
 		fput(kbdev->csf.user_reg.filp);
 	}
@@ -3217,17 +3288,17 @@ void kbase_csf_process_queue_kick(struct kbase_queue *queue)
 		if (err == -EBUSY) {
 			retry_kick = true;
 
-			spin_lock(&kbdev->csf.pending_gpuq_kicks_lock);
+			spin_lock(&kbdev->csf.pending_gpuq_kick_queues_lock);
 			if (list_empty(&queue->pending_kick_link)) {
 				/* A failed queue kick shall be pushed to the
 				 * back of the queue to avoid potential abuse.
 				 */
 				list_add_tail(
 					&queue->pending_kick_link,
-					&kbdev->csf.pending_gpuq_kicks[queue->group_priority]);
-				spin_unlock(&kbdev->csf.pending_gpuq_kicks_lock);
+					&kbdev->csf.pending_gpuq_kick_queues[queue->group_priority]);
+				spin_unlock(&kbdev->csf.pending_gpuq_kick_queues_lock);
 			} else {
-				spin_unlock(&kbdev->csf.pending_gpuq_kicks_lock);
+				spin_unlock(&kbdev->csf.pending_gpuq_kick_queues_lock);
 				WARN_ON(atomic_read(&queue->pending_kick) == 0);
 			}
 
@@ -3250,3 +3321,27 @@ out_release_queue:
 	WARN_ON(atomic_read(&queue->pending_kick) == 0);
 	atomic_dec(&queue->pending_kick);
 }
+
+void kbase_csf_process_protm_event_request(struct kbase_queue_group *group)
+{
+	struct kbase_protected_suspend_buffer *sbuf = &group->protected_suspend_buf;
+	int err = 0;
+
+	KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, PROTM_EVENT_WORKER_START, group, 0u);
+
+	err = alloc_grp_protected_suspend_buffer_pages(group);
+	if (!err) {
+		kbase_csf_scheduler_group_protm_enter(group);
+	} else if (err == -ENOMEM && sbuf->alloc_retries <= PROTM_ALLOC_MAX_RETRIES) {
+		sbuf->alloc_retries++;
+		/* try again to allocate pages */
+		kbase_csf_scheduler_enqueue_protm_event_work(group);
+	} else if (sbuf->alloc_retries >= PROTM_ALLOC_MAX_RETRIES || err != -ENOMEM) {
+		dev_err(group->kctx->kbdev->dev,
+			"Failed to allocate physical pages for Protected mode suspend buffer for the group %d of context %d_%d",
+			group->handle, group->kctx->tgid, group->kctx->id);
+		report_group_fatal_error(group);
+	}
+
+	KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, PROTM_EVENT_WORKER_END, group, 0u);
+}
diff --git a/mali_kbase/csf/mali_kbase_csf.h b/mali_kbase/csf/mali_kbase_csf.h
index b2f6ab2..5661363 100644
--- a/mali_kbase/csf/mali_kbase_csf.h
+++ b/mali_kbase/csf/mali_kbase_csf.h
@@ -244,6 +244,19 @@ struct kbase_queue_group *kbase_csf_find_queue_group(struct kbase_context *kctx,
 int kbase_csf_queue_group_handle_is_valid(struct kbase_context *kctx, u8 group_handle);
 
 /**
+ * kbase_csf_queue_group_clear_faults - Re-enable CS Fault reporting.
+ *
+ * @kctx:	Pointer to the kbase context within which the
+ *		CS Faults for the queues has to be re-enabled.
+ * @clear_faults:	Pointer to the structure which contains details of the
+ *		queues for which the CS Fault reporting has to be re-enabled.
+ *
+ * Return:	0 on success, or negative on failure.
+ */
+int kbase_csf_queue_group_clear_faults(struct kbase_context *kctx,
+				       struct kbase_ioctl_queue_group_clear_faults *clear_faults);
+
+/**
  * kbase_csf_queue_group_create - Create a GPU command queue group.
  *
  * @kctx:	Pointer to the kbase context within which the
@@ -379,20 +392,20 @@ int kbase_csf_setup_dummy_user_reg_page(struct kbase_device *kbdev);
 void kbase_csf_free_dummy_user_reg_page(struct kbase_device *kbdev);
 
 /**
- * kbase_csf_pending_gpuq_kicks_init - Initialize the data used for handling
- *                                     GPU queue kicks.
+ * kbase_csf_pending_gpuq_kick_queues_init - Initialize the data used for handling
+ *                                           GPU queue kicks.
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  */
-void kbase_csf_pending_gpuq_kicks_init(struct kbase_device *kbdev);
+void kbase_csf_pending_gpuq_kick_queues_init(struct kbase_device *kbdev);
 
 /**
- * kbase_csf_pending_gpuq_kicks_term - De-initialize the data used for handling
- *                                     GPU queue kicks.
+ * kbase_csf_pending_gpuq_kick_queues_term - De-initialize the data used for handling
+ *                                           GPU queue kicks.
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  */
-void kbase_csf_pending_gpuq_kicks_term(struct kbase_device *kbdev);
+void kbase_csf_pending_gpuq_kick_queues_term(struct kbase_device *kbdev);
 
 /**
  * kbase_csf_ring_csg_doorbell - ring the doorbell for a CSG interface.
@@ -546,4 +559,13 @@ static inline u64 kbase_csf_ktrace_gpu_cycle_cnt(struct kbase_device *kbdev)
  */
 void kbase_csf_process_queue_kick(struct kbase_queue *queue);
 
+/**
+ * kbase_csf_process_protm_event_request - Handle protected mode switch request
+ *
+ * @group: The group to handle protected mode request
+ *
+ * Request to switch to protected mode.
+ */
+void kbase_csf_process_protm_event_request(struct kbase_queue_group *group);
+
 #endif /* _KBASE_CSF_H_ */
diff --git a/mali_kbase/csf/mali_kbase_csf_defs.h b/mali_kbase/csf/mali_kbase_csf_defs.h
index fdebf55..d5587e3 100644
--- a/mali_kbase/csf/mali_kbase_csf_defs.h
+++ b/mali_kbase/csf/mali_kbase_csf_defs.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -268,6 +268,7 @@ enum kbase_queue_group_priority {
  *                  Shader, L2 and MCU state.
  * @CSF_GPU_RESET_TIMEOUT: Waiting timeout for GPU reset to complete.
  * @CSF_CSG_SUSPEND_TIMEOUT: Timeout given for a CSG to be suspended.
+ * @CSF_CSG_TERM_TIMEOUT: Timeout given for a CSG to be terminated.
  * @CSF_FIRMWARE_BOOT_TIMEOUT: Maximum time to wait for firmware to boot.
  * @CSF_FIRMWARE_PING_TIMEOUT: Maximum time to wait for firmware to respond
  *                             to a ping from KBase.
@@ -290,6 +291,7 @@ enum kbase_timeout_selector {
 	CSF_PM_TIMEOUT,
 	CSF_GPU_RESET_TIMEOUT,
 	CSF_CSG_SUSPEND_TIMEOUT,
+	CSF_CSG_TERM_TIMEOUT,
 	CSF_FIRMWARE_BOOT_TIMEOUT,
 	CSF_FIRMWARE_PING_TIMEOUT,
 	CSF_SCHED_PROTM_PROGRESS_TIMEOUT,
@@ -398,6 +400,7 @@ struct kbase_csf_notification {
  * @cs_error:         Records information about the CS fatal event or
  *                    about CS fault event if dump on fault is enabled.
  * @cs_error_fatal:   Flag to track if the CS fault or CS fatal event occurred.
+ * @clear_faults:     Flag to track if the CS fault reporting is enabled for this queue
  * @extract_ofs: The current EXTRACT offset, this is only updated when handling
  *               the GLB IDLE IRQ if the idle timeout value is non-0 in order
  *               to help detect a queue's true idle status.
@@ -441,6 +444,7 @@ struct kbase_queue {
 	u64 cs_error_info;
 	u32 cs_error;
 	bool cs_error_fatal;
+	bool clear_faults;
 	u64 extract_ofs;
 	u64 saved_cmd_ptr;
 };
@@ -501,6 +505,8 @@ struct kbase_protected_suspend_buffer {
  * @compute_max:    Maximum number of compute endpoints the group is
  *                  allowed to use.
  * @csi_handlers:   Requested CSI exception handler flags for the group.
+ * @cs_fault_report_enable:	Indicated if reporting of CS_FAULTs to
+ *				userspace is enabled.
  * @tiler_mask:     Mask of tiler endpoints the group is allowed to use.
  * @fragment_mask:  Mask of fragment endpoints the group is allowed to use.
  * @compute_mask:   Mask of compute endpoints the group is allowed to use.
@@ -531,8 +537,13 @@ struct kbase_protected_suspend_buffer {
  * @bound_queues:   Array of registered queues bound to this queue group.
  * @doorbell_nr:    Index of the hardware doorbell page assigned to the
  *                  group.
- * @protm_event_work:   Work item corresponding to the protected mode entry
- *                      event for this queue.
+ * @protm_event_work: List item corresponding to the protected mode entry
+ *                    event for this queue. This would be handled by
+ *                    kbase_csf_scheduler_kthread().
+ * @pending_protm_event_work: Indicates that kbase_csf_scheduler_kthread() should
+ *                            handle PROTM request for this group. This would
+ *                            be set to false when the work is done. This is used
+ *                            mainly for synchronisation with group termination.
  * @protm_pending_bitmap:  Bit array to keep a track of CSs that
  *                         have pending protected mode entry requests.
  * @error_fatal: An error of type BASE_GPU_QUEUE_GROUP_ERROR_FATAL to be
@@ -569,7 +580,7 @@ struct kbase_queue_group {
 	u8 compute_max;
 	u8 csi_handlers;
 
-
+	__u8 cs_fault_report_enable;
 	u64 tiler_mask;
 	u64 fragment_mask;
 	u64 compute_mask;
@@ -588,7 +599,8 @@ struct kbase_queue_group {
 	struct kbase_queue *bound_queues[MAX_SUPPORTED_STREAMS_PER_GROUP];
 
 	int doorbell_nr;
-	struct work_struct protm_event_work;
+	struct list_head protm_event_work;
+	atomic_t pending_protm_event_work;
 	DECLARE_BITMAP(protm_pending_bitmap, MAX_SUPPORTED_STREAMS_PER_GROUP);
 
 	struct kbase_csf_notification error_fatal;
@@ -625,6 +637,9 @@ struct kbase_queue_group {
  * @cmd_seq_num:        The sequence number assigned to an enqueued command,
  *                      in incrementing order (older commands shall have a
  *                      smaller number).
+ * @kcpu_wq: Work queue to process KCPU commands for all queues in this
+ *           context. This would be used if the context is not prioritised,
+ *           otherwise it would be handled by kbase_csf_scheduler_kthread().
  * @jit_lock:           Lock to serialise JIT operations.
  * @jit_cmds_head:      A list of the just-in-time memory commands, both
  *                      allocate & free, in submission order, protected
@@ -640,6 +655,8 @@ struct kbase_csf_kcpu_queue_context {
 	DECLARE_BITMAP(in_use, KBASEP_MAX_KCPU_QUEUES);
 	atomic64_t cmd_seq_num;
 
+	struct workqueue_struct *kcpu_wq;
+
 	struct mutex jit_lock;
 	struct list_head jit_cmds_head;
 	struct list_head jit_blocked_queues;
@@ -747,15 +764,7 @@ struct kbase_csf_ctx_heap_reclaim_info {
  *                      GPU command queues are idle and at least one of them
  *                      is blocked on a sync wait operation.
  * @num_idle_wait_grps: Length of the @idle_wait_groups list.
- * @sync_update_wq_high_prio: high-priority work queue to process the
- *                            SYNC_UPDATE events by sync_set / sync_add
- *                            instruction execution on command streams bound to
- *                            groups of @idle_wait_groups list. This WQ would
- *                            be used if the context is prioritised.
- * @sync_update_wq_normal_prio: similar to sync_update_wq_high_prio, but this
- *                              WQ would be used if the context is not
- *                              prioritised.
- * @sync_update_work:   Work item to process the SYNC_UPDATE events.
+ * @sync_update_work:   List item to process the SYNC_UPDATE event.
  * @ngrp_to_schedule:	Number of groups added for the context to the
  *                      'groups_to_schedule' list of scheduler instance.
  * @heap_info:          Heap reclaim information data of the kctx. As the
@@ -768,9 +777,7 @@ struct kbase_csf_scheduler_context {
 	u32 num_runnable_grps;
 	struct list_head idle_wait_groups;
 	u32 num_idle_wait_grps;
-	struct workqueue_struct *sync_update_wq_high_prio;
-	struct workqueue_struct *sync_update_wq_normal_prio;
-	struct work_struct sync_update_work;
+	struct list_head sync_update_work;
 	u32 ngrp_to_schedule;
 	struct kbase_csf_ctx_heap_reclaim_info heap_info;
 };
@@ -865,17 +872,16 @@ struct kbase_csf_user_reg_context {
  * @wq:               Dedicated workqueue to process work items corresponding
  *                    to the OoM events raised for chunked tiler heaps being
  *                    used by GPU command queues, and progress timeout events.
- * @kcpu_wq_high_prio: High-priority work queue to process KCPU commands for
- *                     all queues in this context. This WQ would be used if
- *                     the context is prioritised.
- * @kcpu_wq_normal_prio: Similar to kcpu_wq_high_prio, but this WQ would be
- *                       used if the context is not prioritised.
  * @link:             Link to this csf context in the 'runnable_kctxs' list of
  *                    the scheduler instance
  * @sched:            Object representing the scheduler's context
  * @cpu_queue:        CPU queue information. Only be available when DEBUG_FS
  *                    is enabled.
  * @user_reg:         Collective information to support mapping to USER Register page.
+ * @pending_sync_update: Indicates that kbase_csf_scheduler_kthread() should
+ *                       handle SYNC_UPDATE event for this context. This would
+ *                       be set to false when the work is done. This is used
+ *                       mainly for synchronisation with context termination.
  */
 struct kbase_csf_context {
 	struct list_head event_pages_head;
@@ -888,12 +894,11 @@ struct kbase_csf_context {
 	struct kbase_csf_event event;
 	struct kbase_csf_tiler_heap_context tiler_heaps;
 	struct workqueue_struct *wq;
-	struct workqueue_struct *kcpu_wq_high_prio;
-	struct workqueue_struct *kcpu_wq_normal_prio;
 	struct list_head link;
 	struct kbase_csf_scheduler_context sched;
 	struct kbase_csf_cpu_queue_context cpu_queue;
 	struct kbase_csf_user_reg_context user_reg;
+	atomic_t pending_sync_update;
 };
 
 /**
@@ -936,14 +941,15 @@ struct kbase_csf_csg_slot {
  * struct kbase_csf_sched_heap_reclaim_mgr - Object for managing tiler heap reclaim
  *                                           kctx lists inside the CSF device's scheduler.
  *
- * @heap_reclaim:   Tiler heap reclaim shrinker object.
+ * @heap_reclaim:   Defines Tiler heap reclaim shrinker object.
  * @ctx_lists:      Array of kctx lists, size matching CSG defined priorities. The
  *                  lists track the kctxs attached to the reclaim manager.
  * @unused_pages:   Estimated number of unused pages from the @ctxlist array. The
  *                  number is indicative for use with reclaim shrinker's count method.
  */
 struct kbase_csf_sched_heap_reclaim_mgr {
-	struct shrinker heap_reclaim;
+	DEFINE_KBASE_SHRINKER heap_reclaim;
+
 	struct list_head ctx_lists[KBASE_QUEUE_GROUP_PRIORITY_COUNT];
 	atomic_t unused_pages;
 };
@@ -1042,10 +1048,29 @@ struct kbase_csf_mcu_shared_regions {
  *                          workqueue items (kernel-provided delayed_work
  *                          items do not use hrtimer and for some reason do
  *                          not provide sufficiently reliable periodicity).
- * @pending_tick_work:      Indicates that kbase_csf_scheduler_kthread() should perform
- *                          a scheduling tick.
- * @pending_tock_work:      Indicates that kbase_csf_scheduler_kthread() should perform
- *                          a scheduling tock.
+ * @pending_sync_update_works:  Indicates that kbase_csf_scheduler_kthread()
+ *                              should handle SYNC_UPDATE events.
+ * @sync_update_work_ctxs_lock: Lock protecting the list of contexts that
+ *                              require handling SYNC_UPDATE events.
+ * @sync_update_work_ctxs:      The list of contexts that require handling
+ *                              SYNC_UPDATE events.
+ * @pending_protm_event_works:  Indicates that kbase_csf_scheduler_kthread()
+ *                              should handle PROTM requests.
+ * @protm_event_work_grps_lock: Lock protecting the list of groups that
+ *                              have requested protected mode.
+ * @protm_event_work_grps:      The list of groups that have requested
+ *                              protected mode.
+ * @pending_kcpuq_works:    Indicates that kbase_csf_scheduler_kthread()
+ *                          should process pending KCPU queue works.
+ * @kcpuq_work_queues_lock: Lock protecting the list of KCPU queues that
+ *                          need to be processed.
+ * @kcpuq_work_queues:      The list of KCPU queue that need to be processed
+ * @pending_tick_work:      Indicates that kbase_csf_scheduler_kthread() should
+ *                          perform a scheduling tick.
+ * @pending_tock_work:      Indicates that kbase_csf_scheduler_kthread() should
+ *                          perform a scheduling tock.
+ * @pending_gpu_idle_work:  Indicates that kbase_csf_scheduler_kthread() should
+ *                          handle the GPU IDLE event.
  * @ping_work:              Work item that would ping the firmware at regular
  *                          intervals, only if there is a single active CSG
  *                          slot, to check if firmware is alive and would
@@ -1063,10 +1088,6 @@ struct kbase_csf_mcu_shared_regions {
  *                          This pointer being set doesn't necessarily indicates
  *                          that GPU is in protected mode, kbdev->protected_mode
  *                          needs to be checked for that.
- * @idle_wq:                Workqueue for executing GPU idle notification
- *                          handler.
- * @gpu_idle_work:          Work item for facilitating the scheduler to bring
- *                          the GPU to a low-power mode on becoming idle.
  * @fast_gpu_idle_handling: Indicates whether to relax many of the checks
  *                          normally done in the GPU idle worker. This is
  *                          set to true when handling the GLB IDLE IRQ if the
@@ -1109,7 +1130,8 @@ struct kbase_csf_mcu_shared_regions {
  *                          thread when a queue needs attention.
  * @kthread_running:        Whether the GPU queue submission thread should keep
  *                          executing.
- * @gpuq_kthread:           High-priority thread used to handle GPU queue
+ * @gpuq_kthread:           Dedicated thread primarily used to handle
+ *                          latency-sensitive tasks such as GPU queue
  *                          submissions.
  */
 struct kbase_csf_scheduler {
@@ -1134,14 +1156,22 @@ struct kbase_csf_scheduler {
 	unsigned long last_schedule;
 	atomic_t timer_enabled;
 	struct hrtimer tick_timer;
+	atomic_t pending_sync_update_works;
+	spinlock_t sync_update_work_ctxs_lock;
+	struct list_head sync_update_work_ctxs;
+	atomic_t pending_protm_event_works;
+	spinlock_t protm_event_work_grps_lock;
+	struct list_head protm_event_work_grps;
+	atomic_t pending_kcpuq_works;
+	spinlock_t kcpuq_work_queues_lock;
+	struct list_head kcpuq_work_queues;
 	atomic_t pending_tick_work;
 	atomic_t pending_tock_work;
+	atomic_t pending_gpu_idle_work;
 	struct delayed_work ping_work;
 	struct kbase_context *top_kctx;
 	struct kbase_queue_group *top_grp;
 	struct kbase_queue_group *active_protm_grp;
-	struct workqueue_struct *idle_wq;
-	struct work_struct gpu_idle_work;
 	bool fast_gpu_idle_handling;
 	atomic_t gpu_no_longer_idle;
 	atomic_t non_idle_offslot_grps;
@@ -1653,12 +1683,15 @@ struct kbase_csf_user_reg {
  * @dof:                    Structure for dump on fault.
  * @user_reg:               Collective information to support the mapping to
  *                          USER Register page for user processes.
- * @pending_gpuq_kicks:     Lists of GPU queue that have been kicked but not
- *                          yet processed, categorised by queue group's priority.
- * @pending_gpuq_kicks_lock: Protect @pending_gpu_kicks and
- *                           kbase_queue.pending_kick_link.
+ * @pending_gpuq_kicks:            Indicates that kbase_csf_scheduler_kthread()
+ *                                 should handle GPU queue kicks.
+ * @pending_gpuq_kick_queues:      Lists of GPU queued that have been kicked but not
+ *                                 yet processed, categorised by queue group's priority.
+ * @pending_gpuq_kick_queues_lock: Protect @pending_gpuq_kick_queues and
+ *                                 kbase_queue.pending_kick_link.
  * @quirks_ext:             Pointer to an allocated buffer containing the firmware
  *                          workarounds configuration.
+ * @pmode_sync_sem:         RW Semaphore to prevent MMU operations during P.Mode entrance.
  */
 struct kbase_csf_device {
 	struct kbase_mmu_table mcu_mmu;
@@ -1710,9 +1743,11 @@ struct kbase_csf_device {
 	struct kbase_debug_coresight_device coresight;
 #endif /* IS_ENABLED(CONFIG_MALI_CORESIGHT) */
 	struct kbase_csf_user_reg user_reg;
-	struct list_head pending_gpuq_kicks[KBASE_QUEUE_GROUP_PRIORITY_COUNT];
-	spinlock_t pending_gpuq_kicks_lock;
+	atomic_t pending_gpuq_kicks;
+	struct list_head pending_gpuq_kick_queues[KBASE_QUEUE_GROUP_PRIORITY_COUNT];
+	spinlock_t pending_gpuq_kick_queues_lock;
 	u32 *quirks_ext;
+	struct rw_semaphore pmode_sync_sem;
 };
 
 /**
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware.c b/mali_kbase/csf/mali_kbase_csf_firmware.c
index aec6e65..20f9348 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware.c
+++ b/mali_kbase/csf/mali_kbase_csf_firmware.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -1550,7 +1550,6 @@ static bool global_request_complete(struct kbase_device *const kbdev, u32 const
 	unsigned long flags;
 
 	kbase_csf_scheduler_spin_lock(kbdev, &flags);
-
 	if ((kbase_csf_firmware_global_output(global_iface, GLB_ACK) & req_mask) ==
 	    (kbase_csf_firmware_global_input_read(global_iface, GLB_REQ) & req_mask))
 		complete = true;
@@ -1888,6 +1887,7 @@ static void kbase_csf_firmware_reload_worker(struct work_struct *work)
 {
 	struct kbase_device *kbdev =
 		container_of(work, struct kbase_device, csf.firmware_reload_work);
+	unsigned long flags;
 	int err;
 
 	dev_info(kbdev->dev, "reloading firmware");
@@ -1906,7 +1906,9 @@ static void kbase_csf_firmware_reload_worker(struct work_struct *work)
 		return;
 
 	/* Reboot the firmware */
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	kbase_csf_firmware_enable_mcu(kbdev);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
 
 void kbase_csf_firmware_trigger_reload(struct kbase_device *kbdev)
@@ -2253,8 +2255,9 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 
 	kbdev->csf.glb_init_request_pending = true;
 
+	init_rwsem(&kbdev->csf.pmode_sync_sem);
 	mutex_init(&kbdev->csf.reg_lock);
-	kbase_csf_pending_gpuq_kicks_init(kbdev);
+	kbase_csf_pending_gpuq_kick_queues_init(kbdev);
 
 	kbdev->csf.fw = (struct kbase_csf_mcu_fw){ .data = NULL };
 
@@ -2263,7 +2266,7 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 
 void kbase_csf_firmware_early_term(struct kbase_device *kbdev)
 {
-	kbase_csf_pending_gpuq_kicks_term(kbdev);
+	kbase_csf_pending_gpuq_kick_queues_term(kbdev);
 	mutex_destroy(&kbdev->csf.reg_lock);
 }
 
@@ -2772,6 +2775,7 @@ int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev, unsigned int
 	return wait_for_global_request_with_timeout(kbdev, GLB_REQ_PING_MASK, wait_timeout_ms);
 }
 
+
 int kbase_csf_firmware_set_timeout(struct kbase_device *const kbdev, u64 const timeout)
 {
 	const struct kbase_csf_global_iface *const global_iface = &kbdev->csf.global_iface;
@@ -2810,8 +2814,6 @@ int kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
 {
 	int err;
 
-	lockdep_assert_held(&kbdev->mmu_hw_mutex);
-
 	err = wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK);
 
 	if (!err) {
@@ -2877,6 +2879,7 @@ void kbase_csf_firmware_enable_mcu(struct kbase_device *kbdev)
 {
 	struct kbase_csf_global_iface *iface = &kbdev->csf.global_iface;
 
+	lockdep_assert_held(&kbdev->hwaccess_lock);
 		/* Clear the HALT bit before triggering the boot of MCU firmware */
 		kbase_csf_firmware_global_input_mask(iface, GLB_REQ, 0, GLB_REQ_HALT_MASK);
 
@@ -3156,6 +3159,9 @@ void kbase_csf_firmware_mcu_shared_mapping_term(struct kbase_device *kbdev,
 	}
 
 	if (csf_mapping->phys) {
+		/* This is on module unload path, so the pages can be left uncleared before
+		 * returning them back to kbdev memory pool.
+		 */
 		kbase_mem_pool_free_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
 					  csf_mapping->num_pages, csf_mapping->phys, false, false);
 	}
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware.h b/mali_kbase/csf/mali_kbase_csf_firmware.h
index 9b6f153..4baf2b7 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware.h
+++ b/mali_kbase/csf/mali_kbase_csf_firmware.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -618,6 +618,7 @@ void kbase_csf_firmware_trigger_mcu_sleep(struct kbase_device *kbdev);
 bool kbase_csf_firmware_is_mcu_in_sleep(struct kbase_device *kbdev);
 #endif
 
+
 /**
  * kbase_csf_firmware_trigger_reload() - Trigger the reboot of MCU firmware, for
  *                                       the cold boot case firmware image would
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware_cfg.c b/mali_kbase/csf/mali_kbase_csf_firmware_cfg.c
index d08686f..030a1eb 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware_cfg.c
+++ b/mali_kbase/csf/mali_kbase_csf_firmware_cfg.c
@@ -367,10 +367,10 @@ int kbase_csf_firmware_cfg_fw_wa_init(struct kbase_device *kbdev)
 	 */
 	entry_count = of_property_count_u32_elems(kbdev->dev->of_node, "quirks-ext");
 
-	if (entry_count == -EINVAL)
+	if (entry_count < 0)
 		entry_count = of_property_count_u32_elems(kbdev->dev->of_node, "quirks_ext");
 
-	if (entry_count == -EINVAL || entry_count == -ENODATA)
+	if (entry_count < 0)
 		return 0;
 
 	entry_bytes = (size_t)entry_count * sizeof(u32);
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c b/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
index d0599d6..a087388 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
+++ b/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -857,11 +857,11 @@ static void kbase_csf_firmware_reload_worker(struct work_struct *work)
 		container_of(work, struct kbase_device, csf.firmware_reload_work);
 	unsigned long flags;
 
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	/* Reboot the firmware */
 	kbase_csf_firmware_enable_mcu(kbdev);
 
 	/* Tell MCU state machine to transit to next state */
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	kbdev->csf.firmware_reloaded = true;
 	kbase_pm_update_state(kbdev);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
@@ -934,7 +934,7 @@ static u32 convert_dur_to_idle_count(struct kbase_device *kbdev, const u32 dur_n
 	/* add the source flag */
 	reg_val_u32 = GLB_IDLE_TIMER_TIMER_SOURCE_SET(
 		reg_val_u32, (src_system_timestamp ? GLB_IDLE_TIMER_TIMER_SOURCE_SYSTEM_TIMESTAMP :
-							  GLB_IDLE_TIMER_TIMER_SOURCE_GPU_COUNTER));
+							GLB_IDLE_TIMER_TIMER_SOURCE_GPU_COUNTER));
 
 	return reg_val_u32;
 }
@@ -1118,15 +1118,16 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 	INIT_WORK(&kbdev->csf.firmware_reload_work, kbase_csf_firmware_reload_worker);
 	INIT_WORK(&kbdev->csf.fw_error_work, firmware_error_worker);
 
+	init_rwsem(&kbdev->csf.pmode_sync_sem);
 	mutex_init(&kbdev->csf.reg_lock);
-	kbase_csf_pending_gpuq_kicks_init(kbdev);
+	kbase_csf_pending_gpuq_kick_queues_init(kbdev);
 
 	return 0;
 }
 
 void kbase_csf_firmware_early_term(struct kbase_device *kbdev)
 {
-	kbase_csf_pending_gpuq_kicks_term(kbdev);
+	kbase_csf_pending_gpuq_kick_queues_term(kbdev);
 	mutex_destroy(&kbdev->csf.reg_lock);
 }
 
@@ -1308,6 +1309,7 @@ int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev, unsigned int
 	return wait_for_global_request(kbdev, GLB_REQ_PING_MASK);
 }
 
+
 int kbase_csf_firmware_set_timeout(struct kbase_device *const kbdev, u64 const timeout)
 {
 	const struct kbase_csf_global_iface *const global_iface = &kbdev->csf.global_iface;
@@ -1370,6 +1372,8 @@ void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev)
 
 void kbase_csf_firmware_enable_mcu(struct kbase_device *kbdev)
 {
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
 	/* Trigger the boot of MCU firmware, Use the AUTO mode as
 	 * otherwise on fast reset, to exit protected mode, MCU will
 	 * not reboot by itself to enter normal mode.
diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.c b/mali_kbase/csf/mali_kbase_csf_kcpu.c
index 6c0c8d1..aa34d88 100644
--- a/mali_kbase/csf/mali_kbase_csf_kcpu.c
+++ b/mali_kbase/csf/mali_kbase_csf_kcpu.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -39,13 +39,7 @@
 static DEFINE_SPINLOCK(kbase_csf_fence_lock);
 #endif
 
-#ifdef CONFIG_MALI_FENCE_DEBUG
 #define FENCE_WAIT_TIMEOUT_MS 3000
-#endif
-
-static void kcpu_queue_process(struct kbase_kcpu_command_queue *kcpu_queue, bool drain_queue);
-
-static void kcpu_queue_process_worker(struct work_struct *data);
 
 static int kbase_kcpu_map_import_prepare(struct kbase_kcpu_command_queue *kcpu_queue,
 					 struct base_kcpu_command_import_info *import_info,
@@ -445,6 +439,16 @@ static void kbase_kcpu_jit_allocate_finish(struct kbase_kcpu_command_queue *queu
 	kfree(cmd->info.jit_alloc.info);
 }
 
+static void enqueue_kcpuq_work(struct kbase_kcpu_command_queue *queue)
+{
+	struct kbase_context *const kctx = queue->kctx;
+
+	if (!atomic_read(&kctx->prioritized))
+		queue_work(kctx->csf.kcpu_queues.kcpu_wq, &queue->work);
+	else
+		kbase_csf_scheduler_enqueue_kcpuq_work(queue);
+}
+
 /**
  * kbase_kcpu_jit_retry_pending_allocs() - Retry blocked JIT_ALLOC commands
  *
@@ -464,9 +468,7 @@ static void kbase_kcpu_jit_retry_pending_allocs(struct kbase_context *kctx)
 	 * kbase_csf_kcpu_queue_context.jit_lock .
 	 */
 	list_for_each_entry(blocked_queue, &kctx->csf.kcpu_queues.jit_blocked_queues, jit_blocked)
-		queue_work(atomic_read(&kctx->prioritized) ? kctx->csf.kcpu_wq_high_prio :
-								   kctx->csf.kcpu_wq_normal_prio,
-			   &blocked_queue->work);
+		enqueue_kcpuq_work(blocked_queue);
 }
 
 static int kbase_kcpu_jit_free_process(struct kbase_kcpu_command_queue *queue,
@@ -717,11 +719,8 @@ static int kbase_csf_queue_group_suspend_process(struct kbase_context *kctx,
 static enum kbase_csf_event_callback_action event_cqs_callback(void *param)
 {
 	struct kbase_kcpu_command_queue *kcpu_queue = (struct kbase_kcpu_command_queue *)param;
-	struct kbase_context *kctx = kcpu_queue->kctx;
 
-	queue_work(atomic_read(&kctx->prioritized) ? kctx->csf.kcpu_wq_high_prio :
-							   kctx->csf.kcpu_wq_normal_prio,
-		   &kcpu_queue->work);
+	enqueue_kcpuq_work(kcpu_queue);
 
 	return KBASE_CSF_EVENT_CALLBACK_KEEP;
 }
@@ -1322,9 +1321,7 @@ static void kbase_csf_fence_wait_callback(struct dma_fence *fence, struct dma_fe
 				  fence->seqno);
 
 	/* Resume kcpu command queue processing. */
-	queue_work(atomic_read(&kctx->prioritized) ? kctx->csf.kcpu_wq_high_prio :
-							   kctx->csf.kcpu_wq_normal_prio,
-		   &kcpu_queue->work);
+	enqueue_kcpuq_work(kcpu_queue);
 }
 
 static void kbasep_kcpu_fence_wait_cancel(struct kbase_kcpu_command_queue *kcpu_queue,
@@ -1360,7 +1357,6 @@ static void kbasep_kcpu_fence_wait_cancel(struct kbase_kcpu_command_queue *kcpu_
 	fence_info->fence = NULL;
 }
 
-#ifdef CONFIG_MALI_FENCE_DEBUG
 /**
  * fence_timeout_callback() - Timeout callback function for fence-wait
  *
@@ -1399,9 +1395,7 @@ static void fence_timeout_callback(struct timer_list *timer)
 	kbase_sync_fence_info_get(fence, &info);
 
 	if (info.status == 1) {
-		queue_work(atomic_read(&kctx->prioritized) ? kctx->csf.kcpu_wq_high_prio :
-								   kctx->csf.kcpu_wq_normal_prio,
-			   &kcpu_queue->work);
+		enqueue_kcpuq_work(kcpu_queue);
 	} else if (info.status == 0) {
 		dev_warn(kctx->kbdev->dev, "fence has not yet signalled in %ums",
 			 FENCE_WAIT_TIMEOUT_MS);
@@ -1430,7 +1424,6 @@ static void fence_wait_timeout_start(struct kbase_kcpu_command_queue *cmd)
 {
 	mod_timer(&cmd->fence_timeout, jiffies + msecs_to_jiffies(FENCE_WAIT_TIMEOUT_MS));
 }
-#endif
 
 /**
  * kbase_kcpu_fence_wait_process() - Process the kcpu fence wait command
@@ -1469,9 +1462,8 @@ static int kbase_kcpu_fence_wait_process(struct kbase_kcpu_command_queue *kcpu_q
 		fence_status = cb_err;
 		if (cb_err == 0) {
 			kcpu_queue->fence_wait_processed = true;
-#ifdef CONFIG_MALI_FENCE_DEBUG
-			fence_wait_timeout_start(kcpu_queue);
-#endif
+			if (IS_ENABLED(CONFIG_MALI_FENCE_DEBUG))
+				fence_wait_timeout_start(kcpu_queue);
 		} else if (cb_err == -ENOENT) {
 			fence_status = dma_fence_get_status(fence);
 			if (!fence_status) {
@@ -1692,9 +1684,7 @@ static void fence_signal_timeout_cb(struct timer_list *timer)
 		if (atomic_read(&kcpu_queue->fence_signal_pending_cnt) > 1)
 			fence_signal_timeout_start(kcpu_queue);
 
-		queue_work(atomic_read(&kctx->prioritized) ? kctx->csf.kcpu_wq_high_prio :
-								   kctx->csf.kcpu_wq_normal_prio,
-			   &kcpu_queue->timeout_work);
+		queue_work(kctx->csf.kcpu_queues.kcpu_wq, &kcpu_queue->timeout_work);
 	}
 }
 
@@ -1973,7 +1963,7 @@ static void kcpu_queue_process_worker(struct work_struct *data)
 		container_of(data, struct kbase_kcpu_command_queue, work);
 
 	mutex_lock(&queue->lock);
-	kcpu_queue_process(queue, false);
+	kbase_csf_kcpu_queue_process(queue, false);
 	mutex_unlock(&queue->lock);
 }
 
@@ -2006,7 +1996,7 @@ static int delete_queue(struct kbase_context *kctx, u32 id)
 		/* Drain the remaining work for this queue first and go past
 		 * all the waits.
 		 */
-		kcpu_queue_process(queue, true);
+		kbase_csf_kcpu_queue_process(queue, true);
 
 		/* All commands should have been processed */
 		WARN_ON(queue->num_pending_cmds);
@@ -2022,11 +2012,20 @@ static int delete_queue(struct kbase_context *kctx, u32 id)
 		mutex_unlock(&queue->lock);
 
 		cancel_work_sync(&queue->timeout_work);
+
+		/*
+		 * Drain a pending request to process this queue in
+		 * kbase_csf_scheduler_kthread() if any. By this point the
+		 * queue would be empty so this would be a no-op.
+		 */
+		kbase_csf_scheduler_wait_for_kthread_pending_work(kctx->kbdev,
+								  &queue->pending_kick);
+
 		cancel_work_sync(&queue->work);
 
 		mutex_destroy(&queue->lock);
 
-		kfree(queue);
+		vfree(queue);
 	} else {
 		dev_dbg(kctx->kbdev->dev, "Attempt to delete a non-existent KCPU queue");
 		mutex_unlock(&kctx->csf.kcpu_queues.lock);
@@ -2079,7 +2078,7 @@ KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_FREE_END(struct kbase_device *kbde
 	KBASE_TLSTREAM_TL_KBASE_ARRAY_END_KCPUQUEUE_EXECUTE_JIT_FREE_END(kbdev, queue);
 }
 
-static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, bool drain_queue)
+void kbase_csf_kcpu_queue_process(struct kbase_kcpu_command_queue *queue, bool drain_queue)
 {
 	struct kbase_device *kbdev = queue->kctx->kbdev;
 	bool process_next = true;
@@ -2199,10 +2198,10 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, bool drai
 				KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_MAP_IMPORT_START(kbdev,
 											   queue);
 
-				kbase_gpu_vm_lock(queue->kctx);
+				kbase_gpu_vm_lock_with_pmode_sync(queue->kctx);
 				meta = kbase_sticky_resource_acquire(queue->kctx,
 								     cmd->info.import.gpu_va);
-				kbase_gpu_vm_unlock(queue->kctx);
+				kbase_gpu_vm_unlock_with_pmode_sync(queue->kctx);
 
 				if (meta == NULL) {
 					queue->has_error = true;
@@ -2219,10 +2218,10 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, bool drai
 
 			KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_UNMAP_IMPORT_START(kbdev, queue);
 
-			kbase_gpu_vm_lock(queue->kctx);
+			kbase_gpu_vm_lock_with_pmode_sync(queue->kctx);
 			ret = kbase_sticky_resource_release(queue->kctx, NULL,
 							    cmd->info.import.gpu_va);
-			kbase_gpu_vm_unlock(queue->kctx);
+			kbase_gpu_vm_unlock_with_pmode_sync(queue->kctx);
 
 			if (!ret) {
 				queue->has_error = true;
@@ -2240,10 +2239,10 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, bool drai
 			KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_UNMAP_IMPORT_FORCE_START(kbdev,
 											   queue);
 
-			kbase_gpu_vm_lock(queue->kctx);
+			kbase_gpu_vm_lock_with_pmode_sync(queue->kctx);
 			ret = kbase_sticky_resource_release_force(queue->kctx, NULL,
 								  cmd->info.import.gpu_va);
-			kbase_gpu_vm_unlock(queue->kctx);
+			kbase_gpu_vm_unlock_with_pmode_sync(queue->kctx);
 
 			if (!ret) {
 				queue->has_error = true;
@@ -2642,7 +2641,7 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 		}
 
 		queue->num_pending_cmds += enq->nr_commands;
-		kcpu_queue_process(queue, false);
+		kbase_csf_kcpu_queue_process(queue, false);
 	}
 
 out:
@@ -2653,23 +2652,14 @@ out:
 
 int kbase_csf_kcpu_queue_context_init(struct kbase_context *kctx)
 {
-	kctx->csf.kcpu_wq_high_prio = alloc_workqueue("mali_kcpu_wq_%i_high_prio",
-						      WQ_UNBOUND | WQ_HIGHPRI, 0, kctx->tgid);
-	if (kctx->csf.kcpu_wq_high_prio == NULL) {
+	kctx->csf.kcpu_queues.kcpu_wq =
+		alloc_workqueue("mali_kcpu_wq_%i_%i", 0, 0, kctx->tgid, kctx->id);
+	if (kctx->csf.kcpu_queues.kcpu_wq == NULL) {
 		dev_err(kctx->kbdev->dev,
 			"Failed to initialize KCPU queue high-priority workqueue");
 		return -ENOMEM;
 	}
 
-	kctx->csf.kcpu_wq_normal_prio =
-		alloc_workqueue("mali_kcpu_wq_%i_normal_prio", 0, 0, kctx->tgid);
-	if (kctx->csf.kcpu_wq_normal_prio == NULL) {
-		dev_err(kctx->kbdev->dev,
-			"Failed to initialize KCPU queue normal-priority workqueue");
-		destroy_workqueue(kctx->csf.kcpu_wq_high_prio);
-		return -ENOMEM;
-	}
-
 	mutex_init(&kctx->csf.kcpu_queues.lock);
 
 	return 0;
@@ -2688,8 +2678,7 @@ void kbase_csf_kcpu_queue_context_term(struct kbase_context *kctx)
 
 	mutex_destroy(&kctx->csf.kcpu_queues.lock);
 
-	destroy_workqueue(kctx->csf.kcpu_wq_normal_prio);
-	destroy_workqueue(kctx->csf.kcpu_wq_high_prio);
+	destroy_workqueue(kctx->csf.kcpu_queues.kcpu_wq);
 }
 KBASE_EXPORT_TEST_API(kbase_csf_kcpu_queue_context_term);
 
@@ -2699,15 +2688,42 @@ int kbase_csf_kcpu_queue_delete(struct kbase_context *kctx,
 	return delete_queue(kctx, (u32)del->id);
 }
 
+static struct kbase_kcpu_dma_fence_meta *
+kbase_csf_kcpu_queue_metadata_new(struct kbase_context *kctx, u64 fence_context)
+{
+	int n;
+	struct kbase_kcpu_dma_fence_meta *metadata = kzalloc(sizeof(*metadata), GFP_KERNEL);
+
+	if (!metadata)
+		goto early_ret;
+
+	*metadata = (struct kbase_kcpu_dma_fence_meta){
+		.kbdev = kctx->kbdev,
+		.kctx_id = kctx->id,
+	};
+
+	/* Please update MAX_TIMELINE_NAME macro when making changes to the string. */
+	n = snprintf(metadata->timeline_name, MAX_TIMELINE_NAME, "%u-%d_%u-%llu-kcpu",
+		     kctx->kbdev->id, kctx->tgid, kctx->id, fence_context);
+	if (WARN_ON(n >= MAX_TIMELINE_NAME)) {
+		kfree(metadata);
+		metadata = NULL;
+		goto early_ret;
+	}
+
+	kbase_refcount_set(&metadata->refcount, 1);
+
+early_ret:
+	return metadata;
+}
+KBASE_ALLOW_ERROR_INJECTION_TEST_API(kbase_csf_kcpu_queue_metadata_new, ERRNO_NULL);
+
 int kbase_csf_kcpu_queue_new(struct kbase_context *kctx, struct kbase_ioctl_kcpu_queue_new *newq)
 {
 	struct kbase_kcpu_command_queue *queue;
+	struct kbase_kcpu_dma_fence_meta *metadata;
 	int idx;
-	int n;
 	int ret = 0;
-#if IS_ENABLED(CONFIG_SYNC_FILE)
-	struct kbase_kcpu_dma_fence_meta *metadata;
-#endif
 	/* The queue id is of u8 type and we use the index of the kcpu_queues
 	 * array as an id, so the number of elements in the array can't be
 	 * more than 256.
@@ -2727,54 +2743,48 @@ int kbase_csf_kcpu_queue_new(struct kbase_context *kctx, struct kbase_ioctl_kcpu
 		goto out;
 	}
 
-	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
-
+	queue = vzalloc(sizeof(*queue));
 	if (!queue) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	bitmap_set(kctx->csf.kcpu_queues.in_use, (unsigned int)idx, 1);
-	kctx->csf.kcpu_queues.array[idx] = queue;
-	mutex_init(&queue->lock);
-	queue->kctx = kctx;
-	queue->start_offset = 0;
-	queue->num_pending_cmds = 0;
+	*queue = (struct kbase_kcpu_command_queue)
+	{
+		.kctx = kctx, .start_offset = 0, .num_pending_cmds = 0, .enqueue_failed = false,
+		.command_started = false, .has_error = false, .id = idx,
 #if IS_ENABLED(CONFIG_SYNC_FILE)
-	queue->fence_context = dma_fence_context_alloc(1);
-	queue->fence_seqno = 0;
-	queue->fence_wait_processed = false;
+		.fence_context = dma_fence_context_alloc(1), .fence_seqno = 0,
+		.fence_wait_processed = false,
+#endif /* IS_ENABLED(CONFIG_SYNC_FILE) */
+	};
 
-	metadata = kzalloc(sizeof(*metadata), GFP_KERNEL);
-	if (!metadata) {
-		kfree(queue);
-		ret = -ENOMEM;
-		goto out;
-	}
+	mutex_init(&queue->lock);
+	INIT_WORK(&queue->work, kcpu_queue_process_worker);
+	INIT_LIST_HEAD(&queue->high_prio_work);
+	atomic_set(&queue->pending_kick, 0);
+	INIT_WORK(&queue->timeout_work, kcpu_queue_timeout_worker);
+	INIT_LIST_HEAD(&queue->jit_blocked);
 
-	metadata->kbdev = kctx->kbdev;
-	metadata->kctx_id = kctx->id;
-	n = snprintf(metadata->timeline_name, MAX_TIMELINE_NAME, "%u-%d_%u-%llu-kcpu",
-		     kctx->kbdev->id, kctx->tgid, kctx->id, queue->fence_context);
-	if (WARN_ON(n >= MAX_TIMELINE_NAME)) {
-		kfree(queue);
-		kfree(metadata);
-		ret = -EINVAL;
-		goto out;
+	if (IS_ENABLED(CONFIG_SYNC_FILE)) {
+		metadata = kbase_csf_kcpu_queue_metadata_new(kctx, queue->fence_context);
+		if (!metadata) {
+			vfree(queue);
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		queue->metadata = metadata;
+		atomic_inc(&kctx->kbdev->live_fence_metadata);
+		atomic_set(&queue->fence_signal_pending_cnt, 0);
+		kbase_timer_setup(&queue->fence_signal_timeout, fence_signal_timeout_cb);
 	}
 
-	kbase_refcount_set(&metadata->refcount, 1);
-	queue->metadata = metadata;
-	atomic_inc(&kctx->kbdev->live_fence_metadata);
-#endif /* CONFIG_SYNC_FILE */
-	queue->enqueue_failed = false;
-	queue->command_started = false;
-	INIT_LIST_HEAD(&queue->jit_blocked);
-	queue->has_error = false;
-	INIT_WORK(&queue->work, kcpu_queue_process_worker);
-	INIT_WORK(&queue->timeout_work, kcpu_queue_timeout_worker);
-	queue->id = idx;
+	if (IS_ENABLED(CONFIG_MALI_FENCE_DEBUG))
+		kbase_timer_setup(&queue->fence_timeout, fence_timeout_callback);
 
+	bitmap_set(kctx->csf.kcpu_queues.in_use, (unsigned int)idx, 1);
+	kctx->csf.kcpu_queues.array[idx] = queue;
 	newq->id = idx;
 
 	/* Fire the tracepoint with the mutex held to enforce correct ordering
@@ -2784,14 +2794,6 @@ int kbase_csf_kcpu_queue_new(struct kbase_context *kctx, struct kbase_ioctl_kcpu
 					      queue->num_pending_cmds);
 
 	KBASE_KTRACE_ADD_CSF_KCPU(kctx->kbdev, KCPU_QUEUE_CREATE, queue, queue->fence_context, 0);
-#ifdef CONFIG_MALI_FENCE_DEBUG
-	kbase_timer_setup(&queue->fence_timeout, fence_timeout_callback);
-#endif
-
-#if IS_ENABLED(CONFIG_SYNC_FILE)
-	atomic_set(&queue->fence_signal_pending_cnt, 0);
-	kbase_timer_setup(&queue->fence_signal_timeout, fence_signal_timeout_cb);
-#endif
 out:
 	mutex_unlock(&kctx->csf.kcpu_queues.lock);
 
diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.h b/mali_kbase/csf/mali_kbase_csf_kcpu.h
index a19847e..291509b 100644
--- a/mali_kbase/csf/mali_kbase_csf_kcpu.h
+++ b/mali_kbase/csf/mali_kbase_csf_kcpu.h
@@ -243,7 +243,19 @@ struct kbase_kcpu_command {
  * @work:			struct work_struct which contains a pointer to
  *				the function which handles processing of kcpu
  *				commands enqueued into a kcpu command queue;
- *				part of kernel API for processing workqueues
+ *				part of kernel API for processing workqueues.
+ *				This would be used if the context is not
+ *				prioritised, otherwise it would be handled by
+ *				kbase_csf_scheduler_kthread().
+ * @high_prio_work:		A counterpart to @work, this queue would be
+ *				added to a list to be processed by
+ *				kbase_csf_scheduler_kthread() if it is
+ *				prioritised.
+ * @pending_kick:		Indicates that kbase_csf_scheduler_kthread()
+ *				should re-evaluate pending commands for this
+ *				queue. This would be set to false when the work
+ *				is done. This is used mainly for
+ *				synchronisation with queue termination.
  * @timeout_work:		struct work_struct which contains a pointer to the
  *				function which handles post-timeout actions
  *				queue when a fence signal timeout occurs.
@@ -287,6 +299,8 @@ struct kbase_kcpu_command_queue {
 	struct kbase_context *kctx;
 	struct kbase_kcpu_command commands[KBASEP_KCPU_QUEUE_SIZE];
 	struct work_struct work;
+	struct list_head high_prio_work;
+	atomic_t pending_kick;
 	struct work_struct timeout_work;
 	u8 start_offset;
 	u8 id;
@@ -299,9 +313,7 @@ struct kbase_kcpu_command_queue {
 	bool command_started;
 	struct list_head jit_blocked;
 	bool has_error;
-#ifdef CONFIG_MALI_FENCE_DEBUG
 	struct timer_list fence_timeout;
-#endif /* CONFIG_MALI_FENCE_DEBUG */
 #if IS_ENABLED(CONFIG_SYNC_FILE)
 	struct kbase_kcpu_dma_fence_meta *metadata;
 #endif /* CONFIG_SYNC_FILE */
@@ -335,6 +347,18 @@ int kbase_csf_kcpu_queue_delete(struct kbase_context *kctx,
 				struct kbase_ioctl_kcpu_queue_delete *del);
 
 /**
+ * kbase_csf_kcpu_queue_process - Proces pending KCPU queue commands
+ *
+ * @queue:		The queue to process pending commands for
+ * @drain_queue:	Whether to skip all blocking commands in the queue.
+ *			This is expected to be set to true on queue
+ *			termination.
+ *
+ * Return: 0 if successful or a negative error code on failure.
+ */
+void kbase_csf_kcpu_queue_process(struct kbase_kcpu_command_queue *queue, bool drain_queue);
+
+/**
  * kbase_csf_kcpu_queue_enqueue - Enqueue a KCPU command into a KCPU command
  *				  queue.
  *
diff --git a/mali_kbase/csf/mali_kbase_csf_registers.h b/mali_kbase/csf/mali_kbase_csf_registers.h
index d01f307..9a7c6e4 100644
--- a/mali_kbase/csf/mali_kbase_csf_registers.h
+++ b/mali_kbase/csf/mali_kbase_csf_registers.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -250,7 +250,7 @@
 
 #define GLB_ACK 0x0000 /* () Global acknowledge */
 #define GLB_DB_ACK 0x0008 /* () Global doorbell acknowledge */
-#define GLB_HALT_STATUS 0x0010 /* () Global halt status */
+#define GLB_FATAL_STATUS 0x0010 /* () Global fatal error status */
 #define GLB_PRFCNT_STATUS 0x0014 /* () Performance counter status */
 #define GLB_PRFCNT_INSERT 0x0018 /* () Performance counter buffer insert index */
 #define GLB_DEBUG_FWUTF_RESULT GLB_DEBUG_ARG_OUT0 /* () Firmware debug test result */
@@ -1422,6 +1422,12 @@
 #define GLB_REQ_PRFCNT_OVERFLOW_SET(reg_val, value)    \
 	(((reg_val) & ~GLB_REQ_PRFCNT_OVERFLOW_MASK) | \
 	 (((value) << GLB_REQ_PRFCNT_OVERFLOW_SHIFT) & GLB_REQ_PRFCNT_OVERFLOW_MASK))
+#define GLB_ACK_FATAL_SHIFT GPU_U(27)
+#define GLB_ACK_FATAL_MASK (GPU_U(0x1) << GLB_ACK_FATAL_SHIFT)
+#define GLB_ACK_FATAL_GET(reg_val) (((reg_val)&GLB_ACK_FATAL_MASK) >> GLB_ACK_FATAL_SHIFT)
+#define GLB_ACK_FATAL_SET(reg_val, value)     \
+	(~(~(reg_val) | GLB_ACK_FATAL_MASK) | \
+	 (((value) << GLB_ACK_FATAL_SHIFT) & GLB_ACK_FATAL_MASK))
 #define GLB_REQ_DEBUG_CSF_REQ_SHIFT 30
 #define GLB_REQ_DEBUG_CSF_REQ_MASK (0x1 << GLB_REQ_DEBUG_CSF_REQ_SHIFT)
 #define GLB_REQ_DEBUG_CSF_REQ_GET(reg_val) \
@@ -1822,6 +1828,20 @@
 	(((reg_val) & ~GLB_DEBUG_REQ_RUN_MODE_MASK) | \
 	 (((value) << GLB_DEBUG_REQ_RUN_MODE_SHIFT) & GLB_DEBUG_REQ_RUN_MODE_MASK))
 
+/* GLB_FATAL_STATUS register */
+#define GLB_FATAL_STATUS_VALUE_SHIFT GPU_U(0)
+#define GLB_FATAL_STATUS_VALUE_MASK (GPU_U(0xFFFFFFFF) << GLB_FATAL_STATUS_VALUE_SHIFT)
+#define GLB_FATAL_STATUS_VALUE_GET(reg_val) \
+	(((reg_val)&GLB_FATAL_STATUS_VALUE_MASK) >> GLB_FATAL_STATUS_VALUE_SHIFT)
+
+enum glb_fatal_status {
+	GLB_FATAL_STATUS_VALUE_OK,
+	GLB_FATAL_STATUS_VALUE_ASSERT,
+	GLB_FATAL_STATUS_VALUE_UNEXPECTED_EXCEPTION,
+	GLB_FATAL_STATUS_VALUE_HANG,
+	GLB_FATAL_STATUS_VALUE_COUNT
+};
+
 /* GLB_DEBUG_ACK register */
 #define GLB_DEBUG_ACK_DEBUG_RUN_SHIFT GPU_U(23)
 #define GLB_DEBUG_ACK_DEBUG_RUN_MASK (GPU_U(0x1) << GLB_DEBUG_ACK_DEBUG_RUN_SHIFT)
diff --git a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
index 240397e..b07cc96 100644
--- a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
+++ b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2019-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -224,8 +224,11 @@ static void kbase_csf_reset_end_hw_access(struct kbase_device *kbdev, int err_du
 
 static void kbase_csf_debug_dump_registers(struct kbase_device *kbdev)
 {
+	unsigned long flags;
+
 	kbase_io_history_dump(kbdev);
 
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	dev_err(kbdev->dev, "Register state:");
 	dev_err(kbdev->dev, "  GPU_IRQ_RAWSTAT=0x%08x  GPU_STATUS=0x%08x MCU_STATUS=0x%08x",
 		kbase_reg_read32(kbdev, GPU_CONTROL_ENUM(GPU_IRQ_RAWSTAT)),
@@ -251,6 +254,7 @@ static void kbase_csf_debug_dump_registers(struct kbase_device *kbdev)
 			kbase_reg_read32(kbdev, GPU_CONTROL_ENUM(TILER_CONFIG)));
 	}
 
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
 
 /**
@@ -396,6 +400,7 @@ static int kbase_csf_reset_gpu_now(struct kbase_device *kbdev, bool firmware_ini
 	 */
 	if (likely(firmware_inited))
 		kbase_csf_scheduler_reset(kbdev);
+
 	cancel_work_sync(&kbdev->csf.firmware_reload_work);
 
 	dev_dbg(kbdev->dev, "Disable GPU hardware counters.\n");
@@ -403,6 +408,7 @@ static int kbase_csf_reset_gpu_now(struct kbase_device *kbdev, bool firmware_ini
 	kbase_hwcnt_context_disable(kbdev->hwcnt_gpu_ctx);
 
 	ret = kbase_csf_reset_gpu_once(kbdev, firmware_inited, silent);
+
 	if (ret == SOFT_RESET_FAILED) {
 		dev_err(kbdev->dev, "Soft-reset failed");
 		goto err;
@@ -490,6 +496,13 @@ static void kbase_csf_reset_gpu_worker(struct work_struct *data)
 
 bool kbase_prepare_to_reset_gpu(struct kbase_device *kbdev, unsigned int flags)
 {
+#ifdef CONFIG_MALI_ARBITER_SUPPORT
+	if (kbase_pm_is_gpu_lost(kbdev)) {
+		/* GPU access has been removed, reset will be done by Arbiter instead */
+		return false;
+	}
+#endif
+
 	if (flags & RESET_FLAGS_HWC_UNRECOVERABLE_ERROR)
 		kbase_hwcnt_backend_csf_on_unrecoverable_error(&kbdev->hwcnt_gpu_iface);
 
diff --git a/mali_kbase/csf/mali_kbase_csf_scheduler.c b/mali_kbase/csf/mali_kbase_csf_scheduler.c
index 7fba656..5e215ca 100644
--- a/mali_kbase/csf/mali_kbase_csf_scheduler.c
+++ b/mali_kbase/csf/mali_kbase_csf_scheduler.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -36,6 +36,7 @@
 #include "mali_kbase_csf_tiler_heap_reclaim.h"
 #include "mali_kbase_csf_mcu_shared_reg.h"
 #include <linux/version_compat_defs.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 #if IS_ENABLED(CONFIG_MALI_TRACE_POWER_GPU_WORK_PERIOD)
 #include <mali_kbase_gpu_metrics.h>
 #include <csf/mali_kbase_csf_trace_buffer.h>
@@ -84,7 +85,8 @@ scheduler_get_protm_enter_async_group(struct kbase_device *const kbdev,
 				      struct kbase_queue_group *const group);
 static struct kbase_queue_group *get_tock_top_group(struct kbase_csf_scheduler *const scheduler);
 static void scheduler_enable_tick_timer_nolock(struct kbase_device *kbdev);
-static int suspend_active_queue_groups(struct kbase_device *kbdev, unsigned long *slot_mask);
+static int suspend_active_queue_groups(struct kbase_device *kbdev, unsigned long *slot_mask,
+				       bool reset);
 static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev, bool system_suspend);
 static void schedule_in_cycle(struct kbase_queue_group *group, bool force);
 static bool queue_group_scheduled_locked(struct kbase_queue_group *group);
@@ -788,7 +790,8 @@ static void update_on_slot_queues_offsets(struct kbase_device *kbdev)
 static void enqueue_gpu_idle_work(struct kbase_csf_scheduler *const scheduler)
 {
 	atomic_set(&scheduler->gpu_no_longer_idle, false);
-	queue_work(scheduler->idle_wq, &scheduler->gpu_idle_work);
+	atomic_inc(&scheduler->pending_gpu_idle_work);
+	complete(&scheduler->kthread_signal);
 }
 
 bool kbase_csf_scheduler_process_gpu_idle_event(struct kbase_device *kbdev)
@@ -800,7 +803,8 @@ bool kbase_csf_scheduler_process_gpu_idle_event(struct kbase_device *kbdev)
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 	lockdep_assert_held(&scheduler->interrupt_lock);
 
-	can_suspend_on_idle = kbase_pm_idle_groups_sched_suspendable(kbdev);
+	can_suspend_on_idle = kbase_pm_idle_groups_sched_suspendable(kbdev) &&
+			      !kbase_pm_is_mcu_inactive(kbdev, kbdev->pm.backend.mcu_state);
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_GPU_IDLE_EVENT_CAN_SUSPEND, NULL,
 			 (((u64)can_suspend_on_idle) << 32));
 
@@ -2410,6 +2414,11 @@ static void cancel_tock_work(struct kbase_csf_scheduler *const scheduler)
 	atomic_set(&scheduler->pending_tock_work, false);
 }
 
+static void cancel_gpu_idle_work(struct kbase_csf_scheduler *const scheduler)
+{
+	atomic_set(&scheduler->pending_gpu_idle_work, false);
+}
+
 static void remove_group_from_runnable(struct kbase_csf_scheduler *const scheduler,
 				       struct kbase_queue_group *group,
 				       enum kbase_csf_group_state run_state)
@@ -3131,8 +3140,9 @@ static void sched_evict_group(struct kbase_queue_group *group, bool fault,
 static int term_group_sync(struct kbase_queue_group *group)
 {
 	struct kbase_device *kbdev = group->kctx->kbdev;
-	const unsigned int fw_timeout_ms = kbase_get_timeout_ms(kbdev, CSF_FIRMWARE_TIMEOUT);
-	long remaining = kbase_csf_timeout_in_jiffies(fw_timeout_ms);
+	const unsigned int group_term_timeout_ms =
+		kbase_get_timeout_ms(kbdev, CSF_CSG_TERM_TIMEOUT);
+	long remaining = kbase_csf_timeout_in_jiffies(group_term_timeout_ms);
 	int err = 0;
 
 	term_csg_slot(group);
@@ -3148,7 +3158,7 @@ static int term_group_sync(struct kbase_queue_group *group)
 		dev_warn(
 			kbdev->dev,
 			"[%llu] term request timeout (%d ms) for group %d of context %d_%d on slot %d",
-			kbase_backend_get_cycle_cnt(kbdev), fw_timeout_ms, group->handle,
+			kbase_backend_get_cycle_cnt(kbdev), group_term_timeout_ms, group->handle,
 			group->kctx->tgid, group->kctx->id, group->csg_nr);
 		if (kbase_csf_firmware_ping_wait(kbdev, FW_PING_AFTER_ERROR_TIMEOUT_MS))
 			error_type = DF_PING_REQUEST_TIMEOUT;
@@ -4138,7 +4148,7 @@ static void scheduler_group_check_protm_enter(struct kbase_device *const kbdev,
 	 * entry to protected mode happens with a memory region being locked and
 	 * the same region is then accessed by the GPU in protected mode.
 	 */
-	mutex_lock(&kbdev->mmu_hw_mutex);
+	down_write(&kbdev->csf.pmode_sync_sem);
 	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
 
 	/* Check if the previous transition to enter & exit the protected
@@ -4204,7 +4214,7 @@ static void scheduler_group_check_protm_enter(struct kbase_device *const kbdev,
 				spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 
 				err = kbase_csf_wait_protected_mode_enter(kbdev);
-				mutex_unlock(&kbdev->mmu_hw_mutex);
+				up_write(&kbdev->csf.pmode_sync_sem);
 
 				if (err)
 					schedule_actions_trigger_df(
@@ -4219,7 +4229,7 @@ static void scheduler_group_check_protm_enter(struct kbase_device *const kbdev,
 	}
 
 	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
-	mutex_unlock(&kbdev->mmu_hw_mutex);
+	up_write(&kbdev->csf.pmode_sync_sem);
 }
 
 /**
@@ -4797,8 +4807,9 @@ static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev, bool s
 {
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 	DECLARE_BITMAP(slot_mask, MAX_SUPPORTED_CSGS) = { 0 };
+	int ret;
 
-	int ret = suspend_active_queue_groups(kbdev, slot_mask);
+	ret = suspend_active_queue_groups(kbdev, slot_mask, false);
 
 	if (unlikely(ret)) {
 		const int csg_nr = ffs(slot_mask[0]) - 1;
@@ -4988,14 +4999,14 @@ static bool scheduler_suspend_on_idle(struct kbase_device *kbdev)
 	return true;
 }
 
-static void gpu_idle_worker(struct work_struct *work)
+static void gpu_idle_worker(struct kbase_device *kbdev)
 {
-	struct kbase_device *kbdev =
-		container_of(work, struct kbase_device, csf.scheduler.gpu_idle_work);
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 	bool scheduler_is_idle_suspendable = false;
 	bool all_groups_suspended = false;
 
+	WARN_ON_ONCE(atomic_read(&scheduler->pending_gpu_idle_work) == 0);
+
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_GPU_IDLE_WORKER_START, NULL, 0u);
 
 #define __ENCODE_KTRACE_INFO(reset, idle, all_suspend) \
@@ -5005,7 +5016,7 @@ static void gpu_idle_worker(struct work_struct *work)
 		dev_warn(kbdev->dev, "Quit idle for failing to prevent gpu reset.\n");
 		KBASE_KTRACE_ADD(kbdev, SCHEDULER_GPU_IDLE_WORKER_END, NULL,
 				 __ENCODE_KTRACE_INFO(true, false, false));
-		return;
+		goto exit;
 	}
 	kbase_debug_csf_fault_wait_completion(kbdev);
 	mutex_lock(&scheduler->lock);
@@ -5014,7 +5025,7 @@ static void gpu_idle_worker(struct work_struct *work)
 	if (unlikely(scheduler->state == SCHED_BUSY)) {
 		mutex_unlock(&scheduler->lock);
 		kbase_reset_gpu_allow(kbdev);
-		return;
+		goto exit;
 	}
 #endif
 
@@ -5039,6 +5050,9 @@ static void gpu_idle_worker(struct work_struct *work)
 			 __ENCODE_KTRACE_INFO(false, scheduler_is_idle_suspendable,
 					      all_groups_suspended));
 #undef __ENCODE_KTRACE_INFO
+
+exit:
+	atomic_dec(&scheduler->pending_gpu_idle_work);
 }
 
 static int scheduler_prepare(struct kbase_device *kbdev)
@@ -5662,7 +5676,9 @@ exit_no_schedule_unlock:
 	kbase_reset_gpu_allow(kbdev);
 }
 
-static int suspend_active_queue_groups(struct kbase_device *kbdev, unsigned long *slot_mask)
+
+static int suspend_active_queue_groups(struct kbase_device *kbdev, unsigned long *slot_mask,
+				       bool reset)
 {
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 	u32 num_groups = kbdev->csf.global_iface.group_num;
@@ -5675,12 +5691,12 @@ static int suspend_active_queue_groups(struct kbase_device *kbdev, unsigned long
 		struct kbase_queue_group *group = scheduler->csg_slots[slot_num].resident_group;
 
 		if (group) {
-			suspend_queue_group(group);
+				suspend_queue_group(group);
 			set_bit(slot_num, slot_mask);
 		}
 	}
 
-	ret = wait_csg_slots_suspend(kbdev, slot_mask);
+		ret = wait_csg_slots_suspend(kbdev, slot_mask);
 	return ret;
 }
 
@@ -5693,7 +5709,7 @@ static int suspend_active_queue_groups_on_reset(struct kbase_device *kbdev)
 
 	mutex_lock(&scheduler->lock);
 
-	ret = suspend_active_queue_groups(kbdev, slot_mask);
+	ret = suspend_active_queue_groups(kbdev, slot_mask, true);
 
 	if (ret) {
 		dev_warn(
@@ -5830,9 +5846,9 @@ static void scheduler_inner_reset(struct kbase_device *kbdev)
 	WARN_ON(kbase_csf_scheduler_get_nr_active_csgs(kbdev));
 
 	/* Cancel any potential queued delayed work(s) */
-	cancel_work_sync(&kbdev->csf.scheduler.gpu_idle_work);
 	cancel_tick_work(scheduler);
 	cancel_tock_work(scheduler);
+	cancel_gpu_idle_work(scheduler);
 	cancel_delayed_work_sync(&scheduler->ping_work);
 
 	mutex_lock(&scheduler->lock);
@@ -5860,12 +5876,13 @@ static void scheduler_inner_reset(struct kbase_device *kbdev)
 void kbase_csf_scheduler_reset(struct kbase_device *kbdev)
 {
 	struct kbase_context *kctx;
-
 	WARN_ON(!kbase_reset_gpu_is_active(kbdev));
 
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_RESET_START, NULL, 0u);
 
-	kbase_debug_csf_fault_wait_completion(kbdev);
+	if (kbase_reset_gpu_is_active(kbdev))
+		kbase_debug_csf_fault_wait_completion(kbdev);
+
 
 	if (scheduler_handle_reset_in_protected_mode(kbdev) &&
 	    !suspend_active_queue_groups_on_reset(kbdev)) {
@@ -6453,8 +6470,8 @@ static void check_sync_update_in_sleep_mode(struct kbase_device *kbdev)
  * check_group_sync_update_worker() - Check the sync wait condition for all the
  *                                    blocked queue groups
  *
- * @work:    Pointer to the context-specific work item for evaluating the wait
- *           condition for all the queue groups in idle_wait_groups list.
+ * @kctx: The context to evaluate the wait condition for all the queue groups
+ *        in idle_wait_groups list.
  *
  * This function checks the gpu queues of all the groups present in both
  * idle_wait_groups list of a context and all on slot idle groups (if GPU
@@ -6464,27 +6481,14 @@ static void check_sync_update_in_sleep_mode(struct kbase_device *kbdev)
  * runnable groups so that Scheduler can consider scheduling the group
  * in next tick or exit protected mode.
  */
-static void check_group_sync_update_worker(struct work_struct *work)
+static void check_group_sync_update_worker(struct kbase_context *kctx)
 {
-	struct kbase_context *const kctx =
-		container_of(work, struct kbase_context, csf.sched.sync_update_work);
 	struct kbase_device *const kbdev = kctx->kbdev;
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 	bool sync_updated = false;
 
 	mutex_lock(&scheduler->lock);
 
-#if IS_ENABLED(CONFIG_DEBUG_FS)
-	if (unlikely(scheduler->state == SCHED_BUSY)) {
-		queue_work(atomic_read(&kctx->prioritized) ?
-					 kctx->csf.sched.sync_update_wq_high_prio :
-					 kctx->csf.sched.sync_update_wq_normal_prio,
-			   &kctx->csf.sched.sync_update_work);
-		mutex_unlock(&scheduler->lock);
-		return;
-	}
-#endif
-
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_GROUP_SYNC_UPDATE_WORKER_START, kctx, 0u);
 	if (kctx->csf.sched.num_idle_wait_grps != 0) {
 		struct kbase_queue_group *group, *temp;
@@ -6522,13 +6526,10 @@ static void check_group_sync_update_worker(struct work_struct *work)
 static enum kbase_csf_event_callback_action check_group_sync_update_cb(void *param)
 {
 	struct kbase_context *const kctx = param;
-	struct workqueue_struct *wq = atomic_read(&kctx->prioritized) ?
-						    kctx->csf.sched.sync_update_wq_high_prio :
-						    kctx->csf.sched.sync_update_wq_normal_prio;
 
 	KBASE_KTRACE_ADD(kctx->kbdev, SCHEDULER_GROUP_SYNC_UPDATE_EVENT, kctx, 0u);
 
-	queue_work(wq, &kctx->csf.sched.sync_update_work);
+	kbase_csf_scheduler_enqueue_sync_update_work(kctx);
 
 	return KBASE_CSF_EVENT_CALLBACK_KEEP;
 }
@@ -6539,6 +6540,8 @@ int kbase_csf_scheduler_context_init(struct kbase_context *kctx)
 	int err;
 	struct kbase_device *kbdev = kctx->kbdev;
 
+	WARN_ON_ONCE(!kbdev->csf.scheduler.kthread_running);
+
 #if IS_ENABLED(CONFIG_MALI_TRACE_POWER_GPU_WORK_PERIOD)
 	err = gpu_metrics_ctx_init(kctx);
 	if (err)
@@ -6551,25 +6554,7 @@ int kbase_csf_scheduler_context_init(struct kbase_context *kctx)
 
 	INIT_LIST_HEAD(&kctx->csf.sched.idle_wait_groups);
 
-	kctx->csf.sched.sync_update_wq_high_prio = alloc_ordered_workqueue(
-		"mali_sync_wq_%i_high_prio", WQ_UNBOUND | WQ_HIGHPRI, kctx->tgid);
-	if (kctx->csf.sched.sync_update_wq_high_prio == NULL) {
-		dev_err(kbdev->dev,
-			"Failed to initialize scheduler context high-priority workqueue");
-		err = -ENOMEM;
-		goto alloc_high_prio_wq_failed;
-	}
-
-	kctx->csf.sched.sync_update_wq_normal_prio =
-		alloc_ordered_workqueue("mali_sync_wq_%i_normal_prio", 0, kctx->tgid);
-	if (kctx->csf.sched.sync_update_wq_normal_prio == NULL) {
-		dev_err(kbdev->dev,
-			"Failed to initialize scheduler context normal-priority workqueue");
-		err = -ENOMEM;
-		goto alloc_normal_prio_wq_failed;
-	}
-
-	INIT_WORK(&kctx->csf.sched.sync_update_work, check_group_sync_update_worker);
+	INIT_LIST_HEAD(&kctx->csf.sched.sync_update_work);
 
 	kbase_csf_tiler_heap_reclaim_ctx_init(kctx);
 
@@ -6583,10 +6568,6 @@ int kbase_csf_scheduler_context_init(struct kbase_context *kctx)
 	return err;
 
 event_wait_add_failed:
-	destroy_workqueue(kctx->csf.sched.sync_update_wq_normal_prio);
-alloc_normal_prio_wq_failed:
-	destroy_workqueue(kctx->csf.sched.sync_update_wq_high_prio);
-alloc_high_prio_wq_failed:
 	kbase_ctx_sched_remove_ctx(kctx);
 #if IS_ENABLED(CONFIG_MALI_TRACE_POWER_GPU_WORK_PERIOD)
 	gpu_metrics_ctx_term(kctx);
@@ -6597,9 +6578,10 @@ alloc_high_prio_wq_failed:
 void kbase_csf_scheduler_context_term(struct kbase_context *kctx)
 {
 	kbase_csf_event_wait_remove(kctx, check_group_sync_update_cb, kctx);
-	cancel_work_sync(&kctx->csf.sched.sync_update_work);
-	destroy_workqueue(kctx->csf.sched.sync_update_wq_normal_prio);
-	destroy_workqueue(kctx->csf.sched.sync_update_wq_high_prio);
+
+	/* Drain a pending SYNC_UPDATE work if any */
+	kbase_csf_scheduler_wait_for_kthread_pending_work(kctx->kbdev,
+							  &kctx->csf.pending_sync_update);
 
 	kbase_ctx_sched_remove_ctx(kctx);
 #if IS_ENABLED(CONFIG_MALI_TRACE_POWER_GPU_WORK_PERIOD)
@@ -6607,53 +6589,157 @@ void kbase_csf_scheduler_context_term(struct kbase_context *kctx)
 #endif /* CONFIG_MALI_TRACE_POWER_GPU_WORK_PERIOD */
 }
 
+static void handle_pending_sync_update_works(struct kbase_csf_scheduler *scheduler)
+{
+	struct kbase_context *sync_update_ctx;
+
+	if (atomic_cmpxchg(&scheduler->pending_sync_update_works, true, false) == false)
+		return;
+
+	do {
+		unsigned long flags;
+
+		spin_lock_irqsave(&scheduler->sync_update_work_ctxs_lock, flags);
+		sync_update_ctx = NULL;
+		if (!list_empty(&scheduler->sync_update_work_ctxs)) {
+			sync_update_ctx = list_first_entry(&scheduler->sync_update_work_ctxs,
+							   struct kbase_context,
+							   csf.sched.sync_update_work);
+			list_del_init(&sync_update_ctx->csf.sched.sync_update_work);
+		}
+		spin_unlock_irqrestore(&scheduler->sync_update_work_ctxs_lock, flags);
+
+		if (sync_update_ctx != NULL) {
+			WARN_ON_ONCE(atomic_read(&sync_update_ctx->csf.pending_sync_update) == 0);
+			check_group_sync_update_worker(sync_update_ctx);
+			atomic_dec(&sync_update_ctx->csf.pending_sync_update);
+		}
+	} while (sync_update_ctx != NULL);
+}
+
+static void handle_pending_protm_requests(struct kbase_csf_scheduler *scheduler)
+{
+	struct kbase_queue_group *protm_grp;
+
+	if (atomic_cmpxchg(&scheduler->pending_protm_event_works, true, false) == false)
+		return;
+
+	do {
+		unsigned long flags;
+
+		spin_lock_irqsave(&scheduler->protm_event_work_grps_lock, flags);
+		protm_grp = NULL;
+		if (!list_empty(&scheduler->protm_event_work_grps)) {
+			protm_grp = list_first_entry(&scheduler->protm_event_work_grps,
+						     struct kbase_queue_group, protm_event_work);
+			list_del_init(&protm_grp->protm_event_work);
+		}
+		spin_unlock_irqrestore(&scheduler->protm_event_work_grps_lock, flags);
+
+		if (protm_grp != NULL) {
+			WARN_ON_ONCE(atomic_read(&protm_grp->pending_protm_event_work) == 0);
+			kbase_csf_process_protm_event_request(protm_grp);
+			atomic_dec(&protm_grp->pending_protm_event_work);
+		}
+	} while (protm_grp != NULL);
+}
+
+static void handle_pending_kcpuq_commands(struct kbase_csf_scheduler *scheduler)
+{
+	struct kbase_kcpu_command_queue *kcpuq;
+
+	if (atomic_cmpxchg(&scheduler->pending_kcpuq_works, true, false) == false)
+		return;
+
+	do {
+		unsigned long flags;
+
+		spin_lock_irqsave(&scheduler->kcpuq_work_queues_lock, flags);
+		kcpuq = NULL;
+		if (!list_empty(&scheduler->kcpuq_work_queues)) {
+			kcpuq = list_first_entry(&scheduler->kcpuq_work_queues,
+						 struct kbase_kcpu_command_queue, high_prio_work);
+			list_del_init(&kcpuq->high_prio_work);
+		}
+		spin_unlock_irqrestore(&scheduler->kcpuq_work_queues_lock, flags);
+
+		if (kcpuq != NULL) {
+			WARN_ON_ONCE(atomic_read(&kcpuq->pending_kick) == 0);
+
+			mutex_lock(&kcpuq->lock);
+			kbase_csf_kcpu_queue_process(kcpuq, false);
+			mutex_unlock(&kcpuq->lock);
+
+			atomic_dec(&kcpuq->pending_kick);
+		}
+	} while (kcpuq != NULL);
+}
+
+static void handle_pending_queue_kicks(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+	struct kbase_queue *queue;
+
+	if (atomic_cmpxchg(&kbdev->csf.pending_gpuq_kicks, true, false) == false)
+		return;
+
+	do {
+		u8 prio;
+
+		spin_lock(&kbdev->csf.pending_gpuq_kick_queues_lock);
+		queue = NULL;
+		for (prio = 0; prio != KBASE_QUEUE_GROUP_PRIORITY_COUNT; ++prio) {
+			if (!list_empty(&kbdev->csf.pending_gpuq_kick_queues[prio])) {
+				queue = list_first_entry(&kbdev->csf.pending_gpuq_kick_queues[prio],
+							 struct kbase_queue, pending_kick_link);
+				list_del_init(&queue->pending_kick_link);
+				break;
+			}
+		}
+		spin_unlock(&kbdev->csf.pending_gpuq_kick_queues_lock);
+
+		if (queue != NULL) {
+			WARN_ONCE(
+				prio != queue->group_priority,
+				"Queue %pK has priority %u but instead its kick was handled at priority %u",
+				(void *)queue, queue->group_priority, prio);
+			WARN_ON_ONCE(atomic_read(&queue->pending_kick) == 0);
+
+			kbase_csf_process_queue_kick(queue);
+
+			/* Perform a scheduling tock for high-priority queue groups if
+			 * required.
+			 */
+			BUILD_BUG_ON(KBASE_QUEUE_GROUP_PRIORITY_REALTIME != 0);
+			BUILD_BUG_ON(KBASE_QUEUE_GROUP_PRIORITY_HIGH != 1);
+			if ((prio <= KBASE_QUEUE_GROUP_PRIORITY_HIGH) &&
+			    atomic_read(&scheduler->pending_tock_work))
+				schedule_on_tock(kbdev);
+		}
+	} while (queue != NULL);
+}
+
 static int kbase_csf_scheduler_kthread(void *data)
 {
 	struct kbase_device *const kbdev = data;
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 
 	while (scheduler->kthread_running) {
-		struct kbase_queue *queue;
-
 		if (wait_for_completion_interruptible(&scheduler->kthread_signal) != 0)
 			continue;
 		reinit_completion(&scheduler->kthread_signal);
 
-		/* Iterate through queues with pending kicks */
-		do {
-			u8 prio;
-
-			spin_lock(&kbdev->csf.pending_gpuq_kicks_lock);
-			queue = NULL;
-			for (prio = 0; prio != KBASE_QUEUE_GROUP_PRIORITY_COUNT; ++prio) {
-				if (!list_empty(&kbdev->csf.pending_gpuq_kicks[prio])) {
-					queue = list_first_entry(
-						&kbdev->csf.pending_gpuq_kicks[prio],
-						struct kbase_queue, pending_kick_link);
-					list_del_init(&queue->pending_kick_link);
-					break;
-				}
-			}
-			spin_unlock(&kbdev->csf.pending_gpuq_kicks_lock);
-
-			if (queue != NULL) {
-				WARN_ONCE(
-					prio != queue->group_priority,
-					"Queue %pK has priority %hhu but instead its kick was handled at priority %hhu",
-					(void *)queue, queue->group_priority, prio);
-
-				kbase_csf_process_queue_kick(queue);
+		/*
+		 * The order in which these requests are handled is based on
+		 * how they would influence each other's decisions. As a
+		 * result, the tick & tock requests must be handled after all
+		 * other requests, but before the GPU IDLE work.
+		 */
 
-				/* Perform a scheduling tock for high-priority queue groups if
-				 * required.
-				 */
-				BUILD_BUG_ON(KBASE_QUEUE_GROUP_PRIORITY_REALTIME != 0);
-				BUILD_BUG_ON(KBASE_QUEUE_GROUP_PRIORITY_HIGH != 1);
-				if ((prio <= KBASE_QUEUE_GROUP_PRIORITY_HIGH) &&
-				    atomic_read(&scheduler->pending_tock_work))
-					schedule_on_tock(kbdev);
-			}
-		} while (queue != NULL);
+		handle_pending_sync_update_works(scheduler);
+		handle_pending_protm_requests(scheduler);
+		handle_pending_kcpuq_commands(scheduler);
+		handle_pending_queue_kicks(kbdev);
 
 		/* Check if we need to perform a scheduling tick/tock. A tick
 		 * event shall override a tock event but not vice-versa.
@@ -6665,6 +6751,10 @@ static int kbase_csf_scheduler_kthread(void *data)
 			schedule_on_tock(kbdev);
 		}
 
+		/* Drain pending GPU idle works */
+		while (atomic_read(&scheduler->pending_gpu_idle_work) > 0)
+			gpu_idle_worker(kbdev);
+
 		dev_dbg(kbdev->dev, "Waking up for event after a scheduling iteration.");
 		wake_up_all(&kbdev->csf.event_wait);
 	}
@@ -6694,7 +6784,7 @@ int kbase_csf_scheduler_init(struct kbase_device *kbdev)
 	scheduler->kthread_running = true;
 	scheduler->gpuq_kthread =
 		kthread_run(&kbase_csf_scheduler_kthread, kbdev, "mali-gpuq-kthread");
-	if (!scheduler->gpuq_kthread) {
+	if (IS_ERR_OR_NULL(scheduler->gpuq_kthread)) {
 		kfree(scheduler->csg_slots);
 		scheduler->csg_slots = NULL;
 
@@ -6734,12 +6824,6 @@ int kbase_csf_scheduler_early_init(struct kbase_device *kbdev)
 
 	atomic_set(&scheduler->timer_enabled, true);
 
-	scheduler->idle_wq = alloc_ordered_workqueue("csf_scheduler_gpu_idle_wq", WQ_HIGHPRI);
-	if (!scheduler->idle_wq) {
-		dev_err(kbdev->dev, "Failed to allocate GPU idle scheduler workqueue\n");
-		return -ENOMEM;
-	}
-
 	INIT_DEFERRABLE_WORK(&scheduler->ping_work, firmware_aliveness_monitor);
 
 	mutex_init(&scheduler->lock);
@@ -6757,20 +6841,30 @@ int kbase_csf_scheduler_early_init(struct kbase_device *kbdev)
 	KBASE_KTRACE_ADD(kbdev, SCHED_SUSPENDED, NULL, scheduler->state);
 	scheduler->csg_scheduling_period_ms = CSF_SCHEDULER_TIME_TICK_MS;
 	scheduler_doorbell_init(kbdev);
-	INIT_WORK(&scheduler->gpu_idle_work, gpu_idle_worker);
 	hrtimer_init(&scheduler->tick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	scheduler->tick_timer.function = tick_timer_callback;
 
-	kbase_csf_tiler_heap_reclaim_mgr_init(kbdev);
+	atomic_set(&scheduler->pending_sync_update_works, false);
+	spin_lock_init(&scheduler->sync_update_work_ctxs_lock);
+	INIT_LIST_HEAD(&scheduler->sync_update_work_ctxs);
+	atomic_set(&scheduler->pending_protm_event_works, false);
+	spin_lock_init(&scheduler->protm_event_work_grps_lock);
+	INIT_LIST_HEAD(&scheduler->protm_event_work_grps);
+	atomic_set(&scheduler->pending_kcpuq_works, false);
+	spin_lock_init(&scheduler->kcpuq_work_queues_lock);
+	INIT_LIST_HEAD(&scheduler->kcpuq_work_queues);
+	atomic_set(&scheduler->pending_tick_work, false);
+	atomic_set(&scheduler->pending_tock_work, false);
+	atomic_set(&scheduler->pending_gpu_idle_work, 0);
 
-	return 0;
+	return kbase_csf_tiler_heap_reclaim_mgr_init(kbdev);
 }
 
 void kbase_csf_scheduler_term(struct kbase_device *kbdev)
 {
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
 
-	if (scheduler->gpuq_kthread) {
+	if (!IS_ERR_OR_NULL(scheduler->gpuq_kthread)) {
 		scheduler->kthread_running = false;
 		complete(&scheduler->kthread_signal);
 		kthread_stop(scheduler->gpuq_kthread);
@@ -6784,7 +6878,6 @@ void kbase_csf_scheduler_term(struct kbase_device *kbdev)
 		 * to be active at the time of Driver unload.
 		 */
 		WARN_ON(kbase_csf_scheduler_get_nr_active_csgs(kbdev));
-		flush_work(&kbdev->csf.scheduler.gpu_idle_work);
 		mutex_lock(&kbdev->csf.scheduler.lock);
 
 		if (kbdev->csf.scheduler.state != SCHED_SUSPENDED) {
@@ -6811,9 +6904,6 @@ void kbase_csf_scheduler_term(struct kbase_device *kbdev)
 
 void kbase_csf_scheduler_early_term(struct kbase_device *kbdev)
 {
-	if (kbdev->csf.scheduler.idle_wq)
-		destroy_workqueue(kbdev->csf.scheduler.idle_wq);
-
 	kbase_csf_tiler_heap_reclaim_mgr_term(kbdev);
 	mutex_destroy(&kbdev->csf.scheduler.lock);
 }
@@ -7096,6 +7186,65 @@ int kbase_csf_scheduler_handle_runtime_suspend(struct kbase_device *kbdev)
 	return 0;
 }
 
+void kbase_csf_scheduler_enqueue_sync_update_work(struct kbase_context *kctx)
+{
+	struct kbase_csf_scheduler *const scheduler = &kctx->kbdev->csf.scheduler;
+	unsigned long flags;
+
+	spin_lock_irqsave(&scheduler->sync_update_work_ctxs_lock, flags);
+	if (list_empty(&kctx->csf.sched.sync_update_work)) {
+		list_add_tail(&kctx->csf.sched.sync_update_work, &scheduler->sync_update_work_ctxs);
+		atomic_inc(&kctx->csf.pending_sync_update);
+		if (atomic_cmpxchg(&scheduler->pending_sync_update_works, false, true) == false)
+			complete(&scheduler->kthread_signal);
+	}
+	spin_unlock_irqrestore(&scheduler->sync_update_work_ctxs_lock, flags);
+}
+
+void kbase_csf_scheduler_enqueue_protm_event_work(struct kbase_queue_group *group)
+{
+	struct kbase_context *const kctx = group->kctx;
+	struct kbase_csf_scheduler *const scheduler = &kctx->kbdev->csf.scheduler;
+	unsigned long flags;
+
+	spin_lock_irqsave(&scheduler->protm_event_work_grps_lock, flags);
+	if (list_empty(&group->protm_event_work)) {
+		list_add_tail(&group->protm_event_work, &scheduler->protm_event_work_grps);
+		atomic_inc(&group->pending_protm_event_work);
+		if (atomic_cmpxchg(&scheduler->pending_protm_event_works, false, true) == false)
+			complete(&scheduler->kthread_signal);
+	}
+	spin_unlock_irqrestore(&scheduler->protm_event_work_grps_lock, flags);
+}
+
+void kbase_csf_scheduler_enqueue_kcpuq_work(struct kbase_kcpu_command_queue *queue)
+{
+	struct kbase_csf_scheduler *const scheduler = &queue->kctx->kbdev->csf.scheduler;
+	unsigned long flags;
+
+	spin_lock_irqsave(&scheduler->kcpuq_work_queues_lock, flags);
+	if (list_empty(&queue->high_prio_work)) {
+		list_add_tail(&queue->high_prio_work, &scheduler->kcpuq_work_queues);
+		atomic_inc(&queue->pending_kick);
+		if (atomic_cmpxchg(&scheduler->pending_kcpuq_works, false, true) == false)
+			complete(&scheduler->kthread_signal);
+	}
+	spin_unlock_irqrestore(&scheduler->kcpuq_work_queues_lock, flags);
+}
+
+void kbase_csf_scheduler_wait_for_kthread_pending_work(struct kbase_device *kbdev,
+						       atomic_t *pending)
+{
+	/*
+	 * Signal kbase_csf_scheduler_kthread() to allow for the
+	 * eventual completion of the current iteration. Once the work is
+	 * done, the event_wait wait queue shall be signalled.
+	 */
+
+	complete(&kbdev->csf.scheduler.kthread_signal);
+	wait_event(kbdev->csf.event_wait, atomic_read(pending) == 0);
+}
+
 void kbase_csf_scheduler_reval_idleness_post_sleep(struct kbase_device *kbdev)
 {
 	u32 csg_nr;
diff --git a/mali_kbase/csf/mali_kbase_csf_scheduler.h b/mali_kbase/csf/mali_kbase_csf_scheduler.h
index 5047092..2200bdf 100644
--- a/mali_kbase/csf/mali_kbase_csf_scheduler.h
+++ b/mali_kbase/csf/mali_kbase_csf_scheduler.h
@@ -235,7 +235,8 @@ void kbase_csf_scheduler_early_term(struct kbase_device *kbdev);
  * No explicit re-initialization is done for CSG & CS interface I/O pages;
  * instead, that happens implicitly on firmware reload.
  *
- * Should be called only after initiating the GPU reset.
+ * Should be called either after initiating the GPU reset or when MCU reset is
+ * expected to follow such as GPU_LOST case.
  */
 void kbase_csf_scheduler_reset(struct kbase_device *kbdev);
 
@@ -488,6 +489,48 @@ static inline bool kbase_csf_scheduler_all_csgs_idle(struct kbase_device *kbdev)
 }
 
 /**
+ * kbase_csf_scheduler_enqueue_sync_update_work() - Add a context to the list
+ *                                                  of contexts to handle
+ *                                                  SYNC_UPDATE events.
+ *
+ * @kctx: The context to handle SYNC_UPDATE event
+ *
+ * This function wakes up kbase_csf_scheduler_kthread() to handle pending
+ * SYNC_UPDATE events for all contexts.
+ */
+void kbase_csf_scheduler_enqueue_sync_update_work(struct kbase_context *kctx);
+
+/**
+ * kbase_csf_scheduler_enqueue_protm_event_work() - Add a group to the list
+ *                                                  of groups to handle
+ *                                                  PROTM requests.
+ *
+ * @group: The group to handle protected mode request
+ *
+ * This function wakes up kbase_csf_scheduler_kthread() to handle pending
+ * protected mode requests for all groups.
+ */
+void kbase_csf_scheduler_enqueue_protm_event_work(struct kbase_queue_group *group);
+
+/**
+ * kbase_csf_scheduler_enqueue_kcpuq_work() - Wake up kbase_csf_scheduler_kthread() to process
+ *                                            pending commands for a KCPU queue.
+ *
+ * @queue: The queue to process pending commands for
+ */
+void kbase_csf_scheduler_enqueue_kcpuq_work(struct kbase_kcpu_command_queue *queue);
+
+/**
+ * kbase_csf_scheduler_wait_for_kthread_pending_work - Wait until a pending work has completed in
+ *                                                     kbase_csf_scheduler_kthread().
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface
+ * @pending: The work to wait for
+ */
+void kbase_csf_scheduler_wait_for_kthread_pending_work(struct kbase_device *kbdev,
+						       atomic_t *pending);
+
+/**
  * kbase_csf_scheduler_invoke_tick() - Invoke the scheduling tick
  *
  * @kbdev: Pointer to the device
diff --git a/mali_kbase/csf/mali_kbase_csf_tiler_heap.c b/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
index 2d148ee..51d665f 100644
--- a/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
+++ b/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
@@ -218,7 +218,7 @@ static void remove_unlinked_chunk(struct kbase_context *kctx,
 	if (WARN_ON(!list_empty(&chunk->link)))
 		return;
 
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 	kbase_vunmap(kctx, &chunk->map);
 	/* KBASE_REG_DONT_NEED regions will be confused with ephemeral regions (inc freed JIT
 	 * regions), and so we must clear that flag too before freeing.
@@ -231,7 +231,7 @@ static void remove_unlinked_chunk(struct kbase_context *kctx,
 	chunk->region->flags &= ~KBASE_REG_DONT_NEED;
 #endif
 	kbase_mem_free_region(kctx, chunk->region);
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 
 	kfree(chunk);
 }
@@ -1058,6 +1058,7 @@ static bool delete_chunk_physical_pages(struct kbase_csf_tiler_heap *heap, u64 c
 	struct kbase_csf_tiler_heap_chunk *chunk = NULL;
 
 	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);
+	lockdep_assert_held(&kctx->kbdev->csf.scheduler.lock);
 
 	chunk = find_chunk(heap, chunk_gpu_va);
 	if (unlikely(!chunk)) {
diff --git a/mali_kbase/csf/mali_kbase_csf_tiler_heap_reclaim.c b/mali_kbase/csf/mali_kbase_csf_tiler_heap_reclaim.c
index 2fc19de..df4feb7 100644
--- a/mali_kbase/csf/mali_kbase_csf_tiler_heap_reclaim.c
+++ b/mali_kbase/csf/mali_kbase_csf_tiler_heap_reclaim.c
@@ -331,8 +331,8 @@ static unsigned long kbase_csf_tiler_heap_reclaim_scan_free_pages(struct kbase_d
 static unsigned long kbase_csf_tiler_heap_reclaim_count_objects(struct shrinker *s,
 								struct shrink_control *sc)
 {
-	struct kbase_device *kbdev =
-		container_of(s, struct kbase_device, csf.scheduler.reclaim_mgr.heap_reclaim);
+	struct kbase_device *kbdev = KBASE_GET_KBASE_DATA_FROM_SHRINKER(
+		s, struct kbase_device, csf.scheduler.reclaim_mgr.heap_reclaim);
 
 	return kbase_csf_tiler_heap_reclaim_count_free_pages(kbdev, sc);
 }
@@ -340,8 +340,8 @@ static unsigned long kbase_csf_tiler_heap_reclaim_count_objects(struct shrinker
 static unsigned long kbase_csf_tiler_heap_reclaim_scan_objects(struct shrinker *s,
 							       struct shrink_control *sc)
 {
-	struct kbase_device *kbdev =
-		container_of(s, struct kbase_device, csf.scheduler.reclaim_mgr.heap_reclaim);
+	struct kbase_device *kbdev = KBASE_GET_KBASE_DATA_FROM_SHRINKER(
+		s, struct kbase_device, csf.scheduler.reclaim_mgr.heap_reclaim);
 
 	return kbase_csf_tiler_heap_reclaim_scan_free_pages(kbdev, sc);
 }
@@ -352,11 +352,17 @@ void kbase_csf_tiler_heap_reclaim_ctx_init(struct kbase_context *kctx)
 	INIT_LIST_HEAD(&kctx->csf.sched.heap_info.mgr_link);
 }
 
-void kbase_csf_tiler_heap_reclaim_mgr_init(struct kbase_device *kbdev)
+int kbase_csf_tiler_heap_reclaim_mgr_init(struct kbase_device *kbdev)
 {
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
-	struct shrinker *reclaim = &scheduler->reclaim_mgr.heap_reclaim;
 	u8 prio;
+	struct shrinker *reclaim;
+
+	reclaim =
+		KBASE_INIT_RECLAIM(&(scheduler->reclaim_mgr), heap_reclaim, "mali-csf-tiler-heap");
+	if (!reclaim)
+		return -ENOMEM;
+	KBASE_SET_RECLAIM(&(scheduler->reclaim_mgr), heap_reclaim, reclaim);
 
 	for (prio = KBASE_QUEUE_GROUP_PRIORITY_REALTIME; prio < KBASE_QUEUE_GROUP_PRIORITY_COUNT;
 	     prio++)
@@ -367,13 +373,10 @@ void kbase_csf_tiler_heap_reclaim_mgr_init(struct kbase_device *kbdev)
 	reclaim->seeks = HEAP_SHRINKER_SEEKS;
 	reclaim->batch = HEAP_SHRINKER_BATCH;
 
-#if !defined(CONFIG_MALI_VECTOR_DUMP)
-#if KERNEL_VERSION(6, 0, 0) > LINUX_VERSION_CODE
-	register_shrinker(reclaim);
-#else
-	register_shrinker(reclaim, "mali-csf-tiler-heap");
-#endif
-#endif
+	if (!IS_ENABLED(CONFIG_MALI_VECTOR_DUMP))
+		KBASE_REGISTER_SHRINKER(reclaim, "mali-csf-tiler-heap", kbdev);
+
+	return 0;
 }
 
 void kbase_csf_tiler_heap_reclaim_mgr_term(struct kbase_device *kbdev)
@@ -381,9 +384,8 @@ void kbase_csf_tiler_heap_reclaim_mgr_term(struct kbase_device *kbdev)
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
 	u8 prio;
 
-#if !defined(CONFIG_MALI_VECTOR_DUMP)
-	unregister_shrinker(&scheduler->reclaim_mgr.heap_reclaim);
-#endif
+	if (!IS_ENABLED(CONFIG_MALI_VECTOR_DUMP))
+		KBASE_UNREGISTER_SHRINKER(scheduler->reclaim_mgr.heap_reclaim);
 
 	for (prio = KBASE_QUEUE_GROUP_PRIORITY_REALTIME; prio < KBASE_QUEUE_GROUP_PRIORITY_COUNT;
 	     prio++)
diff --git a/mali_kbase/csf/mali_kbase_csf_tiler_heap_reclaim.h b/mali_kbase/csf/mali_kbase_csf_tiler_heap_reclaim.h
index 7880de0..d41b7ba 100644
--- a/mali_kbase/csf/mali_kbase_csf_tiler_heap_reclaim.h
+++ b/mali_kbase/csf/mali_kbase_csf_tiler_heap_reclaim.h
@@ -66,8 +66,10 @@ void kbase_csf_tiler_heap_reclaim_ctx_init(struct kbase_context *kctx);
  * @kbdev: Pointer to the device.
  *
  * This function must be called only when a kbase device is initialized.
+ *
+ * Return: 0 if issuing reclaim_mgr init was successful, otherwise an error code.
  */
-void kbase_csf_tiler_heap_reclaim_mgr_init(struct kbase_device *kbdev);
+int kbase_csf_tiler_heap_reclaim_mgr_init(struct kbase_device *kbdev);
 
 /**
  * kbase_csf_tiler_heap_reclaim_mgr_term - Termination call for the tiler heap reclaim manger.
diff --git a/mali_kbase/csf/mali_kbase_csf_tl_reader.c b/mali_kbase/csf/mali_kbase_csf_tl_reader.c
index 06163e5..4ee64e1 100644
--- a/mali_kbase/csf/mali_kbase_csf_tl_reader.c
+++ b/mali_kbase/csf/mali_kbase_csf_tl_reader.c
@@ -152,13 +152,22 @@ static bool tl_reader_overflow_check(struct kbase_csf_tl_reader *self, u16 event
  *
  * Reset the reader to the default state, i.e. set all the
  * mutable fields to zero.
+ *
+ * NOTE: this function expects the irq spinlock to be held.
  */
 static void tl_reader_reset(struct kbase_csf_tl_reader *self)
 {
+	lockdep_assert_held(&self->read_lock);
+
 	self->got_first_event = false;
 	self->is_active = false;
 	self->expected_event_id = 0;
 	self->tl_header.btc = 0;
+
+	/* There might be data left in the trace buffer from the previous
+	 * tracing session. We don't want it to leak into this session.
+	 */
+	kbase_csf_firmware_trace_buffer_discard_all(self->trace_buffer);
 }
 
 int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self)
@@ -325,21 +334,16 @@ static int tl_reader_update_enable_bit(struct kbase_csf_tl_reader *self, bool va
 
 void kbase_csf_tl_reader_init(struct kbase_csf_tl_reader *self, struct kbase_tlstream *stream)
 {
-	self->timer_interval = KBASE_CSF_TL_READ_INTERVAL_DEFAULT;
+	*self = (struct kbase_csf_tl_reader){
+		.timer_interval = KBASE_CSF_TL_READ_INTERVAL_DEFAULT,
+		.stream = stream,
+		.kbdev = NULL, /* This will be initialized by tl_reader_init_late() */
+		.is_active = false,
+	};
 
 	kbase_timer_setup(&self->read_timer, kbasep_csf_tl_reader_read_callback);
 
-	self->stream = stream;
-
-	/* This will be initialized by tl_reader_init_late() */
-	self->kbdev = NULL;
-	self->trace_buffer = NULL;
-	self->tl_header.data = NULL;
-	self->tl_header.size = 0;
-
 	spin_lock_init(&self->read_lock);
-
-	tl_reader_reset(self);
 }
 
 void kbase_csf_tl_reader_term(struct kbase_csf_tl_reader *self)
@@ -349,13 +353,19 @@ void kbase_csf_tl_reader_term(struct kbase_csf_tl_reader *self)
 
 int kbase_csf_tl_reader_start(struct kbase_csf_tl_reader *self, struct kbase_device *kbdev)
 {
+	unsigned long flags;
 	int rcode;
 
+	spin_lock_irqsave(&self->read_lock, flags);
+
 	/* If already running, early exit. */
-	if (self->is_active)
+	if (self->is_active) {
+		spin_unlock_irqrestore(&self->read_lock, flags);
 		return 0;
+	}
 
 	if (tl_reader_init_late(self, kbdev)) {
+		spin_unlock_irqrestore(&self->read_lock, flags);
 #if IS_ENABLED(CONFIG_MALI_NO_MALI)
 		dev_warn(kbdev->dev, "CSFFW timeline is not available for MALI_NO_MALI builds!");
 		return 0;
@@ -367,6 +377,9 @@ int kbase_csf_tl_reader_start(struct kbase_csf_tl_reader *self, struct kbase_dev
 	tl_reader_reset(self);
 
 	self->is_active = true;
+
+	spin_unlock_irqrestore(&self->read_lock, flags);
+
 	/* Set bytes to copy to the header size. This is to trigger copying
 	 * of the header to the user space.
 	 */
diff --git a/mali_kbase/csf/mali_kbase_csf_trace_buffer.c b/mali_kbase/csf/mali_kbase_csf_trace_buffer.c
index 47a3d21..9c27a71 100644
--- a/mali_kbase/csf/mali_kbase_csf_trace_buffer.c
+++ b/mali_kbase/csf/mali_kbase_csf_trace_buffer.c
@@ -520,6 +520,14 @@ void kbase_csf_firmware_trace_buffer_discard(struct firmware_trace_buffer *trace
 }
 EXPORT_SYMBOL(kbase_csf_firmware_trace_buffer_discard);
 
+void kbase_csf_firmware_trace_buffer_discard_all(struct firmware_trace_buffer *trace_buffer)
+{
+	if (WARN_ON(!trace_buffer))
+		return;
+
+	*(trace_buffer->cpu_va.extract_cpu_va) = *(trace_buffer->cpu_va.insert_cpu_va);
+}
+
 static void update_trace_buffer_active_mask64(struct firmware_trace_buffer *tb, u64 mask)
 {
 	unsigned int i;
diff --git a/mali_kbase/csf/mali_kbase_csf_trace_buffer.h b/mali_kbase/csf/mali_kbase_csf_trace_buffer.h
index 90dfcb2..35988ea 100644
--- a/mali_kbase/csf/mali_kbase_csf_trace_buffer.h
+++ b/mali_kbase/csf/mali_kbase_csf_trace_buffer.h
@@ -180,6 +180,15 @@ unsigned int kbase_csf_firmware_trace_buffer_read_data(struct firmware_trace_buf
 void kbase_csf_firmware_trace_buffer_discard(struct firmware_trace_buffer *trace_buffer);
 
 /**
+ * kbase_csf_firmware_trace_buffer_discard_all - Discard all data from a trace buffer
+ *
+ * @trace_buffer: Trace buffer handle
+ *
+ * Discard all the data in the trace buffer to make it empty.
+ */
+void kbase_csf_firmware_trace_buffer_discard_all(struct firmware_trace_buffer *trace_buffer);
+
+/**
  * kbase_csf_firmware_trace_buffer_get_active_mask64 - Get trace buffer active mask
  *
  * @tb: Trace buffer handle
diff --git a/mali_kbase/csf/mali_kbase_csf_util.c b/mali_kbase/csf/mali_kbase_csf_util.c
index 7dc32a1..504379e 100644
--- a/mali_kbase/csf/mali_kbase_csf_util.c
+++ b/mali_kbase/csf/mali_kbase_csf_util.c
@@ -115,7 +115,7 @@ struct kbasep_printer *kbasep_printer_buffer_init(struct kbase_device *kbdev,
 
 	if (kbpr) {
 		if (kfifo_alloc(&kbpr->fifo, KBASEP_PRINTER_BUFFER_MAX_SIZE, GFP_KERNEL)) {
-			kfree(kbpr);
+			vfree(kbpr);
 			return NULL;
 		}
 		kbpr->kbdev = kbdev;
diff --git a/mali_kbase/device/backend/mali_kbase_device_csf.c b/mali_kbase/device/backend/mali_kbase_device_csf.c
index 680c69d..33db85d 100644
--- a/mali_kbase/device/backend/mali_kbase_device_csf.c
+++ b/mali_kbase/device/backend/mali_kbase_device_csf.c
@@ -279,10 +279,8 @@ static const struct kbase_device_init dev_init[] = {
 	{ kbase_gpu_device_create, kbase_gpu_device_destroy, "Dummy model initialization failed" },
 #else /* !IS_ENABLED(CONFIG_MALI_REAL_HW) */
 	{ kbase_get_irqs, NULL, "IRQ search failed" },
-#endif /* !IS_ENABLED(CONFIG_MALI_REAL_HW) */
-#if !IS_ENABLED(CONFIG_MALI_NO_MALI)
 	{ registers_map, registers_unmap, "Register map failed" },
-#endif /* !IS_ENABLED(CONFIG_MALI_NO_MALI) */
+#endif /* !IS_ENABLED(CONFIG_MALI_REAL_HW) */
 #if IS_ENABLED(CONFIG_MALI_TRACE_POWER_GPU_WORK_PERIOD)
 	{ kbase_gpu_metrics_init, kbase_gpu_metrics_term, "GPU metrics initialization failed" },
 #endif /* IS_ENABLED(CONFIG_MALI_TRACE_POWER_GPU_WORK_PERIOD) */
diff --git a/mali_kbase/device/backend/mali_kbase_device_hw_csf.c b/mali_kbase/device/backend/mali_kbase_device_hw_csf.c
index ab9df01..c5959f3 100644
--- a/mali_kbase/device/backend/mali_kbase_device_hw_csf.c
+++ b/mali_kbase/device/backend/mali_kbase_device_hw_csf.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2020-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2020-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -173,6 +173,9 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
 			kbase_pm_power_changed(kbdev);
 	}
 
+	if (val & MCU_STATUS_GPU_IRQ)
+		wake_up_all(&kbdev->csf.event_wait);
+
 	KBASE_KTRACE_ADD(kbdev, CORE_GPU_IRQ_DONE, NULL, val);
 }
 KBASE_EXPORT_TEST_API(kbase_gpu_interrupt);
diff --git a/mali_kbase/device/backend/mali_kbase_device_jm.c b/mali_kbase/device/backend/mali_kbase_device_jm.c
index b2fd8bd..4acf2a5 100644
--- a/mali_kbase/device/backend/mali_kbase_device_jm.c
+++ b/mali_kbase/device/backend/mali_kbase_device_jm.c
@@ -217,10 +217,8 @@ static const struct kbase_device_init dev_init[] = {
 	{ kbase_gpu_device_create, kbase_gpu_device_destroy, "Dummy model initialization failed" },
 #else /* !IS_ENABLED(CONFIG_MALI_REAL_HW) */
 	{ kbase_get_irqs, NULL, "IRQ search failed" },
-#endif /* !IS_ENABLED(CONFIG_MALI_REAL_HW) */
-#if !IS_ENABLED(CONFIG_MALI_NO_MALI)
 	{ registers_map, registers_unmap, "Register map failed" },
-#endif /* !IS_ENABLED(CONFIG_MALI_NO_MALI) */
+#endif /* !IS_ENABLED(CONFIG_MALI_REAL_HW) */
 #if IS_ENABLED(CONFIG_MALI_TRACE_POWER_GPU_WORK_PERIOD)
 	{ kbase_gpu_metrics_init, kbase_gpu_metrics_term, "GPU metrics initialization failed" },
 #endif /* IS_ENABLED(CONFIG_MALI_TRACE_POWER_GPU_WORK_PERIOD) */
diff --git a/mali_kbase/device/mali_kbase_device.h b/mali_kbase/device/mali_kbase_device.h
index 9cca6af..b58f0b5 100644
--- a/mali_kbase/device/mali_kbase_device.h
+++ b/mali_kbase/device/mali_kbase_device.h
@@ -58,6 +58,9 @@ void kbase_increment_device_id(void);
  * When a device file is opened for the first time,
  * load firmware and initialize hardware counter components.
  *
+ * It is safe for this function to be called multiple times without ill
+ * effects. Only the first call would be effective.
+ *
  * Return: 0 on success. An error code on failure.
  */
 int kbase_device_firmware_init_once(struct kbase_device *kbdev);
diff --git a/mali_kbase/device/mali_kbase_device_hw.c b/mali_kbase/device/mali_kbase_device_hw.c
index da597af..8b20c0b 100644
--- a/mali_kbase/device/mali_kbase_device_hw.c
+++ b/mali_kbase/device/mali_kbase_device_hw.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2014-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -32,7 +32,7 @@ bool kbase_is_gpu_removed(struct kbase_device *kbdev)
 	if (!IS_ENABLED(CONFIG_MALI_ARBITER_SUPPORT))
 		return false;
 
-	return (kbase_reg_read32(kbdev, GPU_CONTROL_ENUM(GPU_ID)) == 0);
+	return (KBASE_REG_READ(kbdev, GPU_CONTROL_ENUM(GPU_ID)) == 0);
 }
 
 /**
diff --git a/mali_kbase/hw_access/backend/mali_kbase_hw_access_real_hw.c b/mali_kbase/hw_access/backend/mali_kbase_hw_access_real_hw.c
index f4afbf5..21a4fd1 100644
--- a/mali_kbase/hw_access/backend/mali_kbase_hw_access_real_hw.c
+++ b/mali_kbase/hw_access/backend/mali_kbase_hw_access_real_hw.c
@@ -24,12 +24,13 @@
 
 #include <mali_kbase.h>
 #include <hw_access/mali_kbase_hw_access.h>
+#include <linux/mali_hw_access.h>
 
 u64 kbase_reg_get_gpu_id(struct kbase_device *kbdev)
 {
 	u32 val[2] = { 0 };
 
-	val[0] = readl(kbdev->reg);
+	val[0] = mali_readl(kbdev->reg);
 
 
 	return (u64)val[0] | ((u64)val[1] << 32);
@@ -45,7 +46,7 @@ u32 kbase_reg_read32(struct kbase_device *kbdev, u32 reg_enum)
 					      KBASE_REGMAP_PERM_READ | KBASE_REGMAP_WIDTH_32_BIT)))
 		return 0;
 
-	val = readl(kbdev->regmap.regs[reg_enum]);
+	val = mali_readl(kbdev->regmap.regs[reg_enum]);
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	if (unlikely(kbdev->io_history.enabled))
@@ -69,8 +70,8 @@ u64 kbase_reg_read64(struct kbase_device *kbdev, u32 reg_enum)
 					      KBASE_REGMAP_PERM_READ | KBASE_REGMAP_WIDTH_64_BIT)))
 		return 0;
 
-	val = (u64)readl(kbdev->regmap.regs[reg_enum]) |
-	      ((u64)readl(kbdev->regmap.regs[reg_enum] + 4) << 32);
+	val = (u64)mali_readl(kbdev->regmap.regs[reg_enum]) |
+	      ((u64)mali_readl(kbdev->regmap.regs[reg_enum] + 4) << 32);
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	if (unlikely(kbdev->io_history.enabled)) {
@@ -101,9 +102,9 @@ u64 kbase_reg_read64_coherent(struct kbase_device *kbdev, u32 reg_enum)
 		return 0;
 
 	do {
-		hi1 = readl(kbdev->regmap.regs[reg_enum] + 4);
-		lo = readl(kbdev->regmap.regs[reg_enum]);
-		hi2 = readl(kbdev->regmap.regs[reg_enum] + 4);
+		hi1 = mali_readl(kbdev->regmap.regs[reg_enum] + 4);
+		lo = mali_readl(kbdev->regmap.regs[reg_enum]);
+		hi2 = mali_readl(kbdev->regmap.regs[reg_enum] + 4);
 	} while (hi1 != hi2);
 
 	val = lo | (((u64)hi1) << 32);
@@ -131,7 +132,7 @@ void kbase_reg_write32(struct kbase_device *kbdev, u32 reg_enum, u32 value)
 					      KBASE_REGMAP_PERM_WRITE | KBASE_REGMAP_WIDTH_32_BIT)))
 		return;
 
-	writel(value, kbdev->regmap.regs[reg_enum]);
+	mali_writel(value, kbdev->regmap.regs[reg_enum]);
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	if (unlikely(kbdev->io_history.enabled))
@@ -151,8 +152,8 @@ void kbase_reg_write64(struct kbase_device *kbdev, u32 reg_enum, u64 value)
 					      KBASE_REGMAP_PERM_WRITE | KBASE_REGMAP_WIDTH_64_BIT)))
 		return;
 
-	writel(value & 0xFFFFFFFF, kbdev->regmap.regs[reg_enum]);
-	writel(value >> 32, kbdev->regmap.regs[reg_enum] + 4);
+	mali_writel(value & 0xFFFFFFFF, kbdev->regmap.regs[reg_enum]);
+	mali_writel(value >> 32, kbdev->regmap.regs[reg_enum] + 4);
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	if (unlikely(kbdev->io_history.enabled)) {
diff --git a/mali_kbase/hw_access/mali_kbase_hw_access_regmap.h b/mali_kbase/hw_access/mali_kbase_hw_access_regmap.h
index 9bd646d..1ba2598 100644
--- a/mali_kbase/hw_access/mali_kbase_hw_access_regmap.h
+++ b/mali_kbase/hw_access/mali_kbase_hw_access_regmap.h
@@ -308,6 +308,16 @@
 #define TC_CLOCK_GATE_OVERRIDE (1ul << 0)
 /* End TILER_CONFIG register */
 
+/* L2_FEATURES register */
+#define L2_FEATURES_CACHE_SIZE_SHIFT GPU_U(16)
+#define L2_FEATURES_CACHE_SIZE_MASK (GPU_U(0xFF) << L2_FEATURES_CACHE_SIZE_SHIFT)
+#define L2_FEATURES_CACHE_SIZE_GET(reg_val) \
+	(((reg_val)&L2_FEATURES_CACHE_SIZE_MASK) >> L2_FEATURES_CACHE_SIZE_SHIFT)
+#define L2_FEATURES_CACHE_SIZE_SET(reg_val, value)     \
+	(~(~(reg_val) | L2_FEATURES_CACHE_SIZE_MASK) | \
+	 (((value) << L2_FEATURES_CACHE_SIZE_SHIFT) & L2_FEATURES_CACHE_SIZE_MASK))
+/* End L2_FEATURES register */
+
 /* L2_CONFIG register */
 #define L2_CONFIG_SIZE_SHIFT 16
 #define L2_CONFIG_SIZE_MASK (0xFFul << L2_CONFIG_SIZE_SHIFT)
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
index d7911ae..d1290ca 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2021-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -21,7 +21,6 @@
 
 #include "hwcnt/backend/mali_kbase_hwcnt_backend_csf.h"
 #include "hwcnt/mali_kbase_hwcnt_gpu.h"
-#include "hwcnt/mali_kbase_hwcnt_types.h"
 
 #include <linux/log2.h>
 #include <linux/kernel.h>
@@ -255,7 +254,8 @@ struct kbase_hwcnt_csf_physical_layout {
  * @hwc_threshold_work:         Worker for consuming available samples when
  *                              threshold interrupt raised.
  * @num_l2_slices:              Current number of L2 slices allocated to the GPU.
- * @shader_present_bitmap:      Current shader-present bitmap that is allocated to the GPU.
+ * @powered_shader_core_mask:   The common mask between the debug_core_mask
+ *                              and the shader_present_bitmap.
  */
 struct kbase_hwcnt_backend_csf {
 	struct kbase_hwcnt_backend_csf_info *info;
@@ -283,7 +283,7 @@ struct kbase_hwcnt_backend_csf {
 	struct work_struct hwc_dump_work;
 	struct work_struct hwc_threshold_work;
 	size_t num_l2_slices;
-	u64 shader_present_bitmap;
+	u64 powered_shader_core_mask;
 };
 
 static bool kbasep_hwcnt_backend_csf_backend_exists(struct kbase_hwcnt_backend_csf_info *csf_info)
@@ -296,7 +296,7 @@ static bool kbasep_hwcnt_backend_csf_backend_exists(struct kbase_hwcnt_backend_c
 }
 
 void kbase_hwcnt_backend_csf_set_hw_availability(struct kbase_hwcnt_backend_interface *iface,
-						 size_t num_l2_slices, u64 shader_present_bitmap)
+						 size_t num_l2_slices, u64 powered_shader_core_mask)
 {
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 
@@ -313,12 +313,12 @@ void kbase_hwcnt_backend_csf_set_hw_availability(struct kbase_hwcnt_backend_inte
 		return;
 
 	if (WARN_ON(num_l2_slices > csf_info->backend->phys_layout.mmu_l2_cnt) ||
-	    WARN_ON((shader_present_bitmap & csf_info->backend->phys_layout.shader_avail_mask) !=
-		    shader_present_bitmap))
+	    WARN_ON((powered_shader_core_mask & csf_info->backend->phys_layout.shader_avail_mask) !=
+		    powered_shader_core_mask))
 		return;
 
 	csf_info->backend->num_l2_slices = num_l2_slices;
-	csf_info->backend->shader_present_bitmap = shader_present_bitmap;
+	csf_info->backend->powered_shader_core_mask = powered_shader_core_mask;
 }
 
 /**
@@ -424,7 +424,7 @@ static void kbasep_hwcnt_backend_csf_init_layout(
 	WARN_ON(!prfcnt_info);
 	WARN_ON(!phys_layout);
 
-	shader_core_cnt = (size_t)fls64(prfcnt_info->core_mask);
+	shader_core_cnt = (size_t)fls64(prfcnt_info->sc_core_mask);
 	values_per_block = prfcnt_info->prfcnt_block_size / KBASE_HWCNT_VALUE_HW_BYTES;
 	fw_block_cnt = div_u64(prfcnt_info->prfcnt_fw_size, prfcnt_info->prfcnt_block_size);
 	hw_block_cnt = div_u64(prfcnt_info->prfcnt_hw_size, prfcnt_info->prfcnt_block_size);
@@ -445,7 +445,7 @@ static void kbasep_hwcnt_backend_csf_init_layout(
 		.fw_block_cnt = fw_block_cnt,
 		.hw_block_cnt = hw_block_cnt,
 		.block_cnt = fw_block_cnt + hw_block_cnt,
-		.shader_avail_mask = prfcnt_info->core_mask,
+		.shader_avail_mask = prfcnt_info->sc_core_mask,
 		.headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
 		.values_per_block = values_per_block,
 		.counters_per_block = values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
@@ -517,34 +517,21 @@ static void kbasep_hwcnt_backend_csf_update_user_sample(struct kbase_hwcnt_backe
 	memset(backend_csf->block_states, 0, block_state_bytes);
 }
 
-/**
- * kbasep_hwcnt_backend_csf_update_block_state - Update block state of a block instance with
- *						   information from a sample.
- * @phys_layout:                Physical memory layout information of HWC
- *                              sample buffer.
- * @enable_mask:                Counter enable mask for the block whose state is being updated.
- * @enable_state:               The CSF backend internal enabled state.
- * @exiting_protm:              Whether or not the sample is taken when the GPU is exiting
- *                              protected mode.
- * @block_idx:                  Index of block within the ringbuffer.
- * @block_state:                Pointer to existing block state of the block whose state is being
- *                              updated.
- * @fw_in_protected_mode:       Whether or not GPU is in protected mode during sampling.
- */
-static void kbasep_hwcnt_backend_csf_update_block_state(
-	const struct kbase_hwcnt_csf_physical_layout *phys_layout, const u32 enable_mask,
-	enum kbase_hwcnt_backend_csf_enable_state enable_state, bool exiting_protm,
-	size_t block_idx, blk_stt_t *const block_state, bool fw_in_protected_mode)
+void kbasep_hwcnt_backend_csf_update_block_state(struct kbase_hwcnt_backend_csf *backend,
+						 const u32 enable_mask, bool exiting_protm,
+						 size_t block_idx, blk_stt_t *const block_state,
+						 bool fw_in_protected_mode)
 {
+	const struct kbase_hwcnt_csf_physical_layout *phys_layout = &backend->phys_layout;
 	/* Offset of shader core blocks from the start of the HW blocks in the sample */
 	size_t shader_core_block_offset =
-		(size_t)(phys_layout->hw_block_cnt - phys_layout->shader_cnt);
+		(size_t)(phys_layout->block_cnt - phys_layout->shader_cnt);
 	bool is_shader_core_block;
 
-	is_shader_core_block = block_idx >= shader_core_block_offset;
+	is_shader_core_block = (block_idx >= shader_core_block_offset);
 
 	/* Set power bits for the block state for the block, for the sample */
-	switch (enable_state) {
+	switch (backend->enable_state) {
 	/* Disabled states */
 	case KBASE_HWCNT_BACKEND_CSF_DISABLED:
 	case KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED:
@@ -592,21 +579,45 @@ static void kbasep_hwcnt_backend_csf_update_block_state(
 								    KBASE_HWCNT_STATE_NORMAL);
 	else
 		kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_NORMAL);
+
+	/* powered_shader_core_mask stored in the backend is a combination of
+	 * the shader present and the debug core mask, so explicit checking of the
+	 * core mask is not required here.
+	 */
+	if (is_shader_core_block) {
+		u64 current_shader_core = 1ULL << (block_idx - shader_core_block_offset);
+
+		WARN_ON_ONCE(backend->phys_layout.shader_cnt > 64);
+
+		if (current_shader_core & backend->info->backend->powered_shader_core_mask)
+			kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_AVAILABLE);
+		else if (current_shader_core & ~backend->info->backend->powered_shader_core_mask)
+			kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_UNAVAILABLE);
+		else
+			WARN_ON_ONCE(true);
+	}
+	else
+		kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_AVAILABLE);
 }
 
-static void kbasep_hwcnt_backend_csf_accumulate_sample(
-	const struct kbase_hwcnt_csf_physical_layout *phys_layout, size_t dump_bytes,
-	u64 *accum_buf, const u32 *old_sample_buf, const u32 *new_sample_buf,
-	blk_stt_t *const block_states, bool clearing_samples,
-	enum kbase_hwcnt_backend_csf_enable_state enable_state, bool fw_in_protected_mode)
+static void kbasep_hwcnt_backend_csf_accumulate_sample(struct kbase_hwcnt_backend_csf *backend,
+						       const u32 *old_sample_buf,
+						       const u32 *new_sample_buf)
 {
+	const struct kbase_hwcnt_csf_physical_layout *phys_layout = &backend->phys_layout;
+	const size_t dump_bytes = backend->info->prfcnt_info.dump_bytes;
+	const size_t values_per_block = phys_layout->values_per_block;
+	blk_stt_t *const block_states = backend->block_states;
+	const bool fw_in_protected_mode = backend->info->fw_in_protected_mode;
+	const bool clearing_samples = backend->info->prfcnt_info.clearing_samples;
+	u64 *accum_buf = backend->accum_buf;
+
 	size_t block_idx;
 	const u32 *old_block = old_sample_buf;
 	const u32 *new_block = new_sample_buf;
 	u64 *acc_block = accum_buf;
 	/* Flag to indicate whether current sample is exiting protected mode. */
 	bool exiting_protm = false;
-	const size_t values_per_block = phys_layout->values_per_block;
 
 	/* The block pointers now point to the first HW block, which is always a CSHW/front-end
 	 * block. The counter enable mask for this block can be checked to determine whether this
@@ -620,9 +631,8 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 		const u32 old_enable_mask = old_block[phys_layout->enable_mask_offset];
 		const u32 new_enable_mask = new_block[phys_layout->enable_mask_offset];
 		/* Update block state with information of the current sample */
-		kbasep_hwcnt_backend_csf_update_block_state(phys_layout, new_enable_mask,
-							    enable_state, exiting_protm, block_idx,
-							    &block_states[block_idx],
+		kbasep_hwcnt_backend_csf_update_block_state(backend, new_enable_mask, exiting_protm,
+							    block_idx, &block_states[block_idx],
 							    fw_in_protected_mode);
 
 		if (!(new_enable_mask & HWCNT_BLOCK_EMPTY_SAMPLE)) {
@@ -706,7 +716,6 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(struct kbase_hwcnt_backe
 	u8 *cpu_dump_base = (u8 *)backend_csf->ring_buf_cpu_base;
 	const size_t ring_buf_cnt = backend_csf->info->ring_buf_cnt;
 	const size_t buf_dump_bytes = backend_csf->info->prfcnt_info.dump_bytes;
-	bool clearing_samples = backend_csf->info->prfcnt_info.clearing_samples;
 	u32 *old_sample_buf = backend_csf->old_sample_buf;
 	u32 *new_sample_buf = old_sample_buf;
 	const struct kbase_hwcnt_csf_physical_layout *phys_layout = &backend_csf->phys_layout;
@@ -740,10 +749,8 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(struct kbase_hwcnt_backe
 		const u32 buf_idx = raw_idx & (ring_buf_cnt - 1);
 
 		new_sample_buf = (u32 *)&cpu_dump_base[buf_idx * buf_dump_bytes];
-		kbasep_hwcnt_backend_csf_accumulate_sample(
-			phys_layout, buf_dump_bytes, backend_csf->accum_buf, old_sample_buf,
-			new_sample_buf, backend_csf->block_states, clearing_samples,
-			backend_csf->enable_state, backend_csf->info->fw_in_protected_mode);
+		kbasep_hwcnt_backend_csf_accumulate_sample(backend_csf, old_sample_buf,
+							   new_sample_buf);
 
 		old_sample_buf = new_sample_buf;
 	}
@@ -1457,7 +1464,7 @@ static int kbasep_hwcnt_backend_csf_dump_get(struct kbase_hwcnt_backend *backend
 	ret = kbase_hwcnt_csf_dump_get(dst, backend_csf->to_user_buf,
 				       backend_csf->to_user_block_states, dst_enable_map,
 				       backend_csf->num_l2_slices,
-				       backend_csf->shader_present_bitmap, accumulate);
+				       backend_csf->powered_shader_core_mask, accumulate);
 
 	/* If no error occurred (zero ret value), then update block state for all blocks in the
 	 * accumulation with the current sample's block state.
@@ -2098,7 +2105,7 @@ int kbase_hwcnt_backend_csf_metadata_init(struct kbase_hwcnt_backend_interface *
 	gpu_info.has_fw_counters = csf_info->prfcnt_info.prfcnt_fw_size > 0;
 	gpu_info.l2_count = csf_info->prfcnt_info.l2_count;
 	gpu_info.csg_cnt = csf_info->prfcnt_info.csg_count;
-	gpu_info.core_mask = csf_info->prfcnt_info.core_mask;
+	gpu_info.sc_core_mask = csf_info->prfcnt_info.sc_core_mask;
 	gpu_info.clk_cnt = csf_info->prfcnt_info.clk_cnt;
 	gpu_info.prfcnt_values_per_block =
 		csf_info->prfcnt_info.prfcnt_block_size / KBASE_HWCNT_VALUE_HW_BYTES;
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h
index 2487db2..1b4e16d 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -30,8 +30,10 @@
 #include "hwcnt/backend/mali_kbase_hwcnt_backend.h"
 #include "hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h"
 #include "hwcnt/mali_kbase_hwcnt_watchdog_if.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 
 struct kbase_hwcnt_physical_enable_map;
+struct kbase_hwcnt_backend_csf;
 
 /**
  * kbase_hwcnt_backend_csf_create() - Create a CSF hardware counter backend
@@ -123,11 +125,12 @@ void kbase_hwcnt_backend_csf_on_before_reset(struct kbase_hwcnt_backend_interfac
  *                                                 this function is called.
  * @iface: Non-NULL pointer to HWC backend interface.
  * @num_l2_slices: Current number of L2 slices allocated to the GPU.
- * @shader_present_bitmap: Current shader-present bitmap that is allocated to the GPU.
+ * @powered_shader_core_mask: The common mask between the debug_core_mask
+ *                            and the shader_present_bitmap.
  */
 void kbase_hwcnt_backend_csf_set_hw_availability(struct kbase_hwcnt_backend_interface *iface,
 						 size_t num_l2_slices,
-						 uint64_t shader_present_bitmap);
+						 uint64_t powered_shader_core_mask);
 
 /** kbasep_hwcnt_backend_csf_process_enable_map() - Process the enable_map to
  *                                                  guarantee headers are
@@ -174,4 +177,21 @@ void kbase_hwcnt_backend_csf_on_prfcnt_enable(struct kbase_hwcnt_backend_interfa
  */
 void kbase_hwcnt_backend_csf_on_prfcnt_disable(struct kbase_hwcnt_backend_interface *iface);
 
+/**
+ * kbasep_hwcnt_backend_csf_update_block_state - Update block state of a block instance with
+ *                              information from a sample.
+ * @backend:                    CSF hardware counter backend.
+ * @enable_mask:                Counter enable mask for the block whose state is being updated.
+ * @exiting_protm:              Whether or not the sample is taken when the GPU is exiting
+ *                              protected mode.
+ * @block_idx:                  Index of block within the ringbuffer.
+ * @block_state:                Pointer to existing block state of the block whose state is being
+ *                              updated.
+ * @fw_in_protected_mode:       Whether or not GPU is in protected mode during sampling.
+ */
+void kbasep_hwcnt_backend_csf_update_block_state(struct kbase_hwcnt_backend_csf *backend,
+						 const u32 enable_mask, bool exiting_protm,
+						 size_t block_idx, blk_stt_t *const block_state,
+						 bool fw_in_protected_mode);
+
 #endif /* _KBASE_HWCNT_BACKEND_CSF_H_ */
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h
index 65bb965..4ee2c8a 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h
@@ -68,7 +68,7 @@ struct kbase_hwcnt_backend_csf_if_enable {
  * @prfcnt_block_size: Bytes of each performance counter block.
  * @l2_count:          The MMU L2 cache count.
  * @csg_count:         The total number of CSGs in the system
- * @core_mask:         Shader core mask.
+ * @sc_core_mask:         Shader core mask.
  * @clk_cnt:           Clock domain count in the system.
  * @clearing_samples:  Indicates whether counters are cleared after each sample
  *                     is taken.
@@ -80,7 +80,7 @@ struct kbase_hwcnt_backend_csf_if_prfcnt_info {
 	size_t prfcnt_block_size;
 	size_t l2_count;
 	u32 csg_count;
-	u64 core_mask;
+	u64 sc_core_mask;
 	u8 clk_cnt;
 	bool clearing_samples;
 };
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c
index 1b7a116..fe81ce1 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2021-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -229,7 +229,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 
 	*prfcnt_info = (struct kbase_hwcnt_backend_csf_if_prfcnt_info){
 		.l2_count = KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS,
-		.core_mask = (1ull << KBASE_DUMMY_MODEL_MAX_SHADER_CORES) - 1,
+		.sc_core_mask = (1ull << KBASE_DUMMY_MODEL_MAX_SHADER_CORES) - 1,
 		.prfcnt_hw_size =
 			KBASE_DUMMY_MODEL_MAX_NUM_HARDWARE_BLOCKS * KBASE_DUMMY_MODEL_BLOCK_SIZE,
 		.prfcnt_fw_size =
@@ -290,12 +290,13 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 		.dump_bytes = fw_ctx->buf_bytes,
 		.prfcnt_block_size = prfcnt_block_size,
 		.l2_count = kbdev->gpu_props.num_l2_slices,
-		.core_mask = kbasep_hwcnt_backend_csf_core_mask(&kbdev->gpu_props),
+		.sc_core_mask = kbasep_hwcnt_backend_csf_core_mask(&kbdev->gpu_props),
 		.csg_count = fw_block_count > 1 ? csg_count : 0,
 		.clk_cnt = fw_ctx->clk_cnt,
 		.clearing_samples = true,
 	};
 
+
 	/* Block size must be multiple of counter size. */
 	WARN_ON((prfcnt_info->prfcnt_block_size % KBASE_HWCNT_VALUE_HW_BYTES) != 0);
 	/* Total size must be multiple of block size. */
@@ -513,10 +514,15 @@ kbasep_hwcnt_backend_csf_if_fw_ring_buf_free(struct kbase_hwcnt_backend_csf_if_c
 			fw_ring_buf->phys, fw_ring_buf->num_pages, fw_ring_buf->num_pages,
 			MCU_AS_NR));
 
+		/* Clear the dump ring_buf content to zeros */
+		memset(fw_ring_buf->cpu_dump_base, 0, fw_ring_buf->num_pages * PAGE_SIZE);
 		vunmap(fw_ring_buf->cpu_dump_base);
 
+		/* After zeroing, the ring_buf pages are dirty so need to pass the 'dirty' flag
+		 * as true when freeing the pages to the Global pool.
+		 */
 		kbase_mem_pool_free_pages(&fw_ctx->kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
-					  fw_ring_buf->num_pages, fw_ring_buf->phys, false, false);
+					  fw_ring_buf->num_pages, fw_ring_buf->phys, true, false);
 
 		kfree(fw_ring_buf->phys);
 
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c
index 4df7dd4..8b337eb 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c
@@ -165,7 +165,7 @@ static int kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
 #endif
 
 	info->l2_count = l2_count;
-	info->core_mask = core_mask;
+	info->sc_core_mask = core_mask;
 	info->prfcnt_values_per_block = KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK;
 
 	/* Determine the number of available clock domains. */
@@ -186,7 +186,7 @@ static void kbasep_hwcnt_backend_jm_init_layout(const struct kbase_hwcnt_gpu_inf
 	WARN_ON(!gpu_info);
 	WARN_ON(!phys_layout);
 
-	shader_core_cnt = fls64(gpu_info->core_mask);
+	shader_core_cnt = fls64(gpu_info->sc_core_mask);
 
 	*phys_layout = (struct kbase_hwcnt_jm_physical_layout){
 		.fe_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT,
@@ -195,7 +195,7 @@ static void kbasep_hwcnt_backend_jm_init_layout(const struct kbase_hwcnt_gpu_inf
 		.shader_cnt = shader_core_cnt,
 		.block_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT + KBASE_HWCNT_V5_TILER_BLOCK_COUNT +
 			     gpu_info->l2_count + shader_core_cnt,
-		.shader_avail_mask = gpu_info->core_mask,
+		.shader_avail_mask = gpu_info->sc_core_mask,
 		.headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
 		.values_per_block = gpu_info->prfcnt_values_per_block,
 		.counters_per_block =
@@ -384,14 +384,12 @@ kbasep_hwcnt_backend_jm_dump_enable_nolock(struct kbase_hwcnt_backend *backend,
 
 	enable = (struct kbase_instr_hwcnt_enable)
 	{
-		.fe_bm = phys_enable_map.fe_bm,
-		.shader_bm = phys_enable_map.shader_bm,
-		.tiler_bm = phys_enable_map.tiler_bm,
-		.mmu_l2_bm = phys_enable_map.mmu_l2_bm,
+		.fe_bm = phys_enable_map.fe_bm, .shader_bm = phys_enable_map.shader_bm,
+		.tiler_bm = phys_enable_map.tiler_bm, .mmu_l2_bm = phys_enable_map.mmu_l2_bm,
 		.counter_set = phys_counter_set,
 #if IS_ENABLED(CONFIG_MALI_NO_MALI)
 		/* The dummy model needs the CPU mapping. */
-		.dump_buffer = (uintptr_t)backend_jm->cpu_dump_va,
+			.dump_buffer = (uintptr_t)backend_jm->cpu_dump_va,
 #else
 		.dump_buffer = backend_jm->gpu_dump_va,
 #endif /* CONFIG_MALI_NO_MALI */
@@ -411,7 +409,7 @@ kbasep_hwcnt_backend_jm_dump_enable_nolock(struct kbase_hwcnt_backend *backend,
 
 	backend_jm->debug_core_mask = kbase_pm_ca_get_debug_core_mask(kbdev);
 	backend_jm->max_l2_slices = backend_jm->info->hwcnt_gpu_info.l2_count;
-	backend_jm->max_core_mask = backend_jm->info->hwcnt_gpu_info.core_mask;
+	backend_jm->max_core_mask = backend_jm->info->hwcnt_gpu_info.sc_core_mask;
 
 	backend_jm->pm_core_mask = kbase_pm_ca_get_instr_core_mask(kbdev);
 
diff --git a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.c b/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.c
index 5da5645..ffe8449 100644
--- a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.c
+++ b/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -169,7 +169,7 @@ static int kbasep_hwcnt_backend_gpu_metadata_create(const struct kbase_hwcnt_gpu
 	/* Calculate number of block instances that aren't cores */
 	non_core_block_count = 2 + gpu_info->l2_count;
 	/* Calculate number of block instances that are shader cores */
-	sc_block_count = (size_t)fls64(gpu_info->core_mask);
+	sc_block_count = (size_t)fls64(gpu_info->sc_core_mask);
 	/* Determine the total number of cores */
 	core_block_count = sc_block_count;
 
@@ -277,7 +277,7 @@ static int kbasep_hwcnt_backend_gpu_metadata_create(const struct kbase_hwcnt_gpu
 	kbase_hwcnt_set_avail_mask(&desc.avail_mask, 0, 0);
 	kbase_hwcnt_set_avail_mask_bits(&desc.avail_mask, 0, non_core_block_count, U64_MAX);
 	kbase_hwcnt_set_avail_mask_bits(&desc.avail_mask, non_core_block_count, sc_block_count,
-					gpu_info->core_mask);
+					gpu_info->sc_core_mask);
 
 
 	return kbase_hwcnt_metadata_create(&desc, metadata);
@@ -294,7 +294,7 @@ static size_t kbasep_hwcnt_backend_jm_dump_bytes(const struct kbase_hwcnt_gpu_in
 {
 	WARN_ON(!gpu_info);
 
-	return (2 + gpu_info->l2_count + (size_t)fls64(gpu_info->core_mask)) *
+	return (2 + gpu_info->l2_count + (size_t)fls64(gpu_info->sc_core_mask)) *
 	       gpu_info->prfcnt_values_per_block * KBASE_HWCNT_VALUE_HW_BYTES;
 }
 
@@ -384,6 +384,7 @@ bool kbase_hwcnt_is_block_type_shader(const enum kbase_hwcnt_gpu_v5_block_type b
 	return false;
 }
 
+
 bool kbase_hwcnt_is_block_type_memsys(const enum kbase_hwcnt_gpu_v5_block_type blk_type)
 {
 	if (blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS ||
@@ -466,9 +467,7 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 		else
 			hw_res_available = true;
 
-		/*
-		 * Skip block if no values in the destination block are enabled.
-		 */
+		/* Skip block if no values in the destination block are enabled. */
 		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, blk, blk_inst)) {
 			u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, blk, blk_inst);
 			const u64 *src_blk = dump_src + src_offset;
@@ -592,7 +591,7 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			     blk_stt_t *src_block_stt,
 			     const struct kbase_hwcnt_enable_map *dst_enable_map,
-			     size_t num_l2_slices, u64 shader_present_bitmap, bool accumulate)
+			     size_t num_l2_slices, u64 powered_shader_core_mask, bool accumulate)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	const u64 *dump_src = src;
@@ -614,9 +613,7 @@ int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 		blk_stt_t *dst_blk_stt =
 			kbase_hwcnt_dump_buffer_block_state_instance(dst, blk, blk_inst);
 
-		/*
-		 * Skip block if no values in the destination block are enabled.
-		 */
+		/* Skip block if no values in the destination block are enabled. */
 		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, blk, blk_inst)) {
 			u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, blk, blk_inst);
 			const u64 *src_blk = dump_src + src_offset;
diff --git a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.h b/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.h
index 4339fdd..570aad7 100644
--- a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.h
+++ b/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -169,7 +169,7 @@ enum kbase_hwcnt_physical_set {
 /**
  * struct kbase_hwcnt_gpu_info - Information about hwcnt blocks on the GPUs.
  * @l2_count:                L2 cache count.
- * @core_mask:               Shader core mask. May be sparse.
+ * @sc_core_mask:               Shader core mask. May be sparse.
  * @clk_cnt:                 Number of clock domains available.
  * @csg_cnt:                 Number of CSGs available.
  * @prfcnt_values_per_block: Total entries (header + counters) of performance
@@ -178,7 +178,7 @@ enum kbase_hwcnt_physical_set {
  */
 struct kbase_hwcnt_gpu_info {
 	size_t l2_count;
-	u64 core_mask;
+	u64 sc_core_mask;
 	u8 clk_cnt;
 	u8 csg_cnt;
 	size_t prfcnt_values_per_block;
@@ -327,15 +327,16 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
  * kbase_hwcnt_csf_dump_get() - Copy or accumulate enabled counters from the raw
  *                              dump buffer in src into the dump buffer
  *                              abstraction in dst.
- * @dst:                   Non-NULL pointer to destination dump buffer.
- * @src:                   Non-NULL pointer to source raw dump buffer, of same length
- *                         as dump_buf_bytes in the metadata of dst dump buffer.
- * @src_block_stt:         Non-NULL pointer to source block state buffer.
- * @dst_enable_map:        Non-NULL pointer to enable map specifying enabled values.
- * @num_l2_slices:         Current number of L2 slices allocated to the GPU.
- * @shader_present_bitmap: Current shader-present bitmap that is allocated to the GPU.
- * @accumulate:            True if counters in src should be accumulated into
- *                         destination, rather than copied.
+ * @dst:                      Non-NULL pointer to destination dump buffer.
+ * @src:                      Non-NULL pointer to source raw dump buffer, of same length
+ *                            as dump_buf_bytes in the metadata of dst dump buffer.
+ * @src_block_stt:            Non-NULL pointer to source block state buffer.
+ * @dst_enable_map:           Non-NULL pointer to enable map specifying enabled values.
+ * @num_l2_slices:            Current number of L2 slices allocated to the GPU.
+ * @powered_shader_core_mask: The common mask between the debug_core_mask
+ *                            and the shader_present_bitmap.
+ * @accumulate:               True if counters in src should be accumulated into
+ *                            destination, rather than copied.
  *
  * The dst and dst_enable_map MUST have been created from the same metadata as
  * returned from the call to kbase_hwcnt_csf_metadata_create as was used to get
@@ -346,7 +347,7 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			     blk_stt_t *src_block_stt,
 			     const struct kbase_hwcnt_enable_map *dst_enable_map,
-			     size_t num_l2_slices, u64 shader_present_bitmap, bool accumulate);
+			     size_t num_l2_slices, u64 powered_shader_core_mask, bool accumulate);
 
 /**
  * kbase_hwcnt_backend_gpu_block_map_to_physical() - Convert from a block
@@ -453,6 +454,7 @@ bool kbase_hwcnt_is_block_type_memsys(const enum kbase_hwcnt_gpu_v5_block_type b
 bool kbase_hwcnt_is_block_type_tiler(const enum kbase_hwcnt_gpu_v5_block_type blk_type);
 
 bool kbase_hwcnt_is_block_type_fe(const enum kbase_hwcnt_gpu_v5_block_type blk_type);
+
 /**
  * kbase_hwcnt_gpu_enable_map_from_cm() - Builds enable map abstraction from
  *                                        counter selection bitmasks.
diff --git a/mali_kbase/mali_base_hwconfig_features.h b/mali_kbase/mali_base_hwconfig_features.h
index 1f32fc9..dd76be3 100644
--- a/mali_kbase/mali_base_hwconfig_features.h
+++ b/mali_kbase/mali_base_hwconfig_features.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2014-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
diff --git a/mali_kbase/mali_base_hwconfig_issues.h b/mali_kbase/mali_base_hwconfig_issues.h
index 4426bd7..d01977f 100644
--- a/mali_kbase/mali_base_hwconfig_issues.h
+++ b/mali_kbase/mali_base_hwconfig_issues.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2014-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -462,22 +462,6 @@ __attribute__((unused)) static const enum base_hw_issue base_hw_issues_tBAx_r0p2
 	BASE_HW_ISSUE_END
 };
 
-__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tBAx_r1p0[] = {
-	BASE_HW_ISSUE_9435,
-	BASE_HW_ISSUE_TSIX_2033,
-	BASE_HW_ISSUE_TTRX_1337,
-	BASE_HW_ISSUE_TTRX_2968_TTRX_3162,
-	BASE_HW_ISSUE_TTRX_921,
-	BASE_HW_ISSUE_TTRX_3414,
-	BASE_HW_ISSUE_TTRX_3083,
-	BASE_HW_ISSUE_TTRX_3470,
-	BASE_HW_ISSUE_TTRX_3464,
-	BASE_HW_ISSUE_TITANHW_2710,
-	BASE_HW_ISSUE_GPU2022PRO_148,
-	BASE_HW_ISSUE_TITANHW_2938,
-	BASE_HW_ISSUE_END
-};
-
 __attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tBAx[] = {
 	BASE_HW_ISSUE_5736,	      BASE_HW_ISSUE_9435,	  BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,      BASE_HW_ISSUE_TTRX_3414,	  BASE_HW_ISSUE_TTRX_3083,
@@ -512,7 +496,13 @@ __attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tGR
 __attribute__((unused)) static const enum base_hw_issue base_hw_issues_tVAx_r0p0[] = {
 	BASE_HW_ISSUE_TSIX_2033,    BASE_HW_ISSUE_TTRX_1337,	BASE_HW_ISSUE_GPU2019_3878,
 	BASE_HW_ISSUE_GPU2019_3901, BASE_HW_ISSUE_TITANHW_2710, BASE_HW_ISSUE_GPU2022PRO_148,
-	BASE_HW_ISSUE_TITANHW_2938, BASE_HW_ISSUE_TITANHW_2938, BASE_HW_ISSUE_END
+	BASE_HW_ISSUE_TITANHW_2938, BASE_HW_ISSUE_END
+};
+
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tVAx_r0p1[] = {
+	BASE_HW_ISSUE_TSIX_2033,    BASE_HW_ISSUE_TTRX_1337,	BASE_HW_ISSUE_GPU2019_3878,
+	BASE_HW_ISSUE_GPU2019_3901, BASE_HW_ISSUE_TITANHW_2710, BASE_HW_ISSUE_GPU2022PRO_148,
+	BASE_HW_ISSUE_TITANHW_2938, BASE_HW_ISSUE_END
 };
 
 __attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tVAx[] = {
diff --git a/mali_kbase/mali_kbase_caps.h b/mali_kbase/mali_kbase_caps.h
index a92569d..c458ac1 100644
--- a/mali_kbase/mali_kbase_caps.h
+++ b/mali_kbase/mali_kbase_caps.h
@@ -33,15 +33,22 @@
  *
  * @MALI_KBASE_CAP_SYSTEM_MONITOR: System Monitor
  * @MALI_KBASE_CAP_JIT_PRESSURE_LIMIT: JIT Pressure limit
+ * @MALI_KBASE_CAP_MEM_DONT_NEED: Not needed physical memory
  * @MALI_KBASE_CAP_MEM_GROW_ON_GPF: Memory grow on page fault
  * @MALI_KBASE_CAP_MEM_PROTECTED: Protected memory
+ * @MALI_KBASE_CAP_MEM_IMPORT_SYNC_ON_MAP_UNMAP: CPU cache maintenance required when
+ *                                               imported GPU memory is mapped/unmapped
+ * @MALI_KBASE_CAP_MEM_KERNEL_SYNC: Kernel side cache sync ops required
  * @MALI_KBASE_NUM_CAPS: Delimiter
  */
 enum mali_kbase_cap {
 	MALI_KBASE_CAP_SYSTEM_MONITOR = 0,
 	MALI_KBASE_CAP_JIT_PRESSURE_LIMIT,
+	MALI_KBASE_CAP_MEM_DONT_NEED,
 	MALI_KBASE_CAP_MEM_GROW_ON_GPF,
 	MALI_KBASE_CAP_MEM_PROTECTED,
+	MALI_KBASE_CAP_MEM_IMPORT_SYNC_ON_MAP_UNMAP,
+	MALI_KBASE_CAP_MEM_KERNEL_SYNC,
 	MALI_KBASE_NUM_CAPS
 };
 
@@ -57,6 +64,11 @@ static inline bool mali_kbase_supports_jit_pressure_limit(unsigned long api_vers
 	return mali_kbase_supports_cap(api_version, MALI_KBASE_CAP_JIT_PRESSURE_LIMIT);
 }
 
+static inline bool mali_kbase_supports_mem_dont_need(unsigned long api_version)
+{
+	return mali_kbase_supports_cap(api_version, MALI_KBASE_CAP_MEM_DONT_NEED);
+}
+
 static inline bool mali_kbase_supports_mem_grow_on_gpf(unsigned long api_version)
 {
 	return mali_kbase_supports_cap(api_version, MALI_KBASE_CAP_MEM_GROW_ON_GPF);
@@ -67,4 +79,14 @@ static inline bool mali_kbase_supports_mem_protected(unsigned long api_version)
 	return mali_kbase_supports_cap(api_version, MALI_KBASE_CAP_MEM_PROTECTED);
 }
 
+static inline bool mali_kbase_supports_mem_import_sync_on_map_unmap(unsigned long api_version)
+{
+	return mali_kbase_supports_cap(api_version, MALI_KBASE_CAP_MEM_IMPORT_SYNC_ON_MAP_UNMAP);
+}
+
+static inline bool mali_kbase_supports_mem_kernel_sync(unsigned long api_version)
+{
+	return mali_kbase_supports_cap(api_version, MALI_KBASE_CAP_MEM_KERNEL_SYNC);
+}
+
 #endif /* __KBASE_CAPS_H_ */
diff --git a/mali_kbase/mali_kbase_config_defaults.h b/mali_kbase/mali_kbase_config_defaults.h
index 5ce06dd..0b983b4 100644
--- a/mali_kbase/mali_kbase_config_defaults.h
+++ b/mali_kbase/mali_kbase_config_defaults.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2013-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2013-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -194,9 +194,22 @@ enum {
  */
 #define CSF_CSG_SUSPEND_TIMEOUT_CYCLES (3100000000ull)
 
+/* Waiting timeout in clock cycles for GPU suspend to complete. */
+#define CSF_GPU_SUSPEND_TIMEOUT_CYCLES (CSF_CSG_SUSPEND_TIMEOUT_CYCLES)
+
 /* Waiting timeout in clock cycles for GPU reset to complete. */
 #define CSF_GPU_RESET_TIMEOUT_CYCLES (CSF_CSG_SUSPEND_TIMEOUT_CYCLES * 2)
 
+/* Waiting timeout in clock cycles for a CSG to be terminated.
+ *
+ * Based on 0.6s timeout at 100MHZ, scaled from 0.1s at 600Mhz GPU frequency
+ * which is the timeout defined in FW to wait for iterator to complete the
+ * transitioning to DISABLED state.
+ * More cycles (0.4s @ 100Mhz = 40000000) are added up to ensure that
+ * host timeout is always bigger than FW timeout.
+ */
+#define CSF_CSG_TERM_TIMEOUT_CYCLES (100000000)
+
 /* Waiting timeout in clock cycles for GPU firmware to boot.
  *
  * Based on 250ms timeout at 100MHz, scaled from a 50MHz GPU system.
diff --git a/mali_kbase/mali_kbase_core_linux.c b/mali_kbase/mali_kbase_core_linux.c
index 9ea5d74..dad363d 100644
--- a/mali_kbase/mali_kbase_core_linux.c
+++ b/mali_kbase/mali_kbase_core_linux.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -106,6 +106,7 @@
 #include <linux/clk-provider.h>
 #include <linux/delay.h>
 #include <linux/log2.h>
+#include <linux/mali_hw_access.h>
 
 #include <mali_kbase_config.h>
 
@@ -151,13 +152,19 @@ static const struct mali_kbase_capability_def kbase_caps_table[MALI_KBASE_NUM_CA
 #if MALI_USE_CSF
 	{ 1, 0 }, /* SYSTEM_MONITOR */
 	{ 1, 0 }, /* JIT_PRESSURE_LIMIT */
+	{ 1, 22 }, /* MEM_DONT_NEED */
 	{ 1, 0 }, /* MEM_GROW_ON_GPF */
-	{ 1, 0 } /* MEM_PROTECTED */
+	{ 1, 0 }, /* MEM_PROTECTED */
+	{ 1, 26 }, /* MEM_IMPORT_SYNC_ON_MAP_UNMAP */
+	{ 1, 26 } /* MEM_KERNEL_SYNC */
 #else
 	{ 11, 15 }, /* SYSTEM_MONITOR */
 	{ 11, 25 }, /* JIT_PRESSURE_LIMIT */
+	{ 11, 40 }, /* MEM_DONT_NEED */
 	{ 11, 2 }, /* MEM_GROW_ON_GPF */
-	{ 11, 2 } /* MEM_PROTECTED */
+	{ 11, 2 }, /* MEM_PROTECTED */
+	{ 11, 43 }, /* MEM_IMPORT_SYNC_ON_MAP_UNMAP */
+	{ 11, 43 } /* MEM_KERNEL_SYNC */
 #endif
 };
 
@@ -1412,7 +1419,7 @@ static int kbase_api_sticky_resource_map(struct kbase_context *kctx,
 	if (ret != 0)
 		return -EFAULT;
 
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 
 	for (i = 0; i < map->count; i++) {
 		if (!kbase_sticky_resource_acquire(kctx, gpu_addr[i])) {
@@ -1429,7 +1436,7 @@ static int kbase_api_sticky_resource_map(struct kbase_context *kctx,
 		}
 	}
 
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 
 	return ret;
 }
@@ -1449,7 +1456,7 @@ static int kbase_api_sticky_resource_unmap(struct kbase_context *kctx,
 	if (ret != 0)
 		return -EFAULT;
 
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 
 	for (i = 0; i < unmap->count; i++) {
 		if (!kbase_sticky_resource_release_force(kctx, NULL, gpu_addr[i])) {
@@ -1458,7 +1465,7 @@ static int kbase_api_sticky_resource_unmap(struct kbase_context *kctx,
 		}
 	}
 
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 
 	return ret;
 }
@@ -1516,6 +1523,12 @@ static int kbasep_cs_queue_kick(struct kbase_context *kctx, struct kbase_ioctl_c
 	return kbase_csf_queue_kick(kctx, kick);
 }
 
+static int kbasep_queue_group_clear_faults(struct kbase_context *kctx,
+					   struct kbase_ioctl_queue_group_clear_faults *faults)
+{
+	return kbase_csf_queue_group_clear_faults(kctx, faults);
+}
+
 static int kbasep_cs_queue_group_create_1_6(struct kbase_context *kctx,
 					    union kbase_ioctl_cs_queue_group_create_1_6 *create)
 {
@@ -1585,6 +1598,8 @@ static int kbasep_cs_queue_group_create_1_18(struct kbase_context *kctx,
 static int kbasep_cs_queue_group_create(struct kbase_context *kctx,
 					union kbase_ioctl_cs_queue_group_create *create)
 {
+	/* create->in.reserved only present pre-TDRX configuration. */
+
 	if (create->in.reserved != 0) {
 		dev_warn(kctx->kbdev->dev, "Invalid reserved field not 0 in queue group create\n");
 		return -EINVAL;
@@ -2086,6 +2101,11 @@ static long kbase_kfile_ioctl(struct kbase_file *kfile, unsigned int cmd, unsign
 		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, kbasep_kcpu_queue_enqueue,
 				      struct kbase_ioctl_kcpu_queue_enqueue, kctx);
 		break;
+	case KBASE_IOCTL_QUEUE_GROUP_CLEAR_FAULTS:
+		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_QUEUE_GROUP_CLEAR_FAULTS,
+				      kbasep_queue_group_clear_faults,
+				      struct kbase_ioctl_queue_group_clear_faults, kctx);
+		break;
 	case KBASE_IOCTL_CS_TILER_HEAP_INIT:
 		KBASE_HANDLE_IOCTL_INOUT(KBASE_IOCTL_CS_TILER_HEAP_INIT, kbasep_cs_tiler_heap_init,
 					 union kbase_ioctl_cs_tiler_heap_init, kctx);
@@ -2543,6 +2563,9 @@ static ssize_t core_mask_show(struct device *dev, struct device_attribute *attr,
 	struct kbase_device *kbdev;
 	unsigned long flags;
 	ssize_t ret = 0;
+#if !MALI_USE_CSF
+	size_t i;
+#endif
 
 	CSTD_UNUSED(attr);
 
@@ -2561,154 +2584,191 @@ static ssize_t core_mask_show(struct device *dev, struct device_attribute *attr,
 	ret += scnprintf(buf + ret, (size_t)(PAGE_SIZE - ret),
 			 "Current in use core mask : 0x%llX\n", kbdev->pm.backend.shaders_avail);
 #else
-	ret += scnprintf(buf + ret, (size_t)(PAGE_SIZE - ret), "Current core mask (JS0) : 0x%llX\n",
-			 kbdev->pm.debug_core_mask[0]);
-	ret += scnprintf(buf + ret, (size_t)(PAGE_SIZE - ret), "Current core mask (JS1) : 0x%llX\n",
-			 kbdev->pm.debug_core_mask[1]);
-	ret += scnprintf(buf + ret, (size_t)(PAGE_SIZE - ret), "Current core mask (JS2) : 0x%llX\n",
-			 kbdev->pm.debug_core_mask[2]);
+	for (i = 0; i < BASE_JM_MAX_NR_SLOTS; i++) {
+		if (PAGE_SIZE < ret)
+			goto out_unlock;
+
+		ret += scnprintf(buf + ret, (size_t)(PAGE_SIZE - ret),
+				 "Current core mask (JS%zu) : 0x%llX\n", i,
+				 kbdev->pm.debug_core_mask[i]);
+	}
 #endif /* MALI_USE_CSF */
 
 	ret += scnprintf(buf + ret, (size_t)(PAGE_SIZE - ret), "Available core mask : 0x%llX\n",
 			 kbdev->gpu_props.shader_present);
-
+#if !MALI_USE_CSF
+out_unlock:
+#endif
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
 	return ret;
 }
 
-/**
- * core_mask_store - Store callback for the core_mask sysfs file.
- *
- * @dev:	The device with sysfs file is for
- * @attr:	The attributes of the sysfs file
- * @buf:	The value written to the sysfs file
- * @count:	The number of bytes to write to the sysfs file
- *
- * This function is called when the core_mask sysfs file is written to.
- *
- * Return: @count if the function succeeded. An error code on failure.
- */
-static ssize_t core_mask_store(struct device *dev, struct device_attribute *attr, const char *buf,
-			       size_t count)
-{
-	struct kbase_device *kbdev;
 #if MALI_USE_CSF
+struct kbase_core_mask {
 	u64 new_core_mask;
-#else
-	u64 new_core_mask[3];
-	u64 group_core_mask;
-	int i;
-#endif /* MALI_USE_CSF */
-
-	int items;
-	ssize_t err = (ssize_t)count;
-	unsigned long flags;
-	u64 shader_present;
-
-	CSTD_UNUSED(attr);
-
-	kbdev = to_kbase_device(dev);
-
-	if (!kbdev)
-		return -ENODEV;
-
-#if MALI_USE_CSF
-	items = sscanf(buf, "%llx", &new_core_mask);
+};
 
-	if (items != 1) {
-		dev_err(kbdev->dev, "Couldn't process core mask write operation.\n"
-				    "Use format <core_mask>\n");
-		err = -EINVAL;
-		goto end;
-	}
-#else
-	items = sscanf(buf, "%llx %llx %llx", &new_core_mask[0], &new_core_mask[1],
-		       &new_core_mask[2]);
+static int core_mask_parse(struct kbase_device *const kbdev, const char *const buf,
+			   struct kbase_core_mask *const mask)
+{
+	int err = kstrtou64(buf, 0, &mask->new_core_mask);
 
-	if (items != 1 && items != 3) {
-		dev_err(kbdev->dev, "Couldn't process core mask write operation.\n"
-				    "Use format <core_mask>\n"
-				    "or <core_mask_js0> <core_mask_js1> <core_mask_js2>\n");
-		err = -EINVAL;
-		goto end;
-	}
+	if (err)
+		dev_err(kbdev->dev, "Couldn't process core mask write operation.\n");
 
-	if (items == 1)
-		new_core_mask[1] = new_core_mask[2] = new_core_mask[0];
-#endif
+	return err;
+}
 
-	mutex_lock(&kbdev->pm.lock);
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+static int core_mask_set(struct kbase_device *kbdev, struct kbase_core_mask *const new_mask)
+{
+	u64 new_core_mask = new_mask->new_core_mask;
+	u64 shader_present = kbdev->gpu_props.shader_present;
 
-	shader_present = kbdev->gpu_props.shader_present;
+	lockdep_assert_held(&kbdev->pm.lock);
+	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-#if MALI_USE_CSF
 	if ((new_core_mask & shader_present) != new_core_mask) {
-		dev_err(dev,
+		dev_err(kbdev->dev,
 			"Invalid core mask 0x%llX: Includes non-existent cores (present = 0x%llX)",
 			new_core_mask, shader_present);
-		err = -EINVAL;
-		goto unlock;
+		return -EINVAL;
 
 	} else if (!(new_core_mask & shader_present & kbdev->pm.backend.ca_cores_enabled)) {
-		dev_err(dev,
-			"Invalid core mask 0x%llX: No intersection with currently available cores (present = 0x%llX, CA enabled = 0x%llX\n",
+		dev_err(kbdev->dev,
+			"Invalid core mask 0x%llX: No intersection with currently available cores (present = 0x%llX, CA enabled = 0x%llX)",
 			new_core_mask, kbdev->gpu_props.shader_present,
 			kbdev->pm.backend.ca_cores_enabled);
-		err = -EINVAL;
-		goto unlock;
+		return -EINVAL;
 	}
 
+
 	if (kbdev->pm.debug_core_mask != new_core_mask)
 		kbase_pm_set_debug_core_mask(kbdev, new_core_mask);
+
+	return 0;
+}
 #else
-	group_core_mask = kbdev->gpu_props.coherency_info.group.core_mask;
+struct kbase_core_mask {
+	u64 new_core_mask[BASE_JM_MAX_NR_SLOTS];
+};
+
+static int core_mask_parse(struct kbase_device *const kbdev, const char *const buf,
+			   struct kbase_core_mask *const mask)
+{
+	int items;
+
+	items = sscanf(buf, "%llx %llx %llx", &mask->new_core_mask[0], &mask->new_core_mask[1],
+		       &mask->new_core_mask[2]);
 
-	for (i = 0; i < 3; ++i) {
+	if (items != 1 && items != BASE_JM_MAX_NR_SLOTS) {
+		dev_err(kbdev->dev, "Couldn't process core mask write operation.\n"
+				    "Use format <core_mask>\n"
+				    "or <core_mask_js0> <core_mask_js1> <core_mask_js2>\n");
+		return -EINVAL;
+	}
+
+	/* If only one value was provided, set all other core masks equal to the value. */
+	if (items == 1) {
+		size_t i;
+
+		for (i = 1; i < BASE_JM_MAX_NR_SLOTS; i++)
+			mask->new_core_mask[i] = mask->new_core_mask[0];
+	}
+
+	return 0;
+}
+
+static int core_mask_set(struct kbase_device *kbdev, struct kbase_core_mask *const new_mask)
+{
+	u64 shader_present = kbdev->gpu_props.shader_present;
+	u64 group_core_mask = kbdev->gpu_props.coherency_info.group.core_mask;
+	u64 *new_core_mask = &new_mask->new_core_mask[0];
+	size_t i;
+
+	for (i = 0; i < BASE_JM_MAX_NR_SLOTS; ++i) {
 		if ((new_core_mask[i] & shader_present) != new_core_mask[i]) {
-			dev_err(dev,
-				"Invalid core mask 0x%llX for JS %d: Includes non-existent cores (present = 0x%llX)",
+			dev_err(kbdev->dev,
+				"Invalid core mask 0x%llX for JS %zu: Includes non-existent cores (present = 0x%llX)",
 				new_core_mask[i], i, shader_present);
-			err = -EINVAL;
-			goto unlock;
+			return -EINVAL;
 
 		} else if (!(new_core_mask[i] & shader_present &
 			     kbdev->pm.backend.ca_cores_enabled)) {
-			dev_err(dev,
-				"Invalid core mask 0x%llX for JS %d: No intersection with currently available cores (present = 0x%llX, CA enabled = 0x%llX\n",
+			dev_err(kbdev->dev,
+				"Invalid core mask 0x%llX for JS %zu: No intersection with currently available cores (present = 0x%llX, CA enabled = 0x%llX)",
 				new_core_mask[i], i, kbdev->gpu_props.shader_present,
 				kbdev->pm.backend.ca_cores_enabled);
-			err = -EINVAL;
-			goto unlock;
+			return -EINVAL;
 		} else if (!(new_core_mask[i] & group_core_mask)) {
-			dev_err(dev,
-				"Invalid core mask 0x%llX for JS %d: No intersection with group 0 core mask 0x%llX\n",
+			dev_err(kbdev->dev,
+				"Invalid core mask 0x%llX for JS %zu: No intersection with group 0 core mask 0x%llX",
 				new_core_mask[i], i, group_core_mask);
-			err = -EINVAL;
-			goto unlock;
+			return -EINVAL;
 		} else if (!(new_core_mask[i] & kbdev->gpu_props.curr_config.shader_present)) {
-			dev_err(dev,
-				"Invalid core mask 0x%llX for JS %d: No intersection with current core mask 0x%llX\n",
+			dev_err(kbdev->dev,
+				"Invalid core mask 0x%llX for JS %zu: No intersection with current core mask 0x%llX",
 				new_core_mask[i], i, kbdev->gpu_props.curr_config.shader_present);
-			err = -EINVAL;
-			goto unlock;
+			return -EINVAL;
 		}
 	}
 
-	if (kbdev->pm.debug_core_mask[0] != new_core_mask[0] ||
-	    kbdev->pm.debug_core_mask[1] != new_core_mask[1] ||
-	    kbdev->pm.debug_core_mask[2] != new_core_mask[2]) {
-		kbase_pm_set_debug_core_mask(kbdev, new_core_mask[0], new_core_mask[1],
-					     new_core_mask[2]);
+	for (i = 0; i < BASE_JM_MAX_NR_SLOTS; i++) {
+		if (kbdev->pm.debug_core_mask[i] != new_core_mask[i]) {
+			kbase_pm_set_debug_core_mask(kbdev, new_core_mask, BASE_JM_MAX_NR_SLOTS);
+			break;
+		}
 	}
-#endif /* MALI_USE_CSF */
 
-unlock:
+	return 0;
+}
+
+#endif
+
+/**
+ * core_mask_store - Store callback for the core_mask sysfs file.
+ *
+ * @dev:	The device with sysfs file is for
+ * @attr:	The attributes of the sysfs file
+ * @buf:	The value written to the sysfs file
+ * @count:	The number of bytes to write to the sysfs file
+ *
+ * This function is called when the core_mask sysfs file is written to.
+ *
+ * Return: @count if the function succeeded. An error code on failure.
+ */
+static ssize_t core_mask_store(struct device *dev, struct device_attribute *attr, const char *buf,
+			       size_t count)
+{
+	struct kbase_device *kbdev;
+	struct kbase_core_mask core_mask = {};
+
+	int err;
+	unsigned long flags;
+
+	CSTD_UNUSED(attr);
+
+	kbdev = to_kbase_device(dev);
+
+	if (!kbdev)
+		return -ENODEV;
+
+	err = core_mask_parse(kbdev, buf, &core_mask);
+	if (err)
+		return err;
+
+	mutex_lock(&kbdev->pm.lock);
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	err = core_mask_set(kbdev, &core_mask);
+
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 	mutex_unlock(&kbdev->pm.lock);
-end:
-	return err;
+
+	if (err)
+		return err;
+
+	return count;
 }
 
 /*
@@ -3477,12 +3537,8 @@ int kbase_pm_gpu_freq_init(struct kbase_device *kbdev)
 		/* convert found frequency to KHz */
 		found_freq /= 1000;
 
-		/* If lowest frequency in OPP table is still higher
-		 * than the reference, then keep the reference frequency
-		 * as the one to use for scaling .
-		 */
-		if (found_freq < lowest_freq_khz)
-			lowest_freq_khz = found_freq;
+		/* always use the lowest freqency from opp table */
+		lowest_freq_khz = found_freq;
 	}
 #else
 	dev_err(kbdev->dev, "No operating-points-v2 node or operating-points property in DT");
@@ -4465,7 +4521,7 @@ static int kbase_common_reg_map(struct kbase_device *kbdev)
 		goto out_region;
 	}
 
-	kbdev->reg = ioremap(kbdev->reg_start, kbdev->reg_size);
+	kbdev->reg = mali_ioremap(kbdev->reg_start, kbdev->reg_size);
 	if (!kbdev->reg) {
 		dev_err(kbdev->dev, "Can't remap register window\n");
 		err = -EINVAL;
@@ -4483,7 +4539,7 @@ out_region:
 static void kbase_common_reg_unmap(struct kbase_device *const kbdev)
 {
 	if (kbdev->reg) {
-		iounmap(kbdev->reg);
+		mali_iounmap(kbdev->reg);
 		release_mem_region(kbdev->reg_start, kbdev->reg_size);
 		kbdev->reg = NULL;
 		kbdev->reg_start = 0;
@@ -5086,6 +5142,7 @@ static struct dentry *init_debugfs(struct kbase_device *kbdev)
 		return dentry;
 	}
 
+
 	dentry = debugfs_ctx_defaults_init(kbdev);
 	if (IS_ERR_OR_NULL(dentry))
 		return dentry;
diff --git a/mali_kbase/mali_kbase_defs.h b/mali_kbase/mali_kbase_defs.h
index 30ea787..f3c8340 100644
--- a/mali_kbase/mali_kbase_defs.h
+++ b/mali_kbase/mali_kbase_defs.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2011-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2011-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -171,16 +171,11 @@ struct kbase_gpu_metrics {
  *
  * @link:                    Links the object in kbase_device::gpu_metrics::active_list
  *                           or kbase_device::gpu_metrics::inactive_list.
- * @first_active_start_time: Records the time at which the application first became
+ * @active_start_time:       Records the time at which the application first became
  *                           active in the current work period.
- * @last_active_start_time:  Records the time at which the application last became
- *                           active in the current work period.
- * @last_active_end_time:    Records the time at which the application last became
- *                           inactive in the current work period.
- * @total_active:            Tracks the time for which application has been active
- *                           in the current work period.
- * @prev_wp_active_end_time: Records the time at which the application last became
- *                           inactive in the previous work period.
+ * @active_end_time:         Records the time at which the application last became
+ *                           inactive in the current work period, or the time of the end of
+ *                           previous work period if the application remained active.
  * @aid:                     Unique identifier for an application.
  * @kctx_count:              Counter to keep a track of the number of Kbase contexts
  *                           created for an application. There may be multiple Kbase
@@ -188,19 +183,14 @@ struct kbase_gpu_metrics {
  *                           metrics context.
  * @active_cnt:              Counter that is updated every time the GPU activity starts
  *                           and ends in the current work period for an application.
- * @flags:                   Flags to track the state of GPU metrics context.
  */
 struct kbase_gpu_metrics_ctx {
 	struct list_head link;
-	u64 first_active_start_time;
-	u64 last_active_start_time;
-	u64 last_active_end_time;
-	u64 total_active;
-	u64 prev_wp_active_end_time;
+	u64 active_start_time;
+	u64 active_end_time;
 	unsigned int aid;
 	unsigned int kctx_count;
 	u8 active_cnt;
-	u8 flags;
 };
 #endif
 
@@ -548,7 +538,7 @@ struct kbase_mem_pool {
 	u8 group_id;
 	spinlock_t pool_lock;
 	struct list_head page_list;
-	struct shrinker reclaim;
+	DEFINE_KBASE_SHRINKER reclaim;
 	atomic_t isolation_in_progress_cnt;
 
 	struct kbase_mem_pool *next_pool;
@@ -839,8 +829,6 @@ struct kbase_mem_migrate {
  * @as_free:               Bitpattern of free/available GPU address spaces.
  * @mmu_mask_change:       Lock to serialize the access to MMU interrupt mask
  *                         register used in the handling of Bus & Page faults.
- * @pagesize_2mb:          Boolean to determine whether 2MiB page sizes are
- *                         supported and used where possible.
  * @gpu_props:             Object containing complete information about the
  *                         configuration/properties of GPU HW device in use.
  * @hw_issues_mask:        List of SW workarounds for HW issues
@@ -1141,8 +1129,6 @@ struct kbase_device {
 
 	spinlock_t mmu_mask_change;
 
-	bool pagesize_2mb;
-
 	struct kbase_gpu_props gpu_props;
 
 	unsigned long hw_issues_mask[(BASE_HW_ISSUE_END + BITS_PER_LONG - 1) / BITS_PER_LONG];
@@ -1986,7 +1972,8 @@ struct kbase_context {
 
 	struct kbase_mem_pool_group mem_pools;
 
-	struct shrinker reclaim;
+	DEFINE_KBASE_SHRINKER reclaim;
+
 	struct list_head evict_list;
 	atomic_t evict_nents;
 
diff --git a/mali_kbase/mali_kbase_fence.h b/mali_kbase/mali_kbase_fence.h
index 06690d4..d45a0fe 100644
--- a/mali_kbase/mali_kbase_fence.h
+++ b/mali_kbase/mali_kbase_fence.h
@@ -35,8 +35,37 @@
 #include <linux/version_compat_defs.h>
 
 #if MALI_USE_CSF
+/* Number of digits needed to express the max value of given unsigned type.
+ *
+ * Details: The number of digits needed to express the max value of given type is log10(t_max) + 1
+ * sizeof(t) == log2(t_max)/8
+ * log10(t_max) == log2(t_max) / log2(10)
+ * log2(t_max) == sizeof(type) * 8
+ * 1/log2(10) is approx (1233 >> 12)
+ * Hence, number of digits for given type == log10(t_max) + 1 == sizeof(type) * 8 * (1233 >> 12) + 1
+ */
+#define MAX_DIGITS_FOR_UNSIGNED_TYPE(t) ((((sizeof(t) * BITS_PER_BYTE) * 1233) >> 12) + 1)
+
+/* Number of digits needed to express the max value of given signed type,
+ * including the sign character,
+ */
+#define MAX_DIGITS_FOR_SIGNED_TYPE(t) (MAX_DIGITS_FOR_UNSIGNED_TYPE(t) + 1)
+
+/* Max number of characters for id member of kbase_device struct. */
+#define MAX_KBDEV_ID_LEN MAX_DIGITS_FOR_UNSIGNED_TYPE(u32)
+/* Max number of characters for tgid member of kbase_context struct. */
+#define MAX_KCTX_TGID_LEN MAX_DIGITS_FOR_SIGNED_TYPE(pid_t)
+/* Max number of characters for id member of kbase_context struct. */
+#define MAX_KCTX_ID_LEN MAX_DIGITS_FOR_UNSIGNED_TYPE(u32)
+/* Max number of characters for fence_context member of kbase_kcpu_command_queue struct. */
+#define MAX_KCTX_QUEUE_FENCE_CTX_LEN MAX_DIGITS_FOR_UNSIGNED_TYPE(u64)
+/* Max number of characters for timeline name fixed format, including null character. */
+#define FIXED_FORMAT_LEN (9)
+
 /* Maximum number of characters in DMA fence timeline name. */
-#define MAX_TIMELINE_NAME (32)
+#define MAX_TIMELINE_NAME                                                                        \
+	(MAX_KBDEV_ID_LEN + MAX_KCTX_TGID_LEN + MAX_KCTX_ID_LEN + MAX_KCTX_QUEUE_FENCE_CTX_LEN + \
+	 FIXED_FORMAT_LEN)
 
 /**
  * struct kbase_kcpu_dma_fence_meta - Metadata structure for dma fence objects containing
diff --git a/mali_kbase/mali_kbase_gpu_metrics.c b/mali_kbase/mali_kbase_gpu_metrics.c
index d404c69..e5dcde9 100644
--- a/mali_kbase/mali_kbase_gpu_metrics.c
+++ b/mali_kbase/mali_kbase_gpu_metrics.c
@@ -29,46 +29,12 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 
-/**
- * enum gpu_metrics_ctx_flags - Flags for the GPU metrics context
- *
- * @ACTIVE_INTERVAL_IN_WP: Flag set when the application first becomes active in
- *                         the current work period.
- *
- * @INSIDE_ACTIVE_LIST:    Flag to track if object is in kbase_device::gpu_metrics::active_list
- *
- * All members need to be separate bits. This enum is intended for use in a
- * bitmask where multiple values get OR-ed together.
- */
-enum gpu_metrics_ctx_flags {
-	ACTIVE_INTERVAL_IN_WP = 1 << 0,
-	INSIDE_ACTIVE_LIST = 1 << 1,
-};
-
 static unsigned long gpu_metrics_tp_emit_interval_ns = DEFAULT_GPU_METRICS_TP_EMIT_INTERVAL_NS;
 
 module_param(gpu_metrics_tp_emit_interval_ns, ulong, 0444);
 MODULE_PARM_DESC(gpu_metrics_tp_emit_interval_ns,
 		 "Time interval in nano seconds at which GPU metrics tracepoints are emitted");
 
-static inline bool gpu_metrics_ctx_flag(struct kbase_gpu_metrics_ctx *gpu_metrics_ctx,
-					enum gpu_metrics_ctx_flags flag)
-{
-	return (gpu_metrics_ctx->flags & flag);
-}
-
-static inline void gpu_metrics_ctx_flag_set(struct kbase_gpu_metrics_ctx *gpu_metrics_ctx,
-					    enum gpu_metrics_ctx_flags flag)
-{
-	gpu_metrics_ctx->flags |= flag;
-}
-
-static inline void gpu_metrics_ctx_flag_clear(struct kbase_gpu_metrics_ctx *gpu_metrics_ctx,
-					      enum gpu_metrics_ctx_flags flag)
-{
-	gpu_metrics_ctx->flags &= ~flag;
-}
-
 static inline void validate_tracepoint_data(struct kbase_gpu_metrics_ctx *gpu_metrics_ctx,
 					    u64 start_time, u64 end_time, u64 total_active)
 {
@@ -82,43 +48,30 @@ static inline void validate_tracepoint_data(struct kbase_gpu_metrics_ctx *gpu_me
 	WARN(total_active > (end_time - start_time),
 	     "total_active %llu > end_time %llu - start_time %llu for aid %u active_cnt %u",
 	     total_active, end_time, start_time, gpu_metrics_ctx->aid, gpu_metrics_ctx->active_cnt);
-
-	WARN(gpu_metrics_ctx->prev_wp_active_end_time > start_time,
-	     "prev_wp_active_end_time %llu > start_time %llu for aid %u active_cnt %u",
-	     gpu_metrics_ctx->prev_wp_active_end_time, start_time, gpu_metrics_ctx->aid,
-	     gpu_metrics_ctx->active_cnt);
 #endif
 }
 
 static void emit_tracepoint_for_active_gpu_metrics_ctx(
 	struct kbase_device *kbdev, struct kbase_gpu_metrics_ctx *gpu_metrics_ctx, u64 current_time)
 {
-	const u64 start_time = gpu_metrics_ctx->first_active_start_time;
-	u64 total_active = gpu_metrics_ctx->total_active;
-	u64 end_time;
+	const u64 start_time = gpu_metrics_ctx->active_start_time;
+	u64 total_active, end_time = current_time;
 
 	/* Check if the GPU activity is currently ongoing */
 	if (gpu_metrics_ctx->active_cnt) {
 		/* The following check is to handle the race on CSF GPUs that can happen between
 		 * the draining of trace buffer and FW emitting the ACT=1 event .
 		 */
-		if (unlikely(current_time == gpu_metrics_ctx->last_active_start_time))
-			current_time++;
-		end_time = current_time;
-		total_active += end_time - gpu_metrics_ctx->last_active_start_time;
-
-		gpu_metrics_ctx->first_active_start_time = current_time;
-		gpu_metrics_ctx->last_active_start_time = current_time;
-	} else {
-		end_time = gpu_metrics_ctx->last_active_end_time;
-		gpu_metrics_ctx_flag_clear(gpu_metrics_ctx, ACTIVE_INTERVAL_IN_WP);
+		if (unlikely(end_time == start_time))
+			end_time++;
+		gpu_metrics_ctx->active_start_time = end_time;
 	}
 
+	total_active = end_time - start_time;
 	trace_gpu_work_period(kbdev->id, gpu_metrics_ctx->aid, start_time, end_time, total_active);
 
 	validate_tracepoint_data(gpu_metrics_ctx, start_time, end_time, total_active);
-	gpu_metrics_ctx->prev_wp_active_end_time = end_time;
-	gpu_metrics_ctx->total_active = 0;
+	gpu_metrics_ctx->active_end_time = end_time;
 }
 
 void kbase_gpu_metrics_ctx_put(struct kbase_device *kbdev,
@@ -131,7 +84,8 @@ void kbase_gpu_metrics_ctx_put(struct kbase_device *kbdev,
 	if (gpu_metrics_ctx->kctx_count)
 		return;
 
-	if (gpu_metrics_ctx_flag(gpu_metrics_ctx, ACTIVE_INTERVAL_IN_WP))
+	/* Generate a tracepoint if there's still activity */
+	if (gpu_metrics_ctx->active_cnt)
 		emit_tracepoint_for_active_gpu_metrics_ctx(kbdev, gpu_metrics_ctx,
 							   ktime_get_raw_ns());
 
@@ -166,12 +120,11 @@ struct kbase_gpu_metrics_ctx *kbase_gpu_metrics_ctx_get(struct kbase_device *kbd
 void kbase_gpu_metrics_ctx_init(struct kbase_device *kbdev,
 				struct kbase_gpu_metrics_ctx *gpu_metrics_ctx, unsigned int aid)
 {
+	gpu_metrics_ctx->active_start_time = 0;
+	gpu_metrics_ctx->active_end_time = 0;
 	gpu_metrics_ctx->aid = aid;
-	gpu_metrics_ctx->total_active = 0;
 	gpu_metrics_ctx->kctx_count = 1;
 	gpu_metrics_ctx->active_cnt = 0;
-	gpu_metrics_ctx->prev_wp_active_end_time = 0;
-	gpu_metrics_ctx->flags = 0;
 	list_add_tail(&gpu_metrics_ctx->link, &kbdev->gpu_metrics.inactive_list);
 }
 
@@ -180,17 +133,9 @@ void kbase_gpu_metrics_ctx_start_activity(struct kbase_context *kctx, u64 timest
 	struct kbase_gpu_metrics_ctx *gpu_metrics_ctx = kctx->gpu_metrics_ctx;
 
 	gpu_metrics_ctx->active_cnt++;
-	if (gpu_metrics_ctx->active_cnt == 1)
-		gpu_metrics_ctx->last_active_start_time = timestamp_ns;
-
-	if (!gpu_metrics_ctx_flag(gpu_metrics_ctx, ACTIVE_INTERVAL_IN_WP)) {
-		gpu_metrics_ctx->first_active_start_time = timestamp_ns;
-		gpu_metrics_ctx_flag_set(gpu_metrics_ctx, ACTIVE_INTERVAL_IN_WP);
-	}
-
-	if (!gpu_metrics_ctx_flag(gpu_metrics_ctx, INSIDE_ACTIVE_LIST)) {
+	if (gpu_metrics_ctx->active_cnt == 1) {
+		gpu_metrics_ctx->active_start_time = timestamp_ns;
 		list_move_tail(&gpu_metrics_ctx->link, &kctx->kbdev->gpu_metrics.active_list);
-		gpu_metrics_ctx_flag_set(gpu_metrics_ctx, INSIDE_ACTIVE_LIST);
 	}
 }
 
@@ -201,22 +146,22 @@ void kbase_gpu_metrics_ctx_end_activity(struct kbase_context *kctx, u64 timestam
 	if (WARN_ON_ONCE(!gpu_metrics_ctx->active_cnt))
 		return;
 
+	/* Do not emit tracepoint if GPU activity still continues. */
 	if (--gpu_metrics_ctx->active_cnt)
 		return;
 
-	if (likely(timestamp_ns > gpu_metrics_ctx->last_active_start_time)) {
-		gpu_metrics_ctx->last_active_end_time = timestamp_ns;
-		gpu_metrics_ctx->total_active +=
-			timestamp_ns - gpu_metrics_ctx->last_active_start_time;
+	if (likely(timestamp_ns > gpu_metrics_ctx->active_start_time)) {
+		emit_tracepoint_for_active_gpu_metrics_ctx(kctx->kbdev, gpu_metrics_ctx,
+							   timestamp_ns);
 		return;
 	}
 
 	/* Due to conversion from system timestamp to CPU timestamp (which involves rounding)
 	 * the value for start and end timestamp could come as same on CSF GPUs.
 	 */
-	if (timestamp_ns == gpu_metrics_ctx->last_active_start_time) {
-		gpu_metrics_ctx->last_active_end_time = timestamp_ns + 1;
-		gpu_metrics_ctx->total_active += 1;
+	if (timestamp_ns == gpu_metrics_ctx->active_start_time) {
+		emit_tracepoint_for_active_gpu_metrics_ctx(kctx->kbdev, gpu_metrics_ctx,
+							   timestamp_ns + 1);
 		return;
 	}
 
@@ -224,12 +169,9 @@ void kbase_gpu_metrics_ctx_end_activity(struct kbase_context *kctx, u64 timestam
 	 * visible to the Kbase even though the system timestamp value sampled by FW was less than
 	 * the system timestamp value sampled by Kbase just before the draining of trace buffer.
 	 */
-	if (gpu_metrics_ctx->last_active_start_time == gpu_metrics_ctx->first_active_start_time &&
-	    gpu_metrics_ctx->prev_wp_active_end_time == gpu_metrics_ctx->first_active_start_time) {
-		WARN_ON_ONCE(gpu_metrics_ctx->total_active);
-		gpu_metrics_ctx->last_active_end_time =
-			gpu_metrics_ctx->prev_wp_active_end_time + 1;
-		gpu_metrics_ctx->total_active = 1;
+	if (gpu_metrics_ctx->active_end_time == gpu_metrics_ctx->active_start_time) {
+		emit_tracepoint_for_active_gpu_metrics_ctx(kctx->kbdev, gpu_metrics_ctx,
+							   gpu_metrics_ctx->active_end_time + 1);
 		return;
 	}
 
@@ -242,15 +184,12 @@ void kbase_gpu_metrics_emit_tracepoint(struct kbase_device *kbdev, u64 ts)
 	struct kbase_gpu_metrics_ctx *gpu_metrics_ctx, *tmp;
 
 	list_for_each_entry_safe(gpu_metrics_ctx, tmp, &gpu_metrics->active_list, link) {
-		if (!gpu_metrics_ctx_flag(gpu_metrics_ctx, ACTIVE_INTERVAL_IN_WP)) {
-			WARN_ON(!gpu_metrics_ctx_flag(gpu_metrics_ctx, INSIDE_ACTIVE_LIST));
-			WARN_ON(gpu_metrics_ctx->active_cnt);
-			list_move_tail(&gpu_metrics_ctx->link, &gpu_metrics->inactive_list);
-			gpu_metrics_ctx_flag_clear(gpu_metrics_ctx, INSIDE_ACTIVE_LIST);
+		if (gpu_metrics_ctx->active_cnt) {
+			emit_tracepoint_for_active_gpu_metrics_ctx(kbdev, gpu_metrics_ctx, ts);
 			continue;
 		}
 
-		emit_tracepoint_for_active_gpu_metrics_ctx(kbdev, gpu_metrics_ctx, ts);
+		list_move_tail(&gpu_metrics_ctx->link, &gpu_metrics->inactive_list);
 	}
 }
 
diff --git a/mali_kbase/mali_kbase_gpu_metrics.h b/mali_kbase/mali_kbase_gpu_metrics.h
index c445dff..658cf1c 100644
--- a/mali_kbase/mali_kbase_gpu_metrics.h
+++ b/mali_kbase/mali_kbase_gpu_metrics.h
@@ -106,7 +106,7 @@ void kbase_gpu_metrics_ctx_init(struct kbase_device *kbdev,
  * @kctx:         Pointer to the Kbase context contributing data to the GPU metrics context.
  * @timestamp_ns: CPU timestamp at which the GPU activity started.
  *
- * The provided timestamp would be later used as the "start_time_ns" for the
+ * The provided timestamp is used as the "start_time_ns" for the
  * power/gpu_work_period tracepoint if this is the first GPU activity for the GPU
  * metrics context in the current work period.
  *
@@ -122,9 +122,9 @@ void kbase_gpu_metrics_ctx_start_activity(struct kbase_context *kctx, u64 timest
  * @kctx:         Pointer to the Kbase context contributing data to the GPU metrics context.
  * @timestamp_ns: CPU timestamp at which the GPU activity ended.
  *
- * The provided timestamp would be later used as the "end_time_ns" for the
- * power/gpu_work_period tracepoint if this is the last GPU activity for the GPU
- * metrics context in the current work period.
+ * The provided timestamp is used as the "end_time_ns" for the power/gpu_work_period
+ * tracepoint if this is the last GPU activity for the GPU metrics context
+ * in the current work period.
  *
  * Note: The caller must appropriately serialize the call to this function with the
  *       call to other GPU metrics functions declared in this file.
@@ -138,8 +138,8 @@ void kbase_gpu_metrics_ctx_end_activity(struct kbase_context *kctx, u64 timestam
  * @kbdev: Pointer to the GPU device.
  * @ts:    Timestamp at which the tracepoint is being emitted.
  *
- * This function would loop through all the active GPU metrics contexts and emit a
- * power/gpu_work_period tracepoint for them.
+ * This function would loop through all GPU metrics contexts in the active list and
+ * emit a power/gpu_work_period tracepoint if the GPU work in the context still active.
  * The GPU metrics context that is found to be inactive since the last tracepoint
  * was emitted would be moved to the inactive list.
  * The current work period would be considered as over and a new work period would
diff --git a/mali_kbase/mali_kbase_gpuprops.c b/mali_kbase/mali_kbase_gpuprops.c
index 3ac1c45..e6c31d4 100644
--- a/mali_kbase/mali_kbase_gpuprops.c
+++ b/mali_kbase/mali_kbase_gpuprops.c
@@ -357,6 +357,7 @@ enum l2_config_override_result {
 /**
  * kbase_read_l2_config_from_dt - Read L2 configuration
  * @kbdev: The kbase device for which to get the L2 configuration.
+ * @regdump: Pointer to struct kbase_gpuprops_regdump structure.
  *
  * Check for L2 configuration overrides in module parameters and device tree.
  * Override values in module parameters take priority over override values in
@@ -366,9 +367,16 @@ enum l2_config_override_result {
  *         overridden, L2_CONFIG_OVERRIDE_NONE if no overrides are provided.
  *         L2_CONFIG_OVERRIDE_FAIL otherwise.
  */
-static enum l2_config_override_result kbase_read_l2_config_from_dt(struct kbase_device *const kbdev)
+static enum l2_config_override_result
+kbase_read_l2_config_from_dt(struct kbase_device *const kbdev,
+			     struct kbasep_gpuprops_regdump *regdump)
 {
 	struct device_node *np = kbdev->dev->of_node;
+	/*
+	 * CACHE_SIZE bit fields in L2_FEATURES register, default value after the reset/powerup
+	 * holds the maximum size of the cache that can be programmed in L2_CONFIG register.
+	 */
+	const u8 l2_size_max = L2_FEATURES_CACHE_SIZE_GET(regdump->l2_features);
 
 	if (!np)
 		return L2_CONFIG_OVERRIDE_NONE;
@@ -378,8 +386,12 @@ static enum l2_config_override_result kbase_read_l2_config_from_dt(struct kbase_
 	else if (of_property_read_u8(np, "l2-size", &kbdev->l2_size_override))
 		kbdev->l2_size_override = 0;
 
-	if (kbdev->l2_size_override != 0 && kbdev->l2_size_override < OVERRIDE_L2_SIZE_MIN_LOG2)
+	if (kbdev->l2_size_override != 0 && (kbdev->l2_size_override < OVERRIDE_L2_SIZE_MIN_LOG2 ||
+					     kbdev->l2_size_override > l2_size_max)) {
+		dev_err(kbdev->dev, "Invalid Cache Size in %s",
+			override_l2_size ? "Module parameters" : "Device tree node");
 		return L2_CONFIG_OVERRIDE_FAIL;
+	}
 
 	/* Check overriding value is supported, if not will result in
 	 * undefined behavior.
@@ -429,7 +441,7 @@ int kbase_gpuprops_update_l2_features(struct kbase_device *kbdev)
 		struct kbasep_gpuprops_regdump *regdump = &PRIV_DATA_REGDUMP(kbdev);
 
 		/* Check for L2 cache size & hash overrides */
-		switch (kbase_read_l2_config_from_dt(kbdev)) {
+		switch (kbase_read_l2_config_from_dt(kbdev, regdump)) {
 		case L2_CONFIG_OVERRIDE_FAIL:
 			err = -EIO;
 			goto exit;
diff --git a/mali_kbase/mali_kbase_gwt.c b/mali_kbase/mali_kbase_gwt.c
index c92d54c..5e59bf6 100644
--- a/mali_kbase/mali_kbase_gwt.c
+++ b/mali_kbase/mali_kbase_gwt.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -30,9 +30,10 @@
 #include <linux/module.h>
 
 static inline void kbase_gpu_gwt_setup_page_permission(struct kbase_context *kctx,
-						       unsigned long flag, struct rb_node *node)
+						       unsigned long flag,
+						       struct kbase_reg_zone *zone)
 {
-	struct rb_node *rbnode = node;
+	struct rb_node *rbnode = rb_first(&zone->reg_rbtree);
 
 	while (rbnode) {
 		struct kbase_va_region *reg;
@@ -55,17 +56,15 @@ static inline void kbase_gpu_gwt_setup_page_permission(struct kbase_context *kct
 
 static void kbase_gpu_gwt_setup_pages(struct kbase_context *kctx, unsigned long flag)
 {
-	kbase_gpu_gwt_setup_page_permission(kctx, flag,
-					    rb_first(&kctx->reg_zone[SAME_VA_ZONE].reg_rbtree));
-	kbase_gpu_gwt_setup_page_permission(kctx, flag,
-					    rb_first(&kctx->reg_zone[CUSTOM_VA_ZONE].reg_rbtree));
+	kbase_gpu_gwt_setup_page_permission(kctx, flag, &kctx->reg_zone[SAME_VA_ZONE]);
+	kbase_gpu_gwt_setup_page_permission(kctx, flag, &kctx->reg_zone[CUSTOM_VA_ZONE]);
 }
 
 int kbase_gpu_gwt_start(struct kbase_context *kctx)
 {
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 	if (kctx->gwt_enabled) {
-		kbase_gpu_vm_unlock(kctx);
+		kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 		return -EBUSY;
 	}
 
@@ -91,7 +90,7 @@ int kbase_gpu_gwt_start(struct kbase_context *kctx)
 
 	kbase_gpu_gwt_setup_pages(kctx, ~KBASE_REG_GPU_WR);
 
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 	return 0;
 }
 
diff --git a/mali_kbase/mali_kbase_hw.c b/mali_kbase/mali_kbase_hw.c
index 7d4200e..1fde75b 100644
--- a/mali_kbase/mali_kbase_hw.c
+++ b/mali_kbase/mali_kbase_hw.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2012-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2012-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -225,6 +225,8 @@ static const enum base_hw_issue *kbase_hw_get_issues_for_new_id(struct kbase_dev
 
 		{ GPU_ID_PRODUCT_TVAX,
 		  { { GPU_ID_VERSION_MAKE(0, 0, 0), base_hw_issues_tVAx_r0p0 },
+		    { GPU_ID_VERSION_MAKE(0, 0, 5), base_hw_issues_tVAx_r0p0 },
+		    { GPU_ID_VERSION_MAKE(0, 1, 0), base_hw_issues_tVAx_r0p1 },
 		    { U32_MAX, NULL } } },
 
 		{ GPU_ID_PRODUCT_TTUX,
@@ -334,6 +336,8 @@ static const enum base_hw_issue *kbase_hw_get_issues_for_new_id(struct kbase_dev
 			gpu_id->version_id = fallback_version;
 		}
 	}
+
+
 	return issues;
 }
 
diff --git a/mali_kbase/mali_kbase_hwaccess_pm.h b/mali_kbase/mali_kbase_hwaccess_pm.h
index 7a0ea49..982547d 100644
--- a/mali_kbase/mali_kbase_hwaccess_pm.h
+++ b/mali_kbase/mali_kbase_hwaccess_pm.h
@@ -129,14 +129,14 @@ void kbase_pm_set_debug_core_mask(struct kbase_device *kbdev, u64 new_core_mask)
  * kbase_pm_set_debug_core_mask - Set the debug core mask.
  *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
- * @new_core_mask_js0: The core mask to use for job slot 0
- * @new_core_mask_js1: The core mask to use for job slot 1
- * @new_core_mask_js2: The core mask to use for job slot 2
+ * @new_core_mask: The core mask to use, as an array where each element refers
+ *                 to a job slot.
+ * @new_core_mask_size: Number of elements in the core mask array.
  *
  * This determines which cores the power manager is allowed to use.
  */
-void kbase_pm_set_debug_core_mask(struct kbase_device *kbdev, u64 new_core_mask_js0,
-				  u64 new_core_mask_js1, u64 new_core_mask_js2);
+void kbase_pm_set_debug_core_mask(struct kbase_device *kbdev, u64 *new_core_mask,
+				  size_t new_core_mask_size);
 #endif /* MALI_USE_CSF */
 
 /**
diff --git a/mali_kbase/mali_kbase_linux.h b/mali_kbase/mali_kbase_linux.h
index 9195be3..cb55d4b 100644
--- a/mali_kbase/mali_kbase_linux.h
+++ b/mali_kbase/mali_kbase_linux.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -35,8 +35,13 @@
 
 #if IS_ENABLED(MALI_KERNEL_TEST_API)
 #define KBASE_EXPORT_TEST_API(func) EXPORT_SYMBOL(func)
+/* Note: due to the 2-layer macro translation, using the NULL _etype does not
+ * compile, and one workaround is to use ERRNO_NULL instead.
+ */
+#define KBASE_ALLOW_ERROR_INJECTION_TEST_API(func, etype) ALLOW_ERROR_INJECTION(func, etype)
 #else
 #define KBASE_EXPORT_TEST_API(func)
+#define KBASE_ALLOW_ERROR_INJECTION_TEST_API(func, etype)
 #endif
 
 #define KBASE_EXPORT_SYMBOL(func) EXPORT_SYMBOL(func)
diff --git a/mali_kbase/mali_kbase_mem.c b/mali_kbase/mali_kbase_mem.c
index ddf6ea3..71088dd 100644
--- a/mali_kbase/mali_kbase_mem.c
+++ b/mali_kbase/mali_kbase_mem.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -46,6 +46,9 @@
 #include <mali_kbase_trace_gpu_mem.h>
 #include <linux/version_compat_defs.h>
 
+/* Static key used to determine if large pages are enabled or not */
+static DEFINE_STATIC_KEY_FALSE(large_pages_static_key);
+
 #define VA_REGION_SLAB_NAME_PREFIX "va-region-slab-"
 #define VA_REGION_SLAB_NAME_SIZE (DEVNAME_SIZE + sizeof(VA_REGION_SLAB_NAME_PREFIX) + 1)
 
@@ -143,20 +146,20 @@ MODULE_PARM_DESC(large_page_conf, "User override for large page usage on support
 static void kbasep_mem_page_size_init(struct kbase_device *kbdev)
 {
 	if (!IS_ENABLED(CONFIG_LARGE_PAGE_SUPPORT)) {
-		kbdev->pagesize_2mb = false;
 		dev_info(kbdev->dev, "Large page support was disabled at compile-time!");
 		return;
 	}
 
 	switch (large_page_conf) {
 	case LARGE_PAGE_AUTO: {
-		kbdev->pagesize_2mb = kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_LARGE_PAGE_ALLOC);
+		if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_LARGE_PAGE_ALLOC))
+			static_branch_enable(&large_pages_static_key);
 		dev_info(kbdev->dev, "Large page allocation set to %s after hardware feature check",
-			 kbdev->pagesize_2mb ? "true" : "false");
+			 static_branch_unlikely(&large_pages_static_key) ? "true" : "false");
 		break;
 	}
 	case LARGE_PAGE_ON: {
-		kbdev->pagesize_2mb = true;
+		static_branch_enable(&large_pages_static_key);
 		if (!kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_LARGE_PAGE_ALLOC))
 			dev_warn(kbdev->dev,
 				 "Enabling large page allocations on unsupporting GPU!");
@@ -165,12 +168,10 @@ static void kbasep_mem_page_size_init(struct kbase_device *kbdev)
 		break;
 	}
 	case LARGE_PAGE_OFF: {
-		kbdev->pagesize_2mb = false;
 		dev_info(kbdev->dev, "Large page allocation override: turned off\n");
 		break;
 	}
 	default: {
-		kbdev->pagesize_2mb = false;
 		dev_info(kbdev->dev, "Invalid large page override, turning off large pages\n");
 		break;
 	}
@@ -180,12 +181,18 @@ static void kbasep_mem_page_size_init(struct kbase_device *kbdev)
 	 * so that userspace could read it to figure out the state of the configuration
 	 * if necessary.
 	 */
-	if (kbdev->pagesize_2mb)
+	if (static_branch_unlikely(&large_pages_static_key))
 		large_page_conf = LARGE_PAGE_ON;
 	else
 		large_page_conf = LARGE_PAGE_OFF;
 }
 
+inline bool kbase_is_large_pages_enabled(void)
+{
+	return static_branch_unlikely(&large_pages_static_key);
+}
+KBASE_EXPORT_TEST_API(kbase_is_large_pages_enabled);
+
 int kbase_mem_init(struct kbase_device *kbdev)
 {
 	int err = 0;
@@ -672,7 +679,9 @@ void kbase_sync_single(struct kbase_context *kctx, struct tagged_addr t_cpu_pa,
 		dma_addr_t dma_addr;
 
 		WARN_ON(!cpu_page);
-		WARN_ON((size_t)offset + size > PAGE_SIZE);
+
+		if ((size_t)offset + size > PAGE_SIZE)
+			dev_warn(kctx->kbdev->dev, "Size and offset exceed page size");
 
 		dma_addr = kbase_dma_addr_from_tagged(t_cpu_pa) + (dma_addr_t)offset;
 
@@ -942,7 +951,7 @@ int kbase_mem_free(struct kbase_context *kctx, u64 gpu_addr)
 			__func__);
 		return -EINVAL;
 	}
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 
 	if (gpu_addr >= BASE_MEM_COOKIE_BASE && gpu_addr < BASE_MEM_FIRST_FREE_ADDRESS) {
 		unsigned int cookie = PFN_DOWN(gpu_addr - BASE_MEM_COOKIE_BASE);
@@ -981,7 +990,7 @@ int kbase_mem_free(struct kbase_context *kctx, u64 gpu_addr)
 	}
 
 out_unlock:
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 	return err;
 }
 
@@ -1156,7 +1165,7 @@ int kbase_alloc_phy_pages_helper(struct kbase_mem_phy_alloc *alloc, size_t nr_pa
 	/* Check if we have enough pages requested so we can allocate a large
 	 * page (512 * 4KB = 2MB )
 	 */
-	if (kbdev->pagesize_2mb && nr_left >= NUM_PAGES_IN_2MB_LARGE_PAGE) {
+	if (kbase_is_large_pages_enabled() && nr_left >= NUM_PAGES_IN_2MB_LARGE_PAGE) {
 		size_t nr_lp = nr_left / NUM_PAGES_IN_2MB_LARGE_PAGE;
 
 		res = kbase_mem_pool_alloc_pages(&kctx->mem_pools.large[alloc->group_id],
@@ -1307,6 +1316,7 @@ alloc_failed:
 invalid_request:
 	return -ENOMEM;
 }
+KBASE_EXPORT_TEST_API(kbase_alloc_phy_pages_helper);
 
 static size_t free_partial_locked(struct kbase_context *kctx, struct kbase_mem_pool *pool,
 				  struct tagged_addr tp)
@@ -1363,7 +1373,7 @@ struct tagged_addr *kbase_alloc_phy_pages_helper_locked(struct kbase_mem_phy_all
 	kctx = alloc->imported.native.kctx;
 	kbdev = kctx->kbdev;
 
-	if (!kbdev->pagesize_2mb)
+	if (!kbase_is_large_pages_enabled())
 		WARN_ON(pool->order);
 
 	if (alloc->reg) {
@@ -1386,7 +1396,7 @@ struct tagged_addr *kbase_alloc_phy_pages_helper_locked(struct kbase_mem_phy_all
 	tp = alloc->pages + alloc->nents;
 	new_pages = tp;
 
-	if (kbdev->pagesize_2mb && pool->order) {
+	if (kbase_is_large_pages_enabled() && pool->order) {
 		size_t nr_lp = nr_left / NUM_PAGES_IN_2MB_LARGE_PAGE;
 
 		res = kbase_mem_pool_alloc_pages_locked(pool, nr_lp * NUM_PAGES_IN_2MB_LARGE_PAGE,
@@ -1503,7 +1513,7 @@ alloc_failed:
 
 		struct tagged_addr *start_free = alloc->pages + alloc->nents;
 
-		if (kbdev->pagesize_2mb && pool->order) {
+		if (kbase_is_large_pages_enabled() && pool->order) {
 			while (nr_pages_to_free) {
 				if (is_huge_head(*start_free)) {
 					kbase_mem_pool_free_pages_locked(
@@ -1659,6 +1669,7 @@ int kbase_free_phy_pages_helper(struct kbase_mem_phy_alloc *alloc, size_t nr_pag
 
 	return 0;
 }
+KBASE_EXPORT_TEST_API(kbase_free_phy_pages_helper);
 
 void kbase_free_phy_pages_helper_locked(struct kbase_mem_phy_alloc *alloc,
 					struct kbase_mem_pool *pool, struct tagged_addr *pages,
@@ -2156,17 +2167,31 @@ void kbase_gpu_vm_lock(struct kbase_context *kctx)
 	KBASE_DEBUG_ASSERT(kctx != NULL);
 	mutex_lock(&kctx->reg_lock);
 }
-
 KBASE_EXPORT_TEST_API(kbase_gpu_vm_lock);
 
+void kbase_gpu_vm_lock_with_pmode_sync(struct kbase_context *kctx)
+{
+#if MALI_USE_CSF
+	down_read(&kctx->kbdev->csf.pmode_sync_sem);
+#endif
+	kbase_gpu_vm_lock(kctx);
+}
+
 void kbase_gpu_vm_unlock(struct kbase_context *kctx)
 {
 	KBASE_DEBUG_ASSERT(kctx != NULL);
 	mutex_unlock(&kctx->reg_lock);
 }
-
 KBASE_EXPORT_TEST_API(kbase_gpu_vm_unlock);
 
+void kbase_gpu_vm_unlock_with_pmode_sync(struct kbase_context *kctx)
+{
+	kbase_gpu_vm_unlock(kctx);
+#if MALI_USE_CSF
+	up_read(&kctx->kbdev->csf.pmode_sync_sem);
+#endif
+}
+
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 struct kbase_jit_debugfs_data {
 	int (*func)(struct kbase_jit_debugfs_data *data);
@@ -2708,7 +2733,7 @@ static int kbase_jit_grow(struct kbase_context *kctx, const struct base_jit_allo
 	delta = info->commit_pages - reg->gpu_alloc->nents;
 	pages_required = delta;
 
-	if (kctx->kbdev->pagesize_2mb && pages_required >= NUM_PAGES_IN_2MB_LARGE_PAGE) {
+	if (kbase_is_large_pages_enabled() && pages_required >= NUM_PAGES_IN_2MB_LARGE_PAGE) {
 		pool = &kctx->mem_pools.large[kctx->jit_group_id];
 		/* Round up to number of 2 MB pages required */
 		pages_required += (NUM_PAGES_IN_2MB_LARGE_PAGE - 1);
@@ -2746,10 +2771,10 @@ static int kbase_jit_grow(struct kbase_context *kctx, const struct base_jit_allo
 		kbase_mem_pool_lock(pool);
 	}
 
-	if (reg->gpu_alloc->nents > info->commit_pages) {
+	if (reg->gpu_alloc->nents >= info->commit_pages) {
 		kbase_mem_pool_unlock(pool);
 		spin_unlock(&kctx->mem_partials_lock);
-		dev_warn(
+		dev_info(
 			kctx->kbdev->dev,
 			"JIT alloc grown beyond the required number of initially required pages, this grow no longer needed.");
 		goto done;
@@ -2999,7 +3024,7 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx,
 	if (!jit_allow_allocate(kctx, info, ignore_pressure_limit))
 		return NULL;
 
-	if (kctx->kbdev->pagesize_2mb) {
+	if (kbase_is_large_pages_enabled()) {
 		/* Preallocate memory for the sub-allocation structs */
 		for (i = 0; i != ARRAY_SIZE(prealloc_sas); ++i) {
 			prealloc_sas[i] = kmalloc(sizeof(*prealloc_sas[i]), GFP_KERNEL);
@@ -3008,7 +3033,7 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx,
 		}
 	}
 
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 	mutex_lock(&kctx->jit_evict_lock);
 
 	/*
@@ -3086,7 +3111,7 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx,
 			kbase_jit_done_phys_increase(kctx, needed_pages);
 #endif /* MALI_JIT_PRESSURE_LIMIT_BASE */
 
-		kbase_gpu_vm_unlock(kctx);
+		kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 
 		if (ret) {
 			/*
@@ -3147,7 +3172,7 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx,
 #endif /* MALI_JIT_PRESSURE_LIMIT_BASE */
 
 		mutex_unlock(&kctx->jit_evict_lock);
-		kbase_gpu_vm_unlock(kctx);
+		kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 
 		reg = kbase_mem_alloc(kctx, info->va_pages, info->commit_pages, info->extension,
 				      &flags, &gpu_addr, mmu_sync_info);
@@ -3249,9 +3274,9 @@ void kbase_jit_free(struct kbase_context *kctx, struct kbase_va_region *reg)
 		u64 delta = old_pages - new_size;
 
 		if (delta) {
-			mutex_lock(&kctx->reg_lock);
+			kbase_gpu_vm_lock_with_pmode_sync(kctx);
 			kbase_mem_shrink(kctx, reg, old_pages - delta);
-			mutex_unlock(&kctx->reg_lock);
+			kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 		}
 	}
 
@@ -3356,8 +3381,7 @@ void kbase_jit_term(struct kbase_context *kctx)
 	struct kbase_va_region *walker;
 
 	/* Free all allocations for this context */
-
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 	mutex_lock(&kctx->jit_evict_lock);
 	/* Free all allocations from the pool */
 	while (!list_empty(&kctx->jit_pool_head)) {
@@ -3398,7 +3422,7 @@ void kbase_jit_term(struct kbase_context *kctx)
 	WARN_ON(kctx->jit_phys_pages_to_be_allocated);
 #endif
 	mutex_unlock(&kctx->jit_evict_lock);
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 
 	/*
 	 * Flush the freeing of allocations whose backing has been freed
diff --git a/mali_kbase/mali_kbase_mem.h b/mali_kbase/mali_kbase_mem.h
index 6277814..d4c3aee 100644
--- a/mali_kbase/mali_kbase_mem.h
+++ b/mali_kbase/mali_kbase_mem.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -1409,11 +1409,29 @@ int kbase_update_region_flags(struct kbase_context *kctx, struct kbase_va_region
 void kbase_gpu_vm_lock(struct kbase_context *kctx);
 
 /**
+ * kbase_gpu_vm_lock_with_pmode_sync() - Wrapper of kbase_gpu_vm_lock.
+ * @kctx:  KBase context
+ *
+ * Same as kbase_gpu_vm_lock for JM GPU.
+ * Additionally acquire P.mode read-write semaphore for CSF GPU.
+ */
+void kbase_gpu_vm_lock_with_pmode_sync(struct kbase_context *kctx);
+
+/**
  * kbase_gpu_vm_unlock() - Release the per-context region list lock
  * @kctx:  KBase context
  */
 void kbase_gpu_vm_unlock(struct kbase_context *kctx);
 
+/**
+ * kbase_gpu_vm_unlock_with_pmode_sync() - Wrapper of kbase_gpu_vm_unlock.
+ * @kctx:  KBase context
+ *
+ * Same as kbase_gpu_vm_unlock for JM GPU.
+ * Additionally release P.mode read-write semaphore for CSF GPU.
+ */
+void kbase_gpu_vm_unlock_with_pmode_sync(struct kbase_context *kctx);
+
 int kbase_alloc_phy_pages(struct kbase_va_region *reg, size_t vsize, size_t size);
 
 /**
@@ -1651,7 +1669,7 @@ int kbase_alloc_phy_pages_helper(struct kbase_mem_phy_alloc *alloc, size_t nr_pa
  *
  * @prealloc_sa:        Information about the partial allocation if the amount of memory requested
  *                      is not a multiple of 2MB. One instance of struct kbase_sub_alloc must be
- *                      allocated by the caller if kbdev->pagesize_2mb is enabled.
+ *                      allocated by the caller if large pages are enabled.
  *
  * Allocates @nr_pages_requested and updates the alloc object. This function does not allocate new
  * pages from the kernel, and therefore will never trigger the OoM killer. Therefore, it can be
@@ -1679,9 +1697,9 @@ int kbase_alloc_phy_pages_helper(struct kbase_mem_phy_alloc *alloc, size_t nr_pa
  * This ensures that the pool can be grown to the required size and that the allocation can
  * complete without another thread using the newly grown pages.
  *
- * If kbdev->pagesize_2mb is enabled and the allocation is >= 2MB, then @pool must be one of the
- * pools from alloc->imported.native.kctx->mem_pools.large[]. Otherwise it must be one of the
- * mempools from alloc->imported.native.kctx->mem_pools.small[].
+ * If large (2MiB) pages are enabled and the allocation is >= 2MiB, then @pool
+ * must be one of the pools from alloc->imported.native.kctx->mem_pools.large[]. Otherwise it
+ * must be one of the mempools from alloc->imported.native.kctx->mem_pools.small[].
  *
  * @prealloc_sa is used to manage the non-2MB sub-allocation. It has to be pre-allocated because we
  * must not sleep (due to the usage of kmalloc()) whilst holding pool->pool_lock.  @prealloc_sa
@@ -2595,4 +2613,7 @@ static inline base_mem_alloc_flags kbase_mem_group_id_set(int id)
 {
 	return BASE_MEM_GROUP_ID_SET(id);
 }
+
+bool kbase_is_large_pages_enabled(void);
+
 #endif /* _KBASE_MEM_H_ */
diff --git a/mali_kbase/mali_kbase_mem_linux.c b/mali_kbase/mali_kbase_mem_linux.c
index 6838fba..7801441 100644
--- a/mali_kbase/mali_kbase_mem_linux.c
+++ b/mali_kbase/mali_kbase_mem_linux.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -46,6 +46,7 @@
 #include <mali_kbase_caps.h>
 #include <mali_kbase_trace_gpu_mem.h>
 #include <mali_kbase_reset_gpu.h>
+#include <linux/version_compat_defs.h>
 
 #if ((KERNEL_VERSION(5, 3, 0) <= LINUX_VERSION_CODE) || \
      (KERNEL_VERSION(5, 0, 0) > LINUX_VERSION_CODE))
@@ -433,7 +434,7 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages
 	}
 	reg->initial_commit = commit_pages;
 
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 
 	if (reg->flags & KBASE_REG_PERMANENT_KERNEL_MAPPING) {
 		/* Permanent kernel mappings must happen as soon as
@@ -443,7 +444,7 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages
 		 */
 		int err = kbase_phy_alloc_mapping_init(kctx, reg, va_pages, commit_pages);
 		if (err < 0) {
-			kbase_gpu_vm_unlock(kctx);
+			kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 			goto no_kern_mapping;
 		}
 	}
@@ -455,7 +456,7 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages
 		/* Bind to a cookie */
 		if (bitmap_empty(kctx->cookies, BITS_PER_LONG)) {
 			dev_err(dev, "No cookies available for allocation!");
-			kbase_gpu_vm_unlock(kctx);
+			kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 			goto no_cookie;
 		}
 		/* return a cookie */
@@ -472,7 +473,7 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages
 	} else /* we control the VA */ {
 		size_t align = 1;
 
-		if (kctx->kbdev->pagesize_2mb) {
+		if (kbase_is_large_pages_enabled()) {
 			/* If there's enough (> 33 bits) of GPU VA space, align to 2MB
 			* boundaries. The similar condition is used for mapping from
 			* the SAME_VA zone inside kbase_context_get_unmapped_area().
@@ -490,7 +491,7 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages
 		}
 		if (kbase_gpu_mmap(kctx, reg, *gpu_va, va_pages, align, mmu_sync_info) != 0) {
 			dev_warn(dev, "Failed to map memory on GPU");
-			kbase_gpu_vm_unlock(kctx);
+			kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 			goto no_mmap;
 		}
 		/* return real GPU VA */
@@ -508,7 +509,7 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages
 	}
 #endif /* MALI_JIT_PRESSURE_LIMIT_BASE */
 
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 
 #if MALI_USE_CSF
 	if (*flags & BASE_MEM_FIXABLE)
@@ -596,8 +597,10 @@ int kbase_mem_query(struct kbase_context *kctx, u64 gpu_addr, u64 query, u64 *co
 			*out |= BASE_MEM_COHERENT_SYSTEM;
 		if (KBASE_REG_SHARE_IN & reg->flags)
 			*out |= BASE_MEM_COHERENT_LOCAL;
-		if (KBASE_REG_DONT_NEED & reg->flags)
-			*out |= BASE_MEM_DONT_NEED;
+		if (mali_kbase_supports_mem_dont_need(kctx->api_version)) {
+			if (KBASE_REG_DONT_NEED & reg->flags)
+				*out |= BASE_MEM_DONT_NEED;
+		}
 		if (mali_kbase_supports_mem_grow_on_gpf(kctx->api_version)) {
 			/* Prior to this version, this was known about by
 			 * user-side but we did not return them. Returning
@@ -634,9 +637,19 @@ int kbase_mem_query(struct kbase_context *kctx, u64 gpu_addr, u64 query, u64 *co
 			else
 				*out |= BASE_MEM_FIXABLE;
 		}
-#endif
+#endif /* MALI_USE_CSF */
 		if (KBASE_REG_GPU_VA_SAME_4GB_PAGE & reg->flags)
 			*out |= BASE_MEM_GPU_VA_SAME_4GB_PAGE;
+		if (mali_kbase_supports_mem_import_sync_on_map_unmap(kctx->api_version)) {
+			if (reg->gpu_alloc->type == KBASE_MEM_TYPE_IMPORTED_UMM) {
+				if (reg->gpu_alloc->imported.umm.need_sync)
+					*out |= BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP;
+			}
+		}
+		if (mali_kbase_supports_mem_kernel_sync(kctx->api_version)) {
+			if (unlikely(reg->cpu_alloc != reg->gpu_alloc))
+				*out |= BASE_MEM_KERNEL_SYNC;
+		}
 
 		*out |= kbase_mem_group_id_set(reg->cpu_alloc->group_id);
 
@@ -667,7 +680,9 @@ out_unlock:
 static unsigned long kbase_mem_evictable_reclaim_count_objects(struct shrinker *s,
 							       struct shrink_control *sc)
 {
-	struct kbase_context *kctx = container_of(s, struct kbase_context, reclaim);
+	struct kbase_context *kctx =
+		KBASE_GET_KBASE_DATA_FROM_SHRINKER(s, struct kbase_context, reclaim);
+
 	int evict_nents = atomic_read(&kctx->evict_nents);
 	unsigned long nr_freeable_items;
 
@@ -717,8 +732,15 @@ static unsigned long kbase_mem_evictable_reclaim_scan_objects(struct shrinker *s
 	struct kbase_mem_phy_alloc *tmp;
 	unsigned long freed = 0;
 
-	kctx = container_of(s, struct kbase_context, reclaim);
+	kctx = KBASE_GET_KBASE_DATA_FROM_SHRINKER(s, struct kbase_context, reclaim);
 
+#if MALI_USE_CSF
+	if (!down_read_trylock(&kctx->kbdev->csf.pmode_sync_sem)) {
+		dev_warn(kctx->kbdev->dev,
+			 "Can't shrink GPU memory when P.Mode entrance is in progress");
+		return 0;
+	}
+#endif
 	mutex_lock(&kctx->jit_evict_lock);
 
 	list_for_each_entry_safe(alloc, tmp, &kctx->evict_list, evict_node) {
@@ -757,32 +779,36 @@ static unsigned long kbase_mem_evictable_reclaim_scan_objects(struct shrinker *s
 	}
 
 	mutex_unlock(&kctx->jit_evict_lock);
-
+#if MALI_USE_CSF
+	up_read(&kctx->kbdev->csf.pmode_sync_sem);
+#endif
 	return freed;
 }
 
 int kbase_mem_evictable_init(struct kbase_context *kctx)
 {
+	struct shrinker *reclaim;
+
 	INIT_LIST_HEAD(&kctx->evict_list);
 	mutex_init(&kctx->jit_evict_lock);
 
-	kctx->reclaim.count_objects = kbase_mem_evictable_reclaim_count_objects;
-	kctx->reclaim.scan_objects = kbase_mem_evictable_reclaim_scan_objects;
-	kctx->reclaim.seeks = DEFAULT_SEEKS;
-	/* Kernel versions prior to 3.1 :
-	 * struct shrinker does not define batch
-	 */
-#if KERNEL_VERSION(6, 0, 0) > LINUX_VERSION_CODE
-	register_shrinker(&kctx->reclaim);
-#else
-	register_shrinker(&kctx->reclaim, "mali-mem");
-#endif
+	reclaim = KBASE_INIT_RECLAIM(kctx, reclaim, "mali-mem");
+	if (!reclaim)
+		return -ENOMEM;
+	KBASE_SET_RECLAIM(kctx, reclaim, reclaim);
+
+	reclaim->count_objects = kbase_mem_evictable_reclaim_count_objects;
+	reclaim->scan_objects = kbase_mem_evictable_reclaim_scan_objects;
+	reclaim->seeks = DEFAULT_SEEKS;
+
+	KBASE_REGISTER_SHRINKER(reclaim, "mali-mem", kctx);
+
 	return 0;
 }
 
 void kbase_mem_evictable_deinit(struct kbase_context *kctx)
 {
-	unregister_shrinker(&kctx->reclaim);
+	KBASE_UNREGISTER_SHRINKER(kctx->reclaim);
 }
 
 /**
@@ -1058,7 +1084,7 @@ int kbase_mem_flags_change(struct kbase_context *kctx, u64 gpu_addr, unsigned in
 
 	/* Lock down the context, and find the region */
 	down_write(kbase_mem_get_process_mmap_lock());
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 
 	/* Validate the region */
 	reg = kbase_region_tracker_find_region_base_address(kctx, gpu_addr);
@@ -1110,7 +1136,7 @@ int kbase_mem_flags_change(struct kbase_context *kctx, u64 gpu_addr, unsigned in
 	}
 
 out_unlock:
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 	up_write(kbase_mem_get_process_mmap_lock());
 
 	return ret;
@@ -1791,7 +1817,7 @@ u64 kbase_mem_alias(struct kbase_context *kctx, u64 *flags, u64 stride, u64 nent
 	if (!reg->gpu_alloc->imported.alias.aliased)
 		goto no_aliased_array;
 
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 
 	/* validate and add src handles */
 	for (i = 0; i < nents; i++) {
@@ -1901,7 +1927,7 @@ u64 kbase_mem_alias(struct kbase_context *kctx, u64 *flags, u64 stride, u64 nent
 	reg->flags &= ~KBASE_REG_FREE;
 	reg->flags &= ~KBASE_REG_GROWABLE;
 
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 
 	return gpu_va;
 
@@ -1912,7 +1938,7 @@ bad_handle:
 	 * them is handled by putting reg's allocs, so no rollback of those
 	 * actions is done here.
 	 */
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 no_aliased_array:
 invalid_flags:
 	kbase_mem_phy_alloc_put(reg->cpu_alloc);
@@ -2013,7 +2039,7 @@ int kbase_mem_import(struct kbase_context *kctx, enum base_mem_import_type type,
 	if (!reg)
 		goto no_reg;
 
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 
 	/* mmap needed to setup VA? */
 	if (*flags & (BASE_MEM_SAME_VA | BASE_MEM_NEED_MMAP)) {
@@ -2047,13 +2073,13 @@ int kbase_mem_import(struct kbase_context *kctx, enum base_mem_import_type type,
 	/* clear out private flags */
 	*flags &= ((1UL << BASE_MEM_FLAGS_NR_BITS) - 1);
 
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 
 	return 0;
 
 no_gpu_va:
 no_cookie:
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 	kbase_mem_phy_alloc_put(reg->cpu_alloc);
 	kbase_mem_phy_alloc_put(reg->gpu_alloc);
 	kfree(reg);
@@ -2139,7 +2165,7 @@ int kbase_mem_commit(struct kbase_context *kctx, u64 gpu_addr, u64 new_pages)
 	}
 
 	down_write(kbase_mem_get_process_mmap_lock());
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 
 	/* Validate the region */
 	reg = kbase_region_tracker_find_region_base_address(kctx, gpu_addr);
@@ -2247,7 +2273,7 @@ int kbase_mem_commit(struct kbase_context *kctx, u64 gpu_addr, u64 new_pages)
 	}
 
 out_unlock:
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 	if (read_locked)
 		up_read(kbase_mem_get_process_mmap_lock());
 	else
@@ -2271,11 +2297,16 @@ int kbase_mem_shrink(struct kbase_context *const kctx, struct kbase_va_region *c
 		return -EINVAL;
 
 	old_pages = kbase_reg_current_backed_size(reg);
-	if (WARN_ON(old_pages < new_pages))
+	if (old_pages < new_pages) {
+		dev_warn(
+			kctx->kbdev->dev,
+			"Requested number of pages (%llu) is larger than the current number of pages (%llu)",
+			new_pages, old_pages);
 		return -EINVAL;
+	}
 
 	delta = old_pages - new_pages;
-	if (kctx->kbdev->pagesize_2mb) {
+	if (kbase_is_large_pages_enabled()) {
 		struct tagged_addr *start_free = reg->gpu_alloc->pages + new_pages;
 
 		/* Move the end of new commited range to a valid location.
@@ -2329,7 +2360,7 @@ static void kbase_cpu_vm_close(struct vm_area_struct *vma)
 	KBASE_DEBUG_ASSERT(map->kctx);
 	KBASE_DEBUG_ASSERT(map->alloc);
 
-	kbase_gpu_vm_lock(map->kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(map->kctx);
 
 	if (map->free_on_close) {
 		KBASE_DEBUG_ASSERT(kbase_bits_to_zone(map->region->flags) == SAME_VA_ZONE);
@@ -2343,7 +2374,7 @@ static void kbase_cpu_vm_close(struct vm_area_struct *vma)
 	list_del(&map->mappings_list);
 
 	kbase_va_region_alloc_put(map->kctx, map->region);
-	kbase_gpu_vm_unlock(map->kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(map->kctx);
 
 	kbase_mem_phy_alloc_put(map->alloc);
 	kbase_file_dec_cpu_mapping_count(map->kctx->kfile);
@@ -2746,7 +2777,7 @@ int kbase_context_mmap(struct kbase_context *const kctx, struct vm_area_struct *
 		goto out;
 	}
 
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 
 	if (vma->vm_pgoff == PFN_DOWN(BASE_MEM_MAP_TRACKING_HANDLE)) {
 		/* The non-mapped tracking helper page */
@@ -2781,11 +2812,11 @@ int kbase_context_mmap(struct kbase_context *const kctx, struct vm_area_struct *
 #endif /* defined(CONFIG_MALI_VECTOR_DUMP) */
 #if MALI_USE_CSF
 	case PFN_DOWN(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE):
-		kbase_gpu_vm_unlock(kctx);
+		kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 		err = kbase_csf_cpu_mmap_user_reg_page(kctx, vma);
 		goto out;
 	case PFN_DOWN(BASEP_MEM_CSF_USER_IO_PAGES_HANDLE)... PFN_DOWN(BASE_MEM_COOKIE_BASE) - 1: {
-		kbase_gpu_vm_unlock(kctx);
+		kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 		mutex_lock(&kctx->csf.lock);
 		err = kbase_csf_cpu_mmap_user_io_pages(kctx, vma);
 		mutex_unlock(&kctx->csf.lock);
@@ -2879,7 +2910,7 @@ int kbase_context_mmap(struct kbase_context *const kctx, struct vm_area_struct *
 	}
 #endif /* defined(CONFIG_MALI_VECTOR_DUMP) */
 out_unlock:
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 out:
 	if (err)
 		dev_err(dev, "mmap failed %d\n", err);
diff --git a/mali_kbase/mali_kbase_mem_linux.h b/mali_kbase/mali_kbase_mem_linux.h
index 2866603..037bdfe 100644
--- a/mali_kbase/mali_kbase_mem_linux.h
+++ b/mali_kbase/mali_kbase_mem_linux.h
@@ -57,6 +57,8 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages
 /**
  * kbase_mem_query - Query properties of a GPU memory region
  *
+ * Note: Does not currently report the BASE_MEM_SAME_VA flag for any memory allocation.
+ *
  * @kctx:     The kernel context
  * @gpu_addr: A GPU address contained within the memory region
  * @query:    The type of query, from KBASE_MEM_QUERY_* flags, which could be
diff --git a/mali_kbase/mali_kbase_mem_migrate.c b/mali_kbase/mali_kbase_mem_migrate.c
index dbd340a..26ddeed 100644
--- a/mali_kbase/mali_kbase_mem_migrate.c
+++ b/mali_kbase/mali_kbase_mem_migrate.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2022-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2022-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -28,6 +28,9 @@
 #include <mali_kbase_mem_migrate.h>
 #include <mmu/mali_kbase_mmu.h>
 
+/* Static key used to determine if page migration is enabled or not */
+static DEFINE_STATIC_KEY_FALSE(page_migration_static_key);
+
 /* Global integer used to determine if module parameter value has been
  * provided and if page migration feature is enabled.
  * Feature is disabled on all platforms by default.
@@ -50,15 +53,6 @@ MODULE_PARM_DESC(kbase_page_migration_enabled,
 
 KBASE_EXPORT_TEST_API(kbase_page_migration_enabled);
 
-bool kbase_is_page_migration_enabled(void)
-{
-	/* Handle uninitialised int case */
-	if (kbase_page_migration_enabled < 0)
-		return false;
-	return IS_ENABLED(CONFIG_PAGE_MIGRATION_SUPPORT) && kbase_page_migration_enabled;
-}
-KBASE_EXPORT_SYMBOL(kbase_is_page_migration_enabled);
-
 #if (KERNEL_VERSION(6, 0, 0) <= LINUX_VERSION_CODE)
 static const struct movable_operations movable_ops;
 #endif
@@ -225,7 +219,7 @@ static int kbasep_migrate_page_pt_mapped(struct page *old_page, struct page *new
 	 * This blocks the CPU page fault handler from remapping pages.
 	 * Only MCU's mmut is device wide, i.e. no corresponding kctx.
 	 */
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 
 	ret = kbase_mmu_migrate_page(
 		as_tagged(page_to_phys(old_page)), as_tagged(page_to_phys(new_page)), old_dma_addr,
@@ -252,7 +246,7 @@ static int kbasep_migrate_page_pt_mapped(struct page *old_page, struct page *new
 		dma_unmap_page(kbdev->dev, new_dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
 
 	/* Page fault handler for CPU mapping unblocked. */
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 
 	return ret;
 }
@@ -291,7 +285,7 @@ static int kbasep_migrate_page_allocated_mapped(struct page *old_page, struct pa
 	/* Lock context to protect access to array of pages in physical allocation.
 	 * This blocks the CPU page fault handler from remapping pages.
 	 */
-	kbase_gpu_vm_lock(kctx);
+	kbase_gpu_vm_lock_with_pmode_sync(kctx);
 
 	/* Unmap the old physical range. */
 	unmap_mapping_range(kctx->kfile->filp->f_inode->i_mapping,
@@ -328,7 +322,7 @@ static int kbasep_migrate_page_allocated_mapped(struct page *old_page, struct pa
 		dma_unmap_page(kctx->kbdev->dev, new_dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
 
 	/* Page fault handler for CPU mapping unblocked. */
-	kbase_gpu_vm_unlock(kctx);
+	kbase_gpu_vm_unlock_with_pmode_sync(kctx);
 
 	return ret;
 }
@@ -679,11 +673,15 @@ void kbase_mem_migrate_init(struct kbase_device *kbdev)
 	 * integer for a negative value to see if insmod parameter was
 	 * passed in at all (it will override the default negative value).
 	 */
-	if (kbase_page_migration_enabled < 0)
-		kbase_page_migration_enabled = kbdev->pagesize_2mb ? 1 : 0;
-	else
+	if (kbase_page_migration_enabled < 0) {
+		if (kbase_is_large_pages_enabled())
+			static_branch_enable(&page_migration_static_key);
+	} else {
 		dev_info(kbdev->dev, "Page migration support explicitly %s at insmod.",
 			 kbase_page_migration_enabled ? "enabled" : "disabled");
+		if (kbase_page_migration_enabled)
+			static_branch_enable(&page_migration_static_key);
+	}
 
 	spin_lock_init(&mem_migrate->free_pages_lock);
 	INIT_LIST_HEAD(&mem_migrate->free_pages_list);
@@ -708,3 +706,9 @@ void kbase_mem_migrate_term(struct kbase_device *kbdev)
 	iput(mem_migrate->inode);
 #endif
 }
+
+bool kbase_is_page_migration_enabled(void)
+{
+	return static_branch_unlikely(&page_migration_static_key);
+}
+KBASE_EXPORT_TEST_API(kbase_is_page_migration_enabled);
diff --git a/mali_kbase/mali_kbase_mem_migrate.h b/mali_kbase/mali_kbase_mem_migrate.h
index ece8734..70c3135 100644
--- a/mali_kbase/mali_kbase_mem_migrate.h
+++ b/mali_kbase/mali_kbase_mem_migrate.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2022-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2022-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
diff --git a/mali_kbase/mali_kbase_mem_pool.c b/mali_kbase/mali_kbase_mem_pool.c
index cb862d5..5984730 100644
--- a/mali_kbase/mali_kbase_mem_pool.c
+++ b/mali_kbase/mali_kbase_mem_pool.c
@@ -480,7 +480,7 @@ static unsigned long kbase_mem_pool_reclaim_count_objects(struct shrinker *s,
 
 	CSTD_UNUSED(sc);
 
-	pool = container_of(s, struct kbase_mem_pool, reclaim);
+	pool = KBASE_GET_KBASE_DATA_FROM_SHRINKER(s, struct kbase_mem_pool, reclaim);
 
 	kbase_mem_pool_lock(pool);
 	if (pool->dont_reclaim && !pool->dying) {
@@ -502,7 +502,7 @@ static unsigned long kbase_mem_pool_reclaim_scan_objects(struct shrinker *s,
 	struct kbase_mem_pool *pool;
 	unsigned long freed;
 
-	pool = container_of(s, struct kbase_mem_pool, reclaim);
+	pool = KBASE_GET_KBASE_DATA_FROM_SHRINKER(s, struct kbase_mem_pool, reclaim);
 
 	kbase_mem_pool_lock(pool);
 	if (pool->dont_reclaim && !pool->dying) {
@@ -528,6 +528,8 @@ int kbase_mem_pool_init(struct kbase_mem_pool *pool, const struct kbase_mem_pool
 			unsigned int order, int group_id, struct kbase_device *kbdev,
 			struct kbase_mem_pool *next_pool)
 {
+	struct shrinker *reclaim;
+
 	if (WARN_ON(group_id < 0) || WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS)) {
 		return -EINVAL;
 	}
@@ -544,18 +546,17 @@ int kbase_mem_pool_init(struct kbase_mem_pool *pool, const struct kbase_mem_pool
 	spin_lock_init(&pool->pool_lock);
 	INIT_LIST_HEAD(&pool->page_list);
 
-	pool->reclaim.count_objects = kbase_mem_pool_reclaim_count_objects;
-	pool->reclaim.scan_objects = kbase_mem_pool_reclaim_scan_objects;
-	pool->reclaim.seeks = DEFAULT_SEEKS;
-	/* Kernel versions prior to 3.1 :
-	 * struct shrinker does not define batch
-	 */
-	pool->reclaim.batch = 0;
-#if KERNEL_VERSION(6, 0, 0) > LINUX_VERSION_CODE
-	register_shrinker(&pool->reclaim);
-#else
-	register_shrinker(&pool->reclaim, "mali-mem-pool");
-#endif
+	reclaim = KBASE_INIT_RECLAIM(pool, reclaim, "mali-mem-pool");
+	if (!reclaim)
+		return -ENOMEM;
+	KBASE_SET_RECLAIM(pool, reclaim, reclaim);
+
+	reclaim->count_objects = kbase_mem_pool_reclaim_count_objects;
+	reclaim->scan_objects = kbase_mem_pool_reclaim_scan_objects;
+	reclaim->seeks = DEFAULT_SEEKS;
+	reclaim->batch = 0;
+
+	KBASE_REGISTER_SHRINKER(reclaim, "mali-mem-pool", pool);
 
 	pool_dbg(pool, "initialized\n");
 
@@ -581,7 +582,7 @@ void kbase_mem_pool_term(struct kbase_mem_pool *pool)
 
 	pool_dbg(pool, "terminate()\n");
 
-	unregister_shrinker(&pool->reclaim);
+	KBASE_UNREGISTER_SHRINKER(pool->reclaim);
 
 	kbase_mem_pool_lock(pool);
 	pool->max_size = 0;
diff --git a/mali_kbase/mali_kbase_native_mgm.c b/mali_kbase/mali_kbase_native_mgm.c
index 5e3d1ee..d688509 100644
--- a/mali_kbase/mali_kbase_native_mgm.c
+++ b/mali_kbase/mali_kbase_native_mgm.c
@@ -121,44 +121,20 @@ static vm_fault_t kbase_native_mgm_vmf_insert_pfn_prot(struct memory_group_manag
 	return vmf_insert_pfn_prot(vma, addr, pfn, pgprot);
 }
 
-/**
- * kbase_native_mgm_update_gpu_pte - Native method to modify a GPU page table
- *                                   entry
- *
- * @mgm_dev:   The memory group manager the request is being made through.
- * @group_id:  A physical memory group ID, which must be valid but is not used.
- *             Its valid range is 0 .. MEMORY_GROUP_MANAGER_NR_GROUPS-1.
- * @mmu_level: The level of the MMU page table where the page is getting mapped.
- * @pte:       The prepared page table entry.
- *
- * This function simply returns the @pte without modification.
- *
- * Return: A GPU page table entry to be stored in a page table.
- */
 static u64 kbase_native_mgm_update_gpu_pte(struct memory_group_manager_device *mgm_dev,
 					   unsigned int group_id, int mmu_level, u64 pte)
 {
-	CSTD_UNUSED(mgm_dev);
-	CSTD_UNUSED(group_id);
-	CSTD_UNUSED(mmu_level);
+	if (WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
+		return pte;
+
+	pte |= ((u64)group_id << PTE_PBHA_SHIFT) & PTE_PBHA_MASK;
+
+	/* Address could be translated into a different bus address here */
+	pte |= ((u64)1 << PTE_RES_BIT_MULTI_AS_SHIFT);
 
 	return pte;
 }
 
-/**
- * kbase_native_mgm_pte_to_original_pte - Native method to undo changes done in
- *                                        kbase_native_mgm_update_gpu_pte()
- *
- * @mgm_dev:   The memory group manager the request is being made through.
- * @group_id:  A physical memory group ID, which must be valid but is not used.
- *             Its valid range is 0 .. MEMORY_GROUP_MANAGER_NR_GROUPS-1.
- * @mmu_level: The level of the MMU page table where the page is getting mapped.
- * @pte:       The prepared page table entry.
- *
- * This function simply returns the @pte without modification.
- *
- * Return: A GPU page table entry to be stored in a page table.
- */
 static u64 kbase_native_mgm_pte_to_original_pte(struct memory_group_manager_device *mgm_dev,
 						unsigned int group_id, int mmu_level, u64 pte)
 {
@@ -166,6 +142,11 @@ static u64 kbase_native_mgm_pte_to_original_pte(struct memory_group_manager_devi
 	CSTD_UNUSED(group_id);
 	CSTD_UNUSED(mmu_level);
 
+	/* Undo the group ID modification */
+	pte &= ~PTE_PBHA_MASK;
+	/* Undo the bit set */
+	pte &= ~((u64)1 << PTE_RES_BIT_MULTI_AS_SHIFT);
+
 	return pte;
 }
 
diff --git a/mali_kbase/mali_kbase_pbha.c b/mali_kbase/mali_kbase_pbha.c
index 341ea90..c5b6fad 100644
--- a/mali_kbase/mali_kbase_pbha.c
+++ b/mali_kbase/mali_kbase_pbha.c
@@ -277,16 +277,16 @@ static int kbase_pbha_read_int_id_override_property(struct kbase_device *kbdev,
 static int kbase_pbha_read_propagate_bits_property(struct kbase_device *kbdev,
 						   const struct device_node *pbha_node)
 {
-	u32 bits = 0;
+	u8 bits = 0;
 	int err;
 
 	if (!kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_PBHA_HWU))
 		return 0;
 
-	err = of_property_read_u32(pbha_node, "propagate-bits", &bits);
+	err = of_property_read_u8(pbha_node, "propagate-bits", &bits);
 
 	if (err == -EINVAL) {
-		err = of_property_read_u32(pbha_node, "propagate_bits", &bits);
+		err = of_property_read_u8(pbha_node, "propagate_bits", &bits);
 	}
 
 	if (err < 0) {
diff --git a/mali_kbase/mali_kbase_pbha_debugfs.c b/mali_kbase/mali_kbase_pbha_debugfs.c
index f1d2794..8ab0d18 100644
--- a/mali_kbase/mali_kbase_pbha_debugfs.c
+++ b/mali_kbase/mali_kbase_pbha_debugfs.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2021-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -50,8 +50,8 @@ static int int_id_overrides_show(struct seq_file *sfile, void *data)
 #endif /* MALI_USE_CSF */
 
 		for (j = 0; j < sizeof(u32); ++j) {
-			u8 r_val;
-			u8 w_val;
+			u8 r_val = 0;
+			u8 w_val = 0;
 
 			switch (j) {
 			case 0:
diff --git a/mali_kbase/mali_kbase_pm.c b/mali_kbase/mali_kbase_pm.c
index ff71524..c17e302 100644
--- a/mali_kbase/mali_kbase_pm.c
+++ b/mali_kbase/mali_kbase_pm.c
@@ -200,19 +200,24 @@ int kbase_pm_driver_suspend(struct kbase_device *kbdev)
 	mutex_unlock(&kbdev->pm.lock);
 
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
-#if !MALI_USE_CSF
 	if (kbdev->arb.arb_if) {
-		unsigned int i;
 		unsigned long flags;
 
+#if MALI_USE_CSF
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		kbase_disjoint_state_up(kbdev);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+#else
+		unsigned int i;
+
 		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 		kbdev->js_data.runpool_irq.submit_allowed = 0;
 		kbase_disjoint_state_up(kbdev);
 		for (i = 0; i < kbdev->gpu_props.num_job_slots; i++)
 			kbase_job_slot_softstop(kbdev, i, NULL);
 		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+#endif
 	}
-#endif /* !MALI_USE_CSF */
 #endif /* CONFIG_MALI_ARBITER_SUPPORT */
 
 	/* From now on, the active count will drop towards zero. Sometimes,
diff --git a/mali_kbase/mmu/mali_kbase_mmu.c b/mali_kbase/mmu/mali_kbase_mmu.c
index cc4de07..2783e04 100644
--- a/mali_kbase/mmu/mali_kbase_mmu.c
+++ b/mali_kbase/mmu/mali_kbase_mmu.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -776,7 +776,7 @@ static bool page_fault_try_alloc(struct kbase_context *kctx, struct kbase_va_reg
 		return false;
 	}
 
-	if (kctx->kbdev->pagesize_2mb && new_pages >= NUM_PAGES_IN_2MB_LARGE_PAGE) {
+	if (kbase_is_large_pages_enabled() && new_pages >= NUM_PAGES_IN_2MB_LARGE_PAGE) {
 		root_pool = &kctx->mem_pools.large[region->gpu_alloc->group_id];
 		*grow_2mb_pool = true;
 	} else {
@@ -923,7 +923,7 @@ void kbase_mmu_page_fault_worker(struct work_struct *data)
 	int err;
 	bool grown = false;
 	size_t pages_to_grow;
-	bool grow_2mb_pool;
+	bool grow_2mb_pool = false;
 	struct kbase_sub_alloc *prealloc_sas[2] = { NULL, NULL };
 	int i;
 	size_t current_backed_size;
@@ -1093,7 +1093,7 @@ void kbase_mmu_page_fault_worker(struct work_struct *data)
 	}
 
 page_fault_retry:
-	if (kbdev->pagesize_2mb) {
+	if (kbase_is_large_pages_enabled()) {
 		/* Preallocate (or re-allocate) memory for the sub-allocation structs if necessary */
 		for (i = 0; i != ARRAY_SIZE(prealloc_sas); ++i) {
 			if (!prealloc_sas[i]) {
@@ -1180,10 +1180,14 @@ page_fault_retry:
 		 */
 		op_param.mmu_sync_info = mmu_sync_info;
 		op_param.kctx_id = kctx->id;
-		/* Can safely skip the invalidate for all levels in case
-		 * of duplicate page faults.
+		/* Usually it is safe to skip the MMU cache invalidate for all levels
+		 * in case of duplicate page faults. But for the pathological scenario
+		 * where the faulty VA gets mapped by the time page fault worker runs it
+		 * becomes imperative to invalidate MMU cache for all levels, otherwise
+		 * there is a possibility of repeated page faults on GPUs which supports
+		 * fine grained MMU cache invalidation.
 		 */
-		op_param.flush_skip_levels = 0xF;
+		op_param.flush_skip_levels = 0x0;
 		op_param.vpfn = fault_pfn;
 		op_param.nr = 1;
 		spin_lock_irqsave(&kbdev->hwaccess_lock, hwaccess_flags);
@@ -1217,10 +1221,14 @@ page_fault_retry:
 		/* See comment [1] about UNLOCK usage */
 		op_param.mmu_sync_info = mmu_sync_info;
 		op_param.kctx_id = kctx->id;
-		/* Can safely skip the invalidate for all levels in case
-		 * of duplicate page faults.
+		/* Usually it is safe to skip the MMU cache invalidate for all levels
+		 * in case of duplicate page faults. But for the pathological scenario
+		 * where the faulty VA gets mapped by the time page fault worker runs it
+		 * becomes imperative to invalidate MMU cache for all levels, otherwise
+		 * there is a possibility of repeated page faults on GPUs which supports
+		 * fine grained MMU cache invalidation.
 		 */
-		op_param.flush_skip_levels = 0xF;
+		op_param.flush_skip_levels = 0x0;
 		op_param.vpfn = fault_pfn;
 		op_param.nr = 1;
 		spin_lock_irqsave(&kbdev->hwaccess_lock, hwaccess_flags);
@@ -1382,7 +1390,7 @@ page_fault_retry:
 		 * Otherwise fail the allocation.
 		 */
 		if (pages_to_grow > 0) {
-			if (kbdev->pagesize_2mb && grow_2mb_pool) {
+			if (kbase_is_large_pages_enabled() && grow_2mb_pool) {
 				/* Round page requirement up to nearest 2 MB */
 				struct kbase_mem_pool *const lp_mem_pool =
 					&kctx->mem_pools.large[group_id];
@@ -1595,6 +1603,7 @@ static int mmu_get_lowest_valid_pgd(struct kbase_device *kbdev, struct kbase_mmu
 
 	return err;
 }
+KBASE_ALLOW_ERROR_INJECTION_TEST_API(mmu_get_lowest_valid_pgd, ERRNO);
 
 /*
  * On success, sets out_pgd to the PGD for the specified level of translation
@@ -1700,8 +1709,16 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 		mmu_mode->entries_invalidate(&page[idx], pcount);
 
 		if (!num_of_valid_entries) {
+			mmu_mode->set_num_valid_entries(page, 0);
+
 			kbase_kunmap(p, page);
 
+			/* No CPU and GPU cache maintenance is done here as caller would do the
+			 * complete flush of GPU cache and invalidation of TLB before the PGD
+			 * page is freed. CPU cache flush would be done when the PGD page is
+			 * returned to the memory pool.
+			 */
+
 			kbase_mmu_add_to_free_pgds_list(mmut, p);
 
 			kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds, vpfn, level,
@@ -1728,7 +1745,8 @@ next:
 	 * going to happen to these pages at this stage. They might return
 	 * movable once they are returned to a memory pool.
 	 */
-	if (kbase_is_page_migration_enabled() && !ignore_page_migration && phys) {
+	if (kbase_is_page_migration_enabled() && !ignore_page_migration && phys &&
+	    !is_huge(*phys) && !is_partial(*phys)) {
 		const u64 num_pages = (to_vpfn - from_vpfn) / GPU_PAGES_PER_CPU_PAGE;
 		u64 i;
 
@@ -2525,6 +2543,7 @@ int kbase_mmu_insert_pages(struct kbase_device *kbdev, struct kbase_mmu_table *m
 }
 
 KBASE_EXPORT_TEST_API(kbase_mmu_insert_pages);
+KBASE_ALLOW_ERROR_INJECTION_TEST_API(kbase_mmu_insert_pages, ERRNO);
 
 int kbase_mmu_insert_pages_skip_status_update(struct kbase_device *kbdev,
 					      struct kbase_mmu_table *mmut, u64 vpfn,
@@ -2582,6 +2601,7 @@ int kbase_mmu_insert_aliased_pages(struct kbase_device *kbdev, struct kbase_mmu_
 
 	return 0;
 }
+KBASE_ALLOW_ERROR_INJECTION_TEST_API(kbase_mmu_insert_aliased_pages, ERRNO);
 
 void kbase_mmu_update(struct kbase_device *kbdev, struct kbase_mmu_table *mmut, int as_nr)
 {
@@ -2720,15 +2740,25 @@ static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
 
 		kbdev->mmu_mode->entries_invalidate(&current_page[index], 1);
 		if (current_valid_entries == 1 && current_level != MIDGARD_MMU_LEVEL(0)) {
+			kbdev->mmu_mode->set_num_valid_entries(current_page, 0);
+
 			kbase_kunmap(p, current_page);
 
-			/* Ensure the cacheline containing the last valid entry
-			 * of PGD is invalidated from the GPU cache, before the
-			 * PGD page is freed.
-			 */
-			kbase_mmu_sync_pgd_gpu(kbdev, mmut->kctx,
-					       current_pgd + (index * sizeof(u64)), sizeof(u64),
-					       flush_op);
+			/* Check if fine grained GPU cache maintenance is being used */
+			if (flush_op == KBASE_MMU_OP_FLUSH_PT) {
+				/* Ensure the invalidated PTE is visible in memory right away */
+				kbase_mmu_sync_pgd_cpu(kbdev,
+						       kbase_dma_addr(p) + (index * sizeof(u64)),
+						       sizeof(u64));
+				/* Invalidate the GPU cache for the whole PGD page and not just for
+				 * the cacheline containing the invalidated PTE, as the PGD page is
+				 * going to be freed. There is an extremely remote possibility that
+				 * other cachelines (containing all invalid PTEs) of PGD page are
+				 * also present in the GPU cache.
+				 */
+				kbase_mmu_sync_pgd_gpu(kbdev, mmut->kctx, current_pgd,
+						       512 * sizeof(u64), KBASE_MMU_OP_FLUSH_PT);
+			}
 
 			kbase_mmu_add_to_free_pgds_list(mmut, p);
 		} else {
@@ -2832,8 +2862,7 @@ static int kbase_mmu_teardown_pgd_pages(struct kbase_device *kbdev, struct kbase
 		phys_addr_t pgd = mmut->pgd;
 		struct page *p = phys_to_page(pgd);
 
-		if (count > nr)
-			count = nr;
+		count = MIN(nr, count);
 
 		/* need to check if this is a 2MB page or a small page */
 		for (level = MIDGARD_MMU_TOPLEVEL; level <= MIDGARD_MMU_BOTTOMLEVEL; level++) {
@@ -2844,23 +2873,12 @@ static int kbase_mmu_teardown_pgd_pages(struct kbase_device *kbdev, struct kbase
 			if (mmu_mode->ate_is_valid(page[index], level))
 				break; /* keep the mapping */
 			else if (!mmu_mode->pte_is_valid(page[index], level)) {
-				/* nothing here, advance */
-				switch (level) {
-				case MIDGARD_MMU_LEVEL(0):
-					count = 134217728;
-					break;
-				case MIDGARD_MMU_LEVEL(1):
-					count = 262144;
-					break;
-				case MIDGARD_MMU_LEVEL(2):
-					count = 512;
-					break;
-				case MIDGARD_MMU_LEVEL(3):
-					count = 1;
-					break;
-				}
-				if (count > nr)
-					count = nr;
+				dev_warn(kbdev->dev, "Invalid PTE found @ level %d for VA %llx",
+					 level, vpfn << PAGE_SHIFT);
+				/* nothing here, advance to the next PTE of the current level */
+				count = (1 << ((3 - level) * 9));
+				count -= (vpfn & (count - 1));
+				count = MIN(nr, count);
 				goto next;
 			}
 			next_pgd = mmu_mode->pte_to_phy_addr(
@@ -2915,14 +2933,25 @@ static int kbase_mmu_teardown_pgd_pages(struct kbase_device *kbdev, struct kbase
 		mmu_mode->entries_invalidate(&page[index], pcount);
 
 		if (!num_of_valid_entries) {
+			mmu_mode->set_num_valid_entries(page, 0);
+
 			kbase_kunmap(p, page);
 
-			/* Ensure the cacheline(s) containing the last valid entries
-			 * of PGD is invalidated from the GPU cache, before the
-			 * PGD page is freed.
-			 */
-			kbase_mmu_sync_pgd_gpu(kbdev, mmut->kctx, pgd + (index * sizeof(u64)),
-					       pcount * sizeof(u64), flush_op);
+			/* Check if fine grained GPU cache maintenance is being used */
+			if (flush_op == KBASE_MMU_OP_FLUSH_PT) {
+				/* Ensure the invalidated ATEs are visible in memory right away */
+				kbase_mmu_sync_pgd_cpu(kbdev,
+						       kbase_dma_addr(p) + (index * sizeof(u64)),
+						       pcount * sizeof(u64));
+				/* Invalidate the GPU cache for the whole PGD page and not just for
+				 * the cachelines containing the invalidated ATEs, as the PGD page
+				 * is going to be freed. There is an extremely remote possibility
+				 * that other cachelines (containing all invalid ATEs) of PGD page
+				 * are also present in the GPU cache.
+				 */
+				kbase_mmu_sync_pgd_gpu(kbdev, mmut->kctx, pgd, 512 * sizeof(u64),
+						       KBASE_MMU_OP_FLUSH_PT);
+			}
 
 			kbase_mmu_add_to_free_pgds_list(mmut, p);
 
@@ -3069,6 +3098,7 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev, struct kbase_mmu_table
 	return mmu_teardown_pages(kbdev, mmut, vpfn, phys, nr_phys_pages, nr_virt_pages, as_nr,
 				  false);
 }
+KBASE_EXPORT_TEST_API(kbase_mmu_teardown_pages);
 
 int kbase_mmu_teardown_imported_pages(struct kbase_device *kbdev, struct kbase_mmu_table *mmut,
 				      u64 vpfn, struct tagged_addr *phys, size_t nr_phys_pages,
diff --git a/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c b/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c
index d19579d..e3ad78d 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c
+++ b/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c
@@ -32,7 +32,7 @@
  */
 #define ENTRY_IS_ATE_L3 3ULL
 #define ENTRY_IS_ATE_L02 1ULL
-#define ENTRY_IS_INVAL 2ULL
+#define ENTRY_IS_INVAL 0ULL
 #define ENTRY_IS_PTE 3ULL
 
 #define ENTRY_ACCESS_RW (1ULL << 6) /* bits 6:7 */
diff --git a/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c b/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c
index 7c92505..9e3f789 100644
--- a/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c
+++ b/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2015-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2015-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
diff --git a/mali_kbase/tests/include/kutf/kutf_kprobe.h b/mali_kbase/tests/include/kutf/kutf_kprobe.h
index f75cd77..cdcaa46 100644
--- a/mali_kbase/tests/include/kutf/kutf_kprobe.h
+++ b/mali_kbase/tests/include/kutf/kutf_kprobe.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2023-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -22,6 +22,8 @@
 #ifndef _KUTF_KPROBE_H_
 #define _KUTF_KPROBE_H_
 
+struct dentry;
+
 int kutf_kprobe_init(struct dentry *base_dir);
 void kutf_kprobe_exit(void);
 
diff --git a/mali_kbase/thirdparty/mali_kbase_mmap.c b/mali_kbase/thirdparty/mali_kbase_mmap.c
index 1592eab..9fad54d 100644
--- a/mali_kbase/thirdparty/mali_kbase_mmap.c
+++ b/mali_kbase/thirdparty/mali_kbase_mmap.c
@@ -20,18 +20,84 @@
  * kbase_context_get_unmapped_area() interface.
  */
 
+#if (KERNEL_VERSION(6, 1, 0) <= LINUX_VERSION_CODE)
+/**
+ * move_mt_gap() -  Search the maple tree for an existing gap of a particular size
+ *                  immediately before another pre-identified gap.
+ * @gap_start:      Pre-identified gap starting address.
+ * @gap_end:        Pre-identified gap ending address.
+ * @size:           Size of the new gap needed before gap_start.
+ *
+ * This function will search the calling process' maple tree
+ * for another gap, one that is immediately preceding the pre-identified
+ * gap, for a specific size, and upon success it will decrement gap_end
+ * by the specified size, and replace gap_start with the new gap_start of
+ * the newly identified gap.
+ *
+ * Return: true if large enough preceding gap is found, false otherwise.
+ */
+static bool move_mt_gap(unsigned long *gap_start, unsigned long *gap_end, unsigned long size)
+{
+	unsigned long new_gap_start, new_gap_end;
+
+	MA_STATE(mas, &current->mm->mm_mt, 0, 0);
+
+	if (*gap_end < size)
+		return false;
+
+	/* Calculate the gap end for the new, resultant gap */
+	new_gap_end = *gap_end - size;
+
+	/* If the new gap_end (i.e. new VA start address) is larger than gap_start, than the
+	 * pre-identified gap already has space to shrink to accommodate the decrease in
+	 * gap_end.
+	 */
+	if (new_gap_end >= *gap_start) {
+		/* Pre-identified gap already has space - just patch gap_end to new
+		 * lower value and exit.
+		 */
+		*gap_end = new_gap_end;
+		return true;
+	}
+
+	/* Since the new VA start address (new_gap_end) is below the start of the pre-identified
+	 * gap in the maple tree, see if there is a free gap directly before the existing gap, of
+	 * the same size as the alignment shift, such that the effective gap found is "extended".
+	 * This may be larger than needed but leaves the same distance between gap_end and gap_start
+	 * that currently exists.
+	 */
+	new_gap_start = *gap_start - size;
+	if (mas_empty_area_rev(&mas, new_gap_start, *gap_start - 1, size)) {
+		/* There's no gap between the new start address needed and the
+		 * current start address - so return false to find a new
+		 * gap from the maple tree.
+		 */
+		return false;
+	}
+	/* Suitable gap found - replace gap_start and gap_end with new values. gap_start takes the
+	 * value of the start of new gap found, which now correctly precedes gap_end, and gap_end
+	 * takes on the new aligned value that has now been decremented by the requested size.
+	 */
+	*gap_start = mas.index;
+	*gap_end = new_gap_end;
+	return true;
+}
+
 /**
  * align_and_check() - Align the specified pointer to the provided alignment and
- *                     check that it is still in range.
- * @gap_end:        Highest possible start address for allocation (end of gap in
- *                  address space)
- * @gap_start:      Start address of current memory area / gap in address space
- * @info:           vm_unmapped_area_info structure passed to caller, containing
- *                  alignment, length and limits for the allocation
- * @is_shader_code: True if the allocation is for shader code (which has
- *                  additional alignment requirements)
- * @is_same_4gb_page: True if the allocation needs to reside completely within
- *                    a 4GB chunk
+ *                     check that it is still in range. On kernel 6.1 onwards
+ *                     this function does not require that the initial requested
+ *                     gap is extended with the maximum size needed to guarantee
+ *                     an alignment.
+ * @gap_end:           Highest possible start address for allocation (end of gap in
+ *                     address space)
+ * @gap_start:         Start address of current memory area / gap in address space
+ * @info:              vm_unmapped_area_info structure passed to caller, containing
+ *                     alignment, length and limits for the allocation
+ * @is_shader_code:    True if the allocation is for shader code (which has
+ *                     additional alignment requirements)
+ * @is_same_4gb_page:  True if the allocation needs to reside completely within
+ *                     a 4GB chunk
  *
  * Return: true if gap_end is now aligned correctly and is still in range,
  *         false otherwise
@@ -40,9 +106,94 @@ static bool align_and_check(unsigned long *gap_end, unsigned long gap_start,
 			    struct vm_unmapped_area_info *info, bool is_shader_code,
 			    bool is_same_4gb_page)
 {
+	unsigned long alignment_shift;
+
 	/* Compute highest gap address at the desired alignment */
-	(*gap_end) -= info->length;
-	(*gap_end) -= (*gap_end - info->align_offset) & info->align_mask;
+	*gap_end -= info->length;
+	alignment_shift = (*gap_end - info->align_offset) & info->align_mask;
+
+	/* Align desired start VA (gap_end) by calculated alignment shift amount */
+	if (!move_mt_gap(&gap_start, gap_end, alignment_shift))
+		return false;
+	/* Alignment is done so far - check for further alignment requirements */
+
+	if (is_shader_code) {
+		/* Shader code allocations must not start or end on a 4GB boundary */
+		alignment_shift = info->align_offset ? info->align_offset : info->length;
+		if (0 == (*gap_end & BASE_MEM_MASK_4GB)) {
+			if (!move_mt_gap(&gap_start, gap_end, alignment_shift))
+				return false;
+		}
+		if (0 == ((*gap_end + info->length) & BASE_MEM_MASK_4GB)) {
+			if (!move_mt_gap(&gap_start, gap_end, alignment_shift))
+				return false;
+		}
+
+		if (!(*gap_end & BASE_MEM_MASK_4GB) ||
+		    !((*gap_end + info->length) & BASE_MEM_MASK_4GB))
+			return false;
+	} else if (is_same_4gb_page) {
+		unsigned long start = *gap_end;
+		unsigned long end = *gap_end + info->length;
+		unsigned long mask = ~((unsigned long)U32_MAX);
+
+		/* Check if 4GB boundary is straddled */
+		if ((start & mask) != ((end - 1) & mask)) {
+			unsigned long offset = end - (end & mask);
+			/* This is to ensure that alignment doesn't get
+			 * disturbed in an attempt to prevent straddling at
+			 * 4GB boundary. The GPU VA is aligned to 2MB when the
+			 * allocation size is > 2MB and there is enough CPU &
+			 * GPU virtual space.
+			 */
+			unsigned long rounded_offset = ALIGN(offset, info->align_mask + 1);
+
+			if (!move_mt_gap(&gap_start, gap_end, rounded_offset))
+				return false;
+			/* Re-calculate start and end values */
+			start = *gap_end;
+			end = *gap_end + info->length;
+
+			/* The preceding 4GB boundary shall not get straddled,
+			 * even after accounting for the alignment, as the
+			 * size of allocation is limited to 4GB and the initial
+			 * start location was already aligned.
+			 */
+			WARN_ON((start & mask) != ((end - 1) & mask));
+		}
+	}
+
+	if ((*gap_end < info->low_limit) || (*gap_end < gap_start))
+		return false;
+
+	return true;
+}
+#else
+/**
+ * align_and_check() - Align the specified pointer to the provided alignment and
+ *                     check that it is still in range. For Kernel versions below
+ *                     6.1, it requires that the length of the alignment is already
+ *                     extended by a worst-case alignment mask.
+ * @gap_end:           Highest possible start address for allocation (end of gap in
+ *                     address space)
+ * @gap_start:         Start address of current memory area / gap in address space
+ * @info:              vm_unmapped_area_info structure passed to caller, containing
+ *                     alignment, length and limits for the allocation
+ * @is_shader_code:    True if the allocation is for shader code (which has
+ *                     additional alignment requirements)
+ * @is_same_4gb_page:  True if the allocation needs to reside completely within
+ *                     a 4GB chunk
+ *
+ * Return: true if gap_end is now aligned correctly and is still in range,
+ *         false otherwise
+ */
+static bool align_and_check(unsigned long *gap_end, unsigned long gap_start,
+			    struct vm_unmapped_area_info *info, bool is_shader_code,
+			    bool is_same_4gb_page)
+{
+	/* Compute highest gap address at the desired alignment */
+	*gap_end -= info->length;
+	*gap_end -= (*gap_end - info->align_offset) & info->align_mask;
 
 	if (is_shader_code) {
 		/* Check for 4GB boundary */
@@ -73,6 +224,7 @@ static bool align_and_check(unsigned long *gap_end, unsigned long gap_start,
 			start -= rounded_offset;
 			end -= rounded_offset;
 
+			/* Patch gap_end to use new starting address for VA region */
 			*gap_end = start;
 
 			/* The preceding 4GB boundary shall not get straddled,
@@ -89,6 +241,7 @@ static bool align_and_check(unsigned long *gap_end, unsigned long gap_start,
 
 	return true;
 }
+#endif
 
 /**
  * kbase_unmapped_area_topdown() - allocates new areas top-down from
@@ -218,31 +371,27 @@ check_current:
 		}
 	}
 #else
-	unsigned long length, high_limit, gap_start, gap_end;
+	unsigned long high_limit, gap_start, gap_end;
 
 	MA_STATE(mas, &current->mm->mm_mt, 0, 0);
-	/* Adjust search length to account for worst case alignment overhead */
-	length = info->length + info->align_mask;
-	if (length < info->length)
-		return -ENOMEM;
 
 	/*
 	 * Adjust search limits by the desired length.
 	 * See implementation comment at top of unmapped_area().
 	 */
 	gap_end = info->high_limit;
-	if (gap_end < length)
+	if (gap_end < info->length)
 		return -ENOMEM;
-	high_limit = gap_end - length;
+	high_limit = gap_end - info->length;
 
 	if (info->low_limit > high_limit)
 		return -ENOMEM;
 
 	while (true) {
-		if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, length))
+		if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, info->length))
 			return -ENOMEM;
 		gap_end = mas.last + 1;
-		gap_start = mas.min;
+		gap_start = mas.index;
 
 		if (align_and_check(&gap_end, gap_start, info, is_shader_code, is_same_4gb_page))
 			return gap_end;
diff --git a/mali_kbase/tl/mali_kbase_tracepoints.c b/mali_kbase/tl/mali_kbase_tracepoints.c
index 7427358..34cabbd 100644
--- a/mali_kbase/tl/mali_kbase_tracepoints.c
+++ b/mali_kbase/tl/mali_kbase_tracepoints.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
diff --git a/mali_kbase/tl/mali_kbase_tracepoints.h b/mali_kbase/tl/mali_kbase_tracepoints.h
index f5b5b39..dd23f97 100644
--- a/mali_kbase/tl/mali_kbase_tracepoints.h
+++ b/mali_kbase/tl/mali_kbase_tracepoints.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2024 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
author	Vamsidhar reddy Gaddam <gvamsi@google.com>	2024-03-13 09:45:22 +0000
committer	Vamsidhar reddy Gaddam <gvamsi@google.com>	2024-03-13 09:45:22 +0000
commit	a999cd8fd398aed7390c8e5d99795e9b735d6ba7 (patch)
tree	79503e1b07ccfd66140fb903be3a0f2e0ace147c
parent	049a542207ed694271316782397b78b2e202086a (diff)
download	gpu-a999cd8fd398aed7390c8e5d99795e9b735d6ba7.tar.gz